gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2014 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "insn-codes.h"
  26 #include "rtl.h"
  27 #include "insn-attr.h"
  28 #include "tree.h"
  29 #include "stringpool.h"
  30 #include "stor-layout.h"
  31 #include "calls.h"
  32 #include "varasm.h"
  33 #include "regs.h"
  34 #include "df.h"
  35 #include "hard-reg-set.h"
  36 #include "output.h"
  37 #include "expr.h"
  38 #include "reload.h"
  39 #include "toplev.h"
  40 #include "target.h"
  41 #include "target-def.h"
  42 #include "targhooks.h"
  43 #include "ggc.h"
  44 #include "function.h"
  45 #include "tm_p.h"
  46 #include "recog.h"
  47 #include "langhooks.h"
  48 #include "diagnostic-core.h"
  49 #include "hash-table.h"
  50 #include "vec.h"
  51 #include "basic-block.h"
  52 #include "tree-ssa-alias.h"
  53 #include "internal-fn.h"
  54 #include "gimple-fold.h"
  55 #include "tree-eh.h"
  56 #include "gimple-expr.h"
  57 #include "is-a.h"
  58 #include "gimple.h"
  59 #include "gimplify.h"
  60 #include "optabs.h"
  61 #include "dwarf2.h"
  62 #include "cfgloop.h"
  63 #include "tree-vectorizer.h"
  64 #include "config/arm/aarch-cost-tables.h"
  65 #include "dumpfile.h"
  66 #include "builtins.h"
  67
  68 /* Defined for convenience.  */
  69 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  70
  71 /* Classifies an address.
  72
  73    ADDRESS_REG_IMM
  74        A simple base register plus immediate offset.
  75
  76    ADDRESS_REG_WB
  77        A base register indexed by immediate offset with writeback.
  78
  79    ADDRESS_REG_REG
  80        A base register indexed by (optionally scaled) register.
  81
  82    ADDRESS_REG_UXTW
  83        A base register indexed by (optionally scaled) zero-extended register.
  84
  85    ADDRESS_REG_SXTW
  86        A base register indexed by (optionally scaled) sign-extended register.
  87
  88    ADDRESS_LO_SUM
  89        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  90
  91    ADDRESS_SYMBOLIC:
  92        A constant symbolic address, in pc-relative literal pool.  */
  93
  94 enum aarch64_address_type {
  95   ADDRESS_REG_IMM,
  96   ADDRESS_REG_WB,
  97   ADDRESS_REG_REG,
  98   ADDRESS_REG_UXTW,
  99   ADDRESS_REG_SXTW,
 100   ADDRESS_LO_SUM,
 101   ADDRESS_SYMBOLIC
 102 };
 103
 104 struct aarch64_address_info {
 105   enum aarch64_address_type type;
 106   rtx base;
 107   rtx offset;
 108   int shift;
 109   enum aarch64_symbol_type symbol_type;
 110 };
 111
 112 struct simd_immediate_info
 113 {
 114   rtx value;
 115   int shift;
 116   int element_width;
 117   bool mvn;
 118   bool msl;
 119 };
 120
 121 /* The current code model.  */
 122 enum aarch64_code_model aarch64_cmodel;
 123
 124 #ifdef HAVE_AS_TLS
 125 #undef TARGET_HAVE_TLS
 126 #define TARGET_HAVE_TLS 1
 127 #endif
 128
 129 static bool aarch64_lra_p (void);
 130 static bool aarch64_composite_type_p (const_tree, enum machine_mode);
 131 static bool aarch64_vfp_is_call_or_return_candidate (enum machine_mode,
 132                                                      const_tree,
 133                                                      enum machine_mode *, int *,
 134                                                      bool *);
 135 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 136 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 137 static void aarch64_override_options_after_change (void);
 138 static bool aarch64_vector_mode_supported_p (enum machine_mode);
 139 static unsigned bit_count (unsigned HOST_WIDE_INT);
 140 static bool aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
 141                                                  const unsigned char *sel);
 142 static int aarch64_address_cost (rtx, enum machine_mode, addr_space_t, bool);
 143
 144 /* The processor for which instructions should be scheduled.  */
 145 enum aarch64_processor aarch64_tune = cortexa53;
 146
 147 /* The current tuning set.  */
 148 const struct tune_params *aarch64_tune_params;
 149
 150 /* Mask to specify which instructions we are allowed to generate.  */
 151 unsigned long aarch64_isa_flags = 0;
 152
 153 /* Mask to specify which instruction scheduling options should be used.  */
 154 unsigned long aarch64_tune_flags = 0;
 155
 156 /* Tuning parameters.  */
 157
 158 #if HAVE_DESIGNATED_INITIALIZERS
 159 #define NAMED_PARAM(NAME, VAL) .NAME = (VAL)
 160 #else
 161 #define NAMED_PARAM(NAME, VAL) (VAL)
 162 #endif
 163
 164 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 165 __extension__
 166 #endif
 167
 168 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 169 __extension__
 170 #endif
 171 static const struct cpu_addrcost_table generic_addrcost_table =
 172 {
 173 #if HAVE_DESIGNATED_INITIALIZERS
 174   .addr_scale_costs =
 175 #endif
 176     {
 177       NAMED_PARAM (hi, 0),
 178       NAMED_PARAM (si, 0),
 179       NAMED_PARAM (di, 0),
 180       NAMED_PARAM (ti, 0),
 181     },
 182   NAMED_PARAM (pre_modify, 0),
 183   NAMED_PARAM (post_modify, 0),
 184   NAMED_PARAM (register_offset, 0),
 185   NAMED_PARAM (register_extend, 0),
 186   NAMED_PARAM (imm_offset, 0)
 187 };
 188
 189 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 190 __extension__
 191 #endif
 192 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 193 {
 194 #if HAVE_DESIGNATED_INITIALIZERS
 195   .addr_scale_costs =
 196 #endif
 197     {
 198       NAMED_PARAM (hi, 1),
 199       NAMED_PARAM (si, 0),
 200       NAMED_PARAM (di, 0),
 201       NAMED_PARAM (ti, 1),
 202     },
 203   NAMED_PARAM (pre_modify, 0),
 204   NAMED_PARAM (post_modify, 0),
 205   NAMED_PARAM (register_offset, 0),
 206   NAMED_PARAM (register_extend, 0),
 207   NAMED_PARAM (imm_offset, 0),
 208 };
 209
 210 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 211 __extension__
 212 #endif
 213 static const struct cpu_regmove_cost generic_regmove_cost =
 214 {
 215   NAMED_PARAM (GP2GP, 1),
 216   NAMED_PARAM (GP2FP, 2),
 217   NAMED_PARAM (FP2GP, 2),
 218   NAMED_PARAM (FP2FP, 2)
 219 };
 220
 221 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 222 {
 223   NAMED_PARAM (GP2GP, 1),
 224   /* Avoid the use of slow int<->fp moves for spilling by setting
 225      their cost higher than memmov_cost.  */
 226   NAMED_PARAM (GP2FP, 5),
 227   NAMED_PARAM (FP2GP, 5),
 228   NAMED_PARAM (FP2FP, 2)
 229 };
 230
 231 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 232 {
 233   NAMED_PARAM (GP2GP, 1),
 234   /* Avoid the use of slow int<->fp moves for spilling by setting
 235      their cost higher than memmov_cost.  */
 236   NAMED_PARAM (GP2FP, 5),
 237   NAMED_PARAM (FP2GP, 5),
 238   NAMED_PARAM (FP2FP, 2)
 239 };
 240
 241 /* Generic costs for vector insn classes.  */
 242 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 243 __extension__
 244 #endif
 245 static const struct cpu_vector_cost generic_vector_cost =
 246 {
 247   NAMED_PARAM (scalar_stmt_cost, 1),
 248   NAMED_PARAM (scalar_load_cost, 1),
 249   NAMED_PARAM (scalar_store_cost, 1),
 250   NAMED_PARAM (vec_stmt_cost, 1),
 251   NAMED_PARAM (vec_to_scalar_cost, 1),
 252   NAMED_PARAM (scalar_to_vec_cost, 1),
 253   NAMED_PARAM (vec_align_load_cost, 1),
 254   NAMED_PARAM (vec_unalign_load_cost, 1),
 255   NAMED_PARAM (vec_unalign_store_cost, 1),
 256   NAMED_PARAM (vec_store_cost, 1),
 257   NAMED_PARAM (cond_taken_branch_cost, 3),
 258   NAMED_PARAM (cond_not_taken_branch_cost, 1)
 259 };
 260
 261 /* Generic costs for vector insn classes.  */
 262 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 263 __extension__
 264 #endif
 265 static const struct cpu_vector_cost cortexa57_vector_cost =
 266 {
 267   NAMED_PARAM (scalar_stmt_cost, 1),
 268   NAMED_PARAM (scalar_load_cost, 4),
 269   NAMED_PARAM (scalar_store_cost, 1),
 270   NAMED_PARAM (vec_stmt_cost, 3),
 271   NAMED_PARAM (vec_to_scalar_cost, 8),
 272   NAMED_PARAM (scalar_to_vec_cost, 8),
 273   NAMED_PARAM (vec_align_load_cost, 5),
 274   NAMED_PARAM (vec_unalign_load_cost, 5),
 275   NAMED_PARAM (vec_unalign_store_cost, 1),
 276   NAMED_PARAM (vec_store_cost, 1),
 277   NAMED_PARAM (cond_taken_branch_cost, 1),
 278   NAMED_PARAM (cond_not_taken_branch_cost, 1)
 279 };
 280
 281 #if HAVE_DESIGNATED_INITIALIZERS && GCC_VERSION >= 2007
 282 __extension__
 283 #endif
 284 static const struct tune_params generic_tunings =
 285 {
 286   &cortexa57_extra_costs,
 287   &generic_addrcost_table,
 288   &generic_regmove_cost,
 289   &generic_vector_cost,
 290   NAMED_PARAM (memmov_cost, 4),
 291   NAMED_PARAM (issue_rate, 2)
 292 };
 293
 294 static const struct tune_params cortexa53_tunings =
 295 {
 296   &cortexa53_extra_costs,
 297   &generic_addrcost_table,
 298   &cortexa53_regmove_cost,
 299   &generic_vector_cost,
 300   NAMED_PARAM (memmov_cost, 4),
 301   NAMED_PARAM (issue_rate, 2)
 302 };
 303
 304 static const struct tune_params cortexa57_tunings =
 305 {
 306   &cortexa57_extra_costs,
 307   &cortexa57_addrcost_table,
 308   &cortexa57_regmove_cost,
 309   &cortexa57_vector_cost,
 310   NAMED_PARAM (memmov_cost, 4),
 311   NAMED_PARAM (issue_rate, 3)
 312 };
 313
 314 /* A processor implementing AArch64.  */
 315 struct processor
 316 {
 317   const char *const name;
 318   enum aarch64_processor core;
 319   const char *arch;
 320   const unsigned long flags;
 321   const struct tune_params *const tune;
 322 };
 323
 324 /* Processor cores implementing AArch64.  */
 325 static const struct processor all_cores[] =
 326 {
 327 #define AARCH64_CORE(NAME, X, IDENT, ARCH, FLAGS, COSTS) \
 328   {NAME, IDENT, #ARCH, FLAGS | AARCH64_FL_FOR_ARCH##ARCH, &COSTS##_tunings},
 329 #include "aarch64-cores.def"
 330 #undef AARCH64_CORE
 331   {"generic", cortexa53, "8", AARCH64_FL_FPSIMD | AARCH64_FL_FOR_ARCH8, &generic_tunings},
 332   {NULL, aarch64_none, NULL, 0, NULL}
 333 };
 334
 335 /* Architectures implementing AArch64.  */
 336 static const struct processor all_architectures[] =
 337 {
 338 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
 339   {NAME, CORE, #ARCH, FLAGS, NULL},
 340 #include "aarch64-arches.def"
 341 #undef AARCH64_ARCH
 342   {NULL, aarch64_none, NULL, 0, NULL}
 343 };
 344
 345 /* Target specification.  These are populated as commandline arguments
 346    are processed, or NULL if not specified.  */
 347 static const struct processor *selected_arch;
 348 static const struct processor *selected_cpu;
 349 static const struct processor *selected_tune;
 350
 351 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 352
 353 /* An ISA extension in the co-processor and main instruction set space.  */
 354 struct aarch64_option_extension
 355 {
 356   const char *const name;
 357   const unsigned long flags_on;
 358   const unsigned long flags_off;
 359 };
 360
 361 /* ISA extensions in AArch64.  */
 362 static const struct aarch64_option_extension all_extensions[] =
 363 {
 364 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF) \
 365   {NAME, FLAGS_ON, FLAGS_OFF},
 366 #include "aarch64-option-extensions.def"
 367 #undef AARCH64_OPT_EXTENSION
 368   {NULL, 0, 0}
 369 };
 370
 371 /* Used to track the size of an address when generating a pre/post
 372    increment address.  */
 373 static enum machine_mode aarch64_memory_reference_mode;
 374
 375 /* Used to force GTY into this file.  */
 376 static GTY(()) int gty_dummy;
 377
 378 /* A table of valid AArch64 "bitmask immediate" values for
 379    logical instructions.  */
 380
 381 #define AARCH64_NUM_BITMASKS  5334
 382 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
 383
 384 typedef enum aarch64_cond_code
 385 {
 386   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 387   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 388   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 389 }
 390 aarch64_cc;
 391
 392 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 393
 394 /* The condition codes of the processor, and the inverse function.  */
 395 static const char * const aarch64_condition_codes[] =
 396 {
 397   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 398   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 399 };
 400
 401 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 402 unsigned
 403 aarch64_dbx_register_number (unsigned regno)
 404 {
 405    if (GP_REGNUM_P (regno))
 406      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 407    else if (regno == SP_REGNUM)
 408      return AARCH64_DWARF_SP;
 409    else if (FP_REGNUM_P (regno))
 410      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 411
 412    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 413       equivalent DWARF register.  */
 414    return DWARF_FRAME_REGISTERS;
 415 }
 416
 417 /* Return TRUE if MODE is any of the large INT modes.  */
 418 static bool
 419 aarch64_vect_struct_mode_p (enum machine_mode mode)
 420 {
 421   return mode == OImode || mode == CImode || mode == XImode;
 422 }
 423
 424 /* Return TRUE if MODE is any of the vector modes.  */
 425 static bool
 426 aarch64_vector_mode_p (enum machine_mode mode)
 427 {
 428   return aarch64_vector_mode_supported_p (mode)
 429          || aarch64_vect_struct_mode_p (mode);
 430 }
 431
 432 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 433 static bool
 434 aarch64_array_mode_supported_p (enum machine_mode mode,
 435                                 unsigned HOST_WIDE_INT nelems)
 436 {
 437   if (TARGET_SIMD
 438       && AARCH64_VALID_SIMD_QREG_MODE (mode)
 439       && (nelems >= 2 && nelems <= 4))
 440     return true;
 441
 442   return false;
 443 }
 444
 445 /* Implement HARD_REGNO_NREGS.  */
 446
 447 int
 448 aarch64_hard_regno_nregs (unsigned regno, enum machine_mode mode)
 449 {
 450   switch (aarch64_regno_regclass (regno))
 451     {
 452     case FP_REGS:
 453     case FP_LO_REGS:
 454       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 455     default:
 456       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 457     }
 458   gcc_unreachable ();
 459 }
 460
 461 /* Implement HARD_REGNO_MODE_OK.  */
 462
 463 int
 464 aarch64_hard_regno_mode_ok (unsigned regno, enum machine_mode mode)
 465 {
 466   if (GET_MODE_CLASS (mode) == MODE_CC)
 467     return regno == CC_REGNUM;
 468
 469   if (regno == SP_REGNUM)
 470     /* The purpose of comparing with ptr_mode is to support the
 471        global register variable associated with the stack pointer
 472        register via the syntax of asm ("wsp") in ILP32.  */
 473     return mode == Pmode || mode == ptr_mode;
 474
 475   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 476     return mode == Pmode;
 477
 478   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 479     return 1;
 480
 481   if (FP_REGNUM_P (regno))
 482     {
 483       if (aarch64_vect_struct_mode_p (mode))
 484         return
 485           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 486       else
 487         return 1;
 488     }
 489
 490   return 0;
 491 }
 492
 493 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 494 enum machine_mode
 495 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 496                                      enum machine_mode mode)
 497 {
 498   /* Handle modes that fit within single registers.  */
 499   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 500     {
 501       if (GET_MODE_SIZE (mode) >= 4)
 502         return mode;
 503       else
 504         return SImode;
 505     }
 506   /* Fall back to generic for multi-reg and very large modes.  */
 507   else
 508     return choose_hard_reg_mode (regno, nregs, false);
 509 }
 510
 511 /* Return true if calls to DECL should be treated as
 512    long-calls (ie called via a register).  */
 513 static bool
 514 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 515 {
 516   return false;
 517 }
 518
 519 /* Return true if calls to symbol-ref SYM should be treated as
 520    long-calls (ie called via a register).  */
 521 bool
 522 aarch64_is_long_call_p (rtx sym)
 523 {
 524   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 525 }
 526
 527 /* Return true if the offsets to a zero/sign-extract operation
 528    represent an expression that matches an extend operation.  The
 529    operands represent the paramters from
 530
 531    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 532 bool
 533 aarch64_is_extend_from_extract (enum machine_mode mode, rtx mult_imm,
 534                                 rtx extract_imm)
 535 {
 536   HOST_WIDE_INT mult_val, extract_val;
 537
 538   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 539     return false;
 540
 541   mult_val = INTVAL (mult_imm);
 542   extract_val = INTVAL (extract_imm);
 543
 544   if (extract_val > 8
 545       && extract_val < GET_MODE_BITSIZE (mode)
 546       && exact_log2 (extract_val & ~7) > 0
 547       && (extract_val & 7) <= 4
 548       && mult_val == (1 << (extract_val & 7)))
 549     return true;
 550
 551   return false;
 552 }
 553
 554 /* Emit an insn that's a simple single-set.  Both the operands must be
 555    known to be valid.  */
 556 inline static rtx
 557 emit_set_insn (rtx x, rtx y)
 558 {
 559   return emit_insn (gen_rtx_SET (VOIDmode, x, y));
 560 }
 561
 562 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 563    return the rtx for register 0 in the proper mode.  */
 564 rtx
 565 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 566 {
 567   enum machine_mode mode = SELECT_CC_MODE (code, x, y);
 568   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 569
 570   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 571   return cc_reg;
 572 }
 573
 574 /* Build the SYMBOL_REF for __tls_get_addr.  */
 575
 576 static GTY(()) rtx tls_get_addr_libfunc;
 577
 578 rtx
 579 aarch64_tls_get_addr (void)
 580 {
 581   if (!tls_get_addr_libfunc)
 582     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 583   return tls_get_addr_libfunc;
 584 }
 585
 586 /* Return the TLS model to use for ADDR.  */
 587
 588 static enum tls_model
 589 tls_symbolic_operand_type (rtx addr)
 590 {
 591   enum tls_model tls_kind = TLS_MODEL_NONE;
 592   rtx sym, addend;
 593
 594   if (GET_CODE (addr) == CONST)
 595     {
 596       split_const (addr, &sym, &addend);
 597       if (GET_CODE (sym) == SYMBOL_REF)
 598         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 599     }
 600   else if (GET_CODE (addr) == SYMBOL_REF)
 601     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 602
 603   return tls_kind;
 604 }
 605
 606 /* We'll allow lo_sum's in addresses in our legitimate addresses
 607    so that combine would take care of combining addresses where
 608    necessary, but for generation purposes, we'll generate the address
 609    as :
 610    RTL                               Absolute
 611    tmp = hi (symbol_ref);            adrp  x1, foo
 612    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 613                                      nop
 614
 615    PIC                               TLS
 616    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 617    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 618                                      bl   __tls_get_addr
 619                                      nop
 620
 621    Load TLS symbol, depending on TLS mechanism and TLS access model.
 622
 623    Global Dynamic - Traditional TLS:
 624    adrp tmp, :tlsgd:imm
 625    add  dest, tmp, #:tlsgd_lo12:imm
 626    bl   __tls_get_addr
 627
 628    Global Dynamic - TLS Descriptors:
 629    adrp dest, :tlsdesc:imm
 630    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 631    add  dest, dest, #:tlsdesc_lo12:imm
 632    blr  tmp
 633    mrs  tp, tpidr_el0
 634    add  dest, dest, tp
 635
 636    Initial Exec:
 637    mrs  tp, tpidr_el0
 638    adrp tmp, :gottprel:imm
 639    ldr  dest, [tmp, #:gottprel_lo12:imm]
 640    add  dest, dest, tp
 641
 642    Local Exec:
 643    mrs  tp, tpidr_el0
 644    add  t0, tp, #:tprel_hi12:imm
 645    add  t0, #:tprel_lo12_nc:imm
 646 */
 647
 648 static void
 649 aarch64_load_symref_appropriately (rtx dest, rtx imm,
 650                                    enum aarch64_symbol_type type)
 651 {
 652   switch (type)
 653     {
 654     case SYMBOL_SMALL_ABSOLUTE:
 655       {
 656         /* In ILP32, the mode of dest can be either SImode or DImode.  */
 657         rtx tmp_reg = dest;
 658         enum machine_mode mode = GET_MODE (dest);
 659
 660         gcc_assert (mode == Pmode || mode == ptr_mode);
 661
 662         if (can_create_pseudo_p ())
 663           tmp_reg = gen_reg_rtx (mode);
 664
 665         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 666         emit_insn (gen_add_losym (dest, tmp_reg, imm));
 667         return;
 668       }
 669
 670     case SYMBOL_TINY_ABSOLUTE:
 671       emit_insn (gen_rtx_SET (Pmode, dest, imm));
 672       return;
 673
 674     case SYMBOL_SMALL_GOT:
 675       {
 676         /* In ILP32, the mode of dest can be either SImode or DImode,
 677            while the got entry is always of SImode size.  The mode of
 678            dest depends on how dest is used: if dest is assigned to a
 679            pointer (e.g. in the memory), it has SImode; it may have
 680            DImode if dest is dereferenced to access the memeory.
 681            This is why we have to handle three different ldr_got_small
 682            patterns here (two patterns for ILP32).  */
 683         rtx tmp_reg = dest;
 684         enum machine_mode mode = GET_MODE (dest);
 685
 686         if (can_create_pseudo_p ())
 687           tmp_reg = gen_reg_rtx (mode);
 688
 689         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 690         if (mode == ptr_mode)
 691           {
 692             if (mode == DImode)
 693               emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
 694             else
 695               emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
 696           }
 697         else
 698           {
 699             gcc_assert (mode == Pmode);
 700             emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
 701           }
 702
 703         return;
 704       }
 705
 706     case SYMBOL_SMALL_TLSGD:
 707       {
 708         rtx_insn *insns;
 709         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
 710
 711         start_sequence ();
 712         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
 713         insns = get_insns ();
 714         end_sequence ();
 715
 716         RTL_CONST_CALL_P (insns) = 1;
 717         emit_libcall_block (insns, dest, result, imm);
 718         return;
 719       }
 720
 721     case SYMBOL_SMALL_TLSDESC:
 722       {
 723         enum machine_mode mode = GET_MODE (dest);
 724         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
 725         rtx tp;
 726
 727         gcc_assert (mode == Pmode || mode == ptr_mode);
 728
 729         /* In ILP32, the got entry is always of SImode size.  Unlike
 730            small GOT, the dest is fixed at reg 0.  */
 731         if (TARGET_ILP32)
 732           emit_insn (gen_tlsdesc_small_si (imm));
 733         else
 734           emit_insn (gen_tlsdesc_small_di (imm));
 735         tp = aarch64_load_tp (NULL);
 736
 737         if (mode != Pmode)
 738           tp = gen_lowpart (mode, tp);
 739
 740         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
 741         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 742         return;
 743       }
 744
 745     case SYMBOL_SMALL_GOTTPREL:
 746       {
 747         /* In ILP32, the mode of dest can be either SImode or DImode,
 748            while the got entry is always of SImode size.  The mode of
 749            dest depends on how dest is used: if dest is assigned to a
 750            pointer (e.g. in the memory), it has SImode; it may have
 751            DImode if dest is dereferenced to access the memeory.
 752            This is why we have to handle three different tlsie_small
 753            patterns here (two patterns for ILP32).  */
 754         enum machine_mode mode = GET_MODE (dest);
 755         rtx tmp_reg = gen_reg_rtx (mode);
 756         rtx tp = aarch64_load_tp (NULL);
 757
 758         if (mode == ptr_mode)
 759           {
 760             if (mode == DImode)
 761               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
 762             else
 763               {
 764                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
 765                 tp = gen_lowpart (mode, tp);
 766               }
 767           }
 768         else
 769           {
 770             gcc_assert (mode == Pmode);
 771             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
 772           }
 773
 774         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
 775         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 776         return;
 777       }
 778
 779     case SYMBOL_SMALL_TPREL:
 780       {
 781         rtx tp = aarch64_load_tp (NULL);
 782         emit_insn (gen_tlsle_small (dest, tp, imm));
 783         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 784         return;
 785       }
 786
 787     case SYMBOL_TINY_GOT:
 788       emit_insn (gen_ldr_got_tiny (dest, imm));
 789       return;
 790
 791     default:
 792       gcc_unreachable ();
 793     }
 794 }
 795
 796 /* Emit a move from SRC to DEST.  Assume that the move expanders can
 797    handle all moves if !can_create_pseudo_p ().  The distinction is
 798    important because, unlike emit_move_insn, the move expanders know
 799    how to force Pmode objects into the constant pool even when the
 800    constant pool address is not itself legitimate.  */
 801 static rtx
 802 aarch64_emit_move (rtx dest, rtx src)
 803 {
 804   return (can_create_pseudo_p ()
 805           ? emit_move_insn (dest, src)
 806           : emit_move_insn_1 (dest, src));
 807 }
 808
 809 /* Split a 128-bit move operation into two 64-bit move operations,
 810    taking care to handle partial overlap of register to register
 811    copies.  Special cases are needed when moving between GP regs and
 812    FP regs.  SRC can be a register, constant or memory; DST a register
 813    or memory.  If either operand is memory it must not have any side
 814    effects.  */
 815 void
 816 aarch64_split_128bit_move (rtx dst, rtx src)
 817 {
 818   rtx dst_lo, dst_hi;
 819   rtx src_lo, src_hi;
 820
 821   enum machine_mode mode = GET_MODE (dst);
 822
 823   gcc_assert (mode == TImode || mode == TFmode);
 824   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
 825   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
 826
 827   if (REG_P (dst) && REG_P (src))
 828     {
 829       int src_regno = REGNO (src);
 830       int dst_regno = REGNO (dst);
 831
 832       /* Handle FP <-> GP regs.  */
 833       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
 834         {
 835           src_lo = gen_lowpart (word_mode, src);
 836           src_hi = gen_highpart (word_mode, src);
 837
 838           if (mode == TImode)
 839             {
 840               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
 841               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
 842             }
 843           else
 844             {
 845               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
 846               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
 847             }
 848           return;
 849         }
 850       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
 851         {
 852           dst_lo = gen_lowpart (word_mode, dst);
 853           dst_hi = gen_highpart (word_mode, dst);
 854
 855           if (mode == TImode)
 856             {
 857               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
 858               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
 859             }
 860           else
 861             {
 862               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
 863               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
 864             }
 865           return;
 866         }
 867     }
 868
 869   dst_lo = gen_lowpart (word_mode, dst);
 870   dst_hi = gen_highpart (word_mode, dst);
 871   src_lo = gen_lowpart (word_mode, src);
 872   src_hi = gen_highpart_mode (word_mode, mode, src);
 873
 874   /* At most one pairing may overlap.  */
 875   if (reg_overlap_mentioned_p (dst_lo, src_hi))
 876     {
 877       aarch64_emit_move (dst_hi, src_hi);
 878       aarch64_emit_move (dst_lo, src_lo);
 879     }
 880   else
 881     {
 882       aarch64_emit_move (dst_lo, src_lo);
 883       aarch64_emit_move (dst_hi, src_hi);
 884     }
 885 }
 886
 887 bool
 888 aarch64_split_128bit_move_p (rtx dst, rtx src)
 889 {
 890   return (! REG_P (src)
 891           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
 892 }
 893
 894 /* Split a complex SIMD combine.  */
 895
 896 void
 897 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
 898 {
 899   enum machine_mode src_mode = GET_MODE (src1);
 900   enum machine_mode dst_mode = GET_MODE (dst);
 901
 902   gcc_assert (VECTOR_MODE_P (dst_mode));
 903
 904   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
 905     {
 906       rtx (*gen) (rtx, rtx, rtx);
 907
 908       switch (src_mode)
 909         {
 910         case V8QImode:
 911           gen = gen_aarch64_simd_combinev8qi;
 912           break;
 913         case V4HImode:
 914           gen = gen_aarch64_simd_combinev4hi;
 915           break;
 916         case V2SImode:
 917           gen = gen_aarch64_simd_combinev2si;
 918           break;
 919         case V2SFmode:
 920           gen = gen_aarch64_simd_combinev2sf;
 921           break;
 922         case DImode:
 923           gen = gen_aarch64_simd_combinedi;
 924           break;
 925         case DFmode:
 926           gen = gen_aarch64_simd_combinedf;
 927           break;
 928         default:
 929           gcc_unreachable ();
 930         }
 931
 932       emit_insn (gen (dst, src1, src2));
 933       return;
 934     }
 935 }
 936
 937 /* Split a complex SIMD move.  */
 938
 939 void
 940 aarch64_split_simd_move (rtx dst, rtx src)
 941 {
 942   enum machine_mode src_mode = GET_MODE (src);
 943   enum machine_mode dst_mode = GET_MODE (dst);
 944
 945   gcc_assert (VECTOR_MODE_P (dst_mode));
 946
 947   if (REG_P (dst) && REG_P (src))
 948     {
 949       rtx (*gen) (rtx, rtx);
 950
 951       gcc_assert (VECTOR_MODE_P (src_mode));
 952
 953       switch (src_mode)
 954         {
 955         case V16QImode:
 956           gen = gen_aarch64_split_simd_movv16qi;
 957           break;
 958         case V8HImode:
 959           gen = gen_aarch64_split_simd_movv8hi;
 960           break;
 961         case V4SImode:
 962           gen = gen_aarch64_split_simd_movv4si;
 963           break;
 964         case V2DImode:
 965           gen = gen_aarch64_split_simd_movv2di;
 966           break;
 967         case V4SFmode:
 968           gen = gen_aarch64_split_simd_movv4sf;
 969           break;
 970         case V2DFmode:
 971           gen = gen_aarch64_split_simd_movv2df;
 972           break;
 973         default:
 974           gcc_unreachable ();
 975         }
 976
 977       emit_insn (gen (dst, src));
 978       return;
 979     }
 980 }
 981
 982 static rtx
 983 aarch64_force_temporary (enum machine_mode mode, rtx x, rtx value)
 984 {
 985   if (can_create_pseudo_p ())
 986     return force_reg (mode, value);
 987   else
 988     {
 989       x = aarch64_emit_move (x, value);
 990       return x;
 991     }
 992 }
 993
 994
 995 static rtx
 996 aarch64_add_offset (enum machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
 997 {
 998   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
 999     {
1000       rtx high;
1001       /* Load the full offset into a register.  This
1002          might be improvable in the future.  */
1003       high = GEN_INT (offset);
1004       offset = 0;
1005       high = aarch64_force_temporary (mode, temp, high);
1006       reg = aarch64_force_temporary (mode, temp,
1007                                      gen_rtx_PLUS (mode, high, reg));
1008     }
1009   return plus_constant (mode, reg, offset);
1010 }
1011
1012 void
1013 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1014 {
1015   enum machine_mode mode = GET_MODE (dest);
1016   unsigned HOST_WIDE_INT mask;
1017   int i;
1018   bool first;
1019   unsigned HOST_WIDE_INT val;
1020   bool subtargets;
1021   rtx subtarget;
1022   int one_match, zero_match, first_not_ffff_match;
1023
1024   gcc_assert (mode == SImode || mode == DImode);
1025
1026   /* Check on what type of symbol it is.  */
1027   if (GET_CODE (imm) == SYMBOL_REF
1028       || GET_CODE (imm) == LABEL_REF
1029       || GET_CODE (imm) == CONST)
1030     {
1031       rtx mem, base, offset;
1032       enum aarch64_symbol_type sty;
1033
1034       /* If we have (const (plus symbol offset)), separate out the offset
1035          before we start classifying the symbol.  */
1036       split_const (imm, &base, &offset);
1037
1038       sty = aarch64_classify_symbol (base, SYMBOL_CONTEXT_ADR);
1039       switch (sty)
1040         {
1041         case SYMBOL_FORCE_TO_MEM:
1042           if (offset != const0_rtx
1043               && targetm.cannot_force_const_mem (mode, imm))
1044             {
1045               gcc_assert (can_create_pseudo_p ());
1046               base = aarch64_force_temporary (mode, dest, base);
1047               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1048               aarch64_emit_move (dest, base);
1049               return;
1050             }
1051           mem = force_const_mem (ptr_mode, imm);
1052           gcc_assert (mem);
1053           if (mode != ptr_mode)
1054             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1055           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1056           return;
1057
1058         case SYMBOL_SMALL_TLSGD:
1059         case SYMBOL_SMALL_TLSDESC:
1060         case SYMBOL_SMALL_GOTTPREL:
1061         case SYMBOL_SMALL_GOT:
1062         case SYMBOL_TINY_GOT:
1063           if (offset != const0_rtx)
1064             {
1065               gcc_assert(can_create_pseudo_p ());
1066               base = aarch64_force_temporary (mode, dest, base);
1067               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1068               aarch64_emit_move (dest, base);
1069               return;
1070             }
1071           /* FALLTHRU */
1072
1073         case SYMBOL_SMALL_TPREL:
1074         case SYMBOL_SMALL_ABSOLUTE:
1075         case SYMBOL_TINY_ABSOLUTE:
1076           aarch64_load_symref_appropriately (dest, imm, sty);
1077           return;
1078
1079         default:
1080           gcc_unreachable ();
1081         }
1082     }
1083
1084   if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1085     {
1086       emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1087       return;
1088     }
1089
1090   if (!CONST_INT_P (imm))
1091     {
1092       if (GET_CODE (imm) == HIGH)
1093         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1094       else
1095         {
1096           rtx mem = force_const_mem (mode, imm);
1097           gcc_assert (mem);
1098           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1099         }
1100
1101       return;
1102     }
1103
1104   if (mode == SImode)
1105     {
1106       /* We know we can't do this in 1 insn, and we must be able to do it
1107          in two; so don't mess around looking for sequences that don't buy
1108          us anything.  */
1109       emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (INTVAL (imm) & 0xffff)));
1110       emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1111                                  GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1112       return;
1113     }
1114
1115   /* Remaining cases are all for DImode.  */
1116
1117   val = INTVAL (imm);
1118   subtargets = optimize && can_create_pseudo_p ();
1119
1120   one_match = 0;
1121   zero_match = 0;
1122   mask = 0xffff;
1123   first_not_ffff_match = -1;
1124
1125   for (i = 0; i < 64; i += 16, mask <<= 16)
1126     {
1127       if ((val & mask) == mask)
1128         one_match++;
1129       else
1130         {
1131           if (first_not_ffff_match < 0)
1132             first_not_ffff_match = i;
1133           if ((val & mask) == 0)
1134             zero_match++;
1135         }
1136     }
1137
1138   if (one_match == 2)
1139     {
1140       /* Set one of the quarters and then insert back into result.  */
1141       mask = 0xffffll << first_not_ffff_match;
1142       emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1143       emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1144                                  GEN_INT ((val >> first_not_ffff_match)
1145                                           & 0xffff)));
1146       return;
1147     }
1148
1149   if (zero_match == 2)
1150     goto simple_sequence;
1151
1152   mask = 0x0ffff0000UL;
1153   for (i = 16; i < 64; i += 16, mask <<= 16)
1154     {
1155       HOST_WIDE_INT comp = mask & ~(mask - 1);
1156
1157       if (aarch64_uimm12_shift (val - (val & mask)))
1158         {
1159           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1160
1161           emit_insn (gen_rtx_SET (VOIDmode, subtarget, GEN_INT (val & mask)));
1162           emit_insn (gen_adddi3 (dest, subtarget,
1163                                  GEN_INT (val - (val & mask))));
1164           return;
1165         }
1166       else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1167         {
1168           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1169
1170           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1171                                   GEN_INT ((val + comp) & mask)));
1172           emit_insn (gen_adddi3 (dest, subtarget,
1173                                  GEN_INT (val - ((val + comp) & mask))));
1174           return;
1175         }
1176       else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1177         {
1178           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1179
1180           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1181                                   GEN_INT ((val - comp) | ~mask)));
1182           emit_insn (gen_adddi3 (dest, subtarget,
1183                                  GEN_INT (val - ((val - comp) | ~mask))));
1184           return;
1185         }
1186       else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1187         {
1188           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1189
1190           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1191                                   GEN_INT (val | ~mask)));
1192           emit_insn (gen_adddi3 (dest, subtarget,
1193                                  GEN_INT (val - (val | ~mask))));
1194           return;
1195         }
1196     }
1197
1198   /* See if we can do it by arithmetically combining two
1199      immediates.  */
1200   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1201     {
1202       int j;
1203       mask = 0xffff;
1204
1205       if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1206           || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1207         {
1208           subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1209           emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1210                                   GEN_INT (aarch64_bitmasks[i])));
1211           emit_insn (gen_adddi3 (dest, subtarget,
1212                                  GEN_INT (val - aarch64_bitmasks[i])));
1213           return;
1214         }
1215
1216       for (j = 0; j < 64; j += 16, mask <<= 16)
1217         {
1218           if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1219             {
1220               emit_insn (gen_rtx_SET (VOIDmode, dest,
1221                                       GEN_INT (aarch64_bitmasks[i])));
1222               emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1223                                          GEN_INT ((val >> j) & 0xffff)));
1224               return;
1225             }
1226         }
1227     }
1228
1229   /* See if we can do it by logically combining two immediates.  */
1230   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1231     {
1232       if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1233         {
1234           int j;
1235
1236           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1237             if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1238               {
1239                 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1240                 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1241                                         GEN_INT (aarch64_bitmasks[i])));
1242                 emit_insn (gen_iordi3 (dest, subtarget,
1243                                        GEN_INT (aarch64_bitmasks[j])));
1244                 return;
1245               }
1246         }
1247       else if ((val & aarch64_bitmasks[i]) == val)
1248         {
1249           int j;
1250
1251           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1252             if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1253               {
1254
1255                 subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1256                 emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1257                                         GEN_INT (aarch64_bitmasks[j])));
1258                 emit_insn (gen_anddi3 (dest, subtarget,
1259                                        GEN_INT (aarch64_bitmasks[i])));
1260                 return;
1261               }
1262         }
1263     }
1264
1265   if (one_match > zero_match)
1266     {
1267       /* Set either first three quarters or all but the third.   */
1268       mask = 0xffffll << (16 - first_not_ffff_match);
1269       emit_insn (gen_rtx_SET (VOIDmode, dest,
1270                               GEN_INT (val | mask | 0xffffffff00000000ull)));
1271
1272       /* Now insert other two quarters.  */
1273       for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1274            i < 64; i += 16, mask <<= 16)
1275         {
1276           if ((val & mask) != mask)
1277             emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1278                                        GEN_INT ((val >> i) & 0xffff)));
1279         }
1280       return;
1281     }
1282
1283  simple_sequence:
1284   first = true;
1285   mask = 0xffff;
1286   for (i = 0; i < 64; i += 16, mask <<= 16)
1287     {
1288       if ((val & mask) != 0)
1289         {
1290           if (first)
1291             {
1292               emit_insn (gen_rtx_SET (VOIDmode, dest,
1293                                       GEN_INT (val & mask)));
1294               first = false;
1295             }
1296           else
1297             emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1298                                        GEN_INT ((val >> i) & 0xffff)));
1299         }
1300     }
1301 }
1302
1303 static bool
1304 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1305                                  tree exp ATTRIBUTE_UNUSED)
1306 {
1307   /* Currently, always true.  */
1308   return true;
1309 }
1310
1311 /* Implement TARGET_PASS_BY_REFERENCE.  */
1312
1313 static bool
1314 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1315                            enum machine_mode mode,
1316                            const_tree type,
1317                            bool named ATTRIBUTE_UNUSED)
1318 {
1319   HOST_WIDE_INT size;
1320   enum machine_mode dummymode;
1321   int nregs;
1322
1323   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1324   size = (mode == BLKmode && type)
1325     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1326
1327   /* Aggregates are passed by reference based on their size.  */
1328   if (type && AGGREGATE_TYPE_P (type))
1329     {
1330       size = int_size_in_bytes (type);
1331     }
1332
1333   /* Variable sized arguments are always returned by reference.  */
1334   if (size < 0)
1335     return true;
1336
1337   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1338   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1339                                                &dummymode, &nregs,
1340                                                NULL))
1341     return false;
1342
1343   /* Arguments which are variable sized or larger than 2 registers are
1344      passed by reference unless they are a homogenous floating point
1345      aggregate.  */
1346   return size > 2 * UNITS_PER_WORD;
1347 }
1348
1349 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1350 static bool
1351 aarch64_return_in_msb (const_tree valtype)
1352 {
1353   enum machine_mode dummy_mode;
1354   int dummy_int;
1355
1356   /* Never happens in little-endian mode.  */
1357   if (!BYTES_BIG_ENDIAN)
1358     return false;
1359
1360   /* Only composite types smaller than or equal to 16 bytes can
1361      be potentially returned in registers.  */
1362   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1363       || int_size_in_bytes (valtype) <= 0
1364       || int_size_in_bytes (valtype) > 16)
1365     return false;
1366
1367   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1368      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1369      is always passed/returned in the least significant bits of fp/simd
1370      register(s).  */
1371   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1372                                                &dummy_mode, &dummy_int, NULL))
1373     return false;
1374
1375   return true;
1376 }
1377
1378 /* Implement TARGET_FUNCTION_VALUE.
1379    Define how to find the value returned by a function.  */
1380
1381 static rtx
1382 aarch64_function_value (const_tree type, const_tree func,
1383                         bool outgoing ATTRIBUTE_UNUSED)
1384 {
1385   enum machine_mode mode;
1386   int unsignedp;
1387   int count;
1388   enum machine_mode ag_mode;
1389
1390   mode = TYPE_MODE (type);
1391   if (INTEGRAL_TYPE_P (type))
1392     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1393
1394   if (aarch64_return_in_msb (type))
1395     {
1396       HOST_WIDE_INT size = int_size_in_bytes (type);
1397
1398       if (size % UNITS_PER_WORD != 0)
1399         {
1400           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1401           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1402         }
1403     }
1404
1405   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1406                                                &ag_mode, &count, NULL))
1407     {
1408       if (!aarch64_composite_type_p (type, mode))
1409         {
1410           gcc_assert (count == 1 && mode == ag_mode);
1411           return gen_rtx_REG (mode, V0_REGNUM);
1412         }
1413       else
1414         {
1415           int i;
1416           rtx par;
1417
1418           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1419           for (i = 0; i < count; i++)
1420             {
1421               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1422               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1423                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1424               XVECEXP (par, 0, i) = tmp;
1425             }
1426           return par;
1427         }
1428     }
1429   else
1430     return gen_rtx_REG (mode, R0_REGNUM);
1431 }
1432
1433 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1434    Return true if REGNO is the number of a hard register in which the values
1435    of called function may come back.  */
1436
1437 static bool
1438 aarch64_function_value_regno_p (const unsigned int regno)
1439 {
1440   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1441      of 16-byte return values are: 128-bit integers and 16-byte small
1442      structures (excluding homogeneous floating-point aggregates).  */
1443   if (regno == R0_REGNUM || regno == R1_REGNUM)
1444     return true;
1445
1446   /* Up to four fp/simd registers can return a function value, e.g. a
1447      homogeneous floating-point aggregate having four members.  */
1448   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1449     return !TARGET_GENERAL_REGS_ONLY;
1450
1451   return false;
1452 }
1453
1454 /* Implement TARGET_RETURN_IN_MEMORY.
1455
1456    If the type T of the result of a function is such that
1457      void func (T arg)
1458    would require that arg be passed as a value in a register (or set of
1459    registers) according to the parameter passing rules, then the result
1460    is returned in the same registers as would be used for such an
1461    argument.  */
1462
1463 static bool
1464 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1465 {
1466   HOST_WIDE_INT size;
1467   enum machine_mode ag_mode;
1468   int count;
1469
1470   if (!AGGREGATE_TYPE_P (type)
1471       && TREE_CODE (type) != COMPLEX_TYPE
1472       && TREE_CODE (type) != VECTOR_TYPE)
1473     /* Simple scalar types always returned in registers.  */
1474     return false;
1475
1476   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1477                                                type,
1478                                                &ag_mode,
1479                                                &count,
1480                                                NULL))
1481     return false;
1482
1483   /* Types larger than 2 registers returned in memory.  */
1484   size = int_size_in_bytes (type);
1485   return (size < 0 || size > 2 * UNITS_PER_WORD);
1486 }
1487
1488 static bool
1489 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, enum machine_mode mode,
1490                                const_tree type, int *nregs)
1491 {
1492   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1493   return aarch64_vfp_is_call_or_return_candidate (mode,
1494                                                   type,
1495                                                   &pcum->aapcs_vfp_rmode,
1496                                                   nregs,
1497                                                   NULL);
1498 }
1499
1500 /* Given MODE and TYPE of a function argument, return the alignment in
1501    bits.  The idea is to suppress any stronger alignment requested by
1502    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1503    This is a helper function for local use only.  */
1504
1505 static unsigned int
1506 aarch64_function_arg_alignment (enum machine_mode mode, const_tree type)
1507 {
1508   unsigned int alignment;
1509
1510   if (type)
1511     {
1512       if (!integer_zerop (TYPE_SIZE (type)))
1513         {
1514           if (TYPE_MODE (type) == mode)
1515             alignment = TYPE_ALIGN (type);
1516           else
1517             alignment = GET_MODE_ALIGNMENT (mode);
1518         }
1519       else
1520         alignment = 0;
1521     }
1522   else
1523     alignment = GET_MODE_ALIGNMENT (mode);
1524
1525   return alignment;
1526 }
1527
1528 /* Layout a function argument according to the AAPCS64 rules.  The rule
1529    numbers refer to the rule numbers in the AAPCS64.  */
1530
1531 static void
1532 aarch64_layout_arg (cumulative_args_t pcum_v, enum machine_mode mode,
1533                     const_tree type,
1534                     bool named ATTRIBUTE_UNUSED)
1535 {
1536   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1537   int ncrn, nvrn, nregs;
1538   bool allocate_ncrn, allocate_nvrn;
1539   HOST_WIDE_INT size;
1540
1541   /* We need to do this once per argument.  */
1542   if (pcum->aapcs_arg_processed)
1543     return;
1544
1545   pcum->aapcs_arg_processed = true;
1546
1547   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1548   size
1549     = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1550                         UNITS_PER_WORD);
1551
1552   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1553   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1554                                                  mode,
1555                                                  type,
1556                                                  &nregs);
1557
1558   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1559      The following code thus handles passing by SIMD/FP registers first.  */
1560
1561   nvrn = pcum->aapcs_nvrn;
1562
1563   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1564      and homogenous short-vector aggregates (HVA).  */
1565   if (allocate_nvrn)
1566     {
1567       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1568         {
1569           pcum->aapcs_nextnvrn = nvrn + nregs;
1570           if (!aarch64_composite_type_p (type, mode))
1571             {
1572               gcc_assert (nregs == 1);
1573               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1574             }
1575           else
1576             {
1577               rtx par;
1578               int i;
1579               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1580               for (i = 0; i < nregs; i++)
1581                 {
1582                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1583                                          V0_REGNUM + nvrn + i);
1584                   tmp = gen_rtx_EXPR_LIST
1585                     (VOIDmode, tmp,
1586                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1587                   XVECEXP (par, 0, i) = tmp;
1588                 }
1589               pcum->aapcs_reg = par;
1590             }
1591           return;
1592         }
1593       else
1594         {
1595           /* C.3 NSRN is set to 8.  */
1596           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1597           goto on_stack;
1598         }
1599     }
1600
1601   ncrn = pcum->aapcs_ncrn;
1602   nregs = size / UNITS_PER_WORD;
1603
1604   /* C6 - C9.  though the sign and zero extension semantics are
1605      handled elsewhere.  This is the case where the argument fits
1606      entirely general registers.  */
1607   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1608     {
1609       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1610
1611       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1612
1613       /* C.8 if the argument has an alignment of 16 then the NGRN is
1614          rounded up to the next even number.  */
1615       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1616         {
1617           ++ncrn;
1618           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1619         }
1620       /* NREGS can be 0 when e.g. an empty structure is to be passed.
1621          A reg is still generated for it, but the caller should be smart
1622          enough not to use it.  */
1623       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1624         {
1625           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1626         }
1627       else
1628         {
1629           rtx par;
1630           int i;
1631
1632           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1633           for (i = 0; i < nregs; i++)
1634             {
1635               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1636               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1637                                        GEN_INT (i * UNITS_PER_WORD));
1638               XVECEXP (par, 0, i) = tmp;
1639             }
1640           pcum->aapcs_reg = par;
1641         }
1642
1643       pcum->aapcs_nextncrn = ncrn + nregs;
1644       return;
1645     }
1646
1647   /* C.11  */
1648   pcum->aapcs_nextncrn = NUM_ARG_REGS;
1649
1650   /* The argument is passed on stack; record the needed number of words for
1651      this argument and align the total size if necessary.  */
1652 on_stack:
1653   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1654   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1655     pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1656                                                16 / UNITS_PER_WORD);
1657   return;
1658 }
1659
1660 /* Implement TARGET_FUNCTION_ARG.  */
1661
1662 static rtx
1663 aarch64_function_arg (cumulative_args_t pcum_v, enum machine_mode mode,
1664                       const_tree type, bool named)
1665 {
1666   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1667   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1668
1669   if (mode == VOIDmode)
1670     return NULL_RTX;
1671
1672   aarch64_layout_arg (pcum_v, mode, type, named);
1673   return pcum->aapcs_reg;
1674 }
1675
1676 void
1677 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1678                            const_tree fntype ATTRIBUTE_UNUSED,
1679                            rtx libname ATTRIBUTE_UNUSED,
1680                            const_tree fndecl ATTRIBUTE_UNUSED,
1681                            unsigned n_named ATTRIBUTE_UNUSED)
1682 {
1683   pcum->aapcs_ncrn = 0;
1684   pcum->aapcs_nvrn = 0;
1685   pcum->aapcs_nextncrn = 0;
1686   pcum->aapcs_nextnvrn = 0;
1687   pcum->pcs_variant = ARM_PCS_AAPCS64;
1688   pcum->aapcs_reg = NULL_RTX;
1689   pcum->aapcs_arg_processed = false;
1690   pcum->aapcs_stack_words = 0;
1691   pcum->aapcs_stack_size = 0;
1692
1693   return;
1694 }
1695
1696 static void
1697 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1698                               enum machine_mode mode,
1699                               const_tree type,
1700                               bool named)
1701 {
1702   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1703   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1704     {
1705       aarch64_layout_arg (pcum_v, mode, type, named);
1706       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1707                   != (pcum->aapcs_stack_words != 0));
1708       pcum->aapcs_arg_processed = false;
1709       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1710       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1711       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1712       pcum->aapcs_stack_words = 0;
1713       pcum->aapcs_reg = NULL_RTX;
1714     }
1715 }
1716
1717 bool
1718 aarch64_function_arg_regno_p (unsigned regno)
1719 {
1720   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1721           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1722 }
1723
1724 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
1725    PARM_BOUNDARY bits of alignment, but will be given anything up
1726    to STACK_BOUNDARY bits if the type requires it.  This makes sure
1727    that both before and after the layout of each argument, the Next
1728    Stacked Argument Address (NSAA) will have a minimum alignment of
1729    8 bytes.  */
1730
1731 static unsigned int
1732 aarch64_function_arg_boundary (enum machine_mode mode, const_tree type)
1733 {
1734   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1735
1736   if (alignment < PARM_BOUNDARY)
1737     alignment = PARM_BOUNDARY;
1738   if (alignment > STACK_BOUNDARY)
1739     alignment = STACK_BOUNDARY;
1740   return alignment;
1741 }
1742
1743 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1744
1745    Return true if an argument passed on the stack should be padded upwards,
1746    i.e. if the least-significant byte of the stack slot has useful data.
1747
1748    Small aggregate types are placed in the lowest memory address.
1749
1750    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
1751
1752 bool
1753 aarch64_pad_arg_upward (enum machine_mode mode, const_tree type)
1754 {
1755   /* On little-endian targets, the least significant byte of every stack
1756      argument is passed at the lowest byte address of the stack slot.  */
1757   if (!BYTES_BIG_ENDIAN)
1758     return true;
1759
1760   /* Otherwise, integral, floating-point and pointer types are padded downward:
1761      the least significant byte of a stack argument is passed at the highest
1762      byte address of the stack slot.  */
1763   if (type
1764       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1765          || POINTER_TYPE_P (type))
1766       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1767     return false;
1768
1769   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
1770   return true;
1771 }
1772
1773 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1774
1775    It specifies padding for the last (may also be the only)
1776    element of a block move between registers and memory.  If
1777    assuming the block is in the memory, padding upward means that
1778    the last element is padded after its highest significant byte,
1779    while in downward padding, the last element is padded at the
1780    its least significant byte side.
1781
1782    Small aggregates and small complex types are always padded
1783    upwards.
1784
1785    We don't need to worry about homogeneous floating-point or
1786    short-vector aggregates; their move is not affected by the
1787    padding direction determined here.  Regardless of endianness,
1788    each element of such an aggregate is put in the least
1789    significant bits of a fp/simd register.
1790
1791    Return !BYTES_BIG_ENDIAN if the least significant byte of the
1792    register has useful data, and return the opposite if the most
1793    significant byte does.  */
1794
1795 bool
1796 aarch64_pad_reg_upward (enum machine_mode mode, const_tree type,
1797                      bool first ATTRIBUTE_UNUSED)
1798 {
1799
1800   /* Small composite types are always padded upward.  */
1801   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
1802     {
1803       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
1804                             : GET_MODE_SIZE (mode));
1805       if (size < 2 * UNITS_PER_WORD)
1806         return true;
1807     }
1808
1809   /* Otherwise, use the default padding.  */
1810   return !BYTES_BIG_ENDIAN;
1811 }
1812
1813 static enum machine_mode
1814 aarch64_libgcc_cmp_return_mode (void)
1815 {
1816   return SImode;
1817 }
1818
1819 static bool
1820 aarch64_frame_pointer_required (void)
1821 {
1822   /* In aarch64_override_options_after_change
1823      flag_omit_leaf_frame_pointer turns off the frame pointer by
1824      default.  Turn it back on now if we've not got a leaf
1825      function.  */
1826   if (flag_omit_leaf_frame_pointer
1827       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
1828     return true;
1829
1830   return false;
1831 }
1832
1833 /* Mark the registers that need to be saved by the callee and calculate
1834    the size of the callee-saved registers area and frame record (both FP
1835    and LR may be omitted).  */
1836 static void
1837 aarch64_layout_frame (void)
1838 {
1839   HOST_WIDE_INT offset = 0;
1840   int regno;
1841
1842   if (reload_completed && cfun->machine->frame.laid_out)
1843     return;
1844
1845 #define SLOT_NOT_REQUIRED (-2)
1846 #define SLOT_REQUIRED     (-1)
1847
1848   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
1849   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
1850
1851   /* First mark all the registers that really need to be saved...  */
1852   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1853     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
1854
1855   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1856     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
1857
1858   /* ... that includes the eh data registers (if needed)...  */
1859   if (crtl->calls_eh_return)
1860     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
1861       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
1862         = SLOT_REQUIRED;
1863
1864   /* ... and any callee saved register that dataflow says is live.  */
1865   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1866     if (df_regs_ever_live_p (regno)
1867         && !call_used_regs[regno])
1868       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
1869
1870   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1871     if (df_regs_ever_live_p (regno)
1872         && !call_used_regs[regno])
1873       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
1874
1875   if (frame_pointer_needed)
1876     {
1877       /* FP and LR are placed in the linkage record.  */
1878       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
1879       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
1880       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
1881       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
1882       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
1883       offset += 2 * UNITS_PER_WORD;
1884     }
1885
1886   /* Now assign stack slots for them.  */
1887   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
1888     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
1889       {
1890         cfun->machine->frame.reg_offset[regno] = offset;
1891         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
1892           cfun->machine->frame.wb_candidate1 = regno;
1893         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
1894           cfun->machine->frame.wb_candidate2 = regno;
1895         offset += UNITS_PER_WORD;
1896       }
1897
1898   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
1899     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
1900       {
1901         cfun->machine->frame.reg_offset[regno] = offset;
1902         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
1903           cfun->machine->frame.wb_candidate1 = regno;
1904         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
1905                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
1906           cfun->machine->frame.wb_candidate2 = regno;
1907         offset += UNITS_PER_WORD;
1908       }
1909
1910   cfun->machine->frame.padding0 =
1911     (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
1912   offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
1913
1914   cfun->machine->frame.saved_regs_size = offset;
1915
1916   cfun->machine->frame.hard_fp_offset
1917     = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
1918                         + get_frame_size ()
1919                         + cfun->machine->frame.saved_regs_size,
1920                         STACK_BOUNDARY / BITS_PER_UNIT);
1921
1922   cfun->machine->frame.frame_size
1923     = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
1924                         + crtl->outgoing_args_size,
1925                         STACK_BOUNDARY / BITS_PER_UNIT);
1926
1927   cfun->machine->frame.laid_out = true;
1928 }
1929
1930 static bool
1931 aarch64_register_saved_on_entry (int regno)
1932 {
1933   return cfun->machine->frame.reg_offset[regno] >= 0;
1934 }
1935
1936 static unsigned
1937 aarch64_next_callee_save (unsigned regno, unsigned limit)
1938 {
1939   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
1940     regno ++;
1941   return regno;
1942 }
1943
1944 static void
1945 aarch64_pushwb_single_reg (enum machine_mode mode, unsigned regno,
1946                            HOST_WIDE_INT adjustment)
1947  {
1948   rtx base_rtx = stack_pointer_rtx;
1949   rtx insn, reg, mem;
1950
1951   reg = gen_rtx_REG (mode, regno);
1952   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
1953                             plus_constant (Pmode, base_rtx, -adjustment));
1954   mem = gen_rtx_MEM (mode, mem);
1955
1956   insn = emit_move_insn (mem, reg);
1957   RTX_FRAME_RELATED_P (insn) = 1;
1958 }
1959
1960 static rtx
1961 aarch64_gen_storewb_pair (enum machine_mode mode, rtx base, rtx reg, rtx reg2,
1962                           HOST_WIDE_INT adjustment)
1963 {
1964   switch (mode)
1965     {
1966     case DImode:
1967       return gen_storewb_pairdi_di (base, base, reg, reg2,
1968                                     GEN_INT (-adjustment),
1969                                     GEN_INT (UNITS_PER_WORD - adjustment));
1970     case DFmode:
1971       return gen_storewb_pairdf_di (base, base, reg, reg2,
1972                                     GEN_INT (-adjustment),
1973                                     GEN_INT (UNITS_PER_WORD - adjustment));
1974     default:
1975       gcc_unreachable ();
1976     }
1977 }
1978
1979 static void
1980 aarch64_pushwb_pair_reg (enum machine_mode mode, unsigned regno1,
1981                          unsigned regno2, HOST_WIDE_INT adjustment)
1982 {
1983   rtx_insn *insn;
1984   rtx reg1 = gen_rtx_REG (mode, regno1);
1985   rtx reg2 = gen_rtx_REG (mode, regno2);
1986
1987   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
1988                                               reg2, adjustment));
1989   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
1990   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
1991   RTX_FRAME_RELATED_P (insn) = 1;
1992 }
1993
1994 static rtx
1995 aarch64_gen_loadwb_pair (enum machine_mode mode, rtx base, rtx reg, rtx reg2,
1996                          HOST_WIDE_INT adjustment)
1997 {
1998   switch (mode)
1999     {
2000     case DImode:
2001       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2002                                    GEN_INT (UNITS_PER_WORD));
2003     case DFmode:
2004       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2005                                    GEN_INT (UNITS_PER_WORD));
2006     default:
2007       gcc_unreachable ();
2008     }
2009 }
2010
2011 static rtx
2012 aarch64_gen_store_pair (enum machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2013                         rtx reg2)
2014 {
2015   switch (mode)
2016     {
2017     case DImode:
2018       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2019
2020     case DFmode:
2021       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2022
2023     default:
2024       gcc_unreachable ();
2025     }
2026 }
2027
2028 static rtx
2029 aarch64_gen_load_pair (enum machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2030                        rtx mem2)
2031 {
2032   switch (mode)
2033     {
2034     case DImode:
2035       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2036
2037     case DFmode:
2038       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2039
2040     default:
2041       gcc_unreachable ();
2042     }
2043 }
2044
2045
2046 static void
2047 aarch64_save_callee_saves (enum machine_mode mode, HOST_WIDE_INT start_offset,
2048                            unsigned start, unsigned limit, bool skip_wb)
2049 {
2050   rtx_insn *insn;
2051   rtx (*gen_mem_ref) (enum machine_mode, rtx) = (frame_pointer_needed
2052                                                  ? gen_frame_mem : gen_rtx_MEM);
2053   unsigned regno;
2054   unsigned regno2;
2055
2056   for (regno = aarch64_next_callee_save (start, limit);
2057        regno <= limit;
2058        regno = aarch64_next_callee_save (regno + 1, limit))
2059     {
2060       rtx reg, mem;
2061       HOST_WIDE_INT offset;
2062
2063       if (skip_wb
2064           && (regno == cfun->machine->frame.wb_candidate1
2065               || regno == cfun->machine->frame.wb_candidate2))
2066         continue;
2067
2068       reg = gen_rtx_REG (mode, regno);
2069       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2070       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2071                                               offset));
2072
2073       regno2 = aarch64_next_callee_save (regno + 1, limit);
2074
2075       if (regno2 <= limit
2076           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2077               == cfun->machine->frame.reg_offset[regno2]))
2078
2079         {
2080           rtx reg2 = gen_rtx_REG (mode, regno2);
2081           rtx mem2;
2082
2083           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2084           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2085                                                    offset));
2086           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2087                                                     reg2));
2088
2089           /* The first part of a frame-related parallel insn is
2090              always assumed to be relevant to the frame
2091              calculations; subsequent parts, are only
2092              frame-related if explicitly marked.  */
2093           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2094           regno = regno2;
2095         }
2096       else
2097         insn = emit_move_insn (mem, reg);
2098
2099       RTX_FRAME_RELATED_P (insn) = 1;
2100     }
2101 }
2102
2103 static void
2104 aarch64_restore_callee_saves (enum machine_mode mode,
2105                               HOST_WIDE_INT start_offset, unsigned start,
2106                               unsigned limit, bool skip_wb, rtx *cfi_ops)
2107 {
2108   rtx base_rtx = stack_pointer_rtx;
2109   rtx (*gen_mem_ref) (enum machine_mode, rtx) = (frame_pointer_needed
2110                                                  ? gen_frame_mem : gen_rtx_MEM);
2111   unsigned regno;
2112   unsigned regno2;
2113   HOST_WIDE_INT offset;
2114
2115   for (regno = aarch64_next_callee_save (start, limit);
2116        regno <= limit;
2117        regno = aarch64_next_callee_save (regno + 1, limit))
2118     {
2119       rtx reg, mem;
2120
2121       if (skip_wb
2122           && (regno == cfun->machine->frame.wb_candidate1
2123               || regno == cfun->machine->frame.wb_candidate2))
2124         continue;
2125
2126       reg = gen_rtx_REG (mode, regno);
2127       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2128       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2129
2130       regno2 = aarch64_next_callee_save (regno + 1, limit);
2131
2132       if (regno2 <= limit
2133           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2134               == cfun->machine->frame.reg_offset[regno2]))
2135         {
2136           rtx reg2 = gen_rtx_REG (mode, regno2);
2137           rtx mem2;
2138
2139           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2140           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2141           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2142
2143           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2144           regno = regno2;
2145         }
2146       else
2147         emit_move_insn (reg, mem);
2148       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2149     }
2150 }
2151
2152 /* AArch64 stack frames generated by this compiler look like:
2153
2154         +-------------------------------+
2155         |                               |
2156         |  incoming stack arguments     |
2157         |                               |
2158         +-------------------------------+
2159         |                               | <-- incoming stack pointer (aligned)
2160         |  callee-allocated save area   |
2161         |  for register varargs         |
2162         |                               |
2163         +-------------------------------+
2164         |  local variables              | <-- frame_pointer_rtx
2165         |                               |
2166         +-------------------------------+
2167         |  padding0                     | \
2168         +-------------------------------+  |
2169         |  callee-saved registers       |  | frame.saved_regs_size
2170         +-------------------------------+  |
2171         |  LR'                          |  |
2172         +-------------------------------+  |
2173         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2174         +-------------------------------+
2175         |  dynamic allocation           |
2176         +-------------------------------+
2177         |  padding                      |
2178         +-------------------------------+
2179         |  outgoing stack arguments     | <-- arg_pointer
2180         |                               |
2181         +-------------------------------+
2182         |                               | <-- stack_pointer_rtx (aligned)
2183
2184    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2185    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2186    unchanged.  */
2187
2188 /* Generate the prologue instructions for entry into a function.
2189    Establish the stack frame by decreasing the stack pointer with a
2190    properly calculated size and, if necessary, create a frame record
2191    filled with the values of LR and previous frame pointer.  The
2192    current FP is also set up if it is in use.  */
2193
2194 void
2195 aarch64_expand_prologue (void)
2196 {
2197   /* sub sp, sp, #<frame_size>
2198      stp {fp, lr}, [sp, #<frame_size> - 16]
2199      add fp, sp, #<frame_size> - hardfp_offset
2200      stp {cs_reg}, [fp, #-16] etc.
2201
2202      sub sp, sp, <final_adjustment_if_any>
2203   */
2204   HOST_WIDE_INT frame_size, offset;
2205   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2206   HOST_WIDE_INT hard_fp_offset;
2207   rtx_insn *insn;
2208
2209   aarch64_layout_frame ();
2210
2211   offset = frame_size = cfun->machine->frame.frame_size;
2212   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2213   fp_offset = frame_size - hard_fp_offset;
2214
2215   if (flag_stack_usage_info)
2216     current_function_static_stack_size = frame_size;
2217
2218   /* Store pairs and load pairs have a range only -512 to 504.  */
2219   if (offset >= 512)
2220     {
2221       /* When the frame has a large size, an initial decrease is done on
2222          the stack pointer to jump over the callee-allocated save area for
2223          register varargs, the local variable area and/or the callee-saved
2224          register area.  This will allow the pre-index write-back
2225          store pair instructions to be used for setting up the stack frame
2226          efficiently.  */
2227       offset = hard_fp_offset;
2228       if (offset >= 512)
2229         offset = cfun->machine->frame.saved_regs_size;
2230
2231       frame_size -= (offset + crtl->outgoing_args_size);
2232       fp_offset = 0;
2233
2234       if (frame_size >= 0x1000000)
2235         {
2236           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2237           emit_move_insn (op0, GEN_INT (-frame_size));
2238           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2239
2240           add_reg_note (insn, REG_CFA_ADJUST_CFA,
2241                         gen_rtx_SET (VOIDmode, stack_pointer_rtx,
2242                                      plus_constant (Pmode, stack_pointer_rtx,
2243                                                     -frame_size)));
2244           RTX_FRAME_RELATED_P (insn) = 1;
2245         }
2246       else if (frame_size > 0)
2247         {
2248           int hi_ofs = frame_size & 0xfff000;
2249           int lo_ofs = frame_size & 0x000fff;
2250
2251           if (hi_ofs)
2252             {
2253               insn = emit_insn (gen_add2_insn
2254                                 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2255               RTX_FRAME_RELATED_P (insn) = 1;
2256             }
2257           if (lo_ofs)
2258             {
2259               insn = emit_insn (gen_add2_insn
2260                                 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2261               RTX_FRAME_RELATED_P (insn) = 1;
2262             }
2263         }
2264     }
2265   else
2266     frame_size = -1;
2267
2268   if (offset > 0)
2269     {
2270       bool skip_wb = false;
2271
2272       if (frame_pointer_needed)
2273         {
2274           skip_wb = true;
2275
2276           if (fp_offset)
2277             {
2278               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2279                                                GEN_INT (-offset)));
2280               RTX_FRAME_RELATED_P (insn) = 1;
2281
2282               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2283                                          R30_REGNUM, false);
2284             }
2285           else
2286             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2287
2288           /* Set up frame pointer to point to the location of the
2289              previous frame pointer on the stack.  */
2290           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2291                                            stack_pointer_rtx,
2292                                            GEN_INT (fp_offset)));
2293           RTX_FRAME_RELATED_P (insn) = 1;
2294           emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2295         }
2296       else
2297         {
2298           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2299           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2300
2301           if (fp_offset
2302               || reg1 == FIRST_PSEUDO_REGISTER
2303               || (reg2 == FIRST_PSEUDO_REGISTER
2304                   && offset >= 256))
2305             {
2306               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2307                                                GEN_INT (-offset)));
2308               RTX_FRAME_RELATED_P (insn) = 1;
2309             }
2310           else
2311             {
2312               enum machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2313
2314               skip_wb = true;
2315
2316               if (reg2 == FIRST_PSEUDO_REGISTER)
2317                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2318               else
2319                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2320             }
2321         }
2322
2323       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2324                                  skip_wb);
2325       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2326                                  skip_wb);
2327     }
2328
2329   /* when offset >= 512,
2330      sub sp, sp, #<outgoing_args_size> */
2331   if (frame_size > -1)
2332     {
2333       if (crtl->outgoing_args_size > 0)
2334         {
2335           insn = emit_insn (gen_add2_insn
2336                             (stack_pointer_rtx,
2337                              GEN_INT (- crtl->outgoing_args_size)));
2338           RTX_FRAME_RELATED_P (insn) = 1;
2339         }
2340     }
2341 }
2342
2343 /* Return TRUE if we can use a simple_return insn.
2344
2345    This function checks whether the callee saved stack is empty, which
2346    means no restore actions are need. The pro_and_epilogue will use
2347    this to check whether shrink-wrapping opt is feasible.  */
2348
2349 bool
2350 aarch64_use_return_insn_p (void)
2351 {
2352   if (!reload_completed)
2353     return false;
2354
2355   if (crtl->profile)
2356     return false;
2357
2358   aarch64_layout_frame ();
2359
2360   return cfun->machine->frame.frame_size == 0;
2361 }
2362
2363 /* Generate the epilogue instructions for returning from a function.  */
2364 void
2365 aarch64_expand_epilogue (bool for_sibcall)
2366 {
2367   HOST_WIDE_INT frame_size, offset;
2368   HOST_WIDE_INT fp_offset;
2369   HOST_WIDE_INT hard_fp_offset;
2370   rtx_insn *insn;
2371
2372   aarch64_layout_frame ();
2373
2374   offset = frame_size = cfun->machine->frame.frame_size;
2375   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2376   fp_offset = frame_size - hard_fp_offset;
2377
2378   /* Store pairs and load pairs have a range only -512 to 504.  */
2379   if (offset >= 512)
2380     {
2381       offset = hard_fp_offset;
2382       if (offset >= 512)
2383         offset = cfun->machine->frame.saved_regs_size;
2384
2385       frame_size -= (offset + crtl->outgoing_args_size);
2386       fp_offset = 0;
2387       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2388         {
2389           insn = emit_insn (gen_add2_insn
2390                             (stack_pointer_rtx,
2391                              GEN_INT (crtl->outgoing_args_size)));
2392           RTX_FRAME_RELATED_P (insn) = 1;
2393         }
2394     }
2395   else
2396     frame_size = -1;
2397
2398   /* If there were outgoing arguments or we've done dynamic stack
2399      allocation, then restore the stack pointer from the frame
2400      pointer.  This is at most one insn and more efficient than using
2401      GCC's internal mechanism.  */
2402   if (frame_pointer_needed
2403       && (crtl->outgoing_args_size || cfun->calls_alloca))
2404     {
2405       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2406                                        hard_frame_pointer_rtx,
2407                                        GEN_INT (0)));
2408       offset = offset - fp_offset;
2409     }
2410
2411   if (offset > 0)
2412     {
2413       unsigned reg1 = cfun->machine->frame.wb_candidate1;
2414       unsigned reg2 = cfun->machine->frame.wb_candidate2;
2415       bool skip_wb = true;
2416       rtx cfi_ops = NULL;
2417
2418       if (frame_pointer_needed)
2419         fp_offset = 0;
2420       else if (fp_offset
2421                || reg1 == FIRST_PSEUDO_REGISTER
2422                || (reg2 == FIRST_PSEUDO_REGISTER
2423                    && offset >= 256))
2424         skip_wb = false;
2425
2426       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2427                                     skip_wb, &cfi_ops);
2428       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2429                                     skip_wb, &cfi_ops);
2430
2431       if (skip_wb)
2432         {
2433           enum machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2434           rtx rreg1 = gen_rtx_REG (mode1, reg1);
2435
2436           cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2437           if (reg2 == FIRST_PSEUDO_REGISTER)
2438             {
2439               rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2440               mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2441               mem = gen_rtx_MEM (mode1, mem);
2442               insn = emit_move_insn (rreg1, mem);
2443             }
2444           else
2445             {
2446               rtx rreg2 = gen_rtx_REG (mode1, reg2);
2447
2448               cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2449               insn = emit_insn (aarch64_gen_loadwb_pair
2450                                 (mode1, stack_pointer_rtx, rreg1,
2451                                  rreg2, offset));
2452             }
2453         }
2454       else
2455         {
2456           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2457                                            GEN_INT (offset)));
2458         }
2459
2460       /* Reset the CFA to be SP + FRAME_SIZE.  */
2461       rtx new_cfa = stack_pointer_rtx;
2462       if (frame_size > 0)
2463         new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2464       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2465       REG_NOTES (insn) = cfi_ops;
2466       RTX_FRAME_RELATED_P (insn) = 1;
2467     }
2468
2469   if (frame_size > 0)
2470     {
2471       if (frame_size >= 0x1000000)
2472         {
2473           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2474           emit_move_insn (op0, GEN_INT (frame_size));
2475           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2476         }
2477       else
2478         {
2479           int hi_ofs = frame_size & 0xfff000;
2480           int lo_ofs = frame_size & 0x000fff;
2481
2482           if (hi_ofs && lo_ofs)
2483             {
2484               insn = emit_insn (gen_add2_insn
2485                                 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2486               RTX_FRAME_RELATED_P (insn) = 1;
2487               frame_size = lo_ofs;
2488             }
2489           insn = emit_insn (gen_add2_insn
2490                             (stack_pointer_rtx, GEN_INT (frame_size)));
2491         }
2492
2493       /* Reset the CFA to be SP + 0.  */
2494       add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2495       RTX_FRAME_RELATED_P (insn) = 1;
2496     }
2497
2498   /* Stack adjustment for exception handler.  */
2499   if (crtl->calls_eh_return)
2500     {
2501       /* We need to unwind the stack by the offset computed by
2502          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
2503          to be SP; letting the CFA move during this adjustment
2504          is just as correct as retaining the CFA from the body
2505          of the function.  Therefore, do nothing special.  */
2506       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2507     }
2508
2509   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2510   if (!for_sibcall)
2511     emit_jump_insn (ret_rtx);
2512 }
2513
2514 /* Return the place to copy the exception unwinding return address to.
2515    This will probably be a stack slot, but could (in theory be the
2516    return register).  */
2517 rtx
2518 aarch64_final_eh_return_addr (void)
2519 {
2520   HOST_WIDE_INT fp_offset;
2521
2522   aarch64_layout_frame ();
2523
2524   fp_offset = cfun->machine->frame.frame_size
2525               - cfun->machine->frame.hard_fp_offset;
2526
2527   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2528     return gen_rtx_REG (DImode, LR_REGNUM);
2529
2530   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2531      result in a store to save LR introduced by builtin_eh_return () being
2532      incorrectly deleted because the alias is not detected.
2533      So in the calculation of the address to copy the exception unwinding
2534      return address to, we note 2 cases.
2535      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2536      we return a SP-relative location since all the addresses are SP-relative
2537      in this case.  This prevents the store from being optimized away.
2538      If the fp_offset is not 0, then the addresses will be FP-relative and
2539      therefore we return a FP-relative location.  */
2540
2541   if (frame_pointer_needed)
2542     {
2543       if (fp_offset)
2544         return gen_frame_mem (DImode,
2545                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2546       else
2547         return gen_frame_mem (DImode,
2548                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2549     }
2550
2551   /* If FP is not needed, we calculate the location of LR, which would be
2552      at the top of the saved registers block.  */
2553
2554   return gen_frame_mem (DImode,
2555                         plus_constant (Pmode,
2556                                        stack_pointer_rtx,
2557                                        fp_offset
2558                                        + cfun->machine->frame.saved_regs_size
2559                                        - 2 * UNITS_PER_WORD));
2560 }
2561
2562 /* Possibly output code to build up a constant in a register.  For
2563    the benefit of the costs infrastructure, returns the number of
2564    instructions which would be emitted.  GENERATE inhibits or
2565    enables code generation.  */
2566
2567 static int
2568 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2569 {
2570   int insns = 0;
2571
2572   if (aarch64_bitmask_imm (val, DImode))
2573     {
2574       if (generate)
2575         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2576       insns = 1;
2577     }
2578   else
2579     {
2580       int i;
2581       int ncount = 0;
2582       int zcount = 0;
2583       HOST_WIDE_INT valp = val >> 16;
2584       HOST_WIDE_INT valm;
2585       HOST_WIDE_INT tval;
2586
2587       for (i = 16; i < 64; i += 16)
2588         {
2589           valm = (valp & 0xffff);
2590
2591           if (valm != 0)
2592             ++ zcount;
2593
2594           if (valm != 0xffff)
2595             ++ ncount;
2596
2597           valp >>= 16;
2598         }
2599
2600       /* zcount contains the number of additional MOVK instructions
2601          required if the constant is built up with an initial MOVZ instruction,
2602          while ncount is the number of MOVK instructions required if starting
2603          with a MOVN instruction.  Choose the sequence that yields the fewest
2604          number of instructions, preferring MOVZ instructions when they are both
2605          the same.  */
2606       if (ncount < zcount)
2607         {
2608           if (generate)
2609             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2610                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2611           tval = 0xffff;
2612           insns++;
2613         }
2614       else
2615         {
2616           if (generate)
2617             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2618                             GEN_INT (val & 0xffff));
2619           tval = 0;
2620           insns++;
2621         }
2622
2623       val >>= 16;
2624
2625       for (i = 16; i < 64; i += 16)
2626         {
2627           if ((val & 0xffff) != tval)
2628             {
2629               if (generate)
2630                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2631                                            GEN_INT (i),
2632                                            GEN_INT (val & 0xffff)));
2633               insns++;
2634             }
2635           val >>= 16;
2636         }
2637     }
2638   return insns;
2639 }
2640
2641 static void
2642 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2643 {
2644   HOST_WIDE_INT mdelta = delta;
2645   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2646   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2647
2648   if (mdelta < 0)
2649     mdelta = -mdelta;
2650
2651   if (mdelta >= 4096 * 4096)
2652     {
2653       (void) aarch64_build_constant (scratchreg, delta, true);
2654       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2655     }
2656   else if (mdelta > 0)
2657     {
2658       if (mdelta >= 4096)
2659         {
2660           emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2661           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2662           if (delta < 0)
2663             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2664                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
2665           else
2666             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2667                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
2668         }
2669       if (mdelta % 4096 != 0)
2670         {
2671           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2672           emit_insn (gen_rtx_SET (Pmode, this_rtx,
2673                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2674         }
2675     }
2676 }
2677
2678 /* Output code to add DELTA to the first argument, and then jump
2679    to FUNCTION.  Used for C++ multiple inheritance.  */
2680 static void
2681 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2682                          HOST_WIDE_INT delta,
2683                          HOST_WIDE_INT vcall_offset,
2684                          tree function)
2685 {
2686   /* The this pointer is always in x0.  Note that this differs from
2687      Arm where the this pointer maybe bumped to r1 if r0 is required
2688      to return a pointer to an aggregate.  On AArch64 a result value
2689      pointer will be in x8.  */
2690   int this_regno = R0_REGNUM;
2691   rtx this_rtx, temp0, temp1, addr, funexp;
2692   rtx_insn *insn;
2693
2694   reload_completed = 1;
2695   emit_note (NOTE_INSN_PROLOGUE_END);
2696
2697   if (vcall_offset == 0)
2698     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2699   else
2700     {
2701       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2702
2703       this_rtx = gen_rtx_REG (Pmode, this_regno);
2704       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2705       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2706
2707       addr = this_rtx;
2708       if (delta != 0)
2709         {
2710           if (delta >= -256 && delta < 256)
2711             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2712                                        plus_constant (Pmode, this_rtx, delta));
2713           else
2714             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2715         }
2716
2717       if (Pmode == ptr_mode)
2718         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2719       else
2720         aarch64_emit_move (temp0,
2721                            gen_rtx_ZERO_EXTEND (Pmode,
2722                                                 gen_rtx_MEM (ptr_mode, addr)));
2723
2724       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2725           addr = plus_constant (Pmode, temp0, vcall_offset);
2726       else
2727         {
2728           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2729           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2730         }
2731
2732       if (Pmode == ptr_mode)
2733         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2734       else
2735         aarch64_emit_move (temp1,
2736                            gen_rtx_SIGN_EXTEND (Pmode,
2737                                                 gen_rtx_MEM (ptr_mode, addr)));
2738
2739       emit_insn (gen_add2_insn (this_rtx, temp1));
2740     }
2741
2742   /* Generate a tail call to the target function.  */
2743   if (!TREE_USED (function))
2744     {
2745       assemble_external (function);
2746       TREE_USED (function) = 1;
2747     }
2748   funexp = XEXP (DECL_RTL (function), 0);
2749   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2750   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2751   SIBLING_CALL_P (insn) = 1;
2752
2753   insn = get_insns ();
2754   shorten_branches (insn);
2755   final_start_function (insn, file, 1);
2756   final (insn, file, 1);
2757   final_end_function ();
2758
2759   /* Stop pretending to be a post-reload pass.  */
2760   reload_completed = 0;
2761 }
2762
2763 static int
2764 aarch64_tls_operand_p_1 (rtx *x, void *data ATTRIBUTE_UNUSED)
2765 {
2766   if (GET_CODE (*x) == SYMBOL_REF)
2767     return SYMBOL_REF_TLS_MODEL (*x) != 0;
2768
2769   /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2770      TLS offsets, not real symbol references.  */
2771   if (GET_CODE (*x) == UNSPEC
2772       && XINT (*x, 1) == UNSPEC_TLS)
2773     return -1;
2774
2775   return 0;
2776 }
2777
2778 static bool
2779 aarch64_tls_referenced_p (rtx x)
2780 {
2781   if (!TARGET_HAVE_TLS)
2782     return false;
2783
2784   return for_each_rtx (&x, aarch64_tls_operand_p_1, NULL);
2785 }
2786
2787
2788 static int
2789 aarch64_bitmasks_cmp (const void *i1, const void *i2)
2790 {
2791   const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
2792   const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
2793
2794   if (*imm1 < *imm2)
2795     return -1;
2796   if (*imm1 > *imm2)
2797     return +1;
2798   return 0;
2799 }
2800
2801
2802 static void
2803 aarch64_build_bitmask_table (void)
2804 {
2805   unsigned HOST_WIDE_INT mask, imm;
2806   unsigned int log_e, e, s, r;
2807   unsigned int nimms = 0;
2808
2809   for (log_e = 1; log_e <= 6; log_e++)
2810     {
2811       e = 1 << log_e;
2812       if (e == 64)
2813         mask = ~(HOST_WIDE_INT) 0;
2814       else
2815         mask = ((HOST_WIDE_INT) 1 << e) - 1;
2816       for (s = 1; s < e; s++)
2817         {
2818           for (r = 0; r < e; r++)
2819             {
2820               /* set s consecutive bits to 1 (s < 64) */
2821               imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
2822               /* rotate right by r */
2823               if (r != 0)
2824                 imm = ((imm >> r) | (imm << (e - r))) & mask;
2825               /* replicate the constant depending on SIMD size */
2826               switch (log_e) {
2827               case 1: imm |= (imm <<  2);
2828               case 2: imm |= (imm <<  4);
2829               case 3: imm |= (imm <<  8);
2830               case 4: imm |= (imm << 16);
2831               case 5: imm |= (imm << 32);
2832               case 6:
2833                 break;
2834               default:
2835                 gcc_unreachable ();
2836               }
2837               gcc_assert (nimms < AARCH64_NUM_BITMASKS);
2838               aarch64_bitmasks[nimms++] = imm;
2839             }
2840         }
2841     }
2842
2843   gcc_assert (nimms == AARCH64_NUM_BITMASKS);
2844   qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
2845          aarch64_bitmasks_cmp);
2846 }
2847
2848
2849 /* Return true if val can be encoded as a 12-bit unsigned immediate with
2850    a left shift of 0 or 12 bits.  */
2851 bool
2852 aarch64_uimm12_shift (HOST_WIDE_INT val)
2853 {
2854   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
2855           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
2856           );
2857 }
2858
2859
2860 /* Return true if val is an immediate that can be loaded into a
2861    register by a MOVZ instruction.  */
2862 static bool
2863 aarch64_movw_imm (HOST_WIDE_INT val, enum machine_mode mode)
2864 {
2865   if (GET_MODE_SIZE (mode) > 4)
2866     {
2867       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
2868           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
2869         return 1;
2870     }
2871   else
2872     {
2873       /* Ignore sign extension.  */
2874       val &= (HOST_WIDE_INT) 0xffffffff;
2875     }
2876   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
2877           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
2878 }
2879
2880
2881 /* Return true if val is a valid bitmask immediate.  */
2882 bool
2883 aarch64_bitmask_imm (HOST_WIDE_INT val, enum machine_mode mode)
2884 {
2885   if (GET_MODE_SIZE (mode) < 8)
2886     {
2887       /* Replicate bit pattern.  */
2888       val &= (HOST_WIDE_INT) 0xffffffff;
2889       val |= val << 32;
2890     }
2891   return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
2892                   sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
2893 }
2894
2895
2896 /* Return true if val is an immediate that can be loaded into a
2897    register in a single instruction.  */
2898 bool
2899 aarch64_move_imm (HOST_WIDE_INT val, enum machine_mode mode)
2900 {
2901   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
2902     return 1;
2903   return aarch64_bitmask_imm (val, mode);
2904 }
2905
2906 static bool
2907 aarch64_cannot_force_const_mem (enum machine_mode mode ATTRIBUTE_UNUSED, rtx x)
2908 {
2909   rtx base, offset;
2910
2911   if (GET_CODE (x) == HIGH)
2912     return true;
2913
2914   split_const (x, &base, &offset);
2915   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
2916     {
2917       if (aarch64_classify_symbol (base, SYMBOL_CONTEXT_ADR)
2918           != SYMBOL_FORCE_TO_MEM)
2919         return true;
2920       else
2921         /* Avoid generating a 64-bit relocation in ILP32; leave
2922            to aarch64_expand_mov_immediate to handle it properly.  */
2923         return mode != ptr_mode;
2924     }
2925
2926   return aarch64_tls_referenced_p (x);
2927 }
2928
2929 /* Return true if register REGNO is a valid index register.
2930    STRICT_P is true if REG_OK_STRICT is in effect.  */
2931
2932 bool
2933 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
2934 {
2935   if (!HARD_REGISTER_NUM_P (regno))
2936     {
2937       if (!strict_p)
2938         return true;
2939
2940       if (!reg_renumber)
2941         return false;
2942
2943       regno = reg_renumber[regno];
2944     }
2945   return GP_REGNUM_P (regno);
2946 }
2947
2948 /* Return true if register REGNO is a valid base register for mode MODE.
2949    STRICT_P is true if REG_OK_STRICT is in effect.  */
2950
2951 bool
2952 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
2953 {
2954   if (!HARD_REGISTER_NUM_P (regno))
2955     {
2956       if (!strict_p)
2957         return true;
2958
2959       if (!reg_renumber)
2960         return false;
2961
2962       regno = reg_renumber[regno];
2963     }
2964
2965   /* The fake registers will be eliminated to either the stack or
2966      hard frame pointer, both of which are usually valid base registers.
2967      Reload deals with the cases where the eliminated form isn't valid.  */
2968   return (GP_REGNUM_P (regno)
2969           || regno == SP_REGNUM
2970           || regno == FRAME_POINTER_REGNUM
2971           || regno == ARG_POINTER_REGNUM);
2972 }
2973
2974 /* Return true if X is a valid base register for mode MODE.
2975    STRICT_P is true if REG_OK_STRICT is in effect.  */
2976
2977 static bool
2978 aarch64_base_register_rtx_p (rtx x, bool strict_p)
2979 {
2980   if (!strict_p && GET_CODE (x) == SUBREG)
2981     x = SUBREG_REG (x);
2982
2983   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
2984 }
2985
2986 /* Return true if address offset is a valid index.  If it is, fill in INFO
2987    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
2988
2989 static bool
2990 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
2991                         enum machine_mode mode, bool strict_p)
2992 {
2993   enum aarch64_address_type type;
2994   rtx index;
2995   int shift;
2996
2997   /* (reg:P) */
2998   if ((REG_P (x) || GET_CODE (x) == SUBREG)
2999       && GET_MODE (x) == Pmode)
3000     {
3001       type = ADDRESS_REG_REG;
3002       index = x;
3003       shift = 0;
3004     }
3005   /* (sign_extend:DI (reg:SI)) */
3006   else if ((GET_CODE (x) == SIGN_EXTEND
3007             || GET_CODE (x) == ZERO_EXTEND)
3008            && GET_MODE (x) == DImode
3009            && GET_MODE (XEXP (x, 0)) == SImode)
3010     {
3011       type = (GET_CODE (x) == SIGN_EXTEND)
3012         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3013       index = XEXP (x, 0);
3014       shift = 0;
3015     }
3016   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3017   else if (GET_CODE (x) == MULT
3018            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3019                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3020            && GET_MODE (XEXP (x, 0)) == DImode
3021            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3022            && CONST_INT_P (XEXP (x, 1)))
3023     {
3024       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3025         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3026       index = XEXP (XEXP (x, 0), 0);
3027       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3028     }
3029   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3030   else if (GET_CODE (x) == ASHIFT
3031            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3032                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3033            && GET_MODE (XEXP (x, 0)) == DImode
3034            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3035            && CONST_INT_P (XEXP (x, 1)))
3036     {
3037       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3038         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3039       index = XEXP (XEXP (x, 0), 0);
3040       shift = INTVAL (XEXP (x, 1));
3041     }
3042   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3043   else if ((GET_CODE (x) == SIGN_EXTRACT
3044             || GET_CODE (x) == ZERO_EXTRACT)
3045            && GET_MODE (x) == DImode
3046            && GET_CODE (XEXP (x, 0)) == MULT
3047            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3048            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3049     {
3050       type = (GET_CODE (x) == SIGN_EXTRACT)
3051         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3052       index = XEXP (XEXP (x, 0), 0);
3053       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3054       if (INTVAL (XEXP (x, 1)) != 32 + shift
3055           || INTVAL (XEXP (x, 2)) != 0)
3056         shift = -1;
3057     }
3058   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3059      (const_int 0xffffffff<<shift)) */
3060   else if (GET_CODE (x) == AND
3061            && GET_MODE (x) == DImode
3062            && GET_CODE (XEXP (x, 0)) == MULT
3063            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3064            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3065            && CONST_INT_P (XEXP (x, 1)))
3066     {
3067       type = ADDRESS_REG_UXTW;
3068       index = XEXP (XEXP (x, 0), 0);
3069       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3070       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3071         shift = -1;
3072     }
3073   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3074   else if ((GET_CODE (x) == SIGN_EXTRACT
3075             || GET_CODE (x) == ZERO_EXTRACT)
3076            && GET_MODE (x) == DImode
3077            && GET_CODE (XEXP (x, 0)) == ASHIFT
3078            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3079            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3080     {
3081       type = (GET_CODE (x) == SIGN_EXTRACT)
3082         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3083       index = XEXP (XEXP (x, 0), 0);
3084       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3085       if (INTVAL (XEXP (x, 1)) != 32 + shift
3086           || INTVAL (XEXP (x, 2)) != 0)
3087         shift = -1;
3088     }
3089   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3090      (const_int 0xffffffff<<shift)) */
3091   else if (GET_CODE (x) == AND
3092            && GET_MODE (x) == DImode
3093            && GET_CODE (XEXP (x, 0)) == ASHIFT
3094            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3095            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3096            && CONST_INT_P (XEXP (x, 1)))
3097     {
3098       type = ADDRESS_REG_UXTW;
3099       index = XEXP (XEXP (x, 0), 0);
3100       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3101       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3102         shift = -1;
3103     }
3104   /* (mult:P (reg:P) (const_int scale)) */
3105   else if (GET_CODE (x) == MULT
3106            && GET_MODE (x) == Pmode
3107            && GET_MODE (XEXP (x, 0)) == Pmode
3108            && CONST_INT_P (XEXP (x, 1)))
3109     {
3110       type = ADDRESS_REG_REG;
3111       index = XEXP (x, 0);
3112       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3113     }
3114   /* (ashift:P (reg:P) (const_int shift)) */
3115   else if (GET_CODE (x) == ASHIFT
3116            && GET_MODE (x) == Pmode
3117            && GET_MODE (XEXP (x, 0)) == Pmode
3118            && CONST_INT_P (XEXP (x, 1)))
3119     {
3120       type = ADDRESS_REG_REG;
3121       index = XEXP (x, 0);
3122       shift = INTVAL (XEXP (x, 1));
3123     }
3124   else
3125     return false;
3126
3127   if (GET_CODE (index) == SUBREG)
3128     index = SUBREG_REG (index);
3129
3130   if ((shift == 0 ||
3131        (shift > 0 && shift <= 3
3132         && (1 << shift) == GET_MODE_SIZE (mode)))
3133       && REG_P (index)
3134       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3135     {
3136       info->type = type;
3137       info->offset = index;
3138       info->shift = shift;
3139       return true;
3140     }
3141
3142   return false;
3143 }
3144
3145 bool
3146 aarch64_offset_7bit_signed_scaled_p (enum machine_mode mode, HOST_WIDE_INT offset)
3147 {
3148   return (offset >= -64 * GET_MODE_SIZE (mode)
3149           && offset < 64 * GET_MODE_SIZE (mode)
3150           && offset % GET_MODE_SIZE (mode) == 0);
3151 }
3152
3153 static inline bool
3154 offset_9bit_signed_unscaled_p (enum machine_mode mode ATTRIBUTE_UNUSED,
3155                                HOST_WIDE_INT offset)
3156 {
3157   return offset >= -256 && offset < 256;
3158 }
3159
3160 static inline bool
3161 offset_12bit_unsigned_scaled_p (enum machine_mode mode, HOST_WIDE_INT offset)
3162 {
3163   return (offset >= 0
3164           && offset < 4096 * GET_MODE_SIZE (mode)
3165           && offset % GET_MODE_SIZE (mode) == 0);
3166 }
3167
3168 /* Return true if X is a valid address for machine mode MODE.  If it is,
3169    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3170    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3171
3172 static bool
3173 aarch64_classify_address (struct aarch64_address_info *info,
3174                           rtx x, enum machine_mode mode,
3175                           RTX_CODE outer_code, bool strict_p)
3176 {
3177   enum rtx_code code = GET_CODE (x);
3178   rtx op0, op1;
3179   bool allow_reg_index_p =
3180     outer_code != PARALLEL && (GET_MODE_SIZE (mode) != 16
3181                                || aarch64_vector_mode_supported_p (mode));
3182   /* Don't support anything other than POST_INC or REG addressing for
3183      AdvSIMD.  */
3184   if (aarch64_vect_struct_mode_p (mode)
3185       && (code != POST_INC && code != REG))
3186     return false;
3187
3188   switch (code)
3189     {
3190     case REG:
3191     case SUBREG:
3192       info->type = ADDRESS_REG_IMM;
3193       info->base = x;
3194       info->offset = const0_rtx;
3195       return aarch64_base_register_rtx_p (x, strict_p);
3196
3197     case PLUS:
3198       op0 = XEXP (x, 0);
3199       op1 = XEXP (x, 1);
3200
3201       if (! strict_p
3202           && REG_P (op0)
3203           && (op0 == virtual_stack_vars_rtx
3204               || op0 == frame_pointer_rtx
3205               || op0 == arg_pointer_rtx)
3206           && CONST_INT_P (op1))
3207         {
3208           info->type = ADDRESS_REG_IMM;
3209           info->base = op0;
3210           info->offset = op1;
3211
3212           return true;
3213         }
3214
3215       if (GET_MODE_SIZE (mode) != 0
3216           && CONST_INT_P (op1)
3217           && aarch64_base_register_rtx_p (op0, strict_p))
3218         {
3219           HOST_WIDE_INT offset = INTVAL (op1);
3220
3221           info->type = ADDRESS_REG_IMM;
3222           info->base = op0;
3223           info->offset = op1;
3224
3225           /* TImode and TFmode values are allowed in both pairs of X
3226              registers and individual Q registers.  The available
3227              address modes are:
3228              X,X: 7-bit signed scaled offset
3229              Q:   9-bit signed offset
3230              We conservatively require an offset representable in either mode.
3231            */
3232           if (mode == TImode || mode == TFmode)
3233             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3234                     && offset_9bit_signed_unscaled_p (mode, offset));
3235
3236           if (outer_code == PARALLEL)
3237             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3238                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3239           else
3240             return (offset_9bit_signed_unscaled_p (mode, offset)
3241                     || offset_12bit_unsigned_scaled_p (mode, offset));
3242         }
3243
3244       if (allow_reg_index_p)
3245         {
3246           /* Look for base + (scaled/extended) index register.  */
3247           if (aarch64_base_register_rtx_p (op0, strict_p)
3248               && aarch64_classify_index (info, op1, mode, strict_p))
3249             {
3250               info->base = op0;
3251               return true;
3252             }
3253           if (aarch64_base_register_rtx_p (op1, strict_p)
3254               && aarch64_classify_index (info, op0, mode, strict_p))
3255             {
3256               info->base = op1;
3257               return true;
3258             }
3259         }
3260
3261       return false;
3262
3263     case POST_INC:
3264     case POST_DEC:
3265     case PRE_INC:
3266     case PRE_DEC:
3267       info->type = ADDRESS_REG_WB;
3268       info->base = XEXP (x, 0);
3269       info->offset = NULL_RTX;
3270       return aarch64_base_register_rtx_p (info->base, strict_p);
3271
3272     case POST_MODIFY:
3273     case PRE_MODIFY:
3274       info->type = ADDRESS_REG_WB;
3275       info->base = XEXP (x, 0);
3276       if (GET_CODE (XEXP (x, 1)) == PLUS
3277           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3278           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3279           && aarch64_base_register_rtx_p (info->base, strict_p))
3280         {
3281           HOST_WIDE_INT offset;
3282           info->offset = XEXP (XEXP (x, 1), 1);
3283           offset = INTVAL (info->offset);
3284
3285           /* TImode and TFmode values are allowed in both pairs of X
3286              registers and individual Q registers.  The available
3287              address modes are:
3288              X,X: 7-bit signed scaled offset
3289              Q:   9-bit signed offset
3290              We conservatively require an offset representable in either mode.
3291            */
3292           if (mode == TImode || mode == TFmode)
3293             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3294                     && offset_9bit_signed_unscaled_p (mode, offset));
3295
3296           if (outer_code == PARALLEL)
3297             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3298                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3299           else
3300             return offset_9bit_signed_unscaled_p (mode, offset);
3301         }
3302       return false;
3303
3304     case CONST:
3305     case SYMBOL_REF:
3306     case LABEL_REF:
3307       /* load literal: pc-relative constant pool entry.  Only supported
3308          for SI mode or larger.  */
3309       info->type = ADDRESS_SYMBOLIC;
3310       if (outer_code != PARALLEL && GET_MODE_SIZE (mode) >= 4)
3311         {
3312           rtx sym, addend;
3313
3314           split_const (x, &sym, &addend);
3315           return (GET_CODE (sym) == LABEL_REF
3316                   || (GET_CODE (sym) == SYMBOL_REF
3317                       && CONSTANT_POOL_ADDRESS_P (sym)));
3318         }
3319       return false;
3320
3321     case LO_SUM:
3322       info->type = ADDRESS_LO_SUM;
3323       info->base = XEXP (x, 0);
3324       info->offset = XEXP (x, 1);
3325       if (allow_reg_index_p
3326           && aarch64_base_register_rtx_p (info->base, strict_p))
3327         {
3328           rtx sym, offs;
3329           split_const (info->offset, &sym, &offs);
3330           if (GET_CODE (sym) == SYMBOL_REF
3331               && (aarch64_classify_symbol (sym, SYMBOL_CONTEXT_MEM)
3332                   == SYMBOL_SMALL_ABSOLUTE))
3333             {
3334               /* The symbol and offset must be aligned to the access size.  */
3335               unsigned int align;
3336               unsigned int ref_size;
3337
3338               if (CONSTANT_POOL_ADDRESS_P (sym))
3339                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3340               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3341                 {
3342                   tree exp = SYMBOL_REF_DECL (sym);
3343                   align = TYPE_ALIGN (TREE_TYPE (exp));
3344                   align = CONSTANT_ALIGNMENT (exp, align);
3345                 }
3346               else if (SYMBOL_REF_DECL (sym))
3347                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3348               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3349                        && SYMBOL_REF_BLOCK (sym) != NULL)
3350                 align = SYMBOL_REF_BLOCK (sym)->alignment;
3351               else
3352                 align = BITS_PER_UNIT;
3353
3354               ref_size = GET_MODE_SIZE (mode);
3355               if (ref_size == 0)
3356                 ref_size = GET_MODE_SIZE (DImode);
3357
3358               return ((INTVAL (offs) & (ref_size - 1)) == 0
3359                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3360             }
3361         }
3362       return false;
3363
3364     default:
3365       return false;
3366     }
3367 }
3368
3369 bool
3370 aarch64_symbolic_address_p (rtx x)
3371 {
3372   rtx offset;
3373
3374   split_const (x, &x, &offset);
3375   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3376 }
3377
3378 /* Classify the base of symbolic expression X, given that X appears in
3379    context CONTEXT.  */
3380
3381 enum aarch64_symbol_type
3382 aarch64_classify_symbolic_expression (rtx x,
3383                                       enum aarch64_symbol_context context)
3384 {
3385   rtx offset;
3386
3387   split_const (x, &x, &offset);
3388   return aarch64_classify_symbol (x, context);
3389 }
3390
3391
3392 /* Return TRUE if X is a legitimate address for accessing memory in
3393    mode MODE.  */
3394 static bool
3395 aarch64_legitimate_address_hook_p (enum machine_mode mode, rtx x, bool strict_p)
3396 {
3397   struct aarch64_address_info addr;
3398
3399   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3400 }
3401
3402 /* Return TRUE if X is a legitimate address for accessing memory in
3403    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3404    pair operation.  */
3405 bool
3406 aarch64_legitimate_address_p (enum machine_mode mode, rtx x,
3407                               RTX_CODE outer_code, bool strict_p)
3408 {
3409   struct aarch64_address_info addr;
3410
3411   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3412 }
3413
3414 /* Return TRUE if rtx X is immediate constant 0.0 */
3415 bool
3416 aarch64_float_const_zero_rtx_p (rtx x)
3417 {
3418   REAL_VALUE_TYPE r;
3419
3420   if (GET_MODE (x) == VOIDmode)
3421     return false;
3422
3423   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3424   if (REAL_VALUE_MINUS_ZERO (r))
3425     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3426   return REAL_VALUES_EQUAL (r, dconst0);
3427 }
3428
3429 /* Return the fixed registers used for condition codes.  */
3430
3431 static bool
3432 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3433 {
3434   *p1 = CC_REGNUM;
3435   *p2 = INVALID_REGNUM;
3436   return true;
3437 }
3438
3439 /* Emit call insn with PAT and do aarch64-specific handling.  */
3440
3441 void
3442 aarch64_emit_call_insn (rtx pat)
3443 {
3444   rtx insn = emit_call_insn (pat);
3445
3446   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3447   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3448   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3449 }
3450
3451 enum machine_mode
3452 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3453 {
3454   /* All floating point compares return CCFP if it is an equality
3455      comparison, and CCFPE otherwise.  */
3456   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3457     {
3458       switch (code)
3459         {
3460         case EQ:
3461         case NE:
3462         case UNORDERED:
3463         case ORDERED:
3464         case UNLT:
3465         case UNLE:
3466         case UNGT:
3467         case UNGE:
3468         case UNEQ:
3469         case LTGT:
3470           return CCFPmode;
3471
3472         case LT:
3473         case LE:
3474         case GT:
3475         case GE:
3476           return CCFPEmode;
3477
3478         default:
3479           gcc_unreachable ();
3480         }
3481     }
3482
3483   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3484       && y == const0_rtx
3485       && (code == EQ || code == NE || code == LT || code == GE)
3486       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3487           || GET_CODE (x) == NEG))
3488     return CC_NZmode;
3489
3490   /* A compare with a shifted operand.  Because of canonicalization,
3491      the comparison will have to be swapped when we emit the assembly
3492      code.  */
3493   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3494       && (REG_P (y) || GET_CODE (y) == SUBREG)
3495       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3496           || GET_CODE (x) == LSHIFTRT
3497           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3498     return CC_SWPmode;
3499
3500   /* Similarly for a negated operand, but we can only do this for
3501      equalities.  */
3502   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3503       && (REG_P (y) || GET_CODE (y) == SUBREG)
3504       && (code == EQ || code == NE)
3505       && GET_CODE (x) == NEG)
3506     return CC_Zmode;
3507
3508   /* A compare of a mode narrower than SI mode against zero can be done
3509      by extending the value in the comparison.  */
3510   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3511       && y == const0_rtx)
3512     /* Only use sign-extension if we really need it.  */
3513     return ((code == GT || code == GE || code == LE || code == LT)
3514             ? CC_SESWPmode : CC_ZESWPmode);
3515
3516   /* For everything else, return CCmode.  */
3517   return CCmode;
3518 }
3519
3520 int
3521 aarch64_get_condition_code (rtx x)
3522 {
3523   enum machine_mode mode = GET_MODE (XEXP (x, 0));
3524   enum rtx_code comp_code = GET_CODE (x);
3525
3526   if (GET_MODE_CLASS (mode) != MODE_CC)
3527     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3528
3529   switch (mode)
3530     {
3531     case CCFPmode:
3532     case CCFPEmode:
3533       switch (comp_code)
3534         {
3535         case GE: return AARCH64_GE;
3536         case GT: return AARCH64_GT;
3537         case LE: return AARCH64_LS;
3538         case LT: return AARCH64_MI;
3539         case NE: return AARCH64_NE;
3540         case EQ: return AARCH64_EQ;
3541         case ORDERED: return AARCH64_VC;
3542         case UNORDERED: return AARCH64_VS;
3543         case UNLT: return AARCH64_LT;
3544         case UNLE: return AARCH64_LE;
3545         case UNGT: return AARCH64_HI;
3546         case UNGE: return AARCH64_PL;
3547         default: return -1;
3548         }
3549       break;
3550
3551     case CCmode:
3552       switch (comp_code)
3553         {
3554         case NE: return AARCH64_NE;
3555         case EQ: return AARCH64_EQ;
3556         case GE: return AARCH64_GE;
3557         case GT: return AARCH64_GT;
3558         case LE: return AARCH64_LE;
3559         case LT: return AARCH64_LT;
3560         case GEU: return AARCH64_CS;
3561         case GTU: return AARCH64_HI;
3562         case LEU: return AARCH64_LS;
3563         case LTU: return AARCH64_CC;
3564         default: return -1;
3565         }
3566       break;
3567
3568     case CC_SWPmode:
3569     case CC_ZESWPmode:
3570     case CC_SESWPmode:
3571       switch (comp_code)
3572         {
3573         case NE: return AARCH64_NE;
3574         case EQ: return AARCH64_EQ;
3575         case GE: return AARCH64_LE;
3576         case GT: return AARCH64_LT;
3577         case LE: return AARCH64_GE;
3578         case LT: return AARCH64_GT;
3579         case GEU: return AARCH64_LS;
3580         case GTU: return AARCH64_CC;
3581         case LEU: return AARCH64_CS;
3582         case LTU: return AARCH64_HI;
3583         default: return -1;
3584         }
3585       break;
3586
3587     case CC_NZmode:
3588       switch (comp_code)
3589         {
3590         case NE: return AARCH64_NE;
3591         case EQ: return AARCH64_EQ;
3592         case GE: return AARCH64_PL;
3593         case LT: return AARCH64_MI;
3594         default: return -1;
3595         }
3596       break;
3597
3598     case CC_Zmode:
3599       switch (comp_code)
3600         {
3601         case NE: return AARCH64_NE;
3602         case EQ: return AARCH64_EQ;
3603         default: return -1;
3604         }
3605       break;
3606
3607     default:
3608       return -1;
3609       break;
3610     }
3611 }
3612
3613 bool
3614 aarch64_const_vec_all_same_in_range_p (rtx x,
3615                                   HOST_WIDE_INT minval,
3616                                   HOST_WIDE_INT maxval)
3617 {
3618   HOST_WIDE_INT firstval;
3619   int count, i;
3620
3621   if (GET_CODE (x) != CONST_VECTOR
3622       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3623     return false;
3624
3625   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3626   if (firstval < minval || firstval > maxval)
3627     return false;
3628
3629   count = CONST_VECTOR_NUNITS (x);
3630   for (i = 1; i < count; i++)
3631     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3632       return false;
3633
3634   return true;
3635 }
3636
3637 bool
3638 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3639 {
3640   return aarch64_const_vec_all_same_in_range_p (x, val, val);
3641 }
3642
3643 static unsigned
3644 bit_count (unsigned HOST_WIDE_INT value)
3645 {
3646   unsigned count = 0;
3647
3648   while (value)
3649     {
3650       count++;
3651       value &= value - 1;
3652     }
3653
3654   return count;
3655 }
3656
3657 void
3658 aarch64_print_operand (FILE *f, rtx x, char code)
3659 {
3660   switch (code)
3661     {
3662     /* An integer or symbol address without a preceding # sign.  */
3663     case 'c':
3664       switch (GET_CODE (x))
3665         {
3666         case CONST_INT:
3667           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
3668           break;
3669
3670         case SYMBOL_REF:
3671           output_addr_const (f, x);
3672           break;
3673
3674         case CONST:
3675           if (GET_CODE (XEXP (x, 0)) == PLUS
3676               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
3677             {
3678               output_addr_const (f, x);
3679               break;
3680             }
3681           /* Fall through.  */
3682
3683         default:
3684           output_operand_lossage ("Unsupported operand for code '%c'", code);
3685         }
3686       break;
3687
3688     case 'e':
3689       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
3690       {
3691         int n;
3692
3693         if (!CONST_INT_P (x)
3694             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
3695           {
3696             output_operand_lossage ("invalid operand for '%%%c'", code);
3697             return;
3698           }
3699
3700         switch (n)
3701           {
3702           case 3:
3703             fputc ('b', f);
3704             break;
3705           case 4:
3706             fputc ('h', f);
3707             break;
3708           case 5:
3709             fputc ('w', f);
3710             break;
3711           default:
3712             output_operand_lossage ("invalid operand for '%%%c'", code);
3713             return;
3714           }
3715       }
3716       break;
3717
3718     case 'p':
3719       {
3720         int n;
3721
3722         /* Print N such that 2^N == X.  */
3723         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
3724           {
3725             output_operand_lossage ("invalid operand for '%%%c'", code);
3726             return;
3727           }
3728
3729         asm_fprintf (f, "%d", n);
3730       }
3731       break;
3732
3733     case 'P':
3734       /* Print the number of non-zero bits in X (a const_int).  */
3735       if (!CONST_INT_P (x))
3736         {
3737           output_operand_lossage ("invalid operand for '%%%c'", code);
3738           return;
3739         }
3740
3741       asm_fprintf (f, "%u", bit_count (INTVAL (x)));
3742       break;
3743
3744     case 'H':
3745       /* Print the higher numbered register of a pair (TImode) of regs.  */
3746       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
3747         {
3748           output_operand_lossage ("invalid operand for '%%%c'", code);
3749           return;
3750         }
3751
3752       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
3753       break;
3754
3755     case 'm':
3756       {
3757         int cond_code;
3758         /* Print a condition (eq, ne, etc).  */
3759
3760         /* CONST_TRUE_RTX means always -- that's the default.  */
3761         if (x == const_true_rtx)
3762           return;
3763
3764         if (!COMPARISON_P (x))
3765           {
3766             output_operand_lossage ("invalid operand for '%%%c'", code);
3767             return;
3768           }
3769
3770         cond_code = aarch64_get_condition_code (x);
3771         gcc_assert (cond_code >= 0);
3772         fputs (aarch64_condition_codes[cond_code], f);
3773       }
3774       break;
3775
3776     case 'M':
3777       {
3778         int cond_code;
3779         /* Print the inverse of a condition (eq <-> ne, etc).  */
3780
3781         /* CONST_TRUE_RTX means never -- that's the default.  */
3782         if (x == const_true_rtx)
3783           {
3784             fputs ("nv", f);
3785             return;
3786           }
3787
3788         if (!COMPARISON_P (x))
3789           {
3790             output_operand_lossage ("invalid operand for '%%%c'", code);
3791             return;
3792           }
3793         cond_code = aarch64_get_condition_code (x);
3794         gcc_assert (cond_code >= 0);
3795         fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
3796                                        (cond_code)], f);
3797       }
3798       break;
3799
3800     case 'b':
3801     case 'h':
3802     case 's':
3803     case 'd':
3804     case 'q':
3805       /* Print a scalar FP/SIMD register name.  */
3806       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
3807         {
3808           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
3809           return;
3810         }
3811       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
3812       break;
3813
3814     case 'S':
3815     case 'T':
3816     case 'U':
3817     case 'V':
3818       /* Print the first FP/SIMD register name in a list.  */
3819       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
3820         {
3821           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
3822           return;
3823         }
3824       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
3825       break;
3826
3827     case 'X':
3828       /* Print bottom 16 bits of integer constant in hex.  */
3829       if (!CONST_INT_P (x))
3830         {
3831           output_operand_lossage ("invalid operand for '%%%c'", code);
3832           return;
3833         }
3834       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
3835       break;
3836
3837     case 'w':
3838     case 'x':
3839       /* Print a general register name or the zero register (32-bit or
3840          64-bit).  */
3841       if (x == const0_rtx
3842           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
3843         {
3844           asm_fprintf (f, "%czr", code);
3845           break;
3846         }
3847
3848       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
3849         {
3850           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
3851           break;
3852         }
3853
3854       if (REG_P (x) && REGNO (x) == SP_REGNUM)
3855         {
3856           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
3857           break;
3858         }
3859
3860       /* Fall through */
3861
3862     case 0:
3863       /* Print a normal operand, if it's a general register, then we
3864          assume DImode.  */
3865       if (x == NULL)
3866         {
3867           output_operand_lossage ("missing operand");
3868           return;
3869         }
3870
3871       switch (GET_CODE (x))
3872         {
3873         case REG:
3874           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
3875           break;
3876
3877         case MEM:
3878           aarch64_memory_reference_mode = GET_MODE (x);
3879           output_address (XEXP (x, 0));
3880           break;
3881
3882         case LABEL_REF:
3883         case SYMBOL_REF:
3884           output_addr_const (asm_out_file, x);
3885           break;
3886
3887         case CONST_INT:
3888           asm_fprintf (f, "%wd", INTVAL (x));
3889           break;
3890
3891         case CONST_VECTOR:
3892           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
3893             {
3894               gcc_assert (
3895                   aarch64_const_vec_all_same_in_range_p (x,
3896                                                          HOST_WIDE_INT_MIN,
3897                                                          HOST_WIDE_INT_MAX));
3898               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
3899             }
3900           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
3901             {
3902               fputc ('0', f);
3903             }
3904           else
3905             gcc_unreachable ();
3906           break;
3907
3908         case CONST_DOUBLE:
3909           /* CONST_DOUBLE can represent a double-width integer.
3910              In this case, the mode of x is VOIDmode.  */
3911           if (GET_MODE (x) == VOIDmode)
3912             ; /* Do Nothing.  */
3913           else if (aarch64_float_const_zero_rtx_p (x))
3914             {
3915               fputc ('0', f);
3916               break;
3917             }
3918           else if (aarch64_float_const_representable_p (x))
3919             {
3920 #define buf_size 20
3921               char float_buf[buf_size] = {'\0'};
3922               REAL_VALUE_TYPE r;
3923               REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3924               real_to_decimal_for_mode (float_buf, &r,
3925                                         buf_size, buf_size,
3926                                         1, GET_MODE (x));
3927               asm_fprintf (asm_out_file, "%s", float_buf);
3928               break;
3929 #undef buf_size
3930             }
3931           output_operand_lossage ("invalid constant");
3932           return;
3933         default:
3934           output_operand_lossage ("invalid operand");
3935           return;
3936         }
3937       break;
3938
3939     case 'A':
3940       if (GET_CODE (x) == HIGH)
3941         x = XEXP (x, 0);
3942
3943       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3944         {
3945         case SYMBOL_SMALL_GOT:
3946           asm_fprintf (asm_out_file, ":got:");
3947           break;
3948
3949         case SYMBOL_SMALL_TLSGD:
3950           asm_fprintf (asm_out_file, ":tlsgd:");
3951           break;
3952
3953         case SYMBOL_SMALL_TLSDESC:
3954           asm_fprintf (asm_out_file, ":tlsdesc:");
3955           break;
3956
3957         case SYMBOL_SMALL_GOTTPREL:
3958           asm_fprintf (asm_out_file, ":gottprel:");
3959           break;
3960
3961         case SYMBOL_SMALL_TPREL:
3962           asm_fprintf (asm_out_file, ":tprel:");
3963           break;
3964
3965         case SYMBOL_TINY_GOT:
3966           gcc_unreachable ();
3967           break;
3968
3969         default:
3970           break;
3971         }
3972       output_addr_const (asm_out_file, x);
3973       break;
3974
3975     case 'L':
3976       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
3977         {
3978         case SYMBOL_SMALL_GOT:
3979           asm_fprintf (asm_out_file, ":lo12:");
3980           break;
3981
3982         case SYMBOL_SMALL_TLSGD:
3983           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
3984           break;
3985
3986         case SYMBOL_SMALL_TLSDESC:
3987           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
3988           break;
3989
3990         case SYMBOL_SMALL_GOTTPREL:
3991           asm_fprintf (asm_out_file, ":gottprel_lo12:");
3992           break;
3993
3994         case SYMBOL_SMALL_TPREL:
3995           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
3996           break;
3997
3998         case SYMBOL_TINY_GOT:
3999           asm_fprintf (asm_out_file, ":got:");
4000           break;
4001
4002         default:
4003           break;
4004         }
4005       output_addr_const (asm_out_file, x);
4006       break;
4007
4008     case 'G':
4009
4010       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4011         {
4012         case SYMBOL_SMALL_TPREL:
4013           asm_fprintf (asm_out_file, ":tprel_hi12:");
4014           break;
4015         default:
4016           break;
4017         }
4018       output_addr_const (asm_out_file, x);
4019       break;
4020
4021     default:
4022       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4023       return;
4024     }
4025 }
4026
4027 void
4028 aarch64_print_operand_address (FILE *f, rtx x)
4029 {
4030   struct aarch64_address_info addr;
4031
4032   if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4033                              MEM, true))
4034     switch (addr.type)
4035       {
4036       case ADDRESS_REG_IMM:
4037         if (addr.offset == const0_rtx)
4038           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4039         else
4040           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4041                        INTVAL (addr.offset));
4042         return;
4043
4044       case ADDRESS_REG_REG:
4045         if (addr.shift == 0)
4046           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4047                        reg_names [REGNO (addr.offset)]);
4048         else
4049           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4050                        reg_names [REGNO (addr.offset)], addr.shift);
4051         return;
4052
4053       case ADDRESS_REG_UXTW:
4054         if (addr.shift == 0)
4055           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4056                        REGNO (addr.offset) - R0_REGNUM);
4057         else
4058           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4059                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4060         return;
4061
4062       case ADDRESS_REG_SXTW:
4063         if (addr.shift == 0)
4064           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4065                        REGNO (addr.offset) - R0_REGNUM);
4066         else
4067           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4068                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4069         return;
4070
4071       case ADDRESS_REG_WB:
4072         switch (GET_CODE (x))
4073           {
4074           case PRE_INC:
4075             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4076                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4077             return;
4078           case POST_INC:
4079             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4080                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4081             return;
4082           case PRE_DEC:
4083             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4084                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4085             return;
4086           case POST_DEC:
4087             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4088                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4089             return;
4090           case PRE_MODIFY:
4091             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4092                          INTVAL (addr.offset));
4093             return;
4094           case POST_MODIFY:
4095             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4096                          INTVAL (addr.offset));
4097             return;
4098           default:
4099             break;
4100           }
4101         break;
4102
4103       case ADDRESS_LO_SUM:
4104         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4105         output_addr_const (f, addr.offset);
4106         asm_fprintf (f, "]");
4107         return;
4108
4109       case ADDRESS_SYMBOLIC:
4110         break;
4111       }
4112
4113   output_addr_const (f, x);
4114 }
4115
4116 bool
4117 aarch64_label_mentioned_p (rtx x)
4118 {
4119   const char *fmt;
4120   int i;
4121
4122   if (GET_CODE (x) == LABEL_REF)
4123     return true;
4124
4125   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4126      referencing instruction, but they are constant offsets, not
4127      symbols.  */
4128   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4129     return false;
4130
4131   fmt = GET_RTX_FORMAT (GET_CODE (x));
4132   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4133     {
4134       if (fmt[i] == 'E')
4135         {
4136           int j;
4137
4138           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4139             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4140               return 1;
4141         }
4142       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4143         return 1;
4144     }
4145
4146   return 0;
4147 }
4148
4149 /* Implement REGNO_REG_CLASS.  */
4150
4151 enum reg_class
4152 aarch64_regno_regclass (unsigned regno)
4153 {
4154   if (GP_REGNUM_P (regno))
4155     return GENERAL_REGS;
4156
4157   if (regno == SP_REGNUM)
4158     return STACK_REG;
4159
4160   if (regno == FRAME_POINTER_REGNUM
4161       || regno == ARG_POINTER_REGNUM)
4162     return POINTER_REGS;
4163
4164   if (FP_REGNUM_P (regno))
4165     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4166
4167   return NO_REGS;
4168 }
4169
4170 /* Try a machine-dependent way of reloading an illegitimate address
4171    operand.  If we find one, push the reload and return the new rtx.  */
4172
4173 rtx
4174 aarch64_legitimize_reload_address (rtx *x_p,
4175                                    enum machine_mode mode,
4176                                    int opnum, int type,
4177                                    int ind_levels ATTRIBUTE_UNUSED)
4178 {
4179   rtx x = *x_p;
4180
4181   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4182   if (aarch64_vect_struct_mode_p (mode)
4183       && GET_CODE (x) == PLUS
4184       && REG_P (XEXP (x, 0))
4185       && CONST_INT_P (XEXP (x, 1)))
4186     {
4187       rtx orig_rtx = x;
4188       x = copy_rtx (x);
4189       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4190                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4191                    opnum, (enum reload_type) type);
4192       return x;
4193     }
4194
4195   /* We must recognize output that we have already generated ourselves.  */
4196   if (GET_CODE (x) == PLUS
4197       && GET_CODE (XEXP (x, 0)) == PLUS
4198       && REG_P (XEXP (XEXP (x, 0), 0))
4199       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4200       && CONST_INT_P (XEXP (x, 1)))
4201     {
4202       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4203                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4204                    opnum, (enum reload_type) type);
4205       return x;
4206     }
4207
4208   /* We wish to handle large displacements off a base register by splitting
4209      the addend across an add and the mem insn.  This can cut the number of
4210      extra insns needed from 3 to 1.  It is only useful for load/store of a
4211      single register with 12 bit offset field.  */
4212   if (GET_CODE (x) == PLUS
4213       && REG_P (XEXP (x, 0))
4214       && CONST_INT_P (XEXP (x, 1))
4215       && HARD_REGISTER_P (XEXP (x, 0))
4216       && mode != TImode
4217       && mode != TFmode
4218       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4219     {
4220       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4221       HOST_WIDE_INT low = val & 0xfff;
4222       HOST_WIDE_INT high = val - low;
4223       HOST_WIDE_INT offs;
4224       rtx cst;
4225       enum machine_mode xmode = GET_MODE (x);
4226
4227       /* In ILP32, xmode can be either DImode or SImode.  */
4228       gcc_assert (xmode == DImode || xmode == SImode);
4229
4230       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4231          BLKmode alignment.  */
4232       if (GET_MODE_SIZE (mode) == 0)
4233         return NULL_RTX;
4234
4235       offs = low % GET_MODE_SIZE (mode);
4236
4237       /* Align misaligned offset by adjusting high part to compensate.  */
4238       if (offs != 0)
4239         {
4240           if (aarch64_uimm12_shift (high + offs))
4241             {
4242               /* Align down.  */
4243               low = low - offs;
4244               high = high + offs;
4245             }
4246           else
4247             {
4248               /* Align up.  */
4249               offs = GET_MODE_SIZE (mode) - offs;
4250               low = low + offs;
4251               high = high + (low & 0x1000) - offs;
4252               low &= 0xfff;
4253             }
4254         }
4255
4256       /* Check for overflow.  */
4257       if (high + low != val)
4258         return NULL_RTX;
4259
4260       cst = GEN_INT (high);
4261       if (!aarch64_uimm12_shift (high))
4262         cst = force_const_mem (xmode, cst);
4263
4264       /* Reload high part into base reg, leaving the low part
4265          in the mem instruction.
4266          Note that replacing this gen_rtx_PLUS with plus_constant is
4267          wrong in this case because we rely on the
4268          (plus (plus reg c1) c2) structure being preserved so that
4269          XEXP (*p, 0) in push_reload below uses the correct term.  */
4270       x = gen_rtx_PLUS (xmode,
4271                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4272                         GEN_INT (low));
4273
4274       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4275                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4276                    opnum, (enum reload_type) type);
4277       return x;
4278     }
4279
4280   return NULL_RTX;
4281 }
4282
4283
4284 static reg_class_t
4285 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4286                           reg_class_t rclass,
4287                           enum machine_mode mode,
4288                           secondary_reload_info *sri)
4289 {
4290   /* Without the TARGET_SIMD instructions we cannot move a Q register
4291      to a Q register directly.  We need a scratch.  */
4292   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4293       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4294       && reg_class_subset_p (rclass, FP_REGS))
4295     {
4296       if (mode == TFmode)
4297         sri->icode = CODE_FOR_aarch64_reload_movtf;
4298       else if (mode == TImode)
4299         sri->icode = CODE_FOR_aarch64_reload_movti;
4300       return NO_REGS;
4301     }
4302
4303   /* A TFmode or TImode memory access should be handled via an FP_REGS
4304      because AArch64 has richer addressing modes for LDR/STR instructions
4305      than LDP/STP instructions.  */
4306   if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4307       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4308     return FP_REGS;
4309
4310   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4311       return GENERAL_REGS;
4312
4313   return NO_REGS;
4314 }
4315
4316 static bool
4317 aarch64_can_eliminate (const int from, const int to)
4318 {
4319   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4320      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4321
4322   if (frame_pointer_needed)
4323     {
4324       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4325         return true;
4326       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4327         return false;
4328       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4329           && !cfun->calls_alloca)
4330         return true;
4331       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4332         return true;
4333
4334       return false;
4335     }
4336
4337   return true;
4338 }
4339
4340 HOST_WIDE_INT
4341 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4342 {
4343   aarch64_layout_frame ();
4344
4345   if (to == HARD_FRAME_POINTER_REGNUM)
4346     {
4347       if (from == ARG_POINTER_REGNUM)
4348         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4349
4350       if (from == FRAME_POINTER_REGNUM)
4351         return (cfun->machine->frame.hard_fp_offset
4352                 - cfun->machine->frame.saved_varargs_size);
4353     }
4354
4355   if (to == STACK_POINTER_REGNUM)
4356     {
4357       if (from == FRAME_POINTER_REGNUM)
4358           return (cfun->machine->frame.frame_size
4359                   - cfun->machine->frame.saved_varargs_size);
4360     }
4361
4362   return cfun->machine->frame.frame_size;
4363 }
4364
4365 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
4366    previous frame.  */
4367
4368 rtx
4369 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4370 {
4371   if (count != 0)
4372     return const0_rtx;
4373   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4374 }
4375
4376
4377 static void
4378 aarch64_asm_trampoline_template (FILE *f)
4379 {
4380   if (TARGET_ILP32)
4381     {
4382       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4383       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4384     }
4385   else
4386     {
4387       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4388       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4389     }
4390   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4391   assemble_aligned_integer (4, const0_rtx);
4392   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4393   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4394 }
4395
4396 static void
4397 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4398 {
4399   rtx fnaddr, mem, a_tramp;
4400   const int tramp_code_sz = 16;
4401
4402   /* Don't need to copy the trailing D-words, we fill those in below.  */
4403   emit_block_move (m_tramp, assemble_trampoline_template (),
4404                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4405   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4406   fnaddr = XEXP (DECL_RTL (fndecl), 0);
4407   if (GET_MODE (fnaddr) != ptr_mode)
4408     fnaddr = convert_memory_address (ptr_mode, fnaddr);
4409   emit_move_insn (mem, fnaddr);
4410
4411   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4412   emit_move_insn (mem, chain_value);
4413
4414   /* XXX We should really define a "clear_cache" pattern and use
4415      gen_clear_cache().  */
4416   a_tramp = XEXP (m_tramp, 0);
4417   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4418                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4419                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4420                      ptr_mode);
4421 }
4422
4423 static unsigned char
4424 aarch64_class_max_nregs (reg_class_t regclass, enum machine_mode mode)
4425 {
4426   switch (regclass)
4427     {
4428     case CALLER_SAVE_REGS:
4429     case POINTER_REGS:
4430     case GENERAL_REGS:
4431     case ALL_REGS:
4432     case FP_REGS:
4433     case FP_LO_REGS:
4434       return
4435         aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4436                                        (GET_MODE_SIZE (mode) + 7) / 8;
4437     case STACK_REG:
4438       return 1;
4439
4440     case NO_REGS:
4441       return 0;
4442
4443     default:
4444       break;
4445     }
4446   gcc_unreachable ();
4447 }
4448
4449 static reg_class_t
4450 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4451 {
4452   if (regclass == POINTER_REGS)
4453     return GENERAL_REGS;
4454
4455   if (regclass == STACK_REG)
4456     {
4457       if (REG_P(x)
4458           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4459           return regclass;
4460
4461       return NO_REGS;
4462     }
4463
4464   /* If it's an integer immediate that MOVI can't handle, then
4465      FP_REGS is not an option, so we return NO_REGS instead.  */
4466   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4467       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4468     return NO_REGS;
4469
4470   /* Register eliminiation can result in a request for
4471      SP+constant->FP_REGS.  We cannot support such operations which
4472      use SP as source and an FP_REG as destination, so reject out
4473      right now.  */
4474   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4475     {
4476       rtx lhs = XEXP (x, 0);
4477
4478       /* Look through a possible SUBREG introduced by ILP32.  */
4479       if (GET_CODE (lhs) == SUBREG)
4480         lhs = SUBREG_REG (lhs);
4481
4482       gcc_assert (REG_P (lhs));
4483       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4484                                       POINTER_REGS));
4485       return NO_REGS;
4486     }
4487
4488   return regclass;
4489 }
4490
4491 void
4492 aarch64_asm_output_labelref (FILE* f, const char *name)
4493 {
4494   asm_fprintf (f, "%U%s", name);
4495 }
4496
4497 static void
4498 aarch64_elf_asm_constructor (rtx symbol, int priority)
4499 {
4500   if (priority == DEFAULT_INIT_PRIORITY)
4501     default_ctor_section_asm_out_constructor (symbol, priority);
4502   else
4503     {
4504       section *s;
4505       char buf[18];
4506       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4507       s = get_section (buf, SECTION_WRITE, NULL);
4508       switch_to_section (s);
4509       assemble_align (POINTER_SIZE);
4510       assemble_aligned_integer (POINTER_BYTES, symbol);
4511     }
4512 }
4513
4514 static void
4515 aarch64_elf_asm_destructor (rtx symbol, int priority)
4516 {
4517   if (priority == DEFAULT_INIT_PRIORITY)
4518     default_dtor_section_asm_out_destructor (symbol, priority);
4519   else
4520     {
4521       section *s;
4522       char buf[18];
4523       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
4524       s = get_section (buf, SECTION_WRITE, NULL);
4525       switch_to_section (s);
4526       assemble_align (POINTER_SIZE);
4527       assemble_aligned_integer (POINTER_BYTES, symbol);
4528     }
4529 }
4530
4531 const char*
4532 aarch64_output_casesi (rtx *operands)
4533 {
4534   char buf[100];
4535   char label[100];
4536   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
4537   int index;
4538   static const char *const patterns[4][2] =
4539   {
4540     {
4541       "ldrb\t%w3, [%0,%w1,uxtw]",
4542       "add\t%3, %4, %w3, sxtb #2"
4543     },
4544     {
4545       "ldrh\t%w3, [%0,%w1,uxtw #1]",
4546       "add\t%3, %4, %w3, sxth #2"
4547     },
4548     {
4549       "ldr\t%w3, [%0,%w1,uxtw #2]",
4550       "add\t%3, %4, %w3, sxtw #2"
4551     },
4552     /* We assume that DImode is only generated when not optimizing and
4553        that we don't really need 64-bit address offsets.  That would
4554        imply an object file with 8GB of code in a single function!  */
4555     {
4556       "ldr\t%w3, [%0,%w1,uxtw #2]",
4557       "add\t%3, %4, %w3, sxtw #2"
4558     }
4559   };
4560
4561   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
4562
4563   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
4564
4565   gcc_assert (index >= 0 && index <= 3);
4566
4567   /* Need to implement table size reduction, by chaning the code below.  */
4568   output_asm_insn (patterns[index][0], operands);
4569   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
4570   snprintf (buf, sizeof (buf),
4571             "adr\t%%4, %s", targetm.strip_name_encoding (label));
4572   output_asm_insn (buf, operands);
4573   output_asm_insn (patterns[index][1], operands);
4574   output_asm_insn ("br\t%3", operands);
4575   assemble_label (asm_out_file, label);
4576   return "";
4577 }
4578
4579
4580 /* Return size in bits of an arithmetic operand which is shifted/scaled and
4581    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
4582    operator.  */
4583
4584 int
4585 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
4586 {
4587   if (shift >= 0 && shift <= 3)
4588     {
4589       int size;
4590       for (size = 8; size <= 32; size *= 2)
4591         {
4592           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
4593           if (mask == bits << shift)
4594             return size;
4595         }
4596     }
4597   return 0;
4598 }
4599
4600 static bool
4601 aarch64_use_blocks_for_constant_p (enum machine_mode mode ATTRIBUTE_UNUSED,
4602                                    const_rtx x ATTRIBUTE_UNUSED)
4603 {
4604   /* We can't use blocks for constants when we're using a per-function
4605      constant pool.  */
4606   return false;
4607 }
4608
4609 static section *
4610 aarch64_select_rtx_section (enum machine_mode mode ATTRIBUTE_UNUSED,
4611                             rtx x ATTRIBUTE_UNUSED,
4612                             unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
4613 {
4614   /* Force all constant pool entries into the current function section.  */
4615   return function_section (current_function_decl);
4616 }
4617
4618
4619 /* Costs.  */
4620
4621 /* Helper function for rtx cost calculation.  Strip a shift expression
4622    from X.  Returns the inner operand if successful, or the original
4623    expression on failure.  */
4624 static rtx
4625 aarch64_strip_shift (rtx x)
4626 {
4627   rtx op = x;
4628
4629   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
4630      we can convert both to ROR during final output.  */
4631   if ((GET_CODE (op) == ASHIFT
4632        || GET_CODE (op) == ASHIFTRT
4633        || GET_CODE (op) == LSHIFTRT
4634        || GET_CODE (op) == ROTATERT
4635        || GET_CODE (op) == ROTATE)
4636       && CONST_INT_P (XEXP (op, 1)))
4637     return XEXP (op, 0);
4638
4639   if (GET_CODE (op) == MULT
4640       && CONST_INT_P (XEXP (op, 1))
4641       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
4642     return XEXP (op, 0);
4643
4644   return x;
4645 }
4646
4647 /* Helper function for rtx cost calculation.  Strip an extend
4648    expression from X.  Returns the inner operand if successful, or the
4649    original expression on failure.  We deal with a number of possible
4650    canonicalization variations here.  */
4651 static rtx
4652 aarch64_strip_extend (rtx x)
4653 {
4654   rtx op = x;
4655
4656   /* Zero and sign extraction of a widened value.  */
4657   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
4658       && XEXP (op, 2) == const0_rtx
4659       && GET_CODE (XEXP (op, 0)) == MULT
4660       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
4661                                          XEXP (op, 1)))
4662     return XEXP (XEXP (op, 0), 0);
4663
4664   /* It can also be represented (for zero-extend) as an AND with an
4665      immediate.  */
4666   if (GET_CODE (op) == AND
4667       && GET_CODE (XEXP (op, 0)) == MULT
4668       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
4669       && CONST_INT_P (XEXP (op, 1))
4670       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
4671                            INTVAL (XEXP (op, 1))) != 0)
4672     return XEXP (XEXP (op, 0), 0);
4673
4674   /* Now handle extended register, as this may also have an optional
4675      left shift by 1..4.  */
4676   if (GET_CODE (op) == ASHIFT
4677       && CONST_INT_P (XEXP (op, 1))
4678       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
4679     op = XEXP (op, 0);
4680
4681   if (GET_CODE (op) == ZERO_EXTEND
4682       || GET_CODE (op) == SIGN_EXTEND)
4683     op = XEXP (op, 0);
4684
4685   if (op != x)
4686     return op;
4687
4688   return x;
4689 }
4690
4691 /* Helper function for rtx cost calculation.  Calculate the cost of
4692    a MULT, which may be part of a multiply-accumulate rtx.  Return
4693    the calculated cost of the expression, recursing manually in to
4694    operands where needed.  */
4695
4696 static int
4697 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
4698 {
4699   rtx op0, op1;
4700   const struct cpu_cost_table *extra_cost
4701     = aarch64_tune_params->insn_extra_cost;
4702   int cost = 0;
4703   bool maybe_fma = (outer == PLUS || outer == MINUS);
4704   enum machine_mode mode = GET_MODE (x);
4705
4706   gcc_checking_assert (code == MULT);
4707
4708   op0 = XEXP (x, 0);
4709   op1 = XEXP (x, 1);
4710
4711   if (VECTOR_MODE_P (mode))
4712     mode = GET_MODE_INNER (mode);
4713
4714   /* Integer multiply/fma.  */
4715   if (GET_MODE_CLASS (mode) == MODE_INT)
4716     {
4717       /* The multiply will be canonicalized as a shift, cost it as such.  */
4718       if (CONST_INT_P (op1)
4719           && exact_log2 (INTVAL (op1)) > 0)
4720         {
4721           if (speed)
4722             {
4723               if (maybe_fma)
4724                 /* ADD (shifted register).  */
4725                 cost += extra_cost->alu.arith_shift;
4726               else
4727                 /* LSL (immediate).  */
4728                 cost += extra_cost->alu.shift;
4729             }
4730
4731           cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
4732
4733           return cost;
4734         }
4735
4736       /* Integer multiplies or FMAs have zero/sign extending variants.  */
4737       if ((GET_CODE (op0) == ZERO_EXTEND
4738            && GET_CODE (op1) == ZERO_EXTEND)
4739           || (GET_CODE (op0) == SIGN_EXTEND
4740               && GET_CODE (op1) == SIGN_EXTEND))
4741         {
4742           cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
4743                   + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
4744
4745           if (speed)
4746             {
4747               if (maybe_fma)
4748                 /* MADD/SMADDL/UMADDL.  */
4749                 cost += extra_cost->mult[0].extend_add;
4750               else
4751                 /* MUL/SMULL/UMULL.  */
4752                 cost += extra_cost->mult[0].extend;
4753             }
4754
4755           return cost;
4756         }
4757
4758       /* This is either an integer multiply or an FMA.  In both cases
4759          we want to recurse and cost the operands.  */
4760       cost += rtx_cost (op0, MULT, 0, speed)
4761               + rtx_cost (op1, MULT, 1, speed);
4762
4763       if (speed)
4764         {
4765           if (maybe_fma)
4766             /* MADD.  */
4767             cost += extra_cost->mult[mode == DImode].add;
4768           else
4769             /* MUL.  */
4770             cost += extra_cost->mult[mode == DImode].simple;
4771         }
4772
4773       return cost;
4774     }
4775   else
4776     {
4777       if (speed)
4778         {
4779           /* Floating-point FMA/FMUL can also support negations of the
4780              operands.  */
4781           if (GET_CODE (op0) == NEG)
4782             op0 = XEXP (op0, 0);
4783           if (GET_CODE (op1) == NEG)
4784             op1 = XEXP (op1, 0);
4785
4786           if (maybe_fma)
4787             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
4788             cost += extra_cost->fp[mode == DFmode].fma;
4789           else
4790             /* FMUL/FNMUL.  */
4791             cost += extra_cost->fp[mode == DFmode].mult;
4792         }
4793
4794       cost += rtx_cost (op0, MULT, 0, speed)
4795               + rtx_cost (op1, MULT, 1, speed);
4796       return cost;
4797     }
4798 }
4799
4800 static int
4801 aarch64_address_cost (rtx x,
4802                       enum machine_mode mode,
4803                       addr_space_t as ATTRIBUTE_UNUSED,
4804                       bool speed)
4805 {
4806   enum rtx_code c = GET_CODE (x);
4807   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
4808   struct aarch64_address_info info;
4809   int cost = 0;
4810   info.shift = 0;
4811
4812   if (!aarch64_classify_address (&info, x, mode, c, false))
4813     {
4814       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
4815         {
4816           /* This is a CONST or SYMBOL ref which will be split
4817              in a different way depending on the code model in use.
4818              Cost it through the generic infrastructure.  */
4819           int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
4820           /* Divide through by the cost of one instruction to
4821              bring it to the same units as the address costs.  */
4822           cost_symbol_ref /= COSTS_N_INSNS (1);
4823           /* The cost is then the cost of preparing the address,
4824              followed by an immediate (possibly 0) offset.  */
4825           return cost_symbol_ref + addr_cost->imm_offset;
4826         }
4827       else
4828         {
4829           /* This is most likely a jump table from a case
4830              statement.  */
4831           return addr_cost->register_offset;
4832         }
4833     }
4834
4835   switch (info.type)
4836     {
4837       case ADDRESS_LO_SUM:
4838       case ADDRESS_SYMBOLIC:
4839       case ADDRESS_REG_IMM:
4840         cost += addr_cost->imm_offset;
4841         break;
4842
4843       case ADDRESS_REG_WB:
4844         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
4845           cost += addr_cost->pre_modify;
4846         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
4847           cost += addr_cost->post_modify;
4848         else
4849           gcc_unreachable ();
4850
4851         break;
4852
4853       case ADDRESS_REG_REG:
4854         cost += addr_cost->register_offset;
4855         break;
4856
4857       case ADDRESS_REG_UXTW:
4858       case ADDRESS_REG_SXTW:
4859         cost += addr_cost->register_extend;
4860         break;
4861
4862       default:
4863         gcc_unreachable ();
4864     }
4865
4866
4867   if (info.shift > 0)
4868     {
4869       /* For the sake of calculating the cost of the shifted register
4870          component, we can treat same sized modes in the same way.  */
4871       switch (GET_MODE_BITSIZE (mode))
4872         {
4873           case 16:
4874             cost += addr_cost->addr_scale_costs.hi;
4875             break;
4876
4877           case 32:
4878             cost += addr_cost->addr_scale_costs.si;
4879             break;
4880
4881           case 64:
4882             cost += addr_cost->addr_scale_costs.di;
4883             break;
4884
4885           /* We can't tell, or this is a 128-bit vector.  */
4886           default:
4887             cost += addr_cost->addr_scale_costs.ti;
4888             break;
4889         }
4890     }
4891
4892   return cost;
4893 }
4894
4895 /* Return true if the RTX X in mode MODE is a zero or sign extract
4896    usable in an ADD or SUB (extended register) instruction.  */
4897 static bool
4898 aarch64_rtx_arith_op_extract_p (rtx x, enum machine_mode mode)
4899 {
4900   /* Catch add with a sign extract.
4901      This is add_<optab><mode>_multp2.  */
4902   if (GET_CODE (x) == SIGN_EXTRACT
4903       || GET_CODE (x) == ZERO_EXTRACT)
4904     {
4905       rtx op0 = XEXP (x, 0);
4906       rtx op1 = XEXP (x, 1);
4907       rtx op2 = XEXP (x, 2);
4908
4909       if (GET_CODE (op0) == MULT
4910           && CONST_INT_P (op1)
4911           && op2 == const0_rtx
4912           && CONST_INT_P (XEXP (op0, 1))
4913           && aarch64_is_extend_from_extract (mode,
4914                                              XEXP (op0, 1),
4915                                              op1))
4916         {
4917           return true;
4918         }
4919     }
4920
4921   return false;
4922 }
4923
4924 static bool
4925 aarch64_frint_unspec_p (unsigned int u)
4926 {
4927   switch (u)
4928     {
4929       case UNSPEC_FRINTZ:
4930       case UNSPEC_FRINTP:
4931       case UNSPEC_FRINTM:
4932       case UNSPEC_FRINTA:
4933       case UNSPEC_FRINTN:
4934       case UNSPEC_FRINTX:
4935       case UNSPEC_FRINTI:
4936         return true;
4937
4938       default:
4939         return false;
4940     }
4941 }
4942
4943 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
4944    storing it in *COST.  Result is true if the total cost of the operation
4945    has now been calculated.  */
4946 static bool
4947 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
4948 {
4949   rtx inner;
4950   rtx comparator;
4951   enum rtx_code cmpcode;
4952
4953   if (COMPARISON_P (op0))
4954     {
4955       inner = XEXP (op0, 0);
4956       comparator = XEXP (op0, 1);
4957       cmpcode = GET_CODE (op0);
4958     }
4959   else
4960     {
4961       inner = op0;
4962       comparator = const0_rtx;
4963       cmpcode = NE;
4964     }
4965
4966   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
4967     {
4968       /* Conditional branch.  */
4969       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
4970         return true;
4971       else
4972         {
4973           if (cmpcode == NE || cmpcode == EQ)
4974             {
4975               if (comparator == const0_rtx)
4976                 {
4977                   /* TBZ/TBNZ/CBZ/CBNZ.  */
4978                   if (GET_CODE (inner) == ZERO_EXTRACT)
4979                     /* TBZ/TBNZ.  */
4980                     *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
4981                                        0, speed);
4982                 else
4983                   /* CBZ/CBNZ.  */
4984                   *cost += rtx_cost (inner, cmpcode, 0, speed);
4985
4986                 return true;
4987               }
4988             }
4989           else if (cmpcode == LT || cmpcode == GE)
4990             {
4991               /* TBZ/TBNZ.  */
4992               if (comparator == const0_rtx)
4993                 return true;
4994             }
4995         }
4996     }
4997   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
4998     {
4999       /* It's a conditional operation based on the status flags,
5000          so it must be some flavor of CSEL.  */
5001
5002       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
5003       if (GET_CODE (op1) == NEG
5004           || GET_CODE (op1) == NOT
5005           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5006         op1 = XEXP (op1, 0);
5007
5008       *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5009       *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5010       return true;
5011     }
5012
5013   /* We don't know what this is, cost all operands.  */
5014   return false;
5015 }
5016
5017 /* Calculate the cost of calculating X, storing it in *COST.  Result
5018    is true if the total cost of the operation has now been calculated.  */
5019 static bool
5020 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5021                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5022 {
5023   rtx op0, op1, op2;
5024   const struct cpu_cost_table *extra_cost
5025     = aarch64_tune_params->insn_extra_cost;
5026   enum machine_mode mode = GET_MODE (x);
5027
5028   /* By default, assume that everything has equivalent cost to the
5029      cheapest instruction.  Any additional costs are applied as a delta
5030      above this default.  */
5031   *cost = COSTS_N_INSNS (1);
5032
5033   /* TODO: The cost infrastructure currently does not handle
5034      vector operations.  Assume that all vector operations
5035      are equally expensive.  */
5036   if (VECTOR_MODE_P (mode))
5037     {
5038       if (speed)
5039         *cost += extra_cost->vect.alu;
5040       return true;
5041     }
5042
5043   switch (code)
5044     {
5045     case SET:
5046       /* The cost depends entirely on the operands to SET.  */
5047       *cost = 0;
5048       op0 = SET_DEST (x);
5049       op1 = SET_SRC (x);
5050
5051       switch (GET_CODE (op0))
5052         {
5053         case MEM:
5054           if (speed)
5055             {
5056               rtx address = XEXP (op0, 0);
5057               if (GET_MODE_CLASS (mode) == MODE_INT)
5058                 *cost += extra_cost->ldst.store;
5059               else if (mode == SFmode)
5060                 *cost += extra_cost->ldst.storef;
5061               else if (mode == DFmode)
5062                 *cost += extra_cost->ldst.stored;
5063
5064               *cost +=
5065                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5066                                                      0, speed));
5067             }
5068
5069           *cost += rtx_cost (op1, SET, 1, speed);
5070           return true;
5071
5072         case SUBREG:
5073           if (! REG_P (SUBREG_REG (op0)))
5074             *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5075
5076           /* Fall through.  */
5077         case REG:
5078           /* const0_rtx is in general free, but we will use an
5079              instruction to set a register to 0.  */
5080           if (REG_P (op1) || op1 == const0_rtx)
5081             {
5082               /* The cost is 1 per register copied.  */
5083               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5084                               / UNITS_PER_WORD;
5085               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5086             }
5087           else
5088             /* Cost is just the cost of the RHS of the set.  */
5089             *cost += rtx_cost (op1, SET, 1, speed);
5090           return true;
5091
5092         case ZERO_EXTRACT:
5093         case SIGN_EXTRACT:
5094           /* Bit-field insertion.  Strip any redundant widening of
5095              the RHS to meet the width of the target.  */
5096           if (GET_CODE (op1) == SUBREG)
5097             op1 = SUBREG_REG (op1);
5098           if ((GET_CODE (op1) == ZERO_EXTEND
5099                || GET_CODE (op1) == SIGN_EXTEND)
5100               && CONST_INT_P (XEXP (op0, 1))
5101               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5102                   >= INTVAL (XEXP (op0, 1))))
5103             op1 = XEXP (op1, 0);
5104
5105           if (CONST_INT_P (op1))
5106             {
5107               /* MOV immediate is assumed to always be cheap.  */
5108               *cost = COSTS_N_INSNS (1);
5109             }
5110           else
5111             {
5112               /* BFM.  */
5113               if (speed)
5114                 *cost += extra_cost->alu.bfi;
5115               *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5116             }
5117
5118           return true;
5119
5120         default:
5121           /* We can't make sense of this, assume default cost.  */
5122           *cost = COSTS_N_INSNS (1);
5123           return false;
5124         }
5125       return false;
5126
5127     case CONST_INT:
5128       /* If an instruction can incorporate a constant within the
5129          instruction, the instruction's expression avoids calling
5130          rtx_cost() on the constant.  If rtx_cost() is called on a
5131          constant, then it is usually because the constant must be
5132          moved into a register by one or more instructions.
5133
5134          The exception is constant 0, which can be expressed
5135          as XZR/WZR and is therefore free.  The exception to this is
5136          if we have (set (reg) (const0_rtx)) in which case we must cost
5137          the move.  However, we can catch that when we cost the SET, so
5138          we don't need to consider that here.  */
5139       if (x == const0_rtx)
5140         *cost = 0;
5141       else
5142         {
5143           /* To an approximation, building any other constant is
5144              proportionally expensive to the number of instructions
5145              required to build that constant.  This is true whether we
5146              are compiling for SPEED or otherwise.  */
5147           *cost = COSTS_N_INSNS (aarch64_build_constant (0,
5148                                                          INTVAL (x),
5149                                                          false));
5150         }
5151       return true;
5152
5153     case CONST_DOUBLE:
5154       if (speed)
5155         {
5156           /* mov[df,sf]_aarch64.  */
5157           if (aarch64_float_const_representable_p (x))
5158             /* FMOV (scalar immediate).  */
5159             *cost += extra_cost->fp[mode == DFmode].fpconst;
5160           else if (!aarch64_float_const_zero_rtx_p (x))
5161             {
5162               /* This will be a load from memory.  */
5163               if (mode == DFmode)
5164                 *cost += extra_cost->ldst.loadd;
5165               else
5166                 *cost += extra_cost->ldst.loadf;
5167             }
5168           else
5169             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5170                or MOV v0.s[0], wzr - neither of which are modeled by the
5171                cost tables.  Just use the default cost.  */
5172             {
5173             }
5174         }
5175
5176       return true;
5177
5178     case MEM:
5179       if (speed)
5180         {
5181           /* For loads we want the base cost of a load, plus an
5182              approximation for the additional cost of the addressing
5183              mode.  */
5184           rtx address = XEXP (x, 0);
5185           if (GET_MODE_CLASS (mode) == MODE_INT)
5186             *cost += extra_cost->ldst.load;
5187           else if (mode == SFmode)
5188             *cost += extra_cost->ldst.loadf;
5189           else if (mode == DFmode)
5190             *cost += extra_cost->ldst.loadd;
5191
5192           *cost +=
5193                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5194                                                      0, speed));
5195         }
5196
5197       return true;
5198
5199     case NEG:
5200       op0 = XEXP (x, 0);
5201
5202       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5203        {
5204           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5205               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5206             {
5207               /* CSETM.  */
5208               *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5209               return true;
5210             }
5211
5212           /* Cost this as SUB wzr, X.  */
5213           op0 = CONST0_RTX (GET_MODE (x));
5214           op1 = XEXP (x, 0);
5215           goto cost_minus;
5216         }
5217
5218       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5219         {
5220           /* Support (neg(fma...)) as a single instruction only if
5221              sign of zeros is unimportant.  This matches the decision
5222              making in aarch64.md.  */
5223           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5224             {
5225               /* FNMADD.  */
5226               *cost = rtx_cost (op0, NEG, 0, speed);
5227               return true;
5228             }
5229           if (speed)
5230             /* FNEG.  */
5231             *cost += extra_cost->fp[mode == DFmode].neg;
5232           return false;
5233         }
5234
5235       return false;
5236
5237     case CLRSB:
5238     case CLZ:
5239       if (speed)
5240         *cost += extra_cost->alu.clz;
5241
5242       return false;
5243
5244     case COMPARE:
5245       op0 = XEXP (x, 0);
5246       op1 = XEXP (x, 1);
5247
5248       if (op1 == const0_rtx
5249           && GET_CODE (op0) == AND)
5250         {
5251           x = op0;
5252           goto cost_logic;
5253         }
5254
5255       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5256         {
5257           /* TODO: A write to the CC flags possibly costs extra, this
5258              needs encoding in the cost tables.  */
5259
5260           /* CC_ZESWPmode supports zero extend for free.  */
5261           if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5262             op0 = XEXP (op0, 0);
5263
5264           /* ANDS.  */
5265           if (GET_CODE (op0) == AND)
5266             {
5267               x = op0;
5268               goto cost_logic;
5269             }
5270
5271           if (GET_CODE (op0) == PLUS)
5272             {
5273               /* ADDS (and CMN alias).  */
5274               x = op0;
5275               goto cost_plus;
5276             }
5277
5278           if (GET_CODE (op0) == MINUS)
5279             {
5280               /* SUBS.  */
5281               x = op0;
5282               goto cost_minus;
5283             }
5284
5285           if (GET_CODE (op1) == NEG)
5286             {
5287               /* CMN.  */
5288               if (speed)
5289                 *cost += extra_cost->alu.arith;
5290
5291               *cost += rtx_cost (op0, COMPARE, 0, speed);
5292               *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5293               return true;
5294             }
5295
5296           /* CMP.
5297
5298              Compare can freely swap the order of operands, and
5299              canonicalization puts the more complex operation first.
5300              But the integer MINUS logic expects the shift/extend
5301              operation in op1.  */
5302           if (! (REG_P (op0)
5303                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5304           {
5305             op0 = XEXP (x, 1);
5306             op1 = XEXP (x, 0);
5307           }
5308           goto cost_minus;
5309         }
5310
5311       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5312         {
5313           /* FCMP.  */
5314           if (speed)
5315             *cost += extra_cost->fp[mode == DFmode].compare;
5316
5317           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5318             {
5319               /* FCMP supports constant 0.0 for no extra cost. */
5320               return true;
5321             }
5322           return false;
5323         }
5324
5325       return false;
5326
5327     case MINUS:
5328       {
5329         op0 = XEXP (x, 0);
5330         op1 = XEXP (x, 1);
5331
5332 cost_minus:
5333         /* Detect valid immediates.  */
5334         if ((GET_MODE_CLASS (mode) == MODE_INT
5335              || (GET_MODE_CLASS (mode) == MODE_CC
5336                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5337             && CONST_INT_P (op1)
5338             && aarch64_uimm12_shift (INTVAL (op1)))
5339           {
5340             *cost += rtx_cost (op0, MINUS, 0, speed);
5341
5342             if (speed)
5343               /* SUB(S) (immediate).  */
5344               *cost += extra_cost->alu.arith;
5345             return true;
5346
5347           }
5348
5349         /* Look for SUB (extended register).  */
5350         if (aarch64_rtx_arith_op_extract_p (op1, mode))
5351           {
5352             if (speed)
5353               *cost += extra_cost->alu.arith_shift;
5354
5355             *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5356                                (enum rtx_code) GET_CODE (op1),
5357                                0, speed);
5358             return true;
5359           }
5360
5361         rtx new_op1 = aarch64_strip_extend (op1);
5362
5363         /* Cost this as an FMA-alike operation.  */
5364         if ((GET_CODE (new_op1) == MULT
5365              || GET_CODE (new_op1) == ASHIFT)
5366             && code != COMPARE)
5367           {
5368             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5369                                             (enum rtx_code) code,
5370                                             speed);
5371             *cost += rtx_cost (op0, MINUS, 0, speed);
5372             return true;
5373           }
5374
5375         *cost += rtx_cost (new_op1, MINUS, 1, speed);
5376
5377         if (speed)
5378           {
5379             if (GET_MODE_CLASS (mode) == MODE_INT)
5380               /* SUB(S).  */
5381               *cost += extra_cost->alu.arith;
5382             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5383               /* FSUB.  */
5384               *cost += extra_cost->fp[mode == DFmode].addsub;
5385           }
5386         return true;
5387       }
5388
5389     case PLUS:
5390       {
5391         rtx new_op0;
5392
5393         op0 = XEXP (x, 0);
5394         op1 = XEXP (x, 1);
5395
5396 cost_plus:
5397         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5398             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5399           {
5400             /* CSINC.  */
5401             *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5402             *cost += rtx_cost (op1, PLUS, 1, speed);
5403             return true;
5404           }
5405
5406         if (GET_MODE_CLASS (mode) == MODE_INT
5407             && CONST_INT_P (op1)
5408             && aarch64_uimm12_shift (INTVAL (op1)))
5409           {
5410             *cost += rtx_cost (op0, PLUS, 0, speed);
5411
5412             if (speed)
5413               /* ADD (immediate).  */
5414               *cost += extra_cost->alu.arith;
5415             return true;
5416           }
5417
5418         /* Look for ADD (extended register).  */
5419         if (aarch64_rtx_arith_op_extract_p (op0, mode))
5420           {
5421             if (speed)
5422               *cost += extra_cost->alu.arith_shift;
5423
5424             *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5425                                (enum rtx_code) GET_CODE (op0),
5426                                0, speed);
5427             return true;
5428           }
5429
5430         /* Strip any extend, leave shifts behind as we will
5431            cost them through mult_cost.  */
5432         new_op0 = aarch64_strip_extend (op0);
5433
5434         if (GET_CODE (new_op0) == MULT
5435             || GET_CODE (new_op0) == ASHIFT)
5436           {
5437             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5438                                             speed);
5439             *cost += rtx_cost (op1, PLUS, 1, speed);
5440             return true;
5441           }
5442
5443         *cost += (rtx_cost (new_op0, PLUS, 0, speed)
5444                   + rtx_cost (op1, PLUS, 1, speed));
5445
5446         if (speed)
5447           {
5448             if (GET_MODE_CLASS (mode) == MODE_INT)
5449               /* ADD.  */
5450               *cost += extra_cost->alu.arith;
5451             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5452               /* FADD.  */
5453               *cost += extra_cost->fp[mode == DFmode].addsub;
5454           }
5455         return true;
5456       }
5457
5458     case BSWAP:
5459       *cost = COSTS_N_INSNS (1);
5460
5461       if (speed)
5462         *cost += extra_cost->alu.rev;
5463
5464       return false;
5465
5466     case IOR:
5467       if (aarch_rev16_p (x))
5468         {
5469           *cost = COSTS_N_INSNS (1);
5470
5471           if (speed)
5472             *cost += extra_cost->alu.rev;
5473
5474           return true;
5475         }
5476     /* Fall through.  */
5477     case XOR:
5478     case AND:
5479     cost_logic:
5480       op0 = XEXP (x, 0);
5481       op1 = XEXP (x, 1);
5482
5483       if (code == AND
5484           && GET_CODE (op0) == MULT
5485           && CONST_INT_P (XEXP (op0, 1))
5486           && CONST_INT_P (op1)
5487           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
5488                                INTVAL (op1)) != 0)
5489         {
5490           /* This is a UBFM/SBFM.  */
5491           *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
5492           if (speed)
5493             *cost += extra_cost->alu.bfx;
5494           return true;
5495         }
5496
5497       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5498         {
5499           /* We possibly get the immediate for free, this is not
5500              modelled.  */
5501           if (CONST_INT_P (op1)
5502               && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
5503             {
5504               *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5505
5506               if (speed)
5507                 *cost += extra_cost->alu.logical;
5508
5509               return true;
5510             }
5511           else
5512             {
5513               rtx new_op0 = op0;
5514
5515               /* Handle ORN, EON, or BIC.  */
5516               if (GET_CODE (op0) == NOT)
5517                 op0 = XEXP (op0, 0);
5518
5519               new_op0 = aarch64_strip_shift (op0);
5520
5521               /* If we had a shift on op0 then this is a logical-shift-
5522                  by-register/immediate operation.  Otherwise, this is just
5523                  a logical operation.  */
5524               if (speed)
5525                 {
5526                   if (new_op0 != op0)
5527                     {
5528                       /* Shift by immediate.  */
5529                       if (CONST_INT_P (XEXP (op0, 1)))
5530                         *cost += extra_cost->alu.log_shift;
5531                       else
5532                         *cost += extra_cost->alu.log_shift_reg;
5533                     }
5534                   else
5535                     *cost += extra_cost->alu.logical;
5536                 }
5537
5538               /* In both cases we want to cost both operands.  */
5539               *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
5540                        + rtx_cost (op1, (enum rtx_code) code, 1, speed);
5541
5542               return true;
5543             }
5544         }
5545       return false;
5546
5547     case NOT:
5548       /* MVN.  */
5549       if (speed)
5550         *cost += extra_cost->alu.logical;
5551
5552       /* The logical instruction could have the shifted register form,
5553          but the cost is the same if the shift is processed as a separate
5554          instruction, so we don't bother with it here.  */
5555       return false;
5556
5557     case ZERO_EXTEND:
5558
5559       op0 = XEXP (x, 0);
5560       /* If a value is written in SI mode, then zero extended to DI
5561          mode, the operation will in general be free as a write to
5562          a 'w' register implicitly zeroes the upper bits of an 'x'
5563          register.  However, if this is
5564
5565            (set (reg) (zero_extend (reg)))
5566
5567          we must cost the explicit register move.  */
5568       if (mode == DImode
5569           && GET_MODE (op0) == SImode
5570           && outer == SET)
5571         {
5572           int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
5573
5574           if (!op_cost && speed)
5575             /* MOV.  */
5576             *cost += extra_cost->alu.extend;
5577           else
5578             /* Free, the cost is that of the SI mode operation.  */
5579             *cost = op_cost;
5580
5581           return true;
5582         }
5583       else if (MEM_P (XEXP (x, 0)))
5584         {
5585           /* All loads can zero extend to any size for free.  */
5586           *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
5587           return true;
5588         }
5589
5590       /* UXTB/UXTH.  */
5591       if (speed)
5592         *cost += extra_cost->alu.extend;
5593
5594       return false;
5595
5596     case SIGN_EXTEND:
5597       if (MEM_P (XEXP (x, 0)))
5598         {
5599           /* LDRSH.  */
5600           if (speed)
5601             {
5602               rtx address = XEXP (XEXP (x, 0), 0);
5603               *cost += extra_cost->ldst.load_sign_extend;
5604
5605               *cost +=
5606                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5607                                                      0, speed));
5608             }
5609           return true;
5610         }
5611
5612       if (speed)
5613         *cost += extra_cost->alu.extend;
5614       return false;
5615
5616     case ASHIFT:
5617       op0 = XEXP (x, 0);
5618       op1 = XEXP (x, 1);
5619
5620       if (CONST_INT_P (op1))
5621         {
5622           /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
5623              aliases.  */
5624           if (speed)
5625             *cost += extra_cost->alu.shift;
5626
5627           /* We can incorporate zero/sign extend for free.  */
5628           if (GET_CODE (op0) == ZERO_EXTEND
5629               || GET_CODE (op0) == SIGN_EXTEND)
5630             op0 = XEXP (op0, 0);
5631
5632           *cost += rtx_cost (op0, ASHIFT, 0, speed);
5633           return true;
5634         }
5635       else
5636         {
5637           /* LSLV.  */
5638           if (speed)
5639             *cost += extra_cost->alu.shift_reg;
5640
5641           return false;  /* All arguments need to be in registers.  */
5642         }
5643
5644     case ROTATE:
5645     case ROTATERT:
5646     case LSHIFTRT:
5647     case ASHIFTRT:
5648       op0 = XEXP (x, 0);
5649       op1 = XEXP (x, 1);
5650
5651       if (CONST_INT_P (op1))
5652         {
5653           /* ASR (immediate) and friends.  */
5654           if (speed)
5655             *cost += extra_cost->alu.shift;
5656
5657           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5658           return true;
5659         }
5660       else
5661         {
5662
5663           /* ASR (register) and friends.  */
5664           if (speed)
5665             *cost += extra_cost->alu.shift_reg;
5666
5667           return false;  /* All arguments need to be in registers.  */
5668         }
5669
5670     case SYMBOL_REF:
5671
5672       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
5673         {
5674           /* LDR.  */
5675           if (speed)
5676             *cost += extra_cost->ldst.load;
5677         }
5678       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
5679                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
5680         {
5681           /* ADRP, followed by ADD.  */
5682           *cost += COSTS_N_INSNS (1);
5683           if (speed)
5684             *cost += 2 * extra_cost->alu.arith;
5685         }
5686       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
5687                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
5688         {
5689           /* ADR.  */
5690           if (speed)
5691             *cost += extra_cost->alu.arith;
5692         }
5693
5694       if (flag_pic)
5695         {
5696           /* One extra load instruction, after accessing the GOT.  */
5697           *cost += COSTS_N_INSNS (1);
5698           if (speed)
5699             *cost += extra_cost->ldst.load;
5700         }
5701       return true;
5702
5703     case HIGH:
5704     case LO_SUM:
5705       /* ADRP/ADD (immediate).  */
5706       if (speed)
5707         *cost += extra_cost->alu.arith;
5708       return true;
5709
5710     case ZERO_EXTRACT:
5711     case SIGN_EXTRACT:
5712       /* UBFX/SBFX.  */
5713       if (speed)
5714         *cost += extra_cost->alu.bfx;
5715
5716       /* We can trust that the immediates used will be correct (there
5717          are no by-register forms), so we need only cost op0.  */
5718       *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
5719       return true;
5720
5721     case MULT:
5722       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
5723       /* aarch64_rtx_mult_cost always handles recursion to its
5724          operands.  */
5725       return true;
5726
5727     case MOD:
5728     case UMOD:
5729       if (speed)
5730         {
5731           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5732             *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
5733                       + extra_cost->mult[GET_MODE (x) == DImode].idiv);
5734           else if (GET_MODE (x) == DFmode)
5735             *cost += (extra_cost->fp[1].mult
5736                       + extra_cost->fp[1].div);
5737           else if (GET_MODE (x) == SFmode)
5738             *cost += (extra_cost->fp[0].mult
5739                       + extra_cost->fp[0].div);
5740         }
5741       return false;  /* All arguments need to be in registers.  */
5742
5743     case DIV:
5744     case UDIV:
5745     case SQRT:
5746       if (speed)
5747         {
5748           if (GET_MODE_CLASS (mode) == MODE_INT)
5749             /* There is no integer SQRT, so only DIV and UDIV can get
5750                here.  */
5751             *cost += extra_cost->mult[mode == DImode].idiv;
5752           else
5753             *cost += extra_cost->fp[mode == DFmode].div;
5754         }
5755       return false;  /* All arguments need to be in registers.  */
5756
5757     case IF_THEN_ELSE:
5758       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
5759                                          XEXP (x, 2), cost, speed);
5760
5761     case EQ:
5762     case NE:
5763     case GT:
5764     case GTU:
5765     case LT:
5766     case LTU:
5767     case GE:
5768     case GEU:
5769     case LE:
5770     case LEU:
5771
5772       return false; /* All arguments must be in registers.  */
5773
5774     case FMA:
5775       op0 = XEXP (x, 0);
5776       op1 = XEXP (x, 1);
5777       op2 = XEXP (x, 2);
5778
5779       if (speed)
5780         *cost += extra_cost->fp[mode == DFmode].fma;
5781
5782       /* FMSUB, FNMADD, and FNMSUB are free.  */
5783       if (GET_CODE (op0) == NEG)
5784         op0 = XEXP (op0, 0);
5785
5786       if (GET_CODE (op2) == NEG)
5787         op2 = XEXP (op2, 0);
5788
5789       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
5790          and the by-element operand as operand 0.  */
5791       if (GET_CODE (op1) == NEG)
5792         op1 = XEXP (op1, 0);
5793
5794       /* Catch vector-by-element operations.  The by-element operand can
5795          either be (vec_duplicate (vec_select (x))) or just
5796          (vec_select (x)), depending on whether we are multiplying by
5797          a vector or a scalar.
5798
5799          Canonicalization is not very good in these cases, FMA4 will put the
5800          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
5801       if (GET_CODE (op0) == VEC_DUPLICATE)
5802         op0 = XEXP (op0, 0);
5803       else if (GET_CODE (op1) == VEC_DUPLICATE)
5804         op1 = XEXP (op1, 0);
5805
5806       if (GET_CODE (op0) == VEC_SELECT)
5807         op0 = XEXP (op0, 0);
5808       else if (GET_CODE (op1) == VEC_SELECT)
5809         op1 = XEXP (op1, 0);
5810
5811       /* If the remaining parameters are not registers,
5812          get the cost to put them into registers.  */
5813       *cost += rtx_cost (op0, FMA, 0, speed);
5814       *cost += rtx_cost (op1, FMA, 1, speed);
5815       *cost += rtx_cost (op2, FMA, 2, speed);
5816       return true;
5817
5818     case FLOAT_EXTEND:
5819       if (speed)
5820         *cost += extra_cost->fp[mode == DFmode].widen;
5821       return false;
5822
5823     case FLOAT_TRUNCATE:
5824       if (speed)
5825         *cost += extra_cost->fp[mode == DFmode].narrow;
5826       return false;
5827
5828     case FIX:
5829     case UNSIGNED_FIX:
5830       x = XEXP (x, 0);
5831       /* Strip the rounding part.  They will all be implemented
5832          by the fcvt* family of instructions anyway.  */
5833       if (GET_CODE (x) == UNSPEC)
5834         {
5835           unsigned int uns_code = XINT (x, 1);
5836
5837           if (uns_code == UNSPEC_FRINTA
5838               || uns_code == UNSPEC_FRINTM
5839               || uns_code == UNSPEC_FRINTN
5840               || uns_code == UNSPEC_FRINTP
5841               || uns_code == UNSPEC_FRINTZ)
5842             x = XVECEXP (x, 0, 0);
5843         }
5844
5845       if (speed)
5846         *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
5847
5848       *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
5849       return true;
5850
5851     case ABS:
5852       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5853         {
5854           /* FABS and FNEG are analogous.  */
5855           if (speed)
5856             *cost += extra_cost->fp[mode == DFmode].neg;
5857         }
5858       else
5859         {
5860           /* Integer ABS will either be split to
5861              two arithmetic instructions, or will be an ABS
5862              (scalar), which we don't model.  */
5863           *cost = COSTS_N_INSNS (2);
5864           if (speed)
5865             *cost += 2 * extra_cost->alu.arith;
5866         }
5867       return false;
5868
5869     case SMAX:
5870     case SMIN:
5871       if (speed)
5872         {
5873           /* FMAXNM/FMINNM/FMAX/FMIN.
5874              TODO: This may not be accurate for all implementations, but
5875              we do not model this in the cost tables.  */
5876           *cost += extra_cost->fp[mode == DFmode].addsub;
5877         }
5878       return false;
5879
5880     case UNSPEC:
5881       /* The floating point round to integer frint* instructions.  */
5882       if (aarch64_frint_unspec_p (XINT (x, 1)))
5883         {
5884           if (speed)
5885             *cost += extra_cost->fp[mode == DFmode].roundint;
5886
5887           return false;
5888         }
5889
5890       if (XINT (x, 1) == UNSPEC_RBIT)
5891         {
5892           if (speed)
5893             *cost += extra_cost->alu.rev;
5894
5895           return false;
5896         }
5897       break;
5898
5899     case TRUNCATE:
5900
5901       /* Decompose <su>muldi3_highpart.  */
5902       if (/* (truncate:DI  */
5903           mode == DImode
5904           /*   (lshiftrt:TI  */
5905           && GET_MODE (XEXP (x, 0)) == TImode
5906           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
5907           /*      (mult:TI  */
5908           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
5909           /*        (ANY_EXTEND:TI (reg:DI))
5910                     (ANY_EXTEND:TI (reg:DI)))  */
5911           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
5912                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
5913               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
5914                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
5915           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
5916           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
5917           /*     (const_int 64)  */
5918           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5919           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
5920         {
5921           /* UMULH/SMULH.  */
5922           if (speed)
5923             *cost += extra_cost->mult[mode == DImode].extend;
5924           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
5925                              MULT, 0, speed);
5926           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
5927                              MULT, 1, speed);
5928           return true;
5929         }
5930
5931       /* Fall through.  */
5932     default:
5933       break;
5934     }
5935
5936   if (dump_file && (dump_flags & TDF_DETAILS))
5937     fprintf (dump_file,
5938       "\nFailed to cost RTX.  Assuming default cost.\n");
5939
5940   return true;
5941 }
5942
5943 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
5944    calculated for X.  This cost is stored in *COST.  Returns true
5945    if the total cost of X was calculated.  */
5946 static bool
5947 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
5948                    int param, int *cost, bool speed)
5949 {
5950   bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
5951
5952   if (dump_file && (dump_flags & TDF_DETAILS))
5953     {
5954       print_rtl_single (dump_file, x);
5955       fprintf (dump_file, "\n%s cost: %d (%s)\n",
5956                speed ? "Hot" : "Cold",
5957                *cost, result ? "final" : "partial");
5958     }
5959
5960   return result;
5961 }
5962
5963 static int
5964 aarch64_register_move_cost (enum machine_mode mode,
5965                             reg_class_t from_i, reg_class_t to_i)
5966 {
5967   enum reg_class from = (enum reg_class) from_i;
5968   enum reg_class to = (enum reg_class) to_i;
5969   const struct cpu_regmove_cost *regmove_cost
5970     = aarch64_tune_params->regmove_cost;
5971
5972   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
5973   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
5974     to = GENERAL_REGS;
5975
5976   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
5977     from = GENERAL_REGS;
5978
5979   /* Moving between GPR and stack cost is the same as GP2GP.  */
5980   if ((from == GENERAL_REGS && to == STACK_REG)
5981       || (to == GENERAL_REGS && from == STACK_REG))
5982     return regmove_cost->GP2GP;
5983
5984   /* To/From the stack register, we move via the gprs.  */
5985   if (to == STACK_REG || from == STACK_REG)
5986     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
5987             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
5988
5989   if (GET_MODE_SIZE (mode) == 16)
5990     {
5991       /* 128-bit operations on general registers require 2 instructions.  */
5992       if (from == GENERAL_REGS && to == GENERAL_REGS)
5993         return regmove_cost->GP2GP * 2;
5994       else if (from == GENERAL_REGS)
5995         return regmove_cost->GP2FP * 2;
5996       else if (to == GENERAL_REGS)
5997         return regmove_cost->FP2GP * 2;
5998
5999       /* When AdvSIMD instructions are disabled it is not possible to move
6000          a 128-bit value directly between Q registers.  This is handled in
6001          secondary reload.  A general register is used as a scratch to move
6002          the upper DI value and the lower DI value is moved directly,
6003          hence the cost is the sum of three moves. */
6004       if (! TARGET_SIMD)
6005         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6006
6007       return regmove_cost->FP2FP;
6008     }
6009
6010   if (from == GENERAL_REGS && to == GENERAL_REGS)
6011     return regmove_cost->GP2GP;
6012   else if (from == GENERAL_REGS)
6013     return regmove_cost->GP2FP;
6014   else if (to == GENERAL_REGS)
6015     return regmove_cost->FP2GP;
6016
6017   return regmove_cost->FP2FP;
6018 }
6019
6020 static int
6021 aarch64_memory_move_cost (enum machine_mode mode ATTRIBUTE_UNUSED,
6022                           reg_class_t rclass ATTRIBUTE_UNUSED,
6023                           bool in ATTRIBUTE_UNUSED)
6024 {
6025   return aarch64_tune_params->memmov_cost;
6026 }
6027
6028 /* Return the number of instructions that can be issued per cycle.  */
6029 static int
6030 aarch64_sched_issue_rate (void)
6031 {
6032   return aarch64_tune_params->issue_rate;
6033 }
6034
6035 /* Vectorizer cost model target hooks.  */
6036
6037 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
6038 static int
6039 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6040                                     tree vectype,
6041                                     int misalign ATTRIBUTE_UNUSED)
6042 {
6043   unsigned elements;
6044
6045   switch (type_of_cost)
6046     {
6047       case scalar_stmt:
6048         return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6049
6050       case scalar_load:
6051         return aarch64_tune_params->vec_costs->scalar_load_cost;
6052
6053       case scalar_store:
6054         return aarch64_tune_params->vec_costs->scalar_store_cost;
6055
6056       case vector_stmt:
6057         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6058
6059       case vector_load:
6060         return aarch64_tune_params->vec_costs->vec_align_load_cost;
6061
6062       case vector_store:
6063         return aarch64_tune_params->vec_costs->vec_store_cost;
6064
6065       case vec_to_scalar:
6066         return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6067
6068       case scalar_to_vec:
6069         return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6070
6071       case unaligned_load:
6072         return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6073
6074       case unaligned_store:
6075         return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6076
6077       case cond_branch_taken:
6078         return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6079
6080       case cond_branch_not_taken:
6081         return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6082
6083       case vec_perm:
6084       case vec_promote_demote:
6085         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6086
6087       case vec_construct:
6088         elements = TYPE_VECTOR_SUBPARTS (vectype);
6089         return elements / 2 + 1;
6090
6091       default:
6092         gcc_unreachable ();
6093     }
6094 }
6095
6096 /* Implement targetm.vectorize.add_stmt_cost.  */
6097 static unsigned
6098 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6099                        struct _stmt_vec_info *stmt_info, int misalign,
6100                        enum vect_cost_model_location where)
6101 {
6102   unsigned *cost = (unsigned *) data;
6103   unsigned retval = 0;
6104
6105   if (flag_vect_cost_model)
6106     {
6107       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6108       int stmt_cost =
6109             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6110
6111       /* Statements in an inner loop relative to the loop being
6112          vectorized are weighted more heavily.  The value here is
6113          a function (linear for now) of the loop nest level.  */
6114       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6115         {
6116           loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6117           struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
6118           unsigned nest_level = loop_depth (loop);
6119
6120           count *= nest_level;
6121         }
6122
6123       retval = (unsigned) (count * stmt_cost);
6124       cost[where] += retval;
6125     }
6126
6127   return retval;
6128 }
6129
6130 static void initialize_aarch64_code_model (void);
6131
6132 /* Parse the architecture extension string.  */
6133
6134 static void
6135 aarch64_parse_extension (char *str)
6136 {
6137   /* The extension string is parsed left to right.  */
6138   const struct aarch64_option_extension *opt = NULL;
6139
6140   /* Flag to say whether we are adding or removing an extension.  */
6141   int adding_ext = -1;
6142
6143   while (str != NULL && *str != 0)
6144     {
6145       char *ext;
6146       size_t len;
6147
6148       str++;
6149       ext = strchr (str, '+');
6150
6151       if (ext != NULL)
6152         len = ext - str;
6153       else
6154         len = strlen (str);
6155
6156       if (len >= 2 && strncmp (str, "no", 2) == 0)
6157         {
6158           adding_ext = 0;
6159           len -= 2;
6160           str += 2;
6161         }
6162       else if (len > 0)
6163         adding_ext = 1;
6164
6165       if (len == 0)
6166         {
6167           error ("missing feature modifier after %qs", "+no");
6168           return;
6169         }
6170
6171       /* Scan over the extensions table trying to find an exact match.  */
6172       for (opt = all_extensions; opt->name != NULL; opt++)
6173         {
6174           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6175             {
6176               /* Add or remove the extension.  */
6177               if (adding_ext)
6178                 aarch64_isa_flags |= opt->flags_on;
6179               else
6180                 aarch64_isa_flags &= ~(opt->flags_off);
6181               break;
6182             }
6183         }
6184
6185       if (opt->name == NULL)
6186         {
6187           /* Extension not found in list.  */
6188           error ("unknown feature modifier %qs", str);
6189           return;
6190         }
6191
6192       str = ext;
6193     };
6194
6195   return;
6196 }
6197
6198 /* Parse the ARCH string.  */
6199
6200 static void
6201 aarch64_parse_arch (void)
6202 {
6203   char *ext;
6204   const struct processor *arch;
6205   char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6206   size_t len;
6207
6208   strcpy (str, aarch64_arch_string);
6209
6210   ext = strchr (str, '+');
6211
6212   if (ext != NULL)
6213     len = ext - str;
6214   else
6215     len = strlen (str);
6216
6217   if (len == 0)
6218     {
6219       error ("missing arch name in -march=%qs", str);
6220       return;
6221     }
6222
6223   /* Loop through the list of supported ARCHs to find a match.  */
6224   for (arch = all_architectures; arch->name != NULL; arch++)
6225     {
6226       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6227         {
6228           selected_arch = arch;
6229           aarch64_isa_flags = selected_arch->flags;
6230
6231           if (!selected_cpu)
6232             selected_cpu = &all_cores[selected_arch->core];
6233
6234           if (ext != NULL)
6235             {
6236               /* ARCH string contains at least one extension.  */
6237               aarch64_parse_extension (ext);
6238             }
6239
6240           if (strcmp (selected_arch->arch, selected_cpu->arch))
6241             {
6242               warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6243                        selected_cpu->name, selected_arch->name);
6244             }
6245
6246           return;
6247         }
6248     }
6249
6250   /* ARCH name not found in list.  */
6251   error ("unknown value %qs for -march", str);
6252   return;
6253 }
6254
6255 /* Parse the CPU string.  */
6256
6257 static void
6258 aarch64_parse_cpu (void)
6259 {
6260   char *ext;
6261   const struct processor *cpu;
6262   char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6263   size_t len;
6264
6265   strcpy (str, aarch64_cpu_string);
6266
6267   ext = strchr (str, '+');
6268
6269   if (ext != NULL)
6270     len = ext - str;
6271   else
6272     len = strlen (str);
6273
6274   if (len == 0)
6275     {
6276       error ("missing cpu name in -mcpu=%qs", str);
6277       return;
6278     }
6279
6280   /* Loop through the list of supported CPUs to find a match.  */
6281   for (cpu = all_cores; cpu->name != NULL; cpu++)
6282     {
6283       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6284         {
6285           selected_cpu = cpu;
6286           selected_tune = cpu;
6287           aarch64_isa_flags = selected_cpu->flags;
6288
6289           if (ext != NULL)
6290             {
6291               /* CPU string contains at least one extension.  */
6292               aarch64_parse_extension (ext);
6293             }
6294
6295           return;
6296         }
6297     }
6298
6299   /* CPU name not found in list.  */
6300   error ("unknown value %qs for -mcpu", str);
6301   return;
6302 }
6303
6304 /* Parse the TUNE string.  */
6305
6306 static void
6307 aarch64_parse_tune (void)
6308 {
6309   const struct processor *cpu;
6310   char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6311   strcpy (str, aarch64_tune_string);
6312
6313   /* Loop through the list of supported CPUs to find a match.  */
6314   for (cpu = all_cores; cpu->name != NULL; cpu++)
6315     {
6316       if (strcmp (cpu->name, str) == 0)
6317         {
6318           selected_tune = cpu;
6319           return;
6320         }
6321     }
6322
6323   /* CPU name not found in list.  */
6324   error ("unknown value %qs for -mtune", str);
6325   return;
6326 }
6327
6328
6329 /* Implement TARGET_OPTION_OVERRIDE.  */
6330
6331 static void
6332 aarch64_override_options (void)
6333 {
6334   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6335      If either of -march or -mtune is given, they override their
6336      respective component of -mcpu.
6337
6338      So, first parse AARCH64_CPU_STRING, then the others, be careful
6339      with -march as, if -mcpu is not present on the command line, march
6340      must set a sensible default CPU.  */
6341   if (aarch64_cpu_string)
6342     {
6343       aarch64_parse_cpu ();
6344     }
6345
6346   if (aarch64_arch_string)
6347     {
6348       aarch64_parse_arch ();
6349     }
6350
6351   if (aarch64_tune_string)
6352     {
6353       aarch64_parse_tune ();
6354     }
6355
6356 #ifndef HAVE_AS_MABI_OPTION
6357   /* The compiler may have been configured with 2.23.* binutils, which does
6358      not have support for ILP32.  */
6359   if (TARGET_ILP32)
6360     error ("Assembler does not support -mabi=ilp32");
6361 #endif
6362
6363   initialize_aarch64_code_model ();
6364
6365   aarch64_build_bitmask_table ();
6366
6367   /* This target defaults to strict volatile bitfields.  */
6368   if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6369     flag_strict_volatile_bitfields = 1;
6370
6371   /* If the user did not specify a processor, choose the default
6372      one for them.  This will be the CPU set during configuration using
6373      --with-cpu, otherwise it is "generic".  */
6374   if (!selected_cpu)
6375     {
6376       selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6377       aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6378     }
6379
6380   gcc_assert (selected_cpu);
6381
6382   /* The selected cpu may be an architecture, so lookup tuning by core ID.  */
6383   if (!selected_tune)
6384     selected_tune = &all_cores[selected_cpu->core];
6385
6386   aarch64_tune_flags = selected_tune->flags;
6387   aarch64_tune = selected_tune->core;
6388   aarch64_tune_params = selected_tune->tune;
6389
6390   aarch64_override_options_after_change ();
6391 }
6392
6393 /* Implement targetm.override_options_after_change.  */
6394
6395 static void
6396 aarch64_override_options_after_change (void)
6397 {
6398   if (flag_omit_frame_pointer)
6399     flag_omit_leaf_frame_pointer = false;
6400   else if (flag_omit_leaf_frame_pointer)
6401     flag_omit_frame_pointer = true;
6402 }
6403
6404 static struct machine_function *
6405 aarch64_init_machine_status (void)
6406 {
6407   struct machine_function *machine;
6408   machine = ggc_cleared_alloc<machine_function> ();
6409   return machine;
6410 }
6411
6412 void
6413 aarch64_init_expanders (void)
6414 {
6415   init_machine_status = aarch64_init_machine_status;
6416 }
6417
6418 /* A checking mechanism for the implementation of the various code models.  */
6419 static void
6420 initialize_aarch64_code_model (void)
6421 {
6422    if (flag_pic)
6423      {
6424        switch (aarch64_cmodel_var)
6425          {
6426          case AARCH64_CMODEL_TINY:
6427            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
6428            break;
6429          case AARCH64_CMODEL_SMALL:
6430            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
6431            break;
6432          case AARCH64_CMODEL_LARGE:
6433            sorry ("code model %qs with -f%s", "large",
6434                   flag_pic > 1 ? "PIC" : "pic");
6435          default:
6436            gcc_unreachable ();
6437          }
6438      }
6439    else
6440      aarch64_cmodel = aarch64_cmodel_var;
6441 }
6442
6443 /* Return true if SYMBOL_REF X binds locally.  */
6444
6445 static bool
6446 aarch64_symbol_binds_local_p (const_rtx x)
6447 {
6448   return (SYMBOL_REF_DECL (x)
6449           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
6450           : SYMBOL_REF_LOCAL_P (x));
6451 }
6452
6453 /* Return true if SYMBOL_REF X is thread local */
6454 static bool
6455 aarch64_tls_symbol_p (rtx x)
6456 {
6457   if (! TARGET_HAVE_TLS)
6458     return false;
6459
6460   if (GET_CODE (x) != SYMBOL_REF)
6461     return false;
6462
6463   return SYMBOL_REF_TLS_MODEL (x) != 0;
6464 }
6465
6466 /* Classify a TLS symbol into one of the TLS kinds.  */
6467 enum aarch64_symbol_type
6468 aarch64_classify_tls_symbol (rtx x)
6469 {
6470   enum tls_model tls_kind = tls_symbolic_operand_type (x);
6471
6472   switch (tls_kind)
6473     {
6474     case TLS_MODEL_GLOBAL_DYNAMIC:
6475     case TLS_MODEL_LOCAL_DYNAMIC:
6476       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
6477
6478     case TLS_MODEL_INITIAL_EXEC:
6479       return SYMBOL_SMALL_GOTTPREL;
6480
6481     case TLS_MODEL_LOCAL_EXEC:
6482       return SYMBOL_SMALL_TPREL;
6483
6484     case TLS_MODEL_EMULATED:
6485     case TLS_MODEL_NONE:
6486       return SYMBOL_FORCE_TO_MEM;
6487
6488     default:
6489       gcc_unreachable ();
6490     }
6491 }
6492
6493 /* Return the method that should be used to access SYMBOL_REF or
6494    LABEL_REF X in context CONTEXT.  */
6495
6496 enum aarch64_symbol_type
6497 aarch64_classify_symbol (rtx x,
6498                          enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
6499 {
6500   if (GET_CODE (x) == LABEL_REF)
6501     {
6502       switch (aarch64_cmodel)
6503         {
6504         case AARCH64_CMODEL_LARGE:
6505           return SYMBOL_FORCE_TO_MEM;
6506
6507         case AARCH64_CMODEL_TINY_PIC:
6508         case AARCH64_CMODEL_TINY:
6509           return SYMBOL_TINY_ABSOLUTE;
6510
6511         case AARCH64_CMODEL_SMALL_PIC:
6512         case AARCH64_CMODEL_SMALL:
6513           return SYMBOL_SMALL_ABSOLUTE;
6514
6515         default:
6516           gcc_unreachable ();
6517         }
6518     }
6519
6520   if (GET_CODE (x) == SYMBOL_REF)
6521     {
6522       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6523           return SYMBOL_FORCE_TO_MEM;
6524
6525       if (aarch64_tls_symbol_p (x))
6526         return aarch64_classify_tls_symbol (x);
6527
6528       switch (aarch64_cmodel)
6529         {
6530         case AARCH64_CMODEL_TINY:
6531           if (SYMBOL_REF_WEAK (x))
6532             return SYMBOL_FORCE_TO_MEM;
6533           return SYMBOL_TINY_ABSOLUTE;
6534
6535         case AARCH64_CMODEL_SMALL:
6536           if (SYMBOL_REF_WEAK (x))
6537             return SYMBOL_FORCE_TO_MEM;
6538           return SYMBOL_SMALL_ABSOLUTE;
6539
6540         case AARCH64_CMODEL_TINY_PIC:
6541           if (!aarch64_symbol_binds_local_p (x))
6542             return SYMBOL_TINY_GOT;
6543           return SYMBOL_TINY_ABSOLUTE;
6544
6545         case AARCH64_CMODEL_SMALL_PIC:
6546           if (!aarch64_symbol_binds_local_p (x))
6547             return SYMBOL_SMALL_GOT;
6548           return SYMBOL_SMALL_ABSOLUTE;
6549
6550         default:
6551           gcc_unreachable ();
6552         }
6553     }
6554
6555   /* By default push everything into the constant pool.  */
6556   return SYMBOL_FORCE_TO_MEM;
6557 }
6558
6559 bool
6560 aarch64_constant_address_p (rtx x)
6561 {
6562   return (CONSTANT_P (x) && memory_address_p (DImode, x));
6563 }
6564
6565 bool
6566 aarch64_legitimate_pic_operand_p (rtx x)
6567 {
6568   if (GET_CODE (x) == SYMBOL_REF
6569       || (GET_CODE (x) == CONST
6570           && GET_CODE (XEXP (x, 0)) == PLUS
6571           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
6572      return false;
6573
6574   return true;
6575 }
6576
6577 /* Return true if X holds either a quarter-precision or
6578      floating-point +0.0 constant.  */
6579 static bool
6580 aarch64_valid_floating_const (enum machine_mode mode, rtx x)
6581 {
6582   if (!CONST_DOUBLE_P (x))
6583     return false;
6584
6585   /* TODO: We could handle moving 0.0 to a TFmode register,
6586      but first we would like to refactor the movtf_aarch64
6587      to be more amicable to split moves properly and
6588      correctly gate on TARGET_SIMD.  For now - reject all
6589      constants which are not to SFmode or DFmode registers.  */
6590   if (!(mode == SFmode || mode == DFmode))
6591     return false;
6592
6593   if (aarch64_float_const_zero_rtx_p (x))
6594     return true;
6595   return aarch64_float_const_representable_p (x);
6596 }
6597
6598 static bool
6599 aarch64_legitimate_constant_p (enum machine_mode mode, rtx x)
6600 {
6601   /* Do not allow vector struct mode constants.  We could support
6602      0 and -1 easily, but they need support in aarch64-simd.md.  */
6603   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
6604     return false;
6605
6606   /* This could probably go away because
6607      we now decompose CONST_INTs according to expand_mov_immediate.  */
6608   if ((GET_CODE (x) == CONST_VECTOR
6609        && aarch64_simd_valid_immediate (x, mode, false, NULL))
6610       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
6611         return !targetm.cannot_force_const_mem (mode, x);
6612
6613   if (GET_CODE (x) == HIGH
6614       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
6615     return true;
6616
6617   return aarch64_constant_address_p (x);
6618 }
6619
6620 rtx
6621 aarch64_load_tp (rtx target)
6622 {
6623   if (!target
6624       || GET_MODE (target) != Pmode
6625       || !register_operand (target, Pmode))
6626     target = gen_reg_rtx (Pmode);
6627
6628   /* Can return in any reg.  */
6629   emit_insn (gen_aarch64_load_tp_hard (target));
6630   return target;
6631 }
6632
6633 /* On AAPCS systems, this is the "struct __va_list".  */
6634 static GTY(()) tree va_list_type;
6635
6636 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
6637    Return the type to use as __builtin_va_list.
6638
6639    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
6640
6641    struct __va_list
6642    {
6643      void *__stack;
6644      void *__gr_top;
6645      void *__vr_top;
6646      int   __gr_offs;
6647      int   __vr_offs;
6648    };  */
6649
6650 static tree
6651 aarch64_build_builtin_va_list (void)
6652 {
6653   tree va_list_name;
6654   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6655
6656   /* Create the type.  */
6657   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
6658   /* Give it the required name.  */
6659   va_list_name = build_decl (BUILTINS_LOCATION,
6660                              TYPE_DECL,
6661                              get_identifier ("__va_list"),
6662                              va_list_type);
6663   DECL_ARTIFICIAL (va_list_name) = 1;
6664   TYPE_NAME (va_list_type) = va_list_name;
6665   TYPE_STUB_DECL (va_list_type) = va_list_name;
6666
6667   /* Create the fields.  */
6668   f_stack = build_decl (BUILTINS_LOCATION,
6669                         FIELD_DECL, get_identifier ("__stack"),
6670                         ptr_type_node);
6671   f_grtop = build_decl (BUILTINS_LOCATION,
6672                         FIELD_DECL, get_identifier ("__gr_top"),
6673                         ptr_type_node);
6674   f_vrtop = build_decl (BUILTINS_LOCATION,
6675                         FIELD_DECL, get_identifier ("__vr_top"),
6676                         ptr_type_node);
6677   f_groff = build_decl (BUILTINS_LOCATION,
6678                         FIELD_DECL, get_identifier ("__gr_offs"),
6679                         integer_type_node);
6680   f_vroff = build_decl (BUILTINS_LOCATION,
6681                         FIELD_DECL, get_identifier ("__vr_offs"),
6682                         integer_type_node);
6683
6684   DECL_ARTIFICIAL (f_stack) = 1;
6685   DECL_ARTIFICIAL (f_grtop) = 1;
6686   DECL_ARTIFICIAL (f_vrtop) = 1;
6687   DECL_ARTIFICIAL (f_groff) = 1;
6688   DECL_ARTIFICIAL (f_vroff) = 1;
6689
6690   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
6691   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
6692   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
6693   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
6694   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
6695
6696   TYPE_FIELDS (va_list_type) = f_stack;
6697   DECL_CHAIN (f_stack) = f_grtop;
6698   DECL_CHAIN (f_grtop) = f_vrtop;
6699   DECL_CHAIN (f_vrtop) = f_groff;
6700   DECL_CHAIN (f_groff) = f_vroff;
6701
6702   /* Compute its layout.  */
6703   layout_type (va_list_type);
6704
6705   return va_list_type;
6706 }
6707
6708 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
6709 static void
6710 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
6711 {
6712   const CUMULATIVE_ARGS *cum;
6713   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6714   tree stack, grtop, vrtop, groff, vroff;
6715   tree t;
6716   int gr_save_area_size;
6717   int vr_save_area_size;
6718   int vr_offset;
6719
6720   cum = &crtl->args.info;
6721   gr_save_area_size
6722     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
6723   vr_save_area_size
6724     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
6725
6726   if (TARGET_GENERAL_REGS_ONLY)
6727     {
6728       if (cum->aapcs_nvrn > 0)
6729         sorry ("%qs and floating point or vector arguments",
6730                "-mgeneral-regs-only");
6731       vr_save_area_size = 0;
6732     }
6733
6734   f_stack = TYPE_FIELDS (va_list_type_node);
6735   f_grtop = DECL_CHAIN (f_stack);
6736   f_vrtop = DECL_CHAIN (f_grtop);
6737   f_groff = DECL_CHAIN (f_vrtop);
6738   f_vroff = DECL_CHAIN (f_groff);
6739
6740   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
6741                   NULL_TREE);
6742   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
6743                   NULL_TREE);
6744   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
6745                   NULL_TREE);
6746   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
6747                   NULL_TREE);
6748   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
6749                   NULL_TREE);
6750
6751   /* Emit code to initialize STACK, which points to the next varargs stack
6752      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
6753      by named arguments.  STACK is 8-byte aligned.  */
6754   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
6755   if (cum->aapcs_stack_size > 0)
6756     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
6757   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
6758   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6759
6760   /* Emit code to initialize GRTOP, the top of the GR save area.
6761      virtual_incoming_args_rtx should have been 16 byte aligned.  */
6762   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
6763   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
6764   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6765
6766   /* Emit code to initialize VRTOP, the top of the VR save area.
6767      This address is gr_save_area_bytes below GRTOP, rounded
6768      down to the next 16-byte boundary.  */
6769   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
6770   vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
6771                              STACK_BOUNDARY / BITS_PER_UNIT);
6772
6773   if (vr_offset)
6774     t = fold_build_pointer_plus_hwi (t, -vr_offset);
6775   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
6776   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6777
6778   /* Emit code to initialize GROFF, the offset from GRTOP of the
6779      next GPR argument.  */
6780   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
6781               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
6782   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6783
6784   /* Likewise emit code to initialize VROFF, the offset from FTOP
6785      of the next VR argument.  */
6786   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
6787               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
6788   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
6789 }
6790
6791 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
6792
6793 static tree
6794 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
6795                               gimple_seq *post_p ATTRIBUTE_UNUSED)
6796 {
6797   tree addr;
6798   bool indirect_p;
6799   bool is_ha;           /* is HFA or HVA.  */
6800   bool dw_align;        /* double-word align.  */
6801   enum machine_mode ag_mode = VOIDmode;
6802   int nregs;
6803   enum machine_mode mode;
6804
6805   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
6806   tree stack, f_top, f_off, off, arg, roundup, on_stack;
6807   HOST_WIDE_INT size, rsize, adjust, align;
6808   tree t, u, cond1, cond2;
6809
6810   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
6811   if (indirect_p)
6812     type = build_pointer_type (type);
6813
6814   mode = TYPE_MODE (type);
6815
6816   f_stack = TYPE_FIELDS (va_list_type_node);
6817   f_grtop = DECL_CHAIN (f_stack);
6818   f_vrtop = DECL_CHAIN (f_grtop);
6819   f_groff = DECL_CHAIN (f_vrtop);
6820   f_vroff = DECL_CHAIN (f_groff);
6821
6822   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
6823                   f_stack, NULL_TREE);
6824   size = int_size_in_bytes (type);
6825   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
6826
6827   dw_align = false;
6828   adjust = 0;
6829   if (aarch64_vfp_is_call_or_return_candidate (mode,
6830                                                type,
6831                                                &ag_mode,
6832                                                &nregs,
6833                                                &is_ha))
6834     {
6835       /* TYPE passed in fp/simd registers.  */
6836       if (TARGET_GENERAL_REGS_ONLY)
6837         sorry ("%qs and floating point or vector arguments",
6838                "-mgeneral-regs-only");
6839
6840       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
6841                       unshare_expr (valist), f_vrtop, NULL_TREE);
6842       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
6843                       unshare_expr (valist), f_vroff, NULL_TREE);
6844
6845       rsize = nregs * UNITS_PER_VREG;
6846
6847       if (is_ha)
6848         {
6849           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
6850             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
6851         }
6852       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
6853                && size < UNITS_PER_VREG)
6854         {
6855           adjust = UNITS_PER_VREG - size;
6856         }
6857     }
6858   else
6859     {
6860       /* TYPE passed in general registers.  */
6861       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
6862                       unshare_expr (valist), f_grtop, NULL_TREE);
6863       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
6864                       unshare_expr (valist), f_groff, NULL_TREE);
6865       rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
6866       nregs = rsize / UNITS_PER_WORD;
6867
6868       if (align > 8)
6869         dw_align = true;
6870
6871       if (BLOCK_REG_PADDING (mode, type, 1) == downward
6872           && size < UNITS_PER_WORD)
6873         {
6874           adjust = UNITS_PER_WORD  - size;
6875         }
6876     }
6877
6878   /* Get a local temporary for the field value.  */
6879   off = get_initialized_tmp_var (f_off, pre_p, NULL);
6880
6881   /* Emit code to branch if off >= 0.  */
6882   t = build2 (GE_EXPR, boolean_type_node, off,
6883               build_int_cst (TREE_TYPE (off), 0));
6884   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
6885
6886   if (dw_align)
6887     {
6888       /* Emit: offs = (offs + 15) & -16.  */
6889       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
6890                   build_int_cst (TREE_TYPE (off), 15));
6891       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
6892                   build_int_cst (TREE_TYPE (off), -16));
6893       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
6894     }
6895   else
6896     roundup = NULL;
6897
6898   /* Update ap.__[g|v]r_offs  */
6899   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
6900               build_int_cst (TREE_TYPE (off), rsize));
6901   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
6902
6903   /* String up.  */
6904   if (roundup)
6905     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
6906
6907   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
6908   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
6909               build_int_cst (TREE_TYPE (f_off), 0));
6910   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
6911
6912   /* String up: make sure the assignment happens before the use.  */
6913   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
6914   COND_EXPR_ELSE (cond1) = t;
6915
6916   /* Prepare the trees handling the argument that is passed on the stack;
6917      the top level node will store in ON_STACK.  */
6918   arg = get_initialized_tmp_var (stack, pre_p, NULL);
6919   if (align > 8)
6920     {
6921       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
6922       t = fold_convert (intDI_type_node, arg);
6923       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
6924                   build_int_cst (TREE_TYPE (t), 15));
6925       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
6926                   build_int_cst (TREE_TYPE (t), -16));
6927       t = fold_convert (TREE_TYPE (arg), t);
6928       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
6929     }
6930   else
6931     roundup = NULL;
6932   /* Advance ap.__stack  */
6933   t = fold_convert (intDI_type_node, arg);
6934   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
6935               build_int_cst (TREE_TYPE (t), size + 7));
6936   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
6937               build_int_cst (TREE_TYPE (t), -8));
6938   t = fold_convert (TREE_TYPE (arg), t);
6939   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
6940   /* String up roundup and advance.  */
6941   if (roundup)
6942     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
6943   /* String up with arg */
6944   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
6945   /* Big-endianness related address adjustment.  */
6946   if (BLOCK_REG_PADDING (mode, type, 1) == downward
6947       && size < UNITS_PER_WORD)
6948   {
6949     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
6950                 size_int (UNITS_PER_WORD - size));
6951     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
6952   }
6953
6954   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
6955   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
6956
6957   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
6958   t = off;
6959   if (adjust)
6960     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
6961                 build_int_cst (TREE_TYPE (off), adjust));
6962
6963   t = fold_convert (sizetype, t);
6964   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
6965
6966   if (is_ha)
6967     {
6968       /* type ha; // treat as "struct {ftype field[n];}"
6969          ... [computing offs]
6970          for (i = 0; i <nregs; ++i, offs += 16)
6971            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
6972          return ha;  */
6973       int i;
6974       tree tmp_ha, field_t, field_ptr_t;
6975
6976       /* Declare a local variable.  */
6977       tmp_ha = create_tmp_var_raw (type, "ha");
6978       gimple_add_tmp_var (tmp_ha);
6979
6980       /* Establish the base type.  */
6981       switch (ag_mode)
6982         {
6983         case SFmode:
6984           field_t = float_type_node;
6985           field_ptr_t = float_ptr_type_node;
6986           break;
6987         case DFmode:
6988           field_t = double_type_node;
6989           field_ptr_t = double_ptr_type_node;
6990           break;
6991         case TFmode:
6992           field_t = long_double_type_node;
6993           field_ptr_t = long_double_ptr_type_node;
6994           break;
6995 /* The half precision and quad precision are not fully supported yet.  Enable
6996    the following code after the support is complete.  Need to find the correct
6997    type node for __fp16 *.  */
6998 #if 0
6999         case HFmode:
7000           field_t = float_type_node;
7001           field_ptr_t = float_ptr_type_node;
7002           break;
7003 #endif
7004         case V2SImode:
7005         case V4SImode:
7006             {
7007               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7008               field_t = build_vector_type_for_mode (innertype, ag_mode);
7009               field_ptr_t = build_pointer_type (field_t);
7010             }
7011           break;
7012         default:
7013           gcc_assert (0);
7014         }
7015
7016       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
7017       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7018       addr = t;
7019       t = fold_convert (field_ptr_t, addr);
7020       t = build2 (MODIFY_EXPR, field_t,
7021                   build1 (INDIRECT_REF, field_t, tmp_ha),
7022                   build1 (INDIRECT_REF, field_t, t));
7023
7024       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
7025       for (i = 1; i < nregs; ++i)
7026         {
7027           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7028           u = fold_convert (field_ptr_t, addr);
7029           u = build2 (MODIFY_EXPR, field_t,
7030                       build2 (MEM_REF, field_t, tmp_ha,
7031                               build_int_cst (field_ptr_t,
7032                                              (i *
7033                                               int_size_in_bytes (field_t)))),
7034                       build1 (INDIRECT_REF, field_t, u));
7035           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7036         }
7037
7038       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7039       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7040     }
7041
7042   COND_EXPR_ELSE (cond2) = t;
7043   addr = fold_convert (build_pointer_type (type), cond1);
7044   addr = build_va_arg_indirect_ref (addr);
7045
7046   if (indirect_p)
7047     addr = build_va_arg_indirect_ref (addr);
7048
7049   return addr;
7050 }
7051
7052 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
7053
7054 static void
7055 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, enum machine_mode mode,
7056                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7057                                 int no_rtl)
7058 {
7059   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7060   CUMULATIVE_ARGS local_cum;
7061   int gr_saved, vr_saved;
7062
7063   /* The caller has advanced CUM up to, but not beyond, the last named
7064      argument.  Advance a local copy of CUM past the last "real" named
7065      argument, to find out how many registers are left over.  */
7066   local_cum = *cum;
7067   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7068
7069   /* Found out how many registers we need to save.  */
7070   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7071   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7072
7073   if (TARGET_GENERAL_REGS_ONLY)
7074     {
7075       if (local_cum.aapcs_nvrn > 0)
7076         sorry ("%qs and floating point or vector arguments",
7077                "-mgeneral-regs-only");
7078       vr_saved = 0;
7079     }
7080
7081   if (!no_rtl)
7082     {
7083       if (gr_saved > 0)
7084         {
7085           rtx ptr, mem;
7086
7087           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
7088           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7089                                - gr_saved * UNITS_PER_WORD);
7090           mem = gen_frame_mem (BLKmode, ptr);
7091           set_mem_alias_set (mem, get_varargs_alias_set ());
7092
7093           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7094                                mem, gr_saved);
7095         }
7096       if (vr_saved > 0)
7097         {
7098           /* We can't use move_block_from_reg, because it will use
7099              the wrong mode, storing D regs only.  */
7100           enum machine_mode mode = TImode;
7101           int off, i;
7102
7103           /* Set OFF to the offset from virtual_incoming_args_rtx of
7104              the first vector register.  The VR save area lies below
7105              the GR one, and is aligned to 16 bytes.  */
7106           off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7107                                    STACK_BOUNDARY / BITS_PER_UNIT);
7108           off -= vr_saved * UNITS_PER_VREG;
7109
7110           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7111             {
7112               rtx ptr, mem;
7113
7114               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7115               mem = gen_frame_mem (mode, ptr);
7116               set_mem_alias_set (mem, get_varargs_alias_set ());
7117               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7118               off += UNITS_PER_VREG;
7119             }
7120         }
7121     }
7122
7123   /* We don't save the size into *PRETEND_SIZE because we want to avoid
7124      any complication of having crtl->args.pretend_args_size changed.  */
7125   cfun->machine->frame.saved_varargs_size
7126     = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7127                       STACK_BOUNDARY / BITS_PER_UNIT)
7128        + vr_saved * UNITS_PER_VREG);
7129 }
7130
7131 static void
7132 aarch64_conditional_register_usage (void)
7133 {
7134   int i;
7135   if (!TARGET_FLOAT)
7136     {
7137       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7138         {
7139           fixed_regs[i] = 1;
7140           call_used_regs[i] = 1;
7141         }
7142     }
7143 }
7144
7145 /* Walk down the type tree of TYPE counting consecutive base elements.
7146    If *MODEP is VOIDmode, then set it to the first valid floating point
7147    type.  If a non-floating point type is found, or if a floating point
7148    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7149    otherwise return the count in the sub-tree.  */
7150 static int
7151 aapcs_vfp_sub_candidate (const_tree type, enum machine_mode *modep)
7152 {
7153   enum machine_mode mode;
7154   HOST_WIDE_INT size;
7155
7156   switch (TREE_CODE (type))
7157     {
7158     case REAL_TYPE:
7159       mode = TYPE_MODE (type);
7160       if (mode != DFmode && mode != SFmode && mode != TFmode)
7161         return -1;
7162
7163       if (*modep == VOIDmode)
7164         *modep = mode;
7165
7166       if (*modep == mode)
7167         return 1;
7168
7169       break;
7170
7171     case COMPLEX_TYPE:
7172       mode = TYPE_MODE (TREE_TYPE (type));
7173       if (mode != DFmode && mode != SFmode && mode != TFmode)
7174         return -1;
7175
7176       if (*modep == VOIDmode)
7177         *modep = mode;
7178
7179       if (*modep == mode)
7180         return 2;
7181
7182       break;
7183
7184     case VECTOR_TYPE:
7185       /* Use V2SImode and V4SImode as representatives of all 64-bit
7186          and 128-bit vector types.  */
7187       size = int_size_in_bytes (type);
7188       switch (size)
7189         {
7190         case 8:
7191           mode = V2SImode;
7192           break;
7193         case 16:
7194           mode = V4SImode;
7195           break;
7196         default:
7197           return -1;
7198         }
7199
7200       if (*modep == VOIDmode)
7201         *modep = mode;
7202
7203       /* Vector modes are considered to be opaque: two vectors are
7204          equivalent for the purposes of being homogeneous aggregates
7205          if they are the same size.  */
7206       if (*modep == mode)
7207         return 1;
7208
7209       break;
7210
7211     case ARRAY_TYPE:
7212       {
7213         int count;
7214         tree index = TYPE_DOMAIN (type);
7215
7216         /* Can't handle incomplete types nor sizes that are not
7217            fixed.  */
7218         if (!COMPLETE_TYPE_P (type)
7219             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7220           return -1;
7221
7222         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7223         if (count == -1
7224             || !index
7225             || !TYPE_MAX_VALUE (index)
7226             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7227             || !TYPE_MIN_VALUE (index)
7228             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7229             || count < 0)
7230           return -1;
7231
7232         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7233                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7234
7235         /* There must be no padding.  */
7236         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7237           return -1;
7238
7239         return count;
7240       }
7241
7242     case RECORD_TYPE:
7243       {
7244         int count = 0;
7245         int sub_count;
7246         tree field;
7247
7248         /* Can't handle incomplete types nor sizes that are not
7249            fixed.  */
7250         if (!COMPLETE_TYPE_P (type)
7251             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7252           return -1;
7253
7254         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7255           {
7256             if (TREE_CODE (field) != FIELD_DECL)
7257               continue;
7258
7259             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7260             if (sub_count < 0)
7261               return -1;
7262             count += sub_count;
7263           }
7264
7265         /* There must be no padding.  */
7266         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7267           return -1;
7268
7269         return count;
7270       }
7271
7272     case UNION_TYPE:
7273     case QUAL_UNION_TYPE:
7274       {
7275         /* These aren't very interesting except in a degenerate case.  */
7276         int count = 0;
7277         int sub_count;
7278         tree field;
7279
7280         /* Can't handle incomplete types nor sizes that are not
7281            fixed.  */
7282         if (!COMPLETE_TYPE_P (type)
7283             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7284           return -1;
7285
7286         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7287           {
7288             if (TREE_CODE (field) != FIELD_DECL)
7289               continue;
7290
7291             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7292             if (sub_count < 0)
7293               return -1;
7294             count = count > sub_count ? count : sub_count;
7295           }
7296
7297         /* There must be no padding.  */
7298         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7299           return -1;
7300
7301         return count;
7302       }
7303
7304     default:
7305       break;
7306     }
7307
7308   return -1;
7309 }
7310
7311 /* Return true if we use LRA instead of reload pass.  */
7312 static bool
7313 aarch64_lra_p (void)
7314 {
7315   return aarch64_lra_flag;
7316 }
7317
7318 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7319    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
7320    array types.  The C99 floating-point complex types are also considered
7321    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
7322    types, which are GCC extensions and out of the scope of AAPCS64, are
7323    treated as composite types here as well.
7324
7325    Note that MODE itself is not sufficient in determining whether a type
7326    is such a composite type or not.  This is because
7327    stor-layout.c:compute_record_mode may have already changed the MODE
7328    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
7329    structure with only one field may have its MODE set to the mode of the
7330    field.  Also an integer mode whose size matches the size of the
7331    RECORD_TYPE type may be used to substitute the original mode
7332    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
7333    solely relied on.  */
7334
7335 static bool
7336 aarch64_composite_type_p (const_tree type,
7337                           enum machine_mode mode)
7338 {
7339   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7340     return true;
7341
7342   if (mode == BLKmode
7343       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7344       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7345     return true;
7346
7347   return false;
7348 }
7349
7350 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7351    type as described in AAPCS64 \S 4.1.2.
7352
7353    See the comment above aarch64_composite_type_p for the notes on MODE.  */
7354
7355 static bool
7356 aarch64_short_vector_p (const_tree type,
7357                         enum machine_mode mode)
7358 {
7359   HOST_WIDE_INT size = -1;
7360
7361   if (type && TREE_CODE (type) == VECTOR_TYPE)
7362     size = int_size_in_bytes (type);
7363   else if (!aarch64_composite_type_p (type, mode)
7364            && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
7365                || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
7366     size = GET_MODE_SIZE (mode);
7367
7368   return (size == 8 || size == 16) ? true : false;
7369 }
7370
7371 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
7372    shall be passed or returned in simd/fp register(s) (providing these
7373    parameter passing registers are available).
7374
7375    Upon successful return, *COUNT returns the number of needed registers,
7376    *BASE_MODE returns the mode of the individual register and when IS_HAF
7377    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
7378    floating-point aggregate or a homogeneous short-vector aggregate.  */
7379
7380 static bool
7381 aarch64_vfp_is_call_or_return_candidate (enum machine_mode mode,
7382                                          const_tree type,
7383                                          enum machine_mode *base_mode,
7384                                          int *count,
7385                                          bool *is_ha)
7386 {
7387   enum machine_mode new_mode = VOIDmode;
7388   bool composite_p = aarch64_composite_type_p (type, mode);
7389
7390   if (is_ha != NULL) *is_ha = false;
7391
7392   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
7393       || aarch64_short_vector_p (type, mode))
7394     {
7395       *count = 1;
7396       new_mode = mode;
7397     }
7398   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
7399     {
7400       if (is_ha != NULL) *is_ha = true;
7401       *count = 2;
7402       new_mode = GET_MODE_INNER (mode);
7403     }
7404   else if (type && composite_p)
7405     {
7406       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
7407
7408       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
7409         {
7410           if (is_ha != NULL) *is_ha = true;
7411           *count = ag_count;
7412         }
7413       else
7414         return false;
7415     }
7416   else
7417     return false;
7418
7419   *base_mode = new_mode;
7420   return true;
7421 }
7422
7423 /* Implement TARGET_STRUCT_VALUE_RTX.  */
7424
7425 static rtx
7426 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
7427                           int incoming ATTRIBUTE_UNUSED)
7428 {
7429   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
7430 }
7431
7432 /* Implements target hook vector_mode_supported_p.  */
7433 static bool
7434 aarch64_vector_mode_supported_p (enum machine_mode mode)
7435 {
7436   if (TARGET_SIMD
7437       && (mode == V4SImode  || mode == V8HImode
7438           || mode == V16QImode || mode == V2DImode
7439           || mode == V2SImode  || mode == V4HImode
7440           || mode == V8QImode || mode == V2SFmode
7441           || mode == V4SFmode || mode == V2DFmode
7442           || mode == V1DFmode))
7443     return true;
7444
7445   return false;
7446 }
7447
7448 /* Return appropriate SIMD container
7449    for MODE within a vector of WIDTH bits.  */
7450 static enum machine_mode
7451 aarch64_simd_container_mode (enum machine_mode mode, unsigned width)
7452 {
7453   gcc_assert (width == 64 || width == 128);
7454   if (TARGET_SIMD)
7455     {
7456       if (width == 128)
7457         switch (mode)
7458           {
7459           case DFmode:
7460             return V2DFmode;
7461           case SFmode:
7462             return V4SFmode;
7463           case SImode:
7464             return V4SImode;
7465           case HImode:
7466             return V8HImode;
7467           case QImode:
7468             return V16QImode;
7469           case DImode:
7470             return V2DImode;
7471           default:
7472             break;
7473           }
7474       else
7475         switch (mode)
7476           {
7477           case SFmode:
7478             return V2SFmode;
7479           case SImode:
7480             return V2SImode;
7481           case HImode:
7482             return V4HImode;
7483           case QImode:
7484             return V8QImode;
7485           default:
7486             break;
7487           }
7488     }
7489   return word_mode;
7490 }
7491
7492 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
7493 static enum machine_mode
7494 aarch64_preferred_simd_mode (enum machine_mode mode)
7495 {
7496   return aarch64_simd_container_mode (mode, 128);
7497 }
7498
7499 /* Return the bitmask of possible vector sizes for the vectorizer
7500    to iterate over.  */
7501 static unsigned int
7502 aarch64_autovectorize_vector_sizes (void)
7503 {
7504   return (16 | 8);
7505 }
7506
7507 /* A table to help perform AArch64-specific name mangling for AdvSIMD
7508    vector types in order to conform to the AAPCS64 (see "Procedure
7509    Call Standard for the ARM 64-bit Architecture", Appendix A).  To
7510    qualify for emission with the mangled names defined in that document,
7511    a vector type must not only be of the correct mode but also be
7512    composed of AdvSIMD vector element types (e.g.
7513    _builtin_aarch64_simd_qi); these types are registered by
7514    aarch64_init_simd_builtins ().  In other words, vector types defined
7515    in other ways e.g. via vector_size attribute will get default
7516    mangled names.  */
7517 typedef struct
7518 {
7519   enum machine_mode mode;
7520   const char *element_type_name;
7521   const char *mangled_name;
7522 } aarch64_simd_mangle_map_entry;
7523
7524 static aarch64_simd_mangle_map_entry aarch64_simd_mangle_map[] = {
7525   /* 64-bit containerized types.  */
7526   { V8QImode,  "__builtin_aarch64_simd_qi",     "10__Int8x8_t" },
7527   { V8QImode,  "__builtin_aarch64_simd_uqi",    "11__Uint8x8_t" },
7528   { V4HImode,  "__builtin_aarch64_simd_hi",     "11__Int16x4_t" },
7529   { V4HImode,  "__builtin_aarch64_simd_uhi",    "12__Uint16x4_t" },
7530   { V2SImode,  "__builtin_aarch64_simd_si",     "11__Int32x2_t" },
7531   { V2SImode,  "__builtin_aarch64_simd_usi",    "12__Uint32x2_t" },
7532   { V2SFmode,  "__builtin_aarch64_simd_sf",     "13__Float32x2_t" },
7533   { DImode,    "__builtin_aarch64_simd_di",     "11__Int64x1_t" },
7534   { DImode,    "__builtin_aarch64_simd_udi",    "12__Uint64x1_t" },
7535   { V1DFmode,  "__builtin_aarch64_simd_df",     "13__Float64x1_t" },
7536   { V8QImode,  "__builtin_aarch64_simd_poly8",  "11__Poly8x8_t" },
7537   { V4HImode,  "__builtin_aarch64_simd_poly16", "12__Poly16x4_t" },
7538   /* 128-bit containerized types.  */
7539   { V16QImode, "__builtin_aarch64_simd_qi",     "11__Int8x16_t" },
7540   { V16QImode, "__builtin_aarch64_simd_uqi",    "12__Uint8x16_t" },
7541   { V8HImode,  "__builtin_aarch64_simd_hi",     "11__Int16x8_t" },
7542   { V8HImode,  "__builtin_aarch64_simd_uhi",    "12__Uint16x8_t" },
7543   { V4SImode,  "__builtin_aarch64_simd_si",     "11__Int32x4_t" },
7544   { V4SImode,  "__builtin_aarch64_simd_usi",    "12__Uint32x4_t" },
7545   { V2DImode,  "__builtin_aarch64_simd_di",     "11__Int64x2_t" },
7546   { V2DImode,  "__builtin_aarch64_simd_udi",    "12__Uint64x2_t" },
7547   { V4SFmode,  "__builtin_aarch64_simd_sf",     "13__Float32x4_t" },
7548   { V2DFmode,  "__builtin_aarch64_simd_df",     "13__Float64x2_t" },
7549   { V16QImode, "__builtin_aarch64_simd_poly8",  "12__Poly8x16_t" },
7550   { V8HImode,  "__builtin_aarch64_simd_poly16", "12__Poly16x8_t" },
7551   { V2DImode,  "__builtin_aarch64_simd_poly64", "12__Poly64x2_t" },
7552   { VOIDmode, NULL, NULL }
7553 };
7554
7555 /* Implement TARGET_MANGLE_TYPE.  */
7556
7557 static const char *
7558 aarch64_mangle_type (const_tree type)
7559 {
7560   /* The AArch64 ABI documents say that "__va_list" has to be
7561      managled as if it is in the "std" namespace.  */
7562   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
7563     return "St9__va_list";
7564
7565   /* Check the mode of the vector type, and the name of the vector
7566      element type, against the table.  */
7567   if (TREE_CODE (type) == VECTOR_TYPE)
7568     {
7569       aarch64_simd_mangle_map_entry *pos = aarch64_simd_mangle_map;
7570
7571       while (pos->mode != VOIDmode)
7572         {
7573           tree elt_type = TREE_TYPE (type);
7574
7575           if (pos->mode == TYPE_MODE (type)
7576               && TREE_CODE (TYPE_NAME (elt_type)) == TYPE_DECL
7577               && !strcmp (IDENTIFIER_POINTER (DECL_NAME (TYPE_NAME (elt_type))),
7578                           pos->element_type_name))
7579             return pos->mangled_name;
7580
7581           pos++;
7582         }
7583     }
7584
7585   /* Use the default mangling.  */
7586   return NULL;
7587 }
7588
7589 /* Return the equivalent letter for size.  */
7590 static char
7591 sizetochar (int size)
7592 {
7593   switch (size)
7594     {
7595     case 64: return 'd';
7596     case 32: return 's';
7597     case 16: return 'h';
7598     case 8 : return 'b';
7599     default: gcc_unreachable ();
7600     }
7601 }
7602
7603 /* Return true iff x is a uniform vector of floating-point
7604    constants, and the constant can be represented in
7605    quarter-precision form.  Note, as aarch64_float_const_representable
7606    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
7607 static bool
7608 aarch64_vect_float_const_representable_p (rtx x)
7609 {
7610   int i = 0;
7611   REAL_VALUE_TYPE r0, ri;
7612   rtx x0, xi;
7613
7614   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
7615     return false;
7616
7617   x0 = CONST_VECTOR_ELT (x, 0);
7618   if (!CONST_DOUBLE_P (x0))
7619     return false;
7620
7621   REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
7622
7623   for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
7624     {
7625       xi = CONST_VECTOR_ELT (x, i);
7626       if (!CONST_DOUBLE_P (xi))
7627         return false;
7628
7629       REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
7630       if (!REAL_VALUES_EQUAL (r0, ri))
7631         return false;
7632     }
7633
7634   return aarch64_float_const_representable_p (x0);
7635 }
7636
7637 /* Return true for valid and false for invalid.  */
7638 bool
7639 aarch64_simd_valid_immediate (rtx op, enum machine_mode mode, bool inverse,
7640                               struct simd_immediate_info *info)
7641 {
7642 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
7643   matches = 1;                                          \
7644   for (i = 0; i < idx; i += (STRIDE))                   \
7645     if (!(TEST))                                        \
7646       matches = 0;                                      \
7647   if (matches)                                          \
7648     {                                                   \
7649       immtype = (CLASS);                                \
7650       elsize = (ELSIZE);                                \
7651       eshift = (SHIFT);                                 \
7652       emvn = (NEG);                                     \
7653       break;                                            \
7654     }
7655
7656   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
7657   unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
7658   unsigned char bytes[16];
7659   int immtype = -1, matches;
7660   unsigned int invmask = inverse ? 0xff : 0;
7661   int eshift, emvn;
7662
7663   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
7664     {
7665       if (! (aarch64_simd_imm_zero_p (op, mode)
7666              || aarch64_vect_float_const_representable_p (op)))
7667         return false;
7668
7669       if (info)
7670         {
7671           info->value = CONST_VECTOR_ELT (op, 0);
7672           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
7673           info->mvn = false;
7674           info->shift = 0;
7675         }
7676
7677       return true;
7678     }
7679
7680   /* Splat vector constant out into a byte vector.  */
7681   for (i = 0; i < n_elts; i++)
7682     {
7683       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
7684          it must be laid out in the vector register in reverse order.  */
7685       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
7686       unsigned HOST_WIDE_INT elpart;
7687       unsigned int part, parts;
7688
7689       if (CONST_INT_P (el))
7690         {
7691           elpart = INTVAL (el);
7692           parts = 1;
7693         }
7694       else if (GET_CODE (el) == CONST_DOUBLE)
7695         {
7696           elpart = CONST_DOUBLE_LOW (el);
7697           parts = 2;
7698         }
7699       else
7700         gcc_unreachable ();
7701
7702       for (part = 0; part < parts; part++)
7703         {
7704           unsigned int byte;
7705           for (byte = 0; byte < innersize; byte++)
7706             {
7707               bytes[idx++] = (elpart & 0xff) ^ invmask;
7708               elpart >>= BITS_PER_UNIT;
7709             }
7710           if (GET_CODE (el) == CONST_DOUBLE)
7711             elpart = CONST_DOUBLE_HIGH (el);
7712         }
7713     }
7714
7715   /* Sanity check.  */
7716   gcc_assert (idx == GET_MODE_SIZE (mode));
7717
7718   do
7719     {
7720       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
7721              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
7722
7723       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
7724              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
7725
7726       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
7727              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
7728
7729       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
7730              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
7731
7732       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
7733
7734       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
7735
7736       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
7737              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
7738
7739       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
7740              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
7741
7742       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
7743              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
7744
7745       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
7746              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
7747
7748       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
7749
7750       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
7751
7752       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
7753              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
7754
7755       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
7756              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
7757
7758       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
7759              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
7760
7761       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
7762              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
7763
7764       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
7765
7766       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
7767              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
7768     }
7769   while (0);
7770
7771   if (immtype == -1)
7772     return false;
7773
7774   if (info)
7775     {
7776       info->element_width = elsize;
7777       info->mvn = emvn != 0;
7778       info->shift = eshift;
7779
7780       unsigned HOST_WIDE_INT imm = 0;
7781
7782       if (immtype >= 12 && immtype <= 15)
7783         info->msl = true;
7784
7785       /* Un-invert bytes of recognized vector, if necessary.  */
7786       if (invmask != 0)
7787         for (i = 0; i < idx; i++)
7788           bytes[i] ^= invmask;
7789
7790       if (immtype == 17)
7791         {
7792           /* FIXME: Broken on 32-bit H_W_I hosts.  */
7793           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
7794
7795           for (i = 0; i < 8; i++)
7796             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
7797               << (i * BITS_PER_UNIT);
7798
7799
7800           info->value = GEN_INT (imm);
7801         }
7802       else
7803         {
7804           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
7805             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
7806
7807           /* Construct 'abcdefgh' because the assembler cannot handle
7808              generic constants.  */
7809           if (info->mvn)
7810             imm = ~imm;
7811           imm = (imm >> info->shift) & 0xff;
7812           info->value = GEN_INT (imm);
7813         }
7814     }
7815
7816   return true;
7817 #undef CHECK
7818 }
7819
7820 /* Check of immediate shift constants are within range.  */
7821 bool
7822 aarch64_simd_shift_imm_p (rtx x, enum machine_mode mode, bool left)
7823 {
7824   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
7825   if (left)
7826     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
7827   else
7828     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
7829 }
7830
7831 /* Return true if X is a uniform vector where all elements
7832    are either the floating-point constant 0.0 or the
7833    integer constant 0.  */
7834 bool
7835 aarch64_simd_imm_zero_p (rtx x, enum machine_mode mode)
7836 {
7837   return x == CONST0_RTX (mode);
7838 }
7839
7840 bool
7841 aarch64_simd_imm_scalar_p (rtx x, enum machine_mode mode ATTRIBUTE_UNUSED)
7842 {
7843   HOST_WIDE_INT imm = INTVAL (x);
7844   int i;
7845
7846   for (i = 0; i < 8; i++)
7847     {
7848       unsigned int byte = imm & 0xff;
7849       if (byte != 0xff && byte != 0)
7850        return false;
7851       imm >>= 8;
7852     }
7853
7854   return true;
7855 }
7856
7857 bool
7858 aarch64_mov_operand_p (rtx x,
7859                        enum aarch64_symbol_context context,
7860                        enum machine_mode mode)
7861 {
7862   if (GET_CODE (x) == HIGH
7863       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7864     return true;
7865
7866   if (CONST_INT_P (x) && aarch64_move_imm (INTVAL (x), mode))
7867     return true;
7868
7869   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
7870     return true;
7871
7872   return aarch64_classify_symbolic_expression (x, context)
7873     == SYMBOL_TINY_ABSOLUTE;
7874 }
7875
7876 /* Return a const_int vector of VAL.  */
7877 rtx
7878 aarch64_simd_gen_const_vector_dup (enum machine_mode mode, int val)
7879 {
7880   int nunits = GET_MODE_NUNITS (mode);
7881   rtvec v = rtvec_alloc (nunits);
7882   int i;
7883
7884   for (i=0; i < nunits; i++)
7885     RTVEC_ELT (v, i) = GEN_INT (val);
7886
7887   return gen_rtx_CONST_VECTOR (mode, v);
7888 }
7889
7890 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
7891
7892 bool
7893 aarch64_simd_scalar_immediate_valid_for_move (rtx op, enum machine_mode mode)
7894 {
7895   enum machine_mode vmode;
7896
7897   gcc_assert (!VECTOR_MODE_P (mode));
7898   vmode = aarch64_preferred_simd_mode (mode);
7899   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
7900   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
7901 }
7902
7903 /* Construct and return a PARALLEL RTX vector with elements numbering the
7904    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
7905    the vector - from the perspective of the architecture.  This does not
7906    line up with GCC's perspective on lane numbers, so we end up with
7907    different masks depending on our target endian-ness.  The diagram
7908    below may help.  We must draw the distinction when building masks
7909    which select one half of the vector.  An instruction selecting
7910    architectural low-lanes for a big-endian target, must be described using
7911    a mask selecting GCC high-lanes.
7912
7913                  Big-Endian             Little-Endian
7914
7915 GCC             0   1   2   3           3   2   1   0
7916               | x | x | x | x |       | x | x | x | x |
7917 Architecture    3   2   1   0           3   2   1   0
7918
7919 Low Mask:         { 2, 3 }                { 0, 1 }
7920 High Mask:        { 0, 1 }                { 2, 3 }
7921 */
7922
7923 rtx
7924 aarch64_simd_vect_par_cnst_half (enum machine_mode mode, bool high)
7925 {
7926   int nunits = GET_MODE_NUNITS (mode);
7927   rtvec v = rtvec_alloc (nunits / 2);
7928   int high_base = nunits / 2;
7929   int low_base = 0;
7930   int base;
7931   rtx t1;
7932   int i;
7933
7934   if (BYTES_BIG_ENDIAN)
7935     base = high ? low_base : high_base;
7936   else
7937     base = high ? high_base : low_base;
7938
7939   for (i = 0; i < nunits / 2; i++)
7940     RTVEC_ELT (v, i) = GEN_INT (base + i);
7941
7942   t1 = gen_rtx_PARALLEL (mode, v);
7943   return t1;
7944 }
7945
7946 /* Check OP for validity as a PARALLEL RTX vector with elements
7947    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
7948    from the perspective of the architecture.  See the diagram above
7949    aarch64_simd_vect_par_cnst_half for more details.  */
7950
7951 bool
7952 aarch64_simd_check_vect_par_cnst_half (rtx op, enum machine_mode mode,
7953                                        bool high)
7954 {
7955   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
7956   HOST_WIDE_INT count_op = XVECLEN (op, 0);
7957   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
7958   int i = 0;
7959
7960   if (!VECTOR_MODE_P (mode))
7961     return false;
7962
7963   if (count_op != count_ideal)
7964     return false;
7965
7966   for (i = 0; i < count_ideal; i++)
7967     {
7968       rtx elt_op = XVECEXP (op, 0, i);
7969       rtx elt_ideal = XVECEXP (ideal, 0, i);
7970
7971       if (!CONST_INT_P (elt_op)
7972           || INTVAL (elt_ideal) != INTVAL (elt_op))
7973         return false;
7974     }
7975   return true;
7976 }
7977
7978 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
7979    HIGH (exclusive).  */
7980 void
7981 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high)
7982 {
7983   HOST_WIDE_INT lane;
7984   gcc_assert (CONST_INT_P (operand));
7985   lane = INTVAL (operand);
7986
7987   if (lane < low || lane >= high)
7988     error ("lane out of range");
7989 }
7990
7991 /* Emit code to place a AdvSIMD pair result in memory locations (with equal
7992    registers).  */
7993 void
7994 aarch64_simd_emit_pair_result_insn (enum machine_mode mode,
7995                             rtx (*intfn) (rtx, rtx, rtx), rtx destaddr,
7996                             rtx op1)
7997 {
7998   rtx mem = gen_rtx_MEM (mode, destaddr);
7999   rtx tmp1 = gen_reg_rtx (mode);
8000   rtx tmp2 = gen_reg_rtx (mode);
8001
8002   emit_insn (intfn (tmp1, op1, tmp2));
8003
8004   emit_move_insn (mem, tmp1);
8005   mem = adjust_address (mem, mode, GET_MODE_SIZE (mode));
8006   emit_move_insn (mem, tmp2);
8007 }
8008
8009 /* Return TRUE if OP is a valid vector addressing mode.  */
8010 bool
8011 aarch64_simd_mem_operand_p (rtx op)
8012 {
8013   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8014                         || REG_P (XEXP (op, 0)));
8015 }
8016
8017 /* Set up OPERANDS for a register copy from SRC to DEST, taking care
8018    not to early-clobber SRC registers in the process.
8019
8020    We assume that the operands described by SRC and DEST represent a
8021    decomposed copy of OPERANDS[1] into OPERANDS[0].  COUNT is the
8022    number of components into which the copy has been decomposed.  */
8023 void
8024 aarch64_simd_disambiguate_copy (rtx *operands, rtx *dest,
8025                                 rtx *src, unsigned int count)
8026 {
8027   unsigned int i;
8028
8029   if (!reg_overlap_mentioned_p (operands[0], operands[1])
8030       || REGNO (operands[0]) < REGNO (operands[1]))
8031     {
8032       for (i = 0; i < count; i++)
8033         {
8034           operands[2 * i] = dest[i];
8035           operands[2 * i + 1] = src[i];
8036         }
8037     }
8038   else
8039     {
8040       for (i = 0; i < count; i++)
8041         {
8042           operands[2 * i] = dest[count - i - 1];
8043           operands[2 * i + 1] = src[count - i - 1];
8044         }
8045     }
8046 }
8047
8048 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8049    one of VSTRUCT modes: OI, CI or XI.  */
8050 int
8051 aarch64_simd_attr_length_move (rtx_insn *insn)
8052 {
8053   enum machine_mode mode;
8054
8055   extract_insn_cached (insn);
8056
8057   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8058     {
8059       mode = GET_MODE (recog_data.operand[0]);
8060       switch (mode)
8061         {
8062         case OImode:
8063           return 8;
8064         case CImode:
8065           return 12;
8066         case XImode:
8067           return 16;
8068         default:
8069           gcc_unreachable ();
8070         }
8071     }
8072   return 4;
8073 }
8074
8075 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
8076    alignment of a vector to 128 bits.  */
8077 static HOST_WIDE_INT
8078 aarch64_simd_vector_alignment (const_tree type)
8079 {
8080   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8081   return MIN (align, 128);
8082 }
8083
8084 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
8085 static bool
8086 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8087 {
8088   if (is_packed)
8089     return false;
8090
8091   /* We guarantee alignment for vectors up to 128-bits.  */
8092   if (tree_int_cst_compare (TYPE_SIZE (type),
8093                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8094     return false;
8095
8096   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
8097   return true;
8098 }
8099
8100 /* If VALS is a vector constant that can be loaded into a register
8101    using DUP, generate instructions to do so and return an RTX to
8102    assign to the register.  Otherwise return NULL_RTX.  */
8103 static rtx
8104 aarch64_simd_dup_constant (rtx vals)
8105 {
8106   enum machine_mode mode = GET_MODE (vals);
8107   enum machine_mode inner_mode = GET_MODE_INNER (mode);
8108   int n_elts = GET_MODE_NUNITS (mode);
8109   bool all_same = true;
8110   rtx x;
8111   int i;
8112
8113   if (GET_CODE (vals) != CONST_VECTOR)
8114     return NULL_RTX;
8115
8116   for (i = 1; i < n_elts; ++i)
8117     {
8118       x = CONST_VECTOR_ELT (vals, i);
8119       if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8120         all_same = false;
8121     }
8122
8123   if (!all_same)
8124     return NULL_RTX;
8125
8126   /* We can load this constant by using DUP and a constant in a
8127      single ARM register.  This will be cheaper than a vector
8128      load.  */
8129   x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8130   return gen_rtx_VEC_DUPLICATE (mode, x);
8131 }
8132
8133
8134 /* Generate code to load VALS, which is a PARALLEL containing only
8135    constants (for vec_init) or CONST_VECTOR, efficiently into a
8136    register.  Returns an RTX to copy into the register, or NULL_RTX
8137    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
8138 static rtx
8139 aarch64_simd_make_constant (rtx vals)
8140 {
8141   enum machine_mode mode = GET_MODE (vals);
8142   rtx const_dup;
8143   rtx const_vec = NULL_RTX;
8144   int n_elts = GET_MODE_NUNITS (mode);
8145   int n_const = 0;
8146   int i;
8147
8148   if (GET_CODE (vals) == CONST_VECTOR)
8149     const_vec = vals;
8150   else if (GET_CODE (vals) == PARALLEL)
8151     {
8152       /* A CONST_VECTOR must contain only CONST_INTs and
8153          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8154          Only store valid constants in a CONST_VECTOR.  */
8155       for (i = 0; i < n_elts; ++i)
8156         {
8157           rtx x = XVECEXP (vals, 0, i);
8158           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8159             n_const++;
8160         }
8161       if (n_const == n_elts)
8162         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8163     }
8164   else
8165     gcc_unreachable ();
8166
8167   if (const_vec != NULL_RTX
8168       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8169     /* Load using MOVI/MVNI.  */
8170     return const_vec;
8171   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8172     /* Loaded using DUP.  */
8173     return const_dup;
8174   else if (const_vec != NULL_RTX)
8175     /* Load from constant pool. We can not take advantage of single-cycle
8176        LD1 because we need a PC-relative addressing mode.  */
8177     return const_vec;
8178   else
8179     /* A PARALLEL containing something not valid inside CONST_VECTOR.
8180        We can not construct an initializer.  */
8181     return NULL_RTX;
8182 }
8183
8184 void
8185 aarch64_expand_vector_init (rtx target, rtx vals)
8186 {
8187   enum machine_mode mode = GET_MODE (target);
8188   enum machine_mode inner_mode = GET_MODE_INNER (mode);
8189   int n_elts = GET_MODE_NUNITS (mode);
8190   int n_var = 0, one_var = -1;
8191   bool all_same = true;
8192   rtx x, mem;
8193   int i;
8194
8195   x = XVECEXP (vals, 0, 0);
8196   if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8197     n_var = 1, one_var = 0;
8198
8199   for (i = 1; i < n_elts; ++i)
8200     {
8201       x = XVECEXP (vals, 0, i);
8202       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8203         ++n_var, one_var = i;
8204
8205       if (!rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8206         all_same = false;
8207     }
8208
8209   if (n_var == 0)
8210     {
8211       rtx constant = aarch64_simd_make_constant (vals);
8212       if (constant != NULL_RTX)
8213         {
8214           emit_move_insn (target, constant);
8215           return;
8216         }
8217     }
8218
8219   /* Splat a single non-constant element if we can.  */
8220   if (all_same)
8221     {
8222       x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8223       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8224       return;
8225     }
8226
8227   /* One field is non-constant.  Load constant then overwrite varying
8228      field.  This is more efficient than using the stack.  */
8229   if (n_var == 1)
8230     {
8231       rtx copy = copy_rtx (vals);
8232       rtx index = GEN_INT (one_var);
8233       enum insn_code icode;
8234
8235       /* Load constant part of vector, substitute neighboring value for
8236          varying element.  */
8237       XVECEXP (copy, 0, one_var) = XVECEXP (vals, 0, one_var ^ 1);
8238       aarch64_expand_vector_init (target, copy);
8239
8240       /* Insert variable.  */
8241       x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, one_var));
8242       icode = optab_handler (vec_set_optab, mode);
8243       gcc_assert (icode != CODE_FOR_nothing);
8244       emit_insn (GEN_FCN (icode) (target, x, index));
8245       return;
8246     }
8247
8248   /* Construct the vector in memory one field at a time
8249      and load the whole vector.  */
8250   mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8251   for (i = 0; i < n_elts; i++)
8252     emit_move_insn (adjust_address_nv (mem, inner_mode,
8253                                     i * GET_MODE_SIZE (inner_mode)),
8254                     XVECEXP (vals, 0, i));
8255   emit_move_insn (target, mem);
8256
8257 }
8258
8259 static unsigned HOST_WIDE_INT
8260 aarch64_shift_truncation_mask (enum machine_mode mode)
8261 {
8262   return
8263     (aarch64_vector_mode_supported_p (mode)
8264      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8265 }
8266
8267 #ifndef TLS_SECTION_ASM_FLAG
8268 #define TLS_SECTION_ASM_FLAG 'T'
8269 #endif
8270
8271 void
8272 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8273                                tree decl ATTRIBUTE_UNUSED)
8274 {
8275   char flagchars[10], *f = flagchars;
8276
8277   /* If we have already declared this section, we can use an
8278      abbreviated form to switch back to it -- unless this section is
8279      part of a COMDAT groups, in which case GAS requires the full
8280      declaration every time.  */
8281   if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8282       && (flags & SECTION_DECLARED))
8283     {
8284       fprintf (asm_out_file, "\t.section\t%s\n", name);
8285       return;
8286     }
8287
8288   if (!(flags & SECTION_DEBUG))
8289     *f++ = 'a';
8290   if (flags & SECTION_WRITE)
8291     *f++ = 'w';
8292   if (flags & SECTION_CODE)
8293     *f++ = 'x';
8294   if (flags & SECTION_SMALL)
8295     *f++ = 's';
8296   if (flags & SECTION_MERGE)
8297     *f++ = 'M';
8298   if (flags & SECTION_STRINGS)
8299     *f++ = 'S';
8300   if (flags & SECTION_TLS)
8301     *f++ = TLS_SECTION_ASM_FLAG;
8302   if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8303     *f++ = 'G';
8304   *f = '\0';
8305
8306   fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
8307
8308   if (!(flags & SECTION_NOTYPE))
8309     {
8310       const char *type;
8311       const char *format;
8312
8313       if (flags & SECTION_BSS)
8314         type = "nobits";
8315       else
8316         type = "progbits";
8317
8318 #ifdef TYPE_OPERAND_FMT
8319       format = "," TYPE_OPERAND_FMT;
8320 #else
8321       format = ",@%s";
8322 #endif
8323
8324       fprintf (asm_out_file, format, type);
8325
8326       if (flags & SECTION_ENTSIZE)
8327         fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
8328       if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8329         {
8330           if (TREE_CODE (decl) == IDENTIFIER_NODE)
8331             fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
8332           else
8333             fprintf (asm_out_file, ",%s,comdat",
8334                      IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
8335         }
8336     }
8337
8338   putc ('\n', asm_out_file);
8339 }
8340
8341 /* Select a format to encode pointers in exception handling data.  */
8342 int
8343 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
8344 {
8345    int type;
8346    switch (aarch64_cmodel)
8347      {
8348      case AARCH64_CMODEL_TINY:
8349      case AARCH64_CMODEL_TINY_PIC:
8350      case AARCH64_CMODEL_SMALL:
8351      case AARCH64_CMODEL_SMALL_PIC:
8352        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
8353           for everything.  */
8354        type = DW_EH_PE_sdata4;
8355        break;
8356      default:
8357        /* No assumptions here.  8-byte relocs required.  */
8358        type = DW_EH_PE_sdata8;
8359        break;
8360      }
8361    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
8362 }
8363
8364 /* Emit load exclusive.  */
8365
8366 static void
8367 aarch64_emit_load_exclusive (enum machine_mode mode, rtx rval,
8368                              rtx mem, rtx model_rtx)
8369 {
8370   rtx (*gen) (rtx, rtx, rtx);
8371
8372   switch (mode)
8373     {
8374     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
8375     case HImode: gen = gen_aarch64_load_exclusivehi; break;
8376     case SImode: gen = gen_aarch64_load_exclusivesi; break;
8377     case DImode: gen = gen_aarch64_load_exclusivedi; break;
8378     default:
8379       gcc_unreachable ();
8380     }
8381
8382   emit_insn (gen (rval, mem, model_rtx));
8383 }
8384
8385 /* Emit store exclusive.  */
8386
8387 static void
8388 aarch64_emit_store_exclusive (enum machine_mode mode, rtx bval,
8389                               rtx rval, rtx mem, rtx model_rtx)
8390 {
8391   rtx (*gen) (rtx, rtx, rtx, rtx);
8392
8393   switch (mode)
8394     {
8395     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
8396     case HImode: gen = gen_aarch64_store_exclusivehi; break;
8397     case SImode: gen = gen_aarch64_store_exclusivesi; break;
8398     case DImode: gen = gen_aarch64_store_exclusivedi; break;
8399     default:
8400       gcc_unreachable ();
8401     }
8402
8403   emit_insn (gen (bval, rval, mem, model_rtx));
8404 }
8405
8406 /* Mark the previous jump instruction as unlikely.  */
8407
8408 static void
8409 aarch64_emit_unlikely_jump (rtx insn)
8410 {
8411   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
8412
8413   insn = emit_jump_insn (insn);
8414   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
8415 }
8416
8417 /* Expand a compare and swap pattern.  */
8418
8419 void
8420 aarch64_expand_compare_and_swap (rtx operands[])
8421 {
8422   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
8423   enum machine_mode mode, cmp_mode;
8424   rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
8425
8426   bval = operands[0];
8427   rval = operands[1];
8428   mem = operands[2];
8429   oldval = operands[3];
8430   newval = operands[4];
8431   is_weak = operands[5];
8432   mod_s = operands[6];
8433   mod_f = operands[7];
8434   mode = GET_MODE (mem);
8435   cmp_mode = mode;
8436
8437   /* Normally the succ memory model must be stronger than fail, but in the
8438      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
8439      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
8440
8441   if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
8442       && INTVAL (mod_s) == MEMMODEL_RELEASE)
8443     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
8444
8445   switch (mode)
8446     {
8447     case QImode:
8448     case HImode:
8449       /* For short modes, we're going to perform the comparison in SImode,
8450          so do the zero-extension now.  */
8451       cmp_mode = SImode;
8452       rval = gen_reg_rtx (SImode);
8453       oldval = convert_modes (SImode, mode, oldval, true);
8454       /* Fall through.  */
8455
8456     case SImode:
8457     case DImode:
8458       /* Force the value into a register if needed.  */
8459       if (!aarch64_plus_operand (oldval, mode))
8460         oldval = force_reg (cmp_mode, oldval);
8461       break;
8462
8463     default:
8464       gcc_unreachable ();
8465     }
8466
8467   switch (mode)
8468     {
8469     case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
8470     case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
8471     case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
8472     case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
8473     default:
8474       gcc_unreachable ();
8475     }
8476
8477   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
8478
8479   if (mode == QImode || mode == HImode)
8480     emit_move_insn (operands[1], gen_lowpart (mode, rval));
8481
8482   x = gen_rtx_REG (CCmode, CC_REGNUM);
8483   x = gen_rtx_EQ (SImode, x, const0_rtx);
8484   emit_insn (gen_rtx_SET (VOIDmode, bval, x));
8485 }
8486
8487 /* Split a compare and swap pattern.  */
8488
8489 void
8490 aarch64_split_compare_and_swap (rtx operands[])
8491 {
8492   rtx rval, mem, oldval, newval, scratch;
8493   enum machine_mode mode;
8494   bool is_weak;
8495   rtx_code_label *label1, *label2;
8496   rtx x, cond;
8497
8498   rval = operands[0];
8499   mem = operands[1];
8500   oldval = operands[2];
8501   newval = operands[3];
8502   is_weak = (operands[4] != const0_rtx);
8503   scratch = operands[7];
8504   mode = GET_MODE (mem);
8505
8506   label1 = NULL;
8507   if (!is_weak)
8508     {
8509       label1 = gen_label_rtx ();
8510       emit_label (label1);
8511     }
8512   label2 = gen_label_rtx ();
8513
8514   aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
8515
8516   cond = aarch64_gen_compare_reg (NE, rval, oldval);
8517   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
8518   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8519                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
8520   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8521
8522   aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
8523
8524   if (!is_weak)
8525     {
8526       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
8527       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8528                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
8529       aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8530     }
8531   else
8532     {
8533       cond = gen_rtx_REG (CCmode, CC_REGNUM);
8534       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
8535       emit_insn (gen_rtx_SET (VOIDmode, cond, x));
8536     }
8537
8538   emit_label (label2);
8539 }
8540
8541 /* Split an atomic operation.  */
8542
8543 void
8544 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
8545                      rtx value, rtx model_rtx, rtx cond)
8546 {
8547   enum machine_mode mode = GET_MODE (mem);
8548   enum machine_mode wmode = (mode == DImode ? DImode : SImode);
8549   rtx_code_label *label;
8550   rtx x;
8551
8552   label = gen_label_rtx ();
8553   emit_label (label);
8554
8555   if (new_out)
8556     new_out = gen_lowpart (wmode, new_out);
8557   if (old_out)
8558     old_out = gen_lowpart (wmode, old_out);
8559   else
8560     old_out = new_out;
8561   value = simplify_gen_subreg (wmode, value, mode, 0);
8562
8563   aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
8564
8565   switch (code)
8566     {
8567     case SET:
8568       new_out = value;
8569       break;
8570
8571     case NOT:
8572       x = gen_rtx_AND (wmode, old_out, value);
8573       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8574       x = gen_rtx_NOT (wmode, new_out);
8575       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8576       break;
8577
8578     case MINUS:
8579       if (CONST_INT_P (value))
8580         {
8581           value = GEN_INT (-INTVAL (value));
8582           code = PLUS;
8583         }
8584       /* Fall through.  */
8585
8586     default:
8587       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
8588       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
8589       break;
8590     }
8591
8592   aarch64_emit_store_exclusive (mode, cond, mem,
8593                                 gen_lowpart (mode, new_out), model_rtx);
8594
8595   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
8596   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
8597                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
8598   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
8599 }
8600
8601 static void
8602 aarch64_print_extension (void)
8603 {
8604   const struct aarch64_option_extension *opt = NULL;
8605
8606   for (opt = all_extensions; opt->name != NULL; opt++)
8607     if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
8608       asm_fprintf (asm_out_file, "+%s", opt->name);
8609
8610   asm_fprintf (asm_out_file, "\n");
8611 }
8612
8613 static void
8614 aarch64_start_file (void)
8615 {
8616   if (selected_arch)
8617     {
8618       asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
8619       aarch64_print_extension ();
8620     }
8621   else if (selected_cpu)
8622     {
8623       const char *truncated_name
8624             = aarch64_rewrite_selected_cpu (selected_cpu->name);
8625       asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
8626       aarch64_print_extension ();
8627     }
8628   default_file_start();
8629 }
8630
8631 /* Target hook for c_mode_for_suffix.  */
8632 static enum machine_mode
8633 aarch64_c_mode_for_suffix (char suffix)
8634 {
8635   if (suffix == 'q')
8636     return TFmode;
8637
8638   return VOIDmode;
8639 }
8640
8641 /* We can only represent floating point constants which will fit in
8642    "quarter-precision" values.  These values are characterised by
8643    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
8644    by:
8645
8646    (-1)^s * (n/16) * 2^r
8647
8648    Where:
8649      's' is the sign bit.
8650      'n' is an integer in the range 16 <= n <= 31.
8651      'r' is an integer in the range -3 <= r <= 4.  */
8652
8653 /* Return true iff X can be represented by a quarter-precision
8654    floating point immediate operand X.  Note, we cannot represent 0.0.  */
8655 bool
8656 aarch64_float_const_representable_p (rtx x)
8657 {
8658   /* This represents our current view of how many bits
8659      make up the mantissa.  */
8660   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
8661   int exponent;
8662   unsigned HOST_WIDE_INT mantissa, mask;
8663   REAL_VALUE_TYPE r, m;
8664   bool fail;
8665
8666   if (!CONST_DOUBLE_P (x))
8667     return false;
8668
8669   if (GET_MODE (x) == VOIDmode)
8670     return false;
8671
8672   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
8673
8674   /* We cannot represent infinities, NaNs or +/-zero.  We won't
8675      know if we have +zero until we analyse the mantissa, but we
8676      can reject the other invalid values.  */
8677   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
8678       || REAL_VALUE_MINUS_ZERO (r))
8679     return false;
8680
8681   /* Extract exponent.  */
8682   r = real_value_abs (&r);
8683   exponent = REAL_EXP (&r);
8684
8685   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
8686      highest (sign) bit, with a fixed binary point at bit point_pos.
8687      m1 holds the low part of the mantissa, m2 the high part.
8688      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
8689      bits for the mantissa, this can fail (low bits will be lost).  */
8690   real_ldexp (&m, &r, point_pos - exponent);
8691   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
8692
8693   /* If the low part of the mantissa has bits set we cannot represent
8694      the value.  */
8695   if (w.elt (0) != 0)
8696     return false;
8697   /* We have rejected the lower HOST_WIDE_INT, so update our
8698      understanding of how many bits lie in the mantissa and
8699      look only at the high HOST_WIDE_INT.  */
8700   mantissa = w.elt (1);
8701   point_pos -= HOST_BITS_PER_WIDE_INT;
8702
8703   /* We can only represent values with a mantissa of the form 1.xxxx.  */
8704   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
8705   if ((mantissa & mask) != 0)
8706     return false;
8707
8708   /* Having filtered unrepresentable values, we may now remove all
8709      but the highest 5 bits.  */
8710   mantissa >>= point_pos - 5;
8711
8712   /* We cannot represent the value 0.0, so reject it.  This is handled
8713      elsewhere.  */
8714   if (mantissa == 0)
8715     return false;
8716
8717   /* Then, as bit 4 is always set, we can mask it off, leaving
8718      the mantissa in the range [0, 15].  */
8719   mantissa &= ~(1 << 4);
8720   gcc_assert (mantissa <= 15);
8721
8722   /* GCC internally does not use IEEE754-like encoding (where normalized
8723      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
8724      Our mantissa values are shifted 4 places to the left relative to
8725      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
8726      by 5 places to correct for GCC's representation.  */
8727   exponent = 5 - exponent;
8728
8729   return (exponent >= 0 && exponent <= 7);
8730 }
8731
8732 char*
8733 aarch64_output_simd_mov_immediate (rtx const_vector,
8734                                    enum machine_mode mode,
8735                                    unsigned width)
8736 {
8737   bool is_valid;
8738   static char templ[40];
8739   const char *mnemonic;
8740   const char *shift_op;
8741   unsigned int lane_count = 0;
8742   char element_char;
8743
8744   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
8745
8746   /* This will return true to show const_vector is legal for use as either
8747      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
8748      also update INFO to show how the immediate should be generated.  */
8749   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
8750   gcc_assert (is_valid);
8751
8752   element_char = sizetochar (info.element_width);
8753   lane_count = width / info.element_width;
8754
8755   mode = GET_MODE_INNER (mode);
8756   if (mode == SFmode || mode == DFmode)
8757     {
8758       gcc_assert (info.shift == 0 && ! info.mvn);
8759       if (aarch64_float_const_zero_rtx_p (info.value))
8760         info.value = GEN_INT (0);
8761       else
8762         {
8763 #define buf_size 20
8764           REAL_VALUE_TYPE r;
8765           REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
8766           char float_buf[buf_size] = {'\0'};
8767           real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
8768 #undef buf_size
8769
8770           if (lane_count == 1)
8771             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
8772           else
8773             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
8774                       lane_count, element_char, float_buf);
8775           return templ;
8776         }
8777     }
8778
8779   mnemonic = info.mvn ? "mvni" : "movi";
8780   shift_op = info.msl ? "msl" : "lsl";
8781
8782   if (lane_count == 1)
8783     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
8784               mnemonic, UINTVAL (info.value));
8785   else if (info.shift)
8786     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
8787               ", %s %d", mnemonic, lane_count, element_char,
8788               UINTVAL (info.value), shift_op, info.shift);
8789   else
8790     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
8791               mnemonic, lane_count, element_char, UINTVAL (info.value));
8792   return templ;
8793 }
8794
8795 char*
8796 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
8797                                           enum machine_mode mode)
8798 {
8799   enum machine_mode vmode;
8800
8801   gcc_assert (!VECTOR_MODE_P (mode));
8802   vmode = aarch64_simd_container_mode (mode, 64);
8803   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
8804   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
8805 }
8806
8807 /* Split operands into moves from op[1] + op[2] into op[0].  */
8808
8809 void
8810 aarch64_split_combinev16qi (rtx operands[3])
8811 {
8812   unsigned int dest = REGNO (operands[0]);
8813   unsigned int src1 = REGNO (operands[1]);
8814   unsigned int src2 = REGNO (operands[2]);
8815   enum machine_mode halfmode = GET_MODE (operands[1]);
8816   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
8817   rtx destlo, desthi;
8818
8819   gcc_assert (halfmode == V16QImode);
8820
8821   if (src1 == dest && src2 == dest + halfregs)
8822     {
8823       /* No-op move.  Can't split to nothing; emit something.  */
8824       emit_note (NOTE_INSN_DELETED);
8825       return;
8826     }
8827
8828   /* Preserve register attributes for variable tracking.  */
8829   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
8830   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
8831                                GET_MODE_SIZE (halfmode));
8832
8833   /* Special case of reversed high/low parts.  */
8834   if (reg_overlap_mentioned_p (operands[2], destlo)
8835       && reg_overlap_mentioned_p (operands[1], desthi))
8836     {
8837       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
8838       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
8839       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
8840     }
8841   else if (!reg_overlap_mentioned_p (operands[2], destlo))
8842     {
8843       /* Try to avoid unnecessary moves if part of the result
8844          is in the right place already.  */
8845       if (src1 != dest)
8846         emit_move_insn (destlo, operands[1]);
8847       if (src2 != dest + halfregs)
8848         emit_move_insn (desthi, operands[2]);
8849     }
8850   else
8851     {
8852       if (src2 != dest + halfregs)
8853         emit_move_insn (desthi, operands[2]);
8854       if (src1 != dest)
8855         emit_move_insn (destlo, operands[1]);
8856     }
8857 }
8858
8859 /* vec_perm support.  */
8860
8861 #define MAX_VECT_LEN 16
8862
8863 struct expand_vec_perm_d
8864 {
8865   rtx target, op0, op1;
8866   unsigned char perm[MAX_VECT_LEN];
8867   enum machine_mode vmode;
8868   unsigned char nelt;
8869   bool one_vector_p;
8870   bool testing_p;
8871 };
8872
8873 /* Generate a variable permutation.  */
8874
8875 static void
8876 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
8877 {
8878   enum machine_mode vmode = GET_MODE (target);
8879   bool one_vector_p = rtx_equal_p (op0, op1);
8880
8881   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
8882   gcc_checking_assert (GET_MODE (op0) == vmode);
8883   gcc_checking_assert (GET_MODE (op1) == vmode);
8884   gcc_checking_assert (GET_MODE (sel) == vmode);
8885   gcc_checking_assert (TARGET_SIMD);
8886
8887   if (one_vector_p)
8888     {
8889       if (vmode == V8QImode)
8890         {
8891           /* Expand the argument to a V16QI mode by duplicating it.  */
8892           rtx pair = gen_reg_rtx (V16QImode);
8893           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
8894           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
8895         }
8896       else
8897         {
8898           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
8899         }
8900     }
8901   else
8902     {
8903       rtx pair;
8904
8905       if (vmode == V8QImode)
8906         {
8907           pair = gen_reg_rtx (V16QImode);
8908           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
8909           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
8910         }
8911       else
8912         {
8913           pair = gen_reg_rtx (OImode);
8914           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
8915           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
8916         }
8917     }
8918 }
8919
8920 void
8921 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
8922 {
8923   enum machine_mode vmode = GET_MODE (target);
8924   unsigned int nelt = GET_MODE_NUNITS (vmode);
8925   bool one_vector_p = rtx_equal_p (op0, op1);
8926   rtx mask;
8927
8928   /* The TBL instruction does not use a modulo index, so we must take care
8929      of that ourselves.  */
8930   mask = aarch64_simd_gen_const_vector_dup (vmode,
8931       one_vector_p ? nelt - 1 : 2 * nelt - 1);
8932   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
8933
8934   /* For big-endian, we also need to reverse the index within the vector
8935      (but not which vector).  */
8936   if (BYTES_BIG_ENDIAN)
8937     {
8938       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
8939       if (!one_vector_p)
8940         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
8941       sel = expand_simple_binop (vmode, XOR, sel, mask,
8942                                  NULL, 0, OPTAB_LIB_WIDEN);
8943     }
8944   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
8945 }
8946
8947 /* Recognize patterns suitable for the TRN instructions.  */
8948 static bool
8949 aarch64_evpc_trn (struct expand_vec_perm_d *d)
8950 {
8951   unsigned int i, odd, mask, nelt = d->nelt;
8952   rtx out, in0, in1, x;
8953   rtx (*gen) (rtx, rtx, rtx);
8954   enum machine_mode vmode = d->vmode;
8955
8956   if (GET_MODE_UNIT_SIZE (vmode) > 8)
8957     return false;
8958
8959   /* Note that these are little-endian tests.
8960      We correct for big-endian later.  */
8961   if (d->perm[0] == 0)
8962     odd = 0;
8963   else if (d->perm[0] == 1)
8964     odd = 1;
8965   else
8966     return false;
8967   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
8968
8969   for (i = 0; i < nelt; i += 2)
8970     {
8971       if (d->perm[i] != i + odd)
8972         return false;
8973       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
8974         return false;
8975     }
8976
8977   /* Success!  */
8978   if (d->testing_p)
8979     return true;
8980
8981   in0 = d->op0;
8982   in1 = d->op1;
8983   if (BYTES_BIG_ENDIAN)
8984     {
8985       x = in0, in0 = in1, in1 = x;
8986       odd = !odd;
8987     }
8988   out = d->target;
8989
8990   if (odd)
8991     {
8992       switch (vmode)
8993         {
8994         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
8995         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
8996         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
8997         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
8998         case V4SImode: gen = gen_aarch64_trn2v4si; break;
8999         case V2SImode: gen = gen_aarch64_trn2v2si; break;
9000         case V2DImode: gen = gen_aarch64_trn2v2di; break;
9001         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9002         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9003         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9004         default:
9005           return false;
9006         }
9007     }
9008   else
9009     {
9010       switch (vmode)
9011         {
9012         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9013         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9014         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9015         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9016         case V4SImode: gen = gen_aarch64_trn1v4si; break;
9017         case V2SImode: gen = gen_aarch64_trn1v2si; break;
9018         case V2DImode: gen = gen_aarch64_trn1v2di; break;
9019         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9020         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9021         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9022         default:
9023           return false;
9024         }
9025     }
9026
9027   emit_insn (gen (out, in0, in1));
9028   return true;
9029 }
9030
9031 /* Recognize patterns suitable for the UZP instructions.  */
9032 static bool
9033 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
9034 {
9035   unsigned int i, odd, mask, nelt = d->nelt;
9036   rtx out, in0, in1, x;
9037   rtx (*gen) (rtx, rtx, rtx);
9038   enum machine_mode vmode = d->vmode;
9039
9040   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9041     return false;
9042
9043   /* Note that these are little-endian tests.
9044      We correct for big-endian later.  */
9045   if (d->perm[0] == 0)
9046     odd = 0;
9047   else if (d->perm[0] == 1)
9048     odd = 1;
9049   else
9050     return false;
9051   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9052
9053   for (i = 0; i < nelt; i++)
9054     {
9055       unsigned elt = (i * 2 + odd) & mask;
9056       if (d->perm[i] != elt)
9057         return false;
9058     }
9059
9060   /* Success!  */
9061   if (d->testing_p)
9062     return true;
9063
9064   in0 = d->op0;
9065   in1 = d->op1;
9066   if (BYTES_BIG_ENDIAN)
9067     {
9068       x = in0, in0 = in1, in1 = x;
9069       odd = !odd;
9070     }
9071   out = d->target;
9072
9073   if (odd)
9074     {
9075       switch (vmode)
9076         {
9077         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9078         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9079         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9080         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9081         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9082         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9083         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9084         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9085         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9086         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9087         default:
9088           return false;
9089         }
9090     }
9091   else
9092     {
9093       switch (vmode)
9094         {
9095         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9096         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9097         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9098         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9099         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9100         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9101         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9102         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9103         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9104         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9105         default:
9106           return false;
9107         }
9108     }
9109
9110   emit_insn (gen (out, in0, in1));
9111   return true;
9112 }
9113
9114 /* Recognize patterns suitable for the ZIP instructions.  */
9115 static bool
9116 aarch64_evpc_zip (struct expand_vec_perm_d *d)
9117 {
9118   unsigned int i, high, mask, nelt = d->nelt;
9119   rtx out, in0, in1, x;
9120   rtx (*gen) (rtx, rtx, rtx);
9121   enum machine_mode vmode = d->vmode;
9122
9123   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9124     return false;
9125
9126   /* Note that these are little-endian tests.
9127      We correct for big-endian later.  */
9128   high = nelt / 2;
9129   if (d->perm[0] == high)
9130     /* Do Nothing.  */
9131     ;
9132   else if (d->perm[0] == 0)
9133     high = 0;
9134   else
9135     return false;
9136   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9137
9138   for (i = 0; i < nelt / 2; i++)
9139     {
9140       unsigned elt = (i + high) & mask;
9141       if (d->perm[i * 2] != elt)
9142         return false;
9143       elt = (elt + nelt) & mask;
9144       if (d->perm[i * 2 + 1] != elt)
9145         return false;
9146     }
9147
9148   /* Success!  */
9149   if (d->testing_p)
9150     return true;
9151
9152   in0 = d->op0;
9153   in1 = d->op1;
9154   if (BYTES_BIG_ENDIAN)
9155     {
9156       x = in0, in0 = in1, in1 = x;
9157       high = !high;
9158     }
9159   out = d->target;
9160
9161   if (high)
9162     {
9163       switch (vmode)
9164         {
9165         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9166         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9167         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9168         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9169         case V4SImode: gen = gen_aarch64_zip2v4si; break;
9170         case V2SImode: gen = gen_aarch64_zip2v2si; break;
9171         case V2DImode: gen = gen_aarch64_zip2v2di; break;
9172         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9173         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9174         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9175         default:
9176           return false;
9177         }
9178     }
9179   else
9180     {
9181       switch (vmode)
9182         {
9183         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9184         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9185         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9186         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9187         case V4SImode: gen = gen_aarch64_zip1v4si; break;
9188         case V2SImode: gen = gen_aarch64_zip1v2si; break;
9189         case V2DImode: gen = gen_aarch64_zip1v2di; break;
9190         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9191         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9192         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9193         default:
9194           return false;
9195         }
9196     }
9197
9198   emit_insn (gen (out, in0, in1));
9199   return true;
9200 }
9201
9202 /* Recognize patterns for the EXT insn.  */
9203
9204 static bool
9205 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9206 {
9207   unsigned int i, nelt = d->nelt;
9208   rtx (*gen) (rtx, rtx, rtx, rtx);
9209   rtx offset;
9210
9211   unsigned int location = d->perm[0]; /* Always < nelt.  */
9212
9213   /* Check if the extracted indices are increasing by one.  */
9214   for (i = 1; i < nelt; i++)
9215     {
9216       unsigned int required = location + i;
9217       if (d->one_vector_p)
9218         {
9219           /* We'll pass the same vector in twice, so allow indices to wrap.  */
9220           required &= (nelt - 1);
9221         }
9222       if (d->perm[i] != required)
9223         return false;
9224     }
9225
9226   switch (d->vmode)
9227     {
9228     case V16QImode: gen = gen_aarch64_extv16qi; break;
9229     case V8QImode: gen = gen_aarch64_extv8qi; break;
9230     case V4HImode: gen = gen_aarch64_extv4hi; break;
9231     case V8HImode: gen = gen_aarch64_extv8hi; break;
9232     case V2SImode: gen = gen_aarch64_extv2si; break;
9233     case V4SImode: gen = gen_aarch64_extv4si; break;
9234     case V2SFmode: gen = gen_aarch64_extv2sf; break;
9235     case V4SFmode: gen = gen_aarch64_extv4sf; break;
9236     case V2DImode: gen = gen_aarch64_extv2di; break;
9237     case V2DFmode: gen = gen_aarch64_extv2df; break;
9238     default:
9239       return false;
9240     }
9241
9242   /* Success! */
9243   if (d->testing_p)
9244     return true;
9245
9246   /* The case where (location == 0) is a no-op for both big- and little-endian,
9247      and is removed by the mid-end at optimization levels -O1 and higher.  */
9248
9249   if (BYTES_BIG_ENDIAN && (location != 0))
9250     {
9251       /* After setup, we want the high elements of the first vector (stored
9252          at the LSB end of the register), and the low elements of the second
9253          vector (stored at the MSB end of the register). So swap.  */
9254       rtx temp = d->op0;
9255       d->op0 = d->op1;
9256       d->op1 = temp;
9257       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
9258       location = nelt - location;
9259     }
9260
9261   offset = GEN_INT (location);
9262   emit_insn (gen (d->target, d->op0, d->op1, offset));
9263   return true;
9264 }
9265
9266 /* Recognize patterns for the REV insns.  */
9267
9268 static bool
9269 aarch64_evpc_rev (struct expand_vec_perm_d *d)
9270 {
9271   unsigned int i, j, diff, nelt = d->nelt;
9272   rtx (*gen) (rtx, rtx);
9273
9274   if (!d->one_vector_p)
9275     return false;
9276
9277   diff = d->perm[0];
9278   switch (diff)
9279     {
9280     case 7:
9281       switch (d->vmode)
9282         {
9283         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
9284         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
9285         default:
9286           return false;
9287         }
9288       break;
9289     case 3:
9290       switch (d->vmode)
9291         {
9292         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
9293         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
9294         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
9295         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
9296         default:
9297           return false;
9298         }
9299       break;
9300     case 1:
9301       switch (d->vmode)
9302         {
9303         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
9304         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
9305         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
9306         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
9307         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
9308         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
9309         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
9310         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
9311         default:
9312           return false;
9313         }
9314       break;
9315     default:
9316       return false;
9317     }
9318
9319   for (i = 0; i < nelt ; i += diff + 1)
9320     for (j = 0; j <= diff; j += 1)
9321       {
9322         /* This is guaranteed to be true as the value of diff
9323            is 7, 3, 1 and we should have enough elements in the
9324            queue to generate this.  Getting a vector mask with a
9325            value of diff other than these values implies that
9326            something is wrong by the time we get here.  */
9327         gcc_assert (i + j < nelt);
9328         if (d->perm[i + j] != i + diff - j)
9329           return false;
9330       }
9331
9332   /* Success! */
9333   if (d->testing_p)
9334     return true;
9335
9336   emit_insn (gen (d->target, d->op0));
9337   return true;
9338 }
9339
9340 static bool
9341 aarch64_evpc_dup (struct expand_vec_perm_d *d)
9342 {
9343   rtx (*gen) (rtx, rtx, rtx);
9344   rtx out = d->target;
9345   rtx in0;
9346   enum machine_mode vmode = d->vmode;
9347   unsigned int i, elt, nelt = d->nelt;
9348   rtx lane;
9349
9350   elt = d->perm[0];
9351   for (i = 1; i < nelt; i++)
9352     {
9353       if (elt != d->perm[i])
9354         return false;
9355     }
9356
9357   /* The generic preparation in aarch64_expand_vec_perm_const_1
9358      swaps the operand order and the permute indices if it finds
9359      d->perm[0] to be in the second operand.  Thus, we can always
9360      use d->op0 and need not do any extra arithmetic to get the
9361      correct lane number.  */
9362   in0 = d->op0;
9363   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
9364
9365   switch (vmode)
9366     {
9367     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
9368     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
9369     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
9370     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
9371     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
9372     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
9373     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
9374     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
9375     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
9376     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
9377     default:
9378       return false;
9379     }
9380
9381   emit_insn (gen (out, in0, lane));
9382   return true;
9383 }
9384
9385 static bool
9386 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
9387 {
9388   rtx rperm[MAX_VECT_LEN], sel;
9389   enum machine_mode vmode = d->vmode;
9390   unsigned int i, nelt = d->nelt;
9391
9392   if (d->testing_p)
9393     return true;
9394
9395   /* Generic code will try constant permutation twice.  Once with the
9396      original mode and again with the elements lowered to QImode.
9397      So wait and don't do the selector expansion ourselves.  */
9398   if (vmode != V8QImode && vmode != V16QImode)
9399     return false;
9400
9401   for (i = 0; i < nelt; ++i)
9402     {
9403       int nunits = GET_MODE_NUNITS (vmode);
9404
9405       /* If big-endian and two vectors we end up with a weird mixed-endian
9406          mode on NEON.  Reverse the index within each word but not the word
9407          itself.  */
9408       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
9409                                            : d->perm[i]);
9410     }
9411   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
9412   sel = force_reg (vmode, sel);
9413
9414   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
9415   return true;
9416 }
9417
9418 static bool
9419 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
9420 {
9421   /* The pattern matching functions above are written to look for a small
9422      number to begin the sequence (0, 1, N/2).  If we begin with an index
9423      from the second operand, we can swap the operands.  */
9424   if (d->perm[0] >= d->nelt)
9425     {
9426       unsigned i, nelt = d->nelt;
9427       rtx x;
9428
9429       gcc_assert (nelt == (nelt & -nelt));
9430       for (i = 0; i < nelt; ++i)
9431         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
9432
9433       x = d->op0;
9434       d->op0 = d->op1;
9435       d->op1 = x;
9436     }
9437
9438   if (TARGET_SIMD)
9439     {
9440       if (aarch64_evpc_rev (d))
9441         return true;
9442       else if (aarch64_evpc_ext (d))
9443         return true;
9444       else if (aarch64_evpc_dup (d))
9445         return true;
9446       else if (aarch64_evpc_zip (d))
9447         return true;
9448       else if (aarch64_evpc_uzp (d))
9449         return true;
9450       else if (aarch64_evpc_trn (d))
9451         return true;
9452       return aarch64_evpc_tbl (d);
9453     }
9454   return false;
9455 }
9456
9457 /* Expand a vec_perm_const pattern.  */
9458
9459 bool
9460 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
9461 {
9462   struct expand_vec_perm_d d;
9463   int i, nelt, which;
9464
9465   d.target = target;
9466   d.op0 = op0;
9467   d.op1 = op1;
9468
9469   d.vmode = GET_MODE (target);
9470   gcc_assert (VECTOR_MODE_P (d.vmode));
9471   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
9472   d.testing_p = false;
9473
9474   for (i = which = 0; i < nelt; ++i)
9475     {
9476       rtx e = XVECEXP (sel, 0, i);
9477       int ei = INTVAL (e) & (2 * nelt - 1);
9478       which |= (ei < nelt ? 1 : 2);
9479       d.perm[i] = ei;
9480     }
9481
9482   switch (which)
9483     {
9484     default:
9485       gcc_unreachable ();
9486
9487     case 3:
9488       d.one_vector_p = false;
9489       if (!rtx_equal_p (op0, op1))
9490         break;
9491
9492       /* The elements of PERM do not suggest that only the first operand
9493          is used, but both operands are identical.  Allow easier matching
9494          of the permutation by folding the permutation into the single
9495          input vector.  */
9496       /* Fall Through.  */
9497     case 2:
9498       for (i = 0; i < nelt; ++i)
9499         d.perm[i] &= nelt - 1;
9500       d.op0 = op1;
9501       d.one_vector_p = true;
9502       break;
9503
9504     case 1:
9505       d.op1 = op0;
9506       d.one_vector_p = true;
9507       break;
9508     }
9509
9510   return aarch64_expand_vec_perm_const_1 (&d);
9511 }
9512
9513 static bool
9514 aarch64_vectorize_vec_perm_const_ok (enum machine_mode vmode,
9515                                      const unsigned char *sel)
9516 {
9517   struct expand_vec_perm_d d;
9518   unsigned int i, nelt, which;
9519   bool ret;
9520
9521   d.vmode = vmode;
9522   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
9523   d.testing_p = true;
9524   memcpy (d.perm, sel, nelt);
9525
9526   /* Calculate whether all elements are in one vector.  */
9527   for (i = which = 0; i < nelt; ++i)
9528     {
9529       unsigned char e = d.perm[i];
9530       gcc_assert (e < 2 * nelt);
9531       which |= (e < nelt ? 1 : 2);
9532     }
9533
9534   /* If all elements are from the second vector, reindex as if from the
9535      first vector.  */
9536   if (which == 2)
9537     for (i = 0; i < nelt; ++i)
9538       d.perm[i] -= nelt;
9539
9540   /* Check whether the mask can be applied to a single vector.  */
9541   d.one_vector_p = (which != 3);
9542
9543   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
9544   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
9545   if (!d.one_vector_p)
9546     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
9547
9548   start_sequence ();
9549   ret = aarch64_expand_vec_perm_const_1 (&d);
9550   end_sequence ();
9551
9552   return ret;
9553 }
9554
9555 /* Implement target hook CANNOT_CHANGE_MODE_CLASS.  */
9556 bool
9557 aarch64_cannot_change_mode_class (enum machine_mode from,
9558                                   enum machine_mode to,
9559                                   enum reg_class rclass)
9560 {
9561   /* Full-reg subregs are allowed on general regs or any class if they are
9562      the same size.  */
9563   if (GET_MODE_SIZE (from) == GET_MODE_SIZE (to)
9564       || !reg_classes_intersect_p (FP_REGS, rclass))
9565     return false;
9566
9567   /* Limited combinations of subregs are safe on FPREGs.  Particularly,
9568      1. Vector Mode to Scalar mode where 1 unit of the vector is accessed.
9569      2. Scalar to Scalar for integer modes or same size float modes.
9570      3. Vector to Vector modes.
9571      4. On little-endian only, Vector-Structure to Vector modes.  */
9572   if (GET_MODE_SIZE (from) > GET_MODE_SIZE (to))
9573     {
9574       if (aarch64_vector_mode_supported_p (from)
9575           && GET_MODE_SIZE (GET_MODE_INNER (from)) == GET_MODE_SIZE (to))
9576         return false;
9577
9578       if (GET_MODE_NUNITS (from) == 1
9579           && GET_MODE_NUNITS (to) == 1
9580           && (GET_MODE_CLASS (from) == MODE_INT
9581               || from == to))
9582         return false;
9583
9584       if (aarch64_vector_mode_supported_p (from)
9585           && aarch64_vector_mode_supported_p (to))
9586         return false;
9587
9588       /* Within an vector structure straddling multiple vector registers
9589          we are in a mixed-endian representation.  As such, we can't
9590          easily change modes for BYTES_BIG_ENDIAN.  Otherwise, we can
9591          switch between vectors and vector structures cheaply.  */
9592       if (!BYTES_BIG_ENDIAN)
9593         if ((aarch64_vector_mode_supported_p (from)
9594               && aarch64_vect_struct_mode_p (to))
9595             || (aarch64_vector_mode_supported_p (to)
9596               && aarch64_vect_struct_mode_p (from)))
9597           return false;
9598     }
9599
9600   return true;
9601 }
9602
9603 /* Implement MODES_TIEABLE_P.  */
9604
9605 bool
9606 aarch64_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
9607 {
9608   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
9609     return true;
9610
9611   /* We specifically want to allow elements of "structure" modes to
9612      be tieable to the structure.  This more general condition allows
9613      other rarer situations too.  */
9614   if (TARGET_SIMD
9615       && aarch64_vector_mode_p (mode1)
9616       && aarch64_vector_mode_p (mode2))
9617     return true;
9618
9619   return false;
9620 }
9621
9622 /* Return a new RTX holding the result of moving POINTER forward by
9623    AMOUNT bytes.  */
9624
9625 static rtx
9626 aarch64_move_pointer (rtx pointer, int amount)
9627 {
9628   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
9629
9630   return adjust_automodify_address (pointer, GET_MODE (pointer),
9631                                     next, amount);
9632 }
9633
9634 /* Return a new RTX holding the result of moving POINTER forward by the
9635    size of the mode it points to.  */
9636
9637 static rtx
9638 aarch64_progress_pointer (rtx pointer)
9639 {
9640   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
9641
9642   return aarch64_move_pointer (pointer, amount);
9643 }
9644
9645 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
9646    MODE bytes.  */
9647
9648 static void
9649 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
9650                                               enum machine_mode mode)
9651 {
9652   rtx reg = gen_reg_rtx (mode);
9653
9654   /* "Cast" the pointers to the correct mode.  */
9655   *src = adjust_address (*src, mode, 0);
9656   *dst = adjust_address (*dst, mode, 0);
9657   /* Emit the memcpy.  */
9658   emit_move_insn (reg, *src);
9659   emit_move_insn (*dst, reg);
9660   /* Move the pointers forward.  */
9661   *src = aarch64_progress_pointer (*src);
9662   *dst = aarch64_progress_pointer (*dst);
9663 }
9664
9665 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
9666    we succeed, otherwise return false.  */
9667
9668 bool
9669 aarch64_expand_movmem (rtx *operands)
9670 {
9671   unsigned int n;
9672   rtx dst = operands[0];
9673   rtx src = operands[1];
9674   rtx base;
9675   bool speed_p = !optimize_function_for_size_p (cfun);
9676
9677   /* When optimizing for size, give a better estimate of the length of a
9678      memcpy call, but use the default otherwise.  */
9679   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
9680
9681   /* We can't do anything smart if the amount to copy is not constant.  */
9682   if (!CONST_INT_P (operands[2]))
9683     return false;
9684
9685   n = UINTVAL (operands[2]);
9686
9687   /* Try to keep the number of instructions low.  For cases below 16 bytes we
9688      need to make at most two moves.  For cases above 16 bytes it will be one
9689      move for each 16 byte chunk, then at most two additional moves.  */
9690   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
9691     return false;
9692
9693   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
9694   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
9695
9696   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
9697   src = adjust_automodify_address (src, VOIDmode, base, 0);
9698
9699   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
9700      1-byte chunk.  */
9701   if (n < 4)
9702     {
9703       if (n >= 2)
9704         {
9705           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
9706           n -= 2;
9707         }
9708
9709       if (n == 1)
9710         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
9711
9712       return true;
9713     }
9714
9715   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
9716      4-byte chunk, partially overlapping with the previously copied chunk.  */
9717   if (n < 8)
9718     {
9719       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9720       n -= 4;
9721       if (n > 0)
9722         {
9723           int move = n - 4;
9724
9725           src = aarch64_move_pointer (src, move);
9726           dst = aarch64_move_pointer (dst, move);
9727           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9728         }
9729       return true;
9730     }
9731
9732   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
9733      them, then (if applicable) an 8-byte chunk.  */
9734   while (n >= 8)
9735     {
9736       if (n / 16)
9737         {
9738           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
9739           n -= 16;
9740         }
9741       else
9742         {
9743           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
9744           n -= 8;
9745         }
9746     }
9747
9748   /* Finish the final bytes of the copy.  We can always do this in one
9749      instruction.  We either copy the exact amount we need, or partially
9750      overlap with the previous chunk we copied and copy 8-bytes.  */
9751   if (n == 0)
9752     return true;
9753   else if (n == 1)
9754     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
9755   else if (n == 2)
9756     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
9757   else if (n == 4)
9758     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9759   else
9760     {
9761       if (n == 3)
9762         {
9763           src = aarch64_move_pointer (src, -1);
9764           dst = aarch64_move_pointer (dst, -1);
9765           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
9766         }
9767       else
9768         {
9769           int move = n - 8;
9770
9771           src = aarch64_move_pointer (src, move);
9772           dst = aarch64_move_pointer (dst, move);
9773           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
9774         }
9775     }
9776
9777   return true;
9778 }
9779
9780 #undef TARGET_ADDRESS_COST
9781 #define TARGET_ADDRESS_COST aarch64_address_cost
9782
9783 /* This hook will determines whether unnamed bitfields affect the alignment
9784    of the containing structure.  The hook returns true if the structure
9785    should inherit the alignment requirements of an unnamed bitfield's
9786    type.  */
9787 #undef TARGET_ALIGN_ANON_BITFIELD
9788 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
9789
9790 #undef TARGET_ASM_ALIGNED_DI_OP
9791 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
9792
9793 #undef TARGET_ASM_ALIGNED_HI_OP
9794 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
9795
9796 #undef TARGET_ASM_ALIGNED_SI_OP
9797 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
9798
9799 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
9800 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
9801   hook_bool_const_tree_hwi_hwi_const_tree_true
9802
9803 #undef TARGET_ASM_FILE_START
9804 #define TARGET_ASM_FILE_START aarch64_start_file
9805
9806 #undef TARGET_ASM_OUTPUT_MI_THUNK
9807 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
9808
9809 #undef TARGET_ASM_SELECT_RTX_SECTION
9810 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
9811
9812 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
9813 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
9814
9815 #undef TARGET_BUILD_BUILTIN_VA_LIST
9816 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
9817
9818 #undef TARGET_CALLEE_COPIES
9819 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
9820
9821 #undef TARGET_CAN_ELIMINATE
9822 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
9823
9824 #undef TARGET_CANNOT_FORCE_CONST_MEM
9825 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
9826
9827 #undef TARGET_CONDITIONAL_REGISTER_USAGE
9828 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
9829
9830 /* Only the least significant bit is used for initialization guard
9831    variables.  */
9832 #undef TARGET_CXX_GUARD_MASK_BIT
9833 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
9834
9835 #undef TARGET_C_MODE_FOR_SUFFIX
9836 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
9837
9838 #ifdef TARGET_BIG_ENDIAN_DEFAULT
9839 #undef  TARGET_DEFAULT_TARGET_FLAGS
9840 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
9841 #endif
9842
9843 #undef TARGET_CLASS_MAX_NREGS
9844 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
9845
9846 #undef TARGET_BUILTIN_DECL
9847 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
9848
9849 #undef  TARGET_EXPAND_BUILTIN
9850 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
9851
9852 #undef TARGET_EXPAND_BUILTIN_VA_START
9853 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
9854
9855 #undef TARGET_FOLD_BUILTIN
9856 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
9857
9858 #undef TARGET_FUNCTION_ARG
9859 #define TARGET_FUNCTION_ARG aarch64_function_arg
9860
9861 #undef TARGET_FUNCTION_ARG_ADVANCE
9862 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
9863
9864 #undef TARGET_FUNCTION_ARG_BOUNDARY
9865 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
9866
9867 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
9868 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
9869
9870 #undef TARGET_FUNCTION_VALUE
9871 #define TARGET_FUNCTION_VALUE aarch64_function_value
9872
9873 #undef TARGET_FUNCTION_VALUE_REGNO_P
9874 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
9875
9876 #undef TARGET_FRAME_POINTER_REQUIRED
9877 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
9878
9879 #undef TARGET_GIMPLE_FOLD_BUILTIN
9880 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
9881
9882 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
9883 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
9884
9885 #undef  TARGET_INIT_BUILTINS
9886 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
9887
9888 #undef TARGET_LEGITIMATE_ADDRESS_P
9889 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
9890
9891 #undef TARGET_LEGITIMATE_CONSTANT_P
9892 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
9893
9894 #undef TARGET_LIBGCC_CMP_RETURN_MODE
9895 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
9896
9897 #undef TARGET_LRA_P
9898 #define TARGET_LRA_P aarch64_lra_p
9899
9900 #undef TARGET_MANGLE_TYPE
9901 #define TARGET_MANGLE_TYPE aarch64_mangle_type
9902
9903 #undef TARGET_MEMORY_MOVE_COST
9904 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
9905
9906 #undef TARGET_MUST_PASS_IN_STACK
9907 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
9908
9909 /* This target hook should return true if accesses to volatile bitfields
9910    should use the narrowest mode possible.  It should return false if these
9911    accesses should use the bitfield container type.  */
9912 #undef TARGET_NARROW_VOLATILE_BITFIELD
9913 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
9914
9915 #undef  TARGET_OPTION_OVERRIDE
9916 #define TARGET_OPTION_OVERRIDE aarch64_override_options
9917
9918 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
9919 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
9920   aarch64_override_options_after_change
9921
9922 #undef TARGET_PASS_BY_REFERENCE
9923 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
9924
9925 #undef TARGET_PREFERRED_RELOAD_CLASS
9926 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
9927
9928 #undef TARGET_SECONDARY_RELOAD
9929 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
9930
9931 #undef TARGET_SHIFT_TRUNCATION_MASK
9932 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
9933
9934 #undef TARGET_SETUP_INCOMING_VARARGS
9935 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
9936
9937 #undef TARGET_STRUCT_VALUE_RTX
9938 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
9939
9940 #undef TARGET_REGISTER_MOVE_COST
9941 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
9942
9943 #undef TARGET_RETURN_IN_MEMORY
9944 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
9945
9946 #undef TARGET_RETURN_IN_MSB
9947 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
9948
9949 #undef TARGET_RTX_COSTS
9950 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
9951
9952 #undef TARGET_SCHED_ISSUE_RATE
9953 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
9954
9955 #undef TARGET_TRAMPOLINE_INIT
9956 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
9957
9958 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
9959 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
9960
9961 #undef TARGET_VECTOR_MODE_SUPPORTED_P
9962 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
9963
9964 #undef TARGET_ARRAY_MODE_SUPPORTED_P
9965 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
9966
9967 #undef TARGET_VECTORIZE_ADD_STMT_COST
9968 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
9969
9970 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
9971 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
9972   aarch64_builtin_vectorization_cost
9973
9974 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
9975 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
9976
9977 #undef TARGET_VECTORIZE_BUILTINS
9978 #define TARGET_VECTORIZE_BUILTINS
9979
9980 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
9981 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
9982   aarch64_builtin_vectorized_function
9983
9984 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
9985 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
9986   aarch64_autovectorize_vector_sizes
9987
9988 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
9989 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
9990   aarch64_atomic_assign_expand_fenv
9991
9992 /* Section anchor support.  */
9993
9994 #undef TARGET_MIN_ANCHOR_OFFSET
9995 #define TARGET_MIN_ANCHOR_OFFSET -256
9996
9997 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
9998    byte offset; we can do much more for larger data types, but have no way
9999    to determine the size of the access.  We assume accesses are aligned.  */
10000 #undef TARGET_MAX_ANCHOR_OFFSET
10001 #define TARGET_MAX_ANCHOR_OFFSET 4095
10002
10003 #undef TARGET_VECTOR_ALIGNMENT
10004 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
10005
10006 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
10007 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
10008   aarch64_simd_vector_alignment_reachable
10009
10010 /* vec_perm support.  */
10011
10012 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
10013 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
10014   aarch64_vectorize_vec_perm_const_ok
10015
10016
10017 #undef TARGET_FIXED_CONDITION_CODE_REGS
10018 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
10019
10020 #undef TARGET_FLAGS_REGNUM
10021 #define TARGET_FLAGS_REGNUM CC_REGNUM
10022
10023 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
10024 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
10025
10026 struct gcc_target targetm = TARGET_INITIALIZER;
10027
10028 #include "gt-aarch64.h"