gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2015 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "insn-codes.h"
  26 #include "rtl.h"
  27 #include "insn-attr.h"
  28 #include "alias.h"
  29 #include "symtab.h"
  30 #include "tree.h"
  31 #include "fold-const.h"
  32 #include "stringpool.h"
  33 #include "stor-layout.h"
  34 #include "calls.h"
  35 #include "varasm.h"
  36 #include "regs.h"
  37 #include "dominance.h"
  38 #include "cfg.h"
  39 #include "cfgrtl.h"
  40 #include "cfganal.h"
  41 #include "lcm.h"
  42 #include "cfgbuild.h"
  43 #include "cfgcleanup.h"
  44 #include "predict.h"
  45 #include "basic-block.h"
  46 #include "df.h"
  47 #include "hard-reg-set.h"
  48 #include "output.h"
  49 #include "function.h"
  50 #include "flags.h"
  51 #include "insn-config.h"
  52 #include "expmed.h"
  53 #include "dojump.h"
  54 #include "explow.h"
  55 #include "emit-rtl.h"
  56 #include "stmt.h"
  57 #include "expr.h"
  58 #include "reload.h"
  59 #include "toplev.h"
  60 #include "target.h"
  61 #include "targhooks.h"
  62 #include "tm_p.h"
  63 #include "recog.h"
  64 #include "langhooks.h"
  65 #include "diagnostic-core.h"
  66 #include "tree-ssa-alias.h"
  67 #include "internal-fn.h"
  68 #include "gimple-fold.h"
  69 #include "tree-eh.h"
  70 #include "gimple-expr.h"
  71 #include "gimple.h"
  72 #include "gimplify.h"
  73 #include "optabs.h"
  74 #include "dwarf2.h"
  75 #include "cfgloop.h"
  76 #include "tree-vectorizer.h"
  77 #include "aarch64-cost-tables.h"
  78 #include "dumpfile.h"
  79 #include "builtins.h"
  80 #include "rtl-iter.h"
  81 #include "tm-constrs.h"
  82 #include "sched-int.h"
  83 #include "cortex-a57-fma-steering.h"
  84
  85 #include "target-def.h"
  86
  87 /* Defined for convenience.  */
  88 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  89
  90 /* Classifies an address.
  91
  92    ADDRESS_REG_IMM
  93        A simple base register plus immediate offset.
  94
  95    ADDRESS_REG_WB
  96        A base register indexed by immediate offset with writeback.
  97
  98    ADDRESS_REG_REG
  99        A base register indexed by (optionally scaled) register.
 100
 101    ADDRESS_REG_UXTW
 102        A base register indexed by (optionally scaled) zero-extended register.
 103
 104    ADDRESS_REG_SXTW
 105        A base register indexed by (optionally scaled) sign-extended register.
 106
 107    ADDRESS_LO_SUM
 108        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 109
 110    ADDRESS_SYMBOLIC:
 111        A constant symbolic address, in pc-relative literal pool.  */
 112
 113 enum aarch64_address_type {
 114   ADDRESS_REG_IMM,
 115   ADDRESS_REG_WB,
 116   ADDRESS_REG_REG,
 117   ADDRESS_REG_UXTW,
 118   ADDRESS_REG_SXTW,
 119   ADDRESS_LO_SUM,
 120   ADDRESS_SYMBOLIC
 121 };
 122
 123 struct aarch64_address_info {
 124   enum aarch64_address_type type;
 125   rtx base;
 126   rtx offset;
 127   int shift;
 128   enum aarch64_symbol_type symbol_type;
 129 };
 130
 131 struct simd_immediate_info
 132 {
 133   rtx value;
 134   int shift;
 135   int element_width;
 136   bool mvn;
 137   bool msl;
 138 };
 139
 140 /* The current code model.  */
 141 enum aarch64_code_model aarch64_cmodel;
 142
 143 #ifdef HAVE_AS_TLS
 144 #undef TARGET_HAVE_TLS
 145 #define TARGET_HAVE_TLS 1
 146 #endif
 147
 148 static bool aarch64_composite_type_p (const_tree, machine_mode);
 149 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 150                                                      const_tree,
 151                                                      machine_mode *, int *,
 152                                                      bool *);
 153 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 154 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 155 static void aarch64_override_options_after_change (void);
 156 static bool aarch64_vector_mode_supported_p (machine_mode);
 157 static unsigned bit_count (unsigned HOST_WIDE_INT);
 158 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 159                                                  const unsigned char *sel);
 160 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 161
 162 /* Major revision number of the ARM Architecture implemented by the target.  */
 163 unsigned aarch64_architecture_version;
 164
 165 /* The processor for which instructions should be scheduled.  */
 166 enum aarch64_processor aarch64_tune = cortexa53;
 167
 168 /* The current tuning set.  */
 169 const struct tune_params *aarch64_tune_params;
 170
 171 /* Mask to specify which instructions we are allowed to generate.  */
 172 unsigned long aarch64_isa_flags = 0;
 173
 174 /* Mask to specify which instruction scheduling options should be used.  */
 175 unsigned long aarch64_tune_flags = 0;
 176
 177 /* Tuning parameters.  */
 178
 179 static const struct cpu_addrcost_table generic_addrcost_table =
 180 {
 181     {
 182       0, /* hi  */
 183       0, /* si  */
 184       0, /* di  */
 185       0, /* ti  */
 186     },
 187   0, /* pre_modify  */
 188   0, /* post_modify  */
 189   0, /* register_offset  */
 190   0, /* register_extend  */
 191   0 /* imm_offset  */
 192 };
 193
 194 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 195 {
 196     {
 197       1, /* hi  */
 198       0, /* si  */
 199       0, /* di  */
 200       1, /* ti  */
 201     },
 202   0, /* pre_modify  */
 203   0, /* post_modify  */
 204   0, /* register_offset  */
 205   0, /* register_extend  */
 206   0, /* imm_offset  */
 207 };
 208
 209 static const struct cpu_addrcost_table xgene1_addrcost_table =
 210 {
 211     {
 212       1, /* hi  */
 213       0, /* si  */
 214       0, /* di  */
 215       1, /* ti  */
 216     },
 217   1, /* pre_modify  */
 218   0, /* post_modify  */
 219   0, /* register_offset  */
 220   1, /* register_extend  */
 221   0, /* imm_offset  */
 222 };
 223
 224 static const struct cpu_regmove_cost generic_regmove_cost =
 225 {
 226   1, /* GP2GP  */
 227   /* Avoid the use of slow int<->fp moves for spilling by setting
 228      their cost higher than memmov_cost.  */
 229   5, /* GP2FP  */
 230   5, /* FP2GP  */
 231   2 /* FP2FP  */
 232 };
 233
 234 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 235 {
 236   1, /* GP2GP  */
 237   /* Avoid the use of slow int<->fp moves for spilling by setting
 238      their cost higher than memmov_cost.  */
 239   5, /* GP2FP  */
 240   5, /* FP2GP  */
 241   2 /* FP2FP  */
 242 };
 243
 244 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 245 {
 246   1, /* GP2GP  */
 247   /* Avoid the use of slow int<->fp moves for spilling by setting
 248      their cost higher than memmov_cost.  */
 249   5, /* GP2FP  */
 250   5, /* FP2GP  */
 251   2 /* FP2FP  */
 252 };
 253
 254 static const struct cpu_regmove_cost thunderx_regmove_cost =
 255 {
 256   2, /* GP2GP  */
 257   2, /* GP2FP  */
 258   6, /* FP2GP  */
 259   4 /* FP2FP  */
 260 };
 261
 262 static const struct cpu_regmove_cost xgene1_regmove_cost =
 263 {
 264   1, /* GP2GP  */
 265   /* Avoid the use of slow int<->fp moves for spilling by setting
 266      their cost higher than memmov_cost.  */
 267   8, /* GP2FP  */
 268   8, /* FP2GP  */
 269   2 /* FP2FP  */
 270 };
 271
 272 /* Generic costs for vector insn classes.  */
 273 static const struct cpu_vector_cost generic_vector_cost =
 274 {
 275   1, /* scalar_stmt_cost  */
 276   1, /* scalar_load_cost  */
 277   1, /* scalar_store_cost  */
 278   1, /* vec_stmt_cost  */
 279   1, /* vec_to_scalar_cost  */
 280   1, /* scalar_to_vec_cost  */
 281   1, /* vec_align_load_cost  */
 282   1, /* vec_unalign_load_cost  */
 283   1, /* vec_unalign_store_cost  */
 284   1, /* vec_store_cost  */
 285   3, /* cond_taken_branch_cost  */
 286   1 /* cond_not_taken_branch_cost  */
 287 };
 288
 289 /* Generic costs for vector insn classes.  */
 290 static const struct cpu_vector_cost cortexa57_vector_cost =
 291 {
 292   1, /* scalar_stmt_cost  */
 293   4, /* scalar_load_cost  */
 294   1, /* scalar_store_cost  */
 295   3, /* vec_stmt_cost  */
 296   8, /* vec_to_scalar_cost  */
 297   8, /* scalar_to_vec_cost  */
 298   5, /* vec_align_load_cost  */
 299   5, /* vec_unalign_load_cost  */
 300   1, /* vec_unalign_store_cost  */
 301   1, /* vec_store_cost  */
 302   1, /* cond_taken_branch_cost  */
 303   1 /* cond_not_taken_branch_cost  */
 304 };
 305
 306 /* Generic costs for vector insn classes.  */
 307 static const struct cpu_vector_cost xgene1_vector_cost =
 308 {
 309   1, /* scalar_stmt_cost  */
 310   5, /* scalar_load_cost  */
 311   1, /* scalar_store_cost  */
 312   2, /* vec_stmt_cost  */
 313   4, /* vec_to_scalar_cost  */
 314   4, /* scalar_to_vec_cost  */
 315   10, /* vec_align_load_cost  */
 316   10, /* vec_unalign_load_cost  */
 317   2, /* vec_unalign_store_cost  */
 318   2, /* vec_store_cost  */
 319   2, /* cond_taken_branch_cost  */
 320   1 /* cond_not_taken_branch_cost  */
 321 };
 322
 323 #define AARCH64_FUSE_NOTHING    (0)
 324 #define AARCH64_FUSE_MOV_MOVK   (1 << 0)
 325 #define AARCH64_FUSE_ADRP_ADD   (1 << 1)
 326 #define AARCH64_FUSE_MOVK_MOVK  (1 << 2)
 327 #define AARCH64_FUSE_ADRP_LDR   (1 << 3)
 328 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
 329
 330 /* Generic costs for branch instructions.  */
 331 static const struct cpu_branch_cost generic_branch_cost =
 332 {
 333   2,  /* Predictable.  */
 334   2   /* Unpredictable.  */
 335 };
 336
 337 static const struct tune_params generic_tunings =
 338 {
 339   &cortexa57_extra_costs,
 340   &generic_addrcost_table,
 341   &generic_regmove_cost,
 342   &generic_vector_cost,
 343   &generic_branch_cost,
 344   4, /* memmov_cost  */
 345   2, /* issue_rate  */
 346   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 347   8,    /* function_align.  */
 348   8,    /* jump_align.  */
 349   4,    /* loop_align.  */
 350   2,    /* int_reassoc_width.  */
 351   4,    /* fp_reassoc_width.  */
 352   1,    /* vec_reassoc_width.  */
 353   2,    /* min_div_recip_mul_sf.  */
 354   2     /* min_div_recip_mul_df.  */
 355 };
 356
 357 static const struct tune_params cortexa53_tunings =
 358 {
 359   &cortexa53_extra_costs,
 360   &generic_addrcost_table,
 361   &cortexa53_regmove_cost,
 362   &generic_vector_cost,
 363   &generic_branch_cost,
 364   4, /* memmov_cost  */
 365   2, /* issue_rate  */
 366   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 367    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 368   8,    /* function_align.  */
 369   8,    /* jump_align.  */
 370   4,    /* loop_align.  */
 371   2,    /* int_reassoc_width.  */
 372   4,    /* fp_reassoc_width.  */
 373   1,    /* vec_reassoc_width.  */
 374   2,    /* min_div_recip_mul_sf.  */
 375   2     /* min_div_recip_mul_df.  */
 376 };
 377
 378 static const struct tune_params cortexa57_tunings =
 379 {
 380   &cortexa57_extra_costs,
 381   &cortexa57_addrcost_table,
 382   &cortexa57_regmove_cost,
 383   &cortexa57_vector_cost,
 384   &generic_branch_cost,
 385   4, /* memmov_cost  */
 386   3, /* issue_rate  */
 387   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 388    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 389   16,   /* function_align.  */
 390   8,    /* jump_align.  */
 391   4,    /* loop_align.  */
 392   2,    /* int_reassoc_width.  */
 393   4,    /* fp_reassoc_width.  */
 394   1,    /* vec_reassoc_width.  */
 395   2,    /* min_div_recip_mul_sf.  */
 396   2     /* min_div_recip_mul_df.  */
 397 };
 398
 399 static const struct tune_params thunderx_tunings =
 400 {
 401   &thunderx_extra_costs,
 402   &generic_addrcost_table,
 403   &thunderx_regmove_cost,
 404   &generic_vector_cost,
 405   &generic_branch_cost,
 406   6, /* memmov_cost  */
 407   2, /* issue_rate  */
 408   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 409   8,    /* function_align.  */
 410   8,    /* jump_align.  */
 411   8,    /* loop_align.  */
 412   2,    /* int_reassoc_width.  */
 413   4,    /* fp_reassoc_width.  */
 414   1,    /* vec_reassoc_width.  */
 415   2,    /* min_div_recip_mul_sf.  */
 416   2     /* min_div_recip_mul_df.  */
 417 };
 418
 419 static const struct tune_params xgene1_tunings =
 420 {
 421   &xgene1_extra_costs,
 422   &xgene1_addrcost_table,
 423   &xgene1_regmove_cost,
 424   &xgene1_vector_cost,
 425   &generic_branch_cost,
 426   6, /* memmov_cost  */
 427   4, /* issue_rate  */
 428   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 429   16,   /* function_align.  */
 430   8,    /* jump_align.  */
 431   16,   /* loop_align.  */
 432   2,    /* int_reassoc_width.  */
 433   4,    /* fp_reassoc_width.  */
 434   1,    /* vec_reassoc_width.  */
 435   2,    /* min_div_recip_mul_sf.  */
 436   2     /* min_div_recip_mul_df.  */
 437 };
 438
 439 /* A processor implementing AArch64.  */
 440 struct processor
 441 {
 442   const char *const name;
 443   enum aarch64_processor core;
 444   const char *arch;
 445   unsigned architecture_version;
 446   const unsigned long flags;
 447   const struct tune_params *const tune;
 448 };
 449
 450 /* Processor cores implementing AArch64.  */
 451 static const struct processor all_cores[] =
 452 {
 453 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
 454   {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
 455 #include "aarch64-cores.def"
 456 #undef AARCH64_CORE
 457   {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
 458   {NULL, aarch64_none, NULL, 0, 0, NULL}
 459 };
 460
 461 /* Architectures implementing AArch64.  */
 462 static const struct processor all_architectures[] =
 463 {
 464 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
 465   {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
 466 #include "aarch64-arches.def"
 467 #undef AARCH64_ARCH
 468   {NULL, aarch64_none, NULL, 0, 0, NULL}
 469 };
 470
 471 /* Target specification.  These are populated as commandline arguments
 472    are processed, or NULL if not specified.  */
 473 static const struct processor *selected_arch;
 474 static const struct processor *selected_cpu;
 475 static const struct processor *selected_tune;
 476
 477 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 478
 479 /* An ISA extension in the co-processor and main instruction set space.  */
 480 struct aarch64_option_extension
 481 {
 482   const char *const name;
 483   const unsigned long flags_on;
 484   const unsigned long flags_off;
 485 };
 486
 487 /* ISA extensions in AArch64.  */
 488 static const struct aarch64_option_extension all_extensions[] =
 489 {
 490 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
 491   {NAME, FLAGS_ON, FLAGS_OFF},
 492 #include "aarch64-option-extensions.def"
 493 #undef AARCH64_OPT_EXTENSION
 494   {NULL, 0, 0}
 495 };
 496
 497 /* Used to track the size of an address when generating a pre/post
 498    increment address.  */
 499 static machine_mode aarch64_memory_reference_mode;
 500
 501 /* A table of valid AArch64 "bitmask immediate" values for
 502    logical instructions.  */
 503
 504 #define AARCH64_NUM_BITMASKS  5334
 505 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
 506
 507 typedef enum aarch64_cond_code
 508 {
 509   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 510   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 511   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 512 }
 513 aarch64_cc;
 514
 515 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 516
 517 /* The condition codes of the processor, and the inverse function.  */
 518 static const char * const aarch64_condition_codes[] =
 519 {
 520   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 521   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 522 };
 523
 524 void
 525 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 526 {
 527   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 528   if (TARGET_GENERAL_REGS_ONLY)
 529     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 530   else
 531     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 532 }
 533
 534 static unsigned int
 535 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 536 {
 537   if (GET_MODE_UNIT_SIZE (mode) == 4)
 538     return aarch64_tune_params->min_div_recip_mul_sf;
 539   return aarch64_tune_params->min_div_recip_mul_df;
 540 }
 541
 542 static int
 543 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 544                              enum machine_mode mode)
 545 {
 546   if (VECTOR_MODE_P (mode))
 547     return aarch64_tune_params->vec_reassoc_width;
 548   if (INTEGRAL_MODE_P (mode))
 549     return aarch64_tune_params->int_reassoc_width;
 550   if (FLOAT_MODE_P (mode))
 551     return aarch64_tune_params->fp_reassoc_width;
 552   return 1;
 553 }
 554
 555 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 556 unsigned
 557 aarch64_dbx_register_number (unsigned regno)
 558 {
 559    if (GP_REGNUM_P (regno))
 560      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 561    else if (regno == SP_REGNUM)
 562      return AARCH64_DWARF_SP;
 563    else if (FP_REGNUM_P (regno))
 564      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 565
 566    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 567       equivalent DWARF register.  */
 568    return DWARF_FRAME_REGISTERS;
 569 }
 570
 571 /* Return TRUE if MODE is any of the large INT modes.  */
 572 static bool
 573 aarch64_vect_struct_mode_p (machine_mode mode)
 574 {
 575   return mode == OImode || mode == CImode || mode == XImode;
 576 }
 577
 578 /* Return TRUE if MODE is any of the vector modes.  */
 579 static bool
 580 aarch64_vector_mode_p (machine_mode mode)
 581 {
 582   return aarch64_vector_mode_supported_p (mode)
 583          || aarch64_vect_struct_mode_p (mode);
 584 }
 585
 586 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 587 static bool
 588 aarch64_array_mode_supported_p (machine_mode mode,
 589                                 unsigned HOST_WIDE_INT nelems)
 590 {
 591   if (TARGET_SIMD
 592       && AARCH64_VALID_SIMD_QREG_MODE (mode)
 593       && (nelems >= 2 && nelems <= 4))
 594     return true;
 595
 596   return false;
 597 }
 598
 599 /* Implement HARD_REGNO_NREGS.  */
 600
 601 int
 602 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
 603 {
 604   switch (aarch64_regno_regclass (regno))
 605     {
 606     case FP_REGS:
 607     case FP_LO_REGS:
 608       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 609     default:
 610       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 611     }
 612   gcc_unreachable ();
 613 }
 614
 615 /* Implement HARD_REGNO_MODE_OK.  */
 616
 617 int
 618 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
 619 {
 620   if (GET_MODE_CLASS (mode) == MODE_CC)
 621     return regno == CC_REGNUM;
 622
 623   if (regno == SP_REGNUM)
 624     /* The purpose of comparing with ptr_mode is to support the
 625        global register variable associated with the stack pointer
 626        register via the syntax of asm ("wsp") in ILP32.  */
 627     return mode == Pmode || mode == ptr_mode;
 628
 629   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 630     return mode == Pmode;
 631
 632   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 633     return 1;
 634
 635   if (FP_REGNUM_P (regno))
 636     {
 637       if (aarch64_vect_struct_mode_p (mode))
 638         return
 639           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 640       else
 641         return 1;
 642     }
 643
 644   return 0;
 645 }
 646
 647 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 648 machine_mode
 649 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 650                                      machine_mode mode)
 651 {
 652   /* Handle modes that fit within single registers.  */
 653   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 654     {
 655       if (GET_MODE_SIZE (mode) >= 4)
 656         return mode;
 657       else
 658         return SImode;
 659     }
 660   /* Fall back to generic for multi-reg and very large modes.  */
 661   else
 662     return choose_hard_reg_mode (regno, nregs, false);
 663 }
 664
 665 /* Return true if calls to DECL should be treated as
 666    long-calls (ie called via a register).  */
 667 static bool
 668 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 669 {
 670   return false;
 671 }
 672
 673 /* Return true if calls to symbol-ref SYM should be treated as
 674    long-calls (ie called via a register).  */
 675 bool
 676 aarch64_is_long_call_p (rtx sym)
 677 {
 678   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 679 }
 680
 681 /* Return true if the offsets to a zero/sign-extract operation
 682    represent an expression that matches an extend operation.  The
 683    operands represent the paramters from
 684
 685    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 686 bool
 687 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
 688                                 rtx extract_imm)
 689 {
 690   HOST_WIDE_INT mult_val, extract_val;
 691
 692   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 693     return false;
 694
 695   mult_val = INTVAL (mult_imm);
 696   extract_val = INTVAL (extract_imm);
 697
 698   if (extract_val > 8
 699       && extract_val < GET_MODE_BITSIZE (mode)
 700       && exact_log2 (extract_val & ~7) > 0
 701       && (extract_val & 7) <= 4
 702       && mult_val == (1 << (extract_val & 7)))
 703     return true;
 704
 705   return false;
 706 }
 707
 708 /* Emit an insn that's a simple single-set.  Both the operands must be
 709    known to be valid.  */
 710 inline static rtx
 711 emit_set_insn (rtx x, rtx y)
 712 {
 713   return emit_insn (gen_rtx_SET (x, y));
 714 }
 715
 716 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 717    return the rtx for register 0 in the proper mode.  */
 718 rtx
 719 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 720 {
 721   machine_mode mode = SELECT_CC_MODE (code, x, y);
 722   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 723
 724   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 725   return cc_reg;
 726 }
 727
 728 /* Build the SYMBOL_REF for __tls_get_addr.  */
 729
 730 static GTY(()) rtx tls_get_addr_libfunc;
 731
 732 rtx
 733 aarch64_tls_get_addr (void)
 734 {
 735   if (!tls_get_addr_libfunc)
 736     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 737   return tls_get_addr_libfunc;
 738 }
 739
 740 /* Return the TLS model to use for ADDR.  */
 741
 742 static enum tls_model
 743 tls_symbolic_operand_type (rtx addr)
 744 {
 745   enum tls_model tls_kind = TLS_MODEL_NONE;
 746   rtx sym, addend;
 747
 748   if (GET_CODE (addr) == CONST)
 749     {
 750       split_const (addr, &sym, &addend);
 751       if (GET_CODE (sym) == SYMBOL_REF)
 752         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 753     }
 754   else if (GET_CODE (addr) == SYMBOL_REF)
 755     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 756
 757   return tls_kind;
 758 }
 759
 760 /* We'll allow lo_sum's in addresses in our legitimate addresses
 761    so that combine would take care of combining addresses where
 762    necessary, but for generation purposes, we'll generate the address
 763    as :
 764    RTL                               Absolute
 765    tmp = hi (symbol_ref);            adrp  x1, foo
 766    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 767                                      nop
 768
 769    PIC                               TLS
 770    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 771    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 772                                      bl   __tls_get_addr
 773                                      nop
 774
 775    Load TLS symbol, depending on TLS mechanism and TLS access model.
 776
 777    Global Dynamic - Traditional TLS:
 778    adrp tmp, :tlsgd:imm
 779    add  dest, tmp, #:tlsgd_lo12:imm
 780    bl   __tls_get_addr
 781
 782    Global Dynamic - TLS Descriptors:
 783    adrp dest, :tlsdesc:imm
 784    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 785    add  dest, dest, #:tlsdesc_lo12:imm
 786    blr  tmp
 787    mrs  tp, tpidr_el0
 788    add  dest, dest, tp
 789
 790    Initial Exec:
 791    mrs  tp, tpidr_el0
 792    adrp tmp, :gottprel:imm
 793    ldr  dest, [tmp, #:gottprel_lo12:imm]
 794    add  dest, dest, tp
 795
 796    Local Exec:
 797    mrs  tp, tpidr_el0
 798    add  t0, tp, #:tprel_hi12:imm, lsl #12
 799    add  t0, t0, #:tprel_lo12_nc:imm
 800 */
 801
 802 static void
 803 aarch64_load_symref_appropriately (rtx dest, rtx imm,
 804                                    enum aarch64_symbol_type type)
 805 {
 806   switch (type)
 807     {
 808     case SYMBOL_SMALL_ABSOLUTE:
 809       {
 810         /* In ILP32, the mode of dest can be either SImode or DImode.  */
 811         rtx tmp_reg = dest;
 812         machine_mode mode = GET_MODE (dest);
 813
 814         gcc_assert (mode == Pmode || mode == ptr_mode);
 815
 816         if (can_create_pseudo_p ())
 817           tmp_reg = gen_reg_rtx (mode);
 818
 819         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 820         emit_insn (gen_add_losym (dest, tmp_reg, imm));
 821         return;
 822       }
 823
 824     case SYMBOL_TINY_ABSOLUTE:
 825       emit_insn (gen_rtx_SET (dest, imm));
 826       return;
 827
 828     case SYMBOL_SMALL_GOT:
 829       {
 830         /* In ILP32, the mode of dest can be either SImode or DImode,
 831            while the got entry is always of SImode size.  The mode of
 832            dest depends on how dest is used: if dest is assigned to a
 833            pointer (e.g. in the memory), it has SImode; it may have
 834            DImode if dest is dereferenced to access the memeory.
 835            This is why we have to handle three different ldr_got_small
 836            patterns here (two patterns for ILP32).  */
 837         rtx tmp_reg = dest;
 838         machine_mode mode = GET_MODE (dest);
 839
 840         if (can_create_pseudo_p ())
 841           tmp_reg = gen_reg_rtx (mode);
 842
 843         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 844         if (mode == ptr_mode)
 845           {
 846             if (mode == DImode)
 847               emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
 848             else
 849               emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
 850           }
 851         else
 852           {
 853             gcc_assert (mode == Pmode);
 854             emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
 855           }
 856
 857         return;
 858       }
 859
 860     case SYMBOL_SMALL_TLSGD:
 861       {
 862         rtx_insn *insns;
 863         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
 864
 865         start_sequence ();
 866         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
 867         insns = get_insns ();
 868         end_sequence ();
 869
 870         RTL_CONST_CALL_P (insns) = 1;
 871         emit_libcall_block (insns, dest, result, imm);
 872         return;
 873       }
 874
 875     case SYMBOL_SMALL_TLSDESC:
 876       {
 877         machine_mode mode = GET_MODE (dest);
 878         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
 879         rtx tp;
 880
 881         gcc_assert (mode == Pmode || mode == ptr_mode);
 882
 883         /* In ILP32, the got entry is always of SImode size.  Unlike
 884            small GOT, the dest is fixed at reg 0.  */
 885         if (TARGET_ILP32)
 886           emit_insn (gen_tlsdesc_small_si (imm));
 887         else
 888           emit_insn (gen_tlsdesc_small_di (imm));
 889         tp = aarch64_load_tp (NULL);
 890
 891         if (mode != Pmode)
 892           tp = gen_lowpart (mode, tp);
 893
 894         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
 895         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 896         return;
 897       }
 898
 899     case SYMBOL_SMALL_GOTTPREL:
 900       {
 901         /* In ILP32, the mode of dest can be either SImode or DImode,
 902            while the got entry is always of SImode size.  The mode of
 903            dest depends on how dest is used: if dest is assigned to a
 904            pointer (e.g. in the memory), it has SImode; it may have
 905            DImode if dest is dereferenced to access the memeory.
 906            This is why we have to handle three different tlsie_small
 907            patterns here (two patterns for ILP32).  */
 908         machine_mode mode = GET_MODE (dest);
 909         rtx tmp_reg = gen_reg_rtx (mode);
 910         rtx tp = aarch64_load_tp (NULL);
 911
 912         if (mode == ptr_mode)
 913           {
 914             if (mode == DImode)
 915               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
 916             else
 917               {
 918                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
 919                 tp = gen_lowpart (mode, tp);
 920               }
 921           }
 922         else
 923           {
 924             gcc_assert (mode == Pmode);
 925             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
 926           }
 927
 928         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
 929         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 930         return;
 931       }
 932
 933     case SYMBOL_SMALL_TPREL:
 934       {
 935         rtx tp = aarch64_load_tp (NULL);
 936
 937         if (GET_MODE (dest) != Pmode)
 938           tp = gen_lowpart (GET_MODE (dest), tp);
 939
 940         emit_insn (gen_tlsle_small (dest, tp, imm));
 941         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 942         return;
 943       }
 944
 945     case SYMBOL_TINY_GOT:
 946       emit_insn (gen_ldr_got_tiny (dest, imm));
 947       return;
 948
 949     default:
 950       gcc_unreachable ();
 951     }
 952 }
 953
 954 /* Emit a move from SRC to DEST.  Assume that the move expanders can
 955    handle all moves if !can_create_pseudo_p ().  The distinction is
 956    important because, unlike emit_move_insn, the move expanders know
 957    how to force Pmode objects into the constant pool even when the
 958    constant pool address is not itself legitimate.  */
 959 static rtx
 960 aarch64_emit_move (rtx dest, rtx src)
 961 {
 962   return (can_create_pseudo_p ()
 963           ? emit_move_insn (dest, src)
 964           : emit_move_insn_1 (dest, src));
 965 }
 966
 967 /* Split a 128-bit move operation into two 64-bit move operations,
 968    taking care to handle partial overlap of register to register
 969    copies.  Special cases are needed when moving between GP regs and
 970    FP regs.  SRC can be a register, constant or memory; DST a register
 971    or memory.  If either operand is memory it must not have any side
 972    effects.  */
 973 void
 974 aarch64_split_128bit_move (rtx dst, rtx src)
 975 {
 976   rtx dst_lo, dst_hi;
 977   rtx src_lo, src_hi;
 978
 979   machine_mode mode = GET_MODE (dst);
 980
 981   gcc_assert (mode == TImode || mode == TFmode);
 982   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
 983   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
 984
 985   if (REG_P (dst) && REG_P (src))
 986     {
 987       int src_regno = REGNO (src);
 988       int dst_regno = REGNO (dst);
 989
 990       /* Handle FP <-> GP regs.  */
 991       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
 992         {
 993           src_lo = gen_lowpart (word_mode, src);
 994           src_hi = gen_highpart (word_mode, src);
 995
 996           if (mode == TImode)
 997             {
 998               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
 999               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1000             }
1001           else
1002             {
1003               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1004               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1005             }
1006           return;
1007         }
1008       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1009         {
1010           dst_lo = gen_lowpart (word_mode, dst);
1011           dst_hi = gen_highpart (word_mode, dst);
1012
1013           if (mode == TImode)
1014             {
1015               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1016               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1017             }
1018           else
1019             {
1020               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1021               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1022             }
1023           return;
1024         }
1025     }
1026
1027   dst_lo = gen_lowpart (word_mode, dst);
1028   dst_hi = gen_highpart (word_mode, dst);
1029   src_lo = gen_lowpart (word_mode, src);
1030   src_hi = gen_highpart_mode (word_mode, mode, src);
1031
1032   /* At most one pairing may overlap.  */
1033   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1034     {
1035       aarch64_emit_move (dst_hi, src_hi);
1036       aarch64_emit_move (dst_lo, src_lo);
1037     }
1038   else
1039     {
1040       aarch64_emit_move (dst_lo, src_lo);
1041       aarch64_emit_move (dst_hi, src_hi);
1042     }
1043 }
1044
1045 bool
1046 aarch64_split_128bit_move_p (rtx dst, rtx src)
1047 {
1048   return (! REG_P (src)
1049           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1050 }
1051
1052 /* Split a complex SIMD combine.  */
1053
1054 void
1055 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1056 {
1057   machine_mode src_mode = GET_MODE (src1);
1058   machine_mode dst_mode = GET_MODE (dst);
1059
1060   gcc_assert (VECTOR_MODE_P (dst_mode));
1061
1062   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1063     {
1064       rtx (*gen) (rtx, rtx, rtx);
1065
1066       switch (src_mode)
1067         {
1068         case V8QImode:
1069           gen = gen_aarch64_simd_combinev8qi;
1070           break;
1071         case V4HImode:
1072           gen = gen_aarch64_simd_combinev4hi;
1073           break;
1074         case V2SImode:
1075           gen = gen_aarch64_simd_combinev2si;
1076           break;
1077         case V2SFmode:
1078           gen = gen_aarch64_simd_combinev2sf;
1079           break;
1080         case DImode:
1081           gen = gen_aarch64_simd_combinedi;
1082           break;
1083         case DFmode:
1084           gen = gen_aarch64_simd_combinedf;
1085           break;
1086         default:
1087           gcc_unreachable ();
1088         }
1089
1090       emit_insn (gen (dst, src1, src2));
1091       return;
1092     }
1093 }
1094
1095 /* Split a complex SIMD move.  */
1096
1097 void
1098 aarch64_split_simd_move (rtx dst, rtx src)
1099 {
1100   machine_mode src_mode = GET_MODE (src);
1101   machine_mode dst_mode = GET_MODE (dst);
1102
1103   gcc_assert (VECTOR_MODE_P (dst_mode));
1104
1105   if (REG_P (dst) && REG_P (src))
1106     {
1107       rtx (*gen) (rtx, rtx);
1108
1109       gcc_assert (VECTOR_MODE_P (src_mode));
1110
1111       switch (src_mode)
1112         {
1113         case V16QImode:
1114           gen = gen_aarch64_split_simd_movv16qi;
1115           break;
1116         case V8HImode:
1117           gen = gen_aarch64_split_simd_movv8hi;
1118           break;
1119         case V4SImode:
1120           gen = gen_aarch64_split_simd_movv4si;
1121           break;
1122         case V2DImode:
1123           gen = gen_aarch64_split_simd_movv2di;
1124           break;
1125         case V4SFmode:
1126           gen = gen_aarch64_split_simd_movv4sf;
1127           break;
1128         case V2DFmode:
1129           gen = gen_aarch64_split_simd_movv2df;
1130           break;
1131         default:
1132           gcc_unreachable ();
1133         }
1134
1135       emit_insn (gen (dst, src));
1136       return;
1137     }
1138 }
1139
1140 static rtx
1141 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1142 {
1143   if (can_create_pseudo_p ())
1144     return force_reg (mode, value);
1145   else
1146     {
1147       x = aarch64_emit_move (x, value);
1148       return x;
1149     }
1150 }
1151
1152
1153 static rtx
1154 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1155 {
1156   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1157     {
1158       rtx high;
1159       /* Load the full offset into a register.  This
1160          might be improvable in the future.  */
1161       high = GEN_INT (offset);
1162       offset = 0;
1163       high = aarch64_force_temporary (mode, temp, high);
1164       reg = aarch64_force_temporary (mode, temp,
1165                                      gen_rtx_PLUS (mode, high, reg));
1166     }
1167   return plus_constant (mode, reg, offset);
1168 }
1169
1170 static int
1171 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1172                                 machine_mode mode)
1173 {
1174   unsigned HOST_WIDE_INT mask;
1175   int i;
1176   bool first;
1177   unsigned HOST_WIDE_INT val;
1178   bool subtargets;
1179   rtx subtarget;
1180   int one_match, zero_match, first_not_ffff_match;
1181   int num_insns = 0;
1182
1183   if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1184     {
1185       if (generate)
1186         emit_insn (gen_rtx_SET (dest, imm));
1187       num_insns++;
1188       return num_insns;
1189     }
1190
1191   if (mode == SImode)
1192     {
1193       /* We know we can't do this in 1 insn, and we must be able to do it
1194          in two; so don't mess around looking for sequences that don't buy
1195          us anything.  */
1196       if (generate)
1197         {
1198           emit_insn (gen_rtx_SET (dest, GEN_INT (INTVAL (imm) & 0xffff)));
1199           emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1200                                      GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1201         }
1202       num_insns += 2;
1203       return num_insns;
1204     }
1205
1206   /* Remaining cases are all for DImode.  */
1207
1208   val = INTVAL (imm);
1209   subtargets = optimize && can_create_pseudo_p ();
1210
1211   one_match = 0;
1212   zero_match = 0;
1213   mask = 0xffff;
1214   first_not_ffff_match = -1;
1215
1216   for (i = 0; i < 64; i += 16, mask <<= 16)
1217     {
1218       if ((val & mask) == mask)
1219         one_match++;
1220       else
1221         {
1222           if (first_not_ffff_match < 0)
1223             first_not_ffff_match = i;
1224           if ((val & mask) == 0)
1225             zero_match++;
1226         }
1227     }
1228
1229   if (one_match == 2)
1230     {
1231       /* Set one of the quarters and then insert back into result.  */
1232       mask = 0xffffll << first_not_ffff_match;
1233       if (generate)
1234         {
1235           emit_insn (gen_rtx_SET (dest, GEN_INT (val | mask)));
1236           emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1237                                      GEN_INT ((val >> first_not_ffff_match)
1238                                               & 0xffff)));
1239         }
1240       num_insns += 2;
1241       return num_insns;
1242     }
1243
1244   if (zero_match == 2)
1245     goto simple_sequence;
1246
1247   mask = 0x0ffff0000UL;
1248   for (i = 16; i < 64; i += 16, mask <<= 16)
1249     {
1250       HOST_WIDE_INT comp = mask & ~(mask - 1);
1251
1252       if (aarch64_uimm12_shift (val - (val & mask)))
1253         {
1254           if (generate)
1255             {
1256               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1257               emit_insn (gen_rtx_SET (subtarget, GEN_INT (val & mask)));
1258               emit_insn (gen_adddi3 (dest, subtarget,
1259                                      GEN_INT (val - (val & mask))));
1260             }
1261           num_insns += 2;
1262           return num_insns;
1263         }
1264       else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1265         {
1266           if (generate)
1267             {
1268               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1269               emit_insn (gen_rtx_SET (subtarget,
1270                                       GEN_INT ((val + comp) & mask)));
1271               emit_insn (gen_adddi3 (dest, subtarget,
1272                                      GEN_INT (val - ((val + comp) & mask))));
1273             }
1274           num_insns += 2;
1275           return num_insns;
1276         }
1277       else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1278         {
1279           if (generate)
1280             {
1281               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1282               emit_insn (gen_rtx_SET (subtarget,
1283                                       GEN_INT ((val - comp) | ~mask)));
1284               emit_insn (gen_adddi3 (dest, subtarget,
1285                                      GEN_INT (val - ((val - comp) | ~mask))));
1286             }
1287           num_insns += 2;
1288           return num_insns;
1289         }
1290       else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1291         {
1292           if (generate)
1293             {
1294               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1295               emit_insn (gen_rtx_SET (subtarget, GEN_INT (val | ~mask)));
1296               emit_insn (gen_adddi3 (dest, subtarget,
1297                                      GEN_INT (val - (val | ~mask))));
1298             }
1299           num_insns += 2;
1300           return num_insns;
1301         }
1302     }
1303
1304   /* See if we can do it by arithmetically combining two
1305      immediates.  */
1306   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1307     {
1308       int j;
1309       mask = 0xffff;
1310
1311       if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1312           || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1313         {
1314           if (generate)
1315             {
1316               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1317               emit_insn (gen_rtx_SET (subtarget,
1318                                       GEN_INT (aarch64_bitmasks[i])));
1319               emit_insn (gen_adddi3 (dest, subtarget,
1320                                      GEN_INT (val - aarch64_bitmasks[i])));
1321             }
1322           num_insns += 2;
1323           return num_insns;
1324         }
1325
1326       for (j = 0; j < 64; j += 16, mask <<= 16)
1327         {
1328           if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1329             {
1330               if (generate)
1331                 {
1332                   emit_insn (gen_rtx_SET (dest,
1333                                           GEN_INT (aarch64_bitmasks[i])));
1334                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1335                                              GEN_INT ((val >> j) & 0xffff)));
1336                 }
1337               num_insns += 2;
1338               return num_insns;
1339             }
1340         }
1341     }
1342
1343   /* See if we can do it by logically combining two immediates.  */
1344   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1345     {
1346       if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1347         {
1348           int j;
1349
1350           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1351             if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1352               {
1353                 if (generate)
1354                   {
1355                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1356                     emit_insn (gen_rtx_SET (subtarget,
1357                                             GEN_INT (aarch64_bitmasks[i])));
1358                     emit_insn (gen_iordi3 (dest, subtarget,
1359                                            GEN_INT (aarch64_bitmasks[j])));
1360                   }
1361                 num_insns += 2;
1362                 return num_insns;
1363               }
1364         }
1365       else if ((val & aarch64_bitmasks[i]) == val)
1366         {
1367           int j;
1368
1369           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1370             if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1371               {
1372                 if (generate)
1373                   {
1374                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1375                     emit_insn (gen_rtx_SET (subtarget,
1376                                             GEN_INT (aarch64_bitmasks[j])));
1377                     emit_insn (gen_anddi3 (dest, subtarget,
1378                                            GEN_INT (aarch64_bitmasks[i])));
1379                   }
1380                 num_insns += 2;
1381                 return num_insns;
1382               }
1383         }
1384     }
1385
1386   if (one_match > zero_match)
1387     {
1388       /* Set either first three quarters or all but the third.   */
1389       mask = 0xffffll << (16 - first_not_ffff_match);
1390       if (generate)
1391         emit_insn (gen_rtx_SET (dest,
1392                                 GEN_INT (val | mask | 0xffffffff00000000ull)));
1393       num_insns ++;
1394
1395       /* Now insert other two quarters.  */
1396       for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1397            i < 64; i += 16, mask <<= 16)
1398         {
1399           if ((val & mask) != mask)
1400             {
1401               if (generate)
1402                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1403                                            GEN_INT ((val >> i) & 0xffff)));
1404               num_insns ++;
1405             }
1406         }
1407       return num_insns;
1408     }
1409
1410  simple_sequence:
1411   first = true;
1412   mask = 0xffff;
1413   for (i = 0; i < 64; i += 16, mask <<= 16)
1414     {
1415       if ((val & mask) != 0)
1416         {
1417           if (first)
1418             {
1419               if (generate)
1420                 emit_insn (gen_rtx_SET (dest, GEN_INT (val & mask)));
1421               num_insns ++;
1422               first = false;
1423             }
1424           else
1425             {
1426               if (generate)
1427                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1428                                            GEN_INT ((val >> i) & 0xffff)));
1429               num_insns ++;
1430             }
1431         }
1432     }
1433
1434   return num_insns;
1435 }
1436
1437
1438 void
1439 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1440 {
1441   machine_mode mode = GET_MODE (dest);
1442
1443   gcc_assert (mode == SImode || mode == DImode);
1444
1445   /* Check on what type of symbol it is.  */
1446   if (GET_CODE (imm) == SYMBOL_REF
1447       || GET_CODE (imm) == LABEL_REF
1448       || GET_CODE (imm) == CONST)
1449     {
1450       rtx mem, base, offset;
1451       enum aarch64_symbol_type sty;
1452
1453       /* If we have (const (plus symbol offset)), separate out the offset
1454          before we start classifying the symbol.  */
1455       split_const (imm, &base, &offset);
1456
1457       sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1458       switch (sty)
1459         {
1460         case SYMBOL_FORCE_TO_MEM:
1461           if (offset != const0_rtx
1462               && targetm.cannot_force_const_mem (mode, imm))
1463             {
1464               gcc_assert (can_create_pseudo_p ());
1465               base = aarch64_force_temporary (mode, dest, base);
1466               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1467               aarch64_emit_move (dest, base);
1468               return;
1469             }
1470           mem = force_const_mem (ptr_mode, imm);
1471           gcc_assert (mem);
1472           if (mode != ptr_mode)
1473             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1474           emit_insn (gen_rtx_SET (dest, mem));
1475           return;
1476
1477         case SYMBOL_SMALL_TLSGD:
1478         case SYMBOL_SMALL_TLSDESC:
1479         case SYMBOL_SMALL_GOTTPREL:
1480         case SYMBOL_SMALL_GOT:
1481         case SYMBOL_TINY_GOT:
1482           if (offset != const0_rtx)
1483             {
1484               gcc_assert(can_create_pseudo_p ());
1485               base = aarch64_force_temporary (mode, dest, base);
1486               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1487               aarch64_emit_move (dest, base);
1488               return;
1489             }
1490           /* FALLTHRU */
1491
1492         case SYMBOL_SMALL_TPREL:
1493         case SYMBOL_SMALL_ABSOLUTE:
1494         case SYMBOL_TINY_ABSOLUTE:
1495           aarch64_load_symref_appropriately (dest, imm, sty);
1496           return;
1497
1498         default:
1499           gcc_unreachable ();
1500         }
1501     }
1502
1503   if (!CONST_INT_P (imm))
1504     {
1505       if (GET_CODE (imm) == HIGH)
1506         emit_insn (gen_rtx_SET (dest, imm));
1507       else
1508         {
1509           rtx mem = force_const_mem (mode, imm);
1510           gcc_assert (mem);
1511           emit_insn (gen_rtx_SET (dest, mem));
1512         }
1513
1514       return;
1515     }
1516
1517   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1518 }
1519
1520 static bool
1521 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1522                                  tree exp ATTRIBUTE_UNUSED)
1523 {
1524   /* Currently, always true.  */
1525   return true;
1526 }
1527
1528 /* Implement TARGET_PASS_BY_REFERENCE.  */
1529
1530 static bool
1531 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1532                            machine_mode mode,
1533                            const_tree type,
1534                            bool named ATTRIBUTE_UNUSED)
1535 {
1536   HOST_WIDE_INT size;
1537   machine_mode dummymode;
1538   int nregs;
1539
1540   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1541   size = (mode == BLKmode && type)
1542     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1543
1544   /* Aggregates are passed by reference based on their size.  */
1545   if (type && AGGREGATE_TYPE_P (type))
1546     {
1547       size = int_size_in_bytes (type);
1548     }
1549
1550   /* Variable sized arguments are always returned by reference.  */
1551   if (size < 0)
1552     return true;
1553
1554   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1555   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1556                                                &dummymode, &nregs,
1557                                                NULL))
1558     return false;
1559
1560   /* Arguments which are variable sized or larger than 2 registers are
1561      passed by reference unless they are a homogenous floating point
1562      aggregate.  */
1563   return size > 2 * UNITS_PER_WORD;
1564 }
1565
1566 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1567 static bool
1568 aarch64_return_in_msb (const_tree valtype)
1569 {
1570   machine_mode dummy_mode;
1571   int dummy_int;
1572
1573   /* Never happens in little-endian mode.  */
1574   if (!BYTES_BIG_ENDIAN)
1575     return false;
1576
1577   /* Only composite types smaller than or equal to 16 bytes can
1578      be potentially returned in registers.  */
1579   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1580       || int_size_in_bytes (valtype) <= 0
1581       || int_size_in_bytes (valtype) > 16)
1582     return false;
1583
1584   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1585      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1586      is always passed/returned in the least significant bits of fp/simd
1587      register(s).  */
1588   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1589                                                &dummy_mode, &dummy_int, NULL))
1590     return false;
1591
1592   return true;
1593 }
1594
1595 /* Implement TARGET_FUNCTION_VALUE.
1596    Define how to find the value returned by a function.  */
1597
1598 static rtx
1599 aarch64_function_value (const_tree type, const_tree func,
1600                         bool outgoing ATTRIBUTE_UNUSED)
1601 {
1602   machine_mode mode;
1603   int unsignedp;
1604   int count;
1605   machine_mode ag_mode;
1606
1607   mode = TYPE_MODE (type);
1608   if (INTEGRAL_TYPE_P (type))
1609     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1610
1611   if (aarch64_return_in_msb (type))
1612     {
1613       HOST_WIDE_INT size = int_size_in_bytes (type);
1614
1615       if (size % UNITS_PER_WORD != 0)
1616         {
1617           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1618           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1619         }
1620     }
1621
1622   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1623                                                &ag_mode, &count, NULL))
1624     {
1625       if (!aarch64_composite_type_p (type, mode))
1626         {
1627           gcc_assert (count == 1 && mode == ag_mode);
1628           return gen_rtx_REG (mode, V0_REGNUM);
1629         }
1630       else
1631         {
1632           int i;
1633           rtx par;
1634
1635           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1636           for (i = 0; i < count; i++)
1637             {
1638               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1639               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1640                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1641               XVECEXP (par, 0, i) = tmp;
1642             }
1643           return par;
1644         }
1645     }
1646   else
1647     return gen_rtx_REG (mode, R0_REGNUM);
1648 }
1649
1650 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1651    Return true if REGNO is the number of a hard register in which the values
1652    of called function may come back.  */
1653
1654 static bool
1655 aarch64_function_value_regno_p (const unsigned int regno)
1656 {
1657   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1658      of 16-byte return values are: 128-bit integers and 16-byte small
1659      structures (excluding homogeneous floating-point aggregates).  */
1660   if (regno == R0_REGNUM || regno == R1_REGNUM)
1661     return true;
1662
1663   /* Up to four fp/simd registers can return a function value, e.g. a
1664      homogeneous floating-point aggregate having four members.  */
1665   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1666     return TARGET_FLOAT;
1667
1668   return false;
1669 }
1670
1671 /* Implement TARGET_RETURN_IN_MEMORY.
1672
1673    If the type T of the result of a function is such that
1674      void func (T arg)
1675    would require that arg be passed as a value in a register (or set of
1676    registers) according to the parameter passing rules, then the result
1677    is returned in the same registers as would be used for such an
1678    argument.  */
1679
1680 static bool
1681 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1682 {
1683   HOST_WIDE_INT size;
1684   machine_mode ag_mode;
1685   int count;
1686
1687   if (!AGGREGATE_TYPE_P (type)
1688       && TREE_CODE (type) != COMPLEX_TYPE
1689       && TREE_CODE (type) != VECTOR_TYPE)
1690     /* Simple scalar types always returned in registers.  */
1691     return false;
1692
1693   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1694                                                type,
1695                                                &ag_mode,
1696                                                &count,
1697                                                NULL))
1698     return false;
1699
1700   /* Types larger than 2 registers returned in memory.  */
1701   size = int_size_in_bytes (type);
1702   return (size < 0 || size > 2 * UNITS_PER_WORD);
1703 }
1704
1705 static bool
1706 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1707                                const_tree type, int *nregs)
1708 {
1709   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1710   return aarch64_vfp_is_call_or_return_candidate (mode,
1711                                                   type,
1712                                                   &pcum->aapcs_vfp_rmode,
1713                                                   nregs,
1714                                                   NULL);
1715 }
1716
1717 /* Given MODE and TYPE of a function argument, return the alignment in
1718    bits.  The idea is to suppress any stronger alignment requested by
1719    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1720    This is a helper function for local use only.  */
1721
1722 static unsigned int
1723 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1724 {
1725   unsigned int alignment;
1726
1727   if (type)
1728     {
1729       if (!integer_zerop (TYPE_SIZE (type)))
1730         {
1731           if (TYPE_MODE (type) == mode)
1732             alignment = TYPE_ALIGN (type);
1733           else
1734             alignment = GET_MODE_ALIGNMENT (mode);
1735         }
1736       else
1737         alignment = 0;
1738     }
1739   else
1740     alignment = GET_MODE_ALIGNMENT (mode);
1741
1742   return alignment;
1743 }
1744
1745 /* Layout a function argument according to the AAPCS64 rules.  The rule
1746    numbers refer to the rule numbers in the AAPCS64.  */
1747
1748 static void
1749 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1750                     const_tree type,
1751                     bool named ATTRIBUTE_UNUSED)
1752 {
1753   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1754   int ncrn, nvrn, nregs;
1755   bool allocate_ncrn, allocate_nvrn;
1756   HOST_WIDE_INT size;
1757
1758   /* We need to do this once per argument.  */
1759   if (pcum->aapcs_arg_processed)
1760     return;
1761
1762   pcum->aapcs_arg_processed = true;
1763
1764   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1765   size
1766     = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1767                         UNITS_PER_WORD);
1768
1769   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1770   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1771                                                  mode,
1772                                                  type,
1773                                                  &nregs);
1774
1775   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1776      The following code thus handles passing by SIMD/FP registers first.  */
1777
1778   nvrn = pcum->aapcs_nvrn;
1779
1780   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1781      and homogenous short-vector aggregates (HVA).  */
1782   if (allocate_nvrn)
1783     {
1784       if (!TARGET_FLOAT)
1785         aarch64_err_no_fpadvsimd (mode, "argument");
1786
1787       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1788         {
1789           pcum->aapcs_nextnvrn = nvrn + nregs;
1790           if (!aarch64_composite_type_p (type, mode))
1791             {
1792               gcc_assert (nregs == 1);
1793               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1794             }
1795           else
1796             {
1797               rtx par;
1798               int i;
1799               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1800               for (i = 0; i < nregs; i++)
1801                 {
1802                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1803                                          V0_REGNUM + nvrn + i);
1804                   tmp = gen_rtx_EXPR_LIST
1805                     (VOIDmode, tmp,
1806                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1807                   XVECEXP (par, 0, i) = tmp;
1808                 }
1809               pcum->aapcs_reg = par;
1810             }
1811           return;
1812         }
1813       else
1814         {
1815           /* C.3 NSRN is set to 8.  */
1816           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1817           goto on_stack;
1818         }
1819     }
1820
1821   ncrn = pcum->aapcs_ncrn;
1822   nregs = size / UNITS_PER_WORD;
1823
1824   /* C6 - C9.  though the sign and zero extension semantics are
1825      handled elsewhere.  This is the case where the argument fits
1826      entirely general registers.  */
1827   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1828     {
1829       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1830
1831       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1832
1833       /* C.8 if the argument has an alignment of 16 then the NGRN is
1834          rounded up to the next even number.  */
1835       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1836         {
1837           ++ncrn;
1838           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1839         }
1840       /* NREGS can be 0 when e.g. an empty structure is to be passed.
1841          A reg is still generated for it, but the caller should be smart
1842          enough not to use it.  */
1843       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1844         {
1845           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1846         }
1847       else
1848         {
1849           rtx par;
1850           int i;
1851
1852           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1853           for (i = 0; i < nregs; i++)
1854             {
1855               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1856               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1857                                        GEN_INT (i * UNITS_PER_WORD));
1858               XVECEXP (par, 0, i) = tmp;
1859             }
1860           pcum->aapcs_reg = par;
1861         }
1862
1863       pcum->aapcs_nextncrn = ncrn + nregs;
1864       return;
1865     }
1866
1867   /* C.11  */
1868   pcum->aapcs_nextncrn = NUM_ARG_REGS;
1869
1870   /* The argument is passed on stack; record the needed number of words for
1871      this argument and align the total size if necessary.  */
1872 on_stack:
1873   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1874   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1875     pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1876                                                16 / UNITS_PER_WORD);
1877   return;
1878 }
1879
1880 /* Implement TARGET_FUNCTION_ARG.  */
1881
1882 static rtx
1883 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1884                       const_tree type, bool named)
1885 {
1886   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1887   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1888
1889   if (mode == VOIDmode)
1890     return NULL_RTX;
1891
1892   aarch64_layout_arg (pcum_v, mode, type, named);
1893   return pcum->aapcs_reg;
1894 }
1895
1896 void
1897 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1898                            const_tree fntype ATTRIBUTE_UNUSED,
1899                            rtx libname ATTRIBUTE_UNUSED,
1900                            const_tree fndecl ATTRIBUTE_UNUSED,
1901                            unsigned n_named ATTRIBUTE_UNUSED)
1902 {
1903   pcum->aapcs_ncrn = 0;
1904   pcum->aapcs_nvrn = 0;
1905   pcum->aapcs_nextncrn = 0;
1906   pcum->aapcs_nextnvrn = 0;
1907   pcum->pcs_variant = ARM_PCS_AAPCS64;
1908   pcum->aapcs_reg = NULL_RTX;
1909   pcum->aapcs_arg_processed = false;
1910   pcum->aapcs_stack_words = 0;
1911   pcum->aapcs_stack_size = 0;
1912
1913   if (!TARGET_FLOAT
1914       && fndecl && TREE_PUBLIC (fndecl)
1915       && fntype && fntype != error_mark_node)
1916     {
1917       const_tree type = TREE_TYPE (fntype);
1918       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
1919       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
1920       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
1921                                                    &mode, &nregs, NULL))
1922         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
1923     }
1924   return;
1925 }
1926
1927 static void
1928 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1929                               machine_mode mode,
1930                               const_tree type,
1931                               bool named)
1932 {
1933   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1934   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1935     {
1936       aarch64_layout_arg (pcum_v, mode, type, named);
1937       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1938                   != (pcum->aapcs_stack_words != 0));
1939       pcum->aapcs_arg_processed = false;
1940       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1941       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1942       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1943       pcum->aapcs_stack_words = 0;
1944       pcum->aapcs_reg = NULL_RTX;
1945     }
1946 }
1947
1948 bool
1949 aarch64_function_arg_regno_p (unsigned regno)
1950 {
1951   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1952           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1953 }
1954
1955 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
1956    PARM_BOUNDARY bits of alignment, but will be given anything up
1957    to STACK_BOUNDARY bits if the type requires it.  This makes sure
1958    that both before and after the layout of each argument, the Next
1959    Stacked Argument Address (NSAA) will have a minimum alignment of
1960    8 bytes.  */
1961
1962 static unsigned int
1963 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1964 {
1965   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1966
1967   if (alignment < PARM_BOUNDARY)
1968     alignment = PARM_BOUNDARY;
1969   if (alignment > STACK_BOUNDARY)
1970     alignment = STACK_BOUNDARY;
1971   return alignment;
1972 }
1973
1974 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1975
1976    Return true if an argument passed on the stack should be padded upwards,
1977    i.e. if the least-significant byte of the stack slot has useful data.
1978
1979    Small aggregate types are placed in the lowest memory address.
1980
1981    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
1982
1983 bool
1984 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1985 {
1986   /* On little-endian targets, the least significant byte of every stack
1987      argument is passed at the lowest byte address of the stack slot.  */
1988   if (!BYTES_BIG_ENDIAN)
1989     return true;
1990
1991   /* Otherwise, integral, floating-point and pointer types are padded downward:
1992      the least significant byte of a stack argument is passed at the highest
1993      byte address of the stack slot.  */
1994   if (type
1995       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1996          || POINTER_TYPE_P (type))
1997       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1998     return false;
1999
2000   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2001   return true;
2002 }
2003
2004 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2005
2006    It specifies padding for the last (may also be the only)
2007    element of a block move between registers and memory.  If
2008    assuming the block is in the memory, padding upward means that
2009    the last element is padded after its highest significant byte,
2010    while in downward padding, the last element is padded at the
2011    its least significant byte side.
2012
2013    Small aggregates and small complex types are always padded
2014    upwards.
2015
2016    We don't need to worry about homogeneous floating-point or
2017    short-vector aggregates; their move is not affected by the
2018    padding direction determined here.  Regardless of endianness,
2019    each element of such an aggregate is put in the least
2020    significant bits of a fp/simd register.
2021
2022    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2023    register has useful data, and return the opposite if the most
2024    significant byte does.  */
2025
2026 bool
2027 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2028                      bool first ATTRIBUTE_UNUSED)
2029 {
2030
2031   /* Small composite types are always padded upward.  */
2032   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2033     {
2034       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2035                             : GET_MODE_SIZE (mode));
2036       if (size < 2 * UNITS_PER_WORD)
2037         return true;
2038     }
2039
2040   /* Otherwise, use the default padding.  */
2041   return !BYTES_BIG_ENDIAN;
2042 }
2043
2044 static machine_mode
2045 aarch64_libgcc_cmp_return_mode (void)
2046 {
2047   return SImode;
2048 }
2049
2050 static bool
2051 aarch64_frame_pointer_required (void)
2052 {
2053   /* In aarch64_override_options_after_change
2054      flag_omit_leaf_frame_pointer turns off the frame pointer by
2055      default.  Turn it back on now if we've not got a leaf
2056      function.  */
2057   if (flag_omit_leaf_frame_pointer
2058       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2059     return true;
2060
2061   return false;
2062 }
2063
2064 /* Mark the registers that need to be saved by the callee and calculate
2065    the size of the callee-saved registers area and frame record (both FP
2066    and LR may be omitted).  */
2067 static void
2068 aarch64_layout_frame (void)
2069 {
2070   HOST_WIDE_INT offset = 0;
2071   int regno;
2072
2073   if (reload_completed && cfun->machine->frame.laid_out)
2074     return;
2075
2076 #define SLOT_NOT_REQUIRED (-2)
2077 #define SLOT_REQUIRED     (-1)
2078
2079   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2080   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2081
2082   /* First mark all the registers that really need to be saved...  */
2083   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2084     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2085
2086   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2087     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2088
2089   /* ... that includes the eh data registers (if needed)...  */
2090   if (crtl->calls_eh_return)
2091     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2092       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2093         = SLOT_REQUIRED;
2094
2095   /* ... and any callee saved register that dataflow says is live.  */
2096   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2097     if (df_regs_ever_live_p (regno)
2098         && (regno == R30_REGNUM
2099             || !call_used_regs[regno]))
2100       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2101
2102   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2103     if (df_regs_ever_live_p (regno)
2104         && !call_used_regs[regno])
2105       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2106
2107   if (frame_pointer_needed)
2108     {
2109       /* FP and LR are placed in the linkage record.  */
2110       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2111       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2112       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2113       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2114       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2115       offset += 2 * UNITS_PER_WORD;
2116     }
2117
2118   /* Now assign stack slots for them.  */
2119   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2120     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2121       {
2122         cfun->machine->frame.reg_offset[regno] = offset;
2123         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2124           cfun->machine->frame.wb_candidate1 = regno;
2125         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2126           cfun->machine->frame.wb_candidate2 = regno;
2127         offset += UNITS_PER_WORD;
2128       }
2129
2130   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2131     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2132       {
2133         cfun->machine->frame.reg_offset[regno] = offset;
2134         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2135           cfun->machine->frame.wb_candidate1 = regno;
2136         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2137                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2138           cfun->machine->frame.wb_candidate2 = regno;
2139         offset += UNITS_PER_WORD;
2140       }
2141
2142   cfun->machine->frame.padding0 =
2143     (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2144   offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2145
2146   cfun->machine->frame.saved_regs_size = offset;
2147
2148   cfun->machine->frame.hard_fp_offset
2149     = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2150                         + get_frame_size ()
2151                         + cfun->machine->frame.saved_regs_size,
2152                         STACK_BOUNDARY / BITS_PER_UNIT);
2153
2154   cfun->machine->frame.frame_size
2155     = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2156                         + crtl->outgoing_args_size,
2157                         STACK_BOUNDARY / BITS_PER_UNIT);
2158
2159   cfun->machine->frame.laid_out = true;
2160 }
2161
2162 static bool
2163 aarch64_register_saved_on_entry (int regno)
2164 {
2165   return cfun->machine->frame.reg_offset[regno] >= 0;
2166 }
2167
2168 static unsigned
2169 aarch64_next_callee_save (unsigned regno, unsigned limit)
2170 {
2171   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2172     regno ++;
2173   return regno;
2174 }
2175
2176 static void
2177 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2178                            HOST_WIDE_INT adjustment)
2179  {
2180   rtx base_rtx = stack_pointer_rtx;
2181   rtx insn, reg, mem;
2182
2183   reg = gen_rtx_REG (mode, regno);
2184   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2185                             plus_constant (Pmode, base_rtx, -adjustment));
2186   mem = gen_rtx_MEM (mode, mem);
2187
2188   insn = emit_move_insn (mem, reg);
2189   RTX_FRAME_RELATED_P (insn) = 1;
2190 }
2191
2192 static rtx
2193 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2194                           HOST_WIDE_INT adjustment)
2195 {
2196   switch (mode)
2197     {
2198     case DImode:
2199       return gen_storewb_pairdi_di (base, base, reg, reg2,
2200                                     GEN_INT (-adjustment),
2201                                     GEN_INT (UNITS_PER_WORD - adjustment));
2202     case DFmode:
2203       return gen_storewb_pairdf_di (base, base, reg, reg2,
2204                                     GEN_INT (-adjustment),
2205                                     GEN_INT (UNITS_PER_WORD - adjustment));
2206     default:
2207       gcc_unreachable ();
2208     }
2209 }
2210
2211 static void
2212 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2213                          unsigned regno2, HOST_WIDE_INT adjustment)
2214 {
2215   rtx_insn *insn;
2216   rtx reg1 = gen_rtx_REG (mode, regno1);
2217   rtx reg2 = gen_rtx_REG (mode, regno2);
2218
2219   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2220                                               reg2, adjustment));
2221   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2222   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2223   RTX_FRAME_RELATED_P (insn) = 1;
2224 }
2225
2226 static rtx
2227 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2228                          HOST_WIDE_INT adjustment)
2229 {
2230   switch (mode)
2231     {
2232     case DImode:
2233       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2234                                    GEN_INT (UNITS_PER_WORD));
2235     case DFmode:
2236       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2237                                    GEN_INT (UNITS_PER_WORD));
2238     default:
2239       gcc_unreachable ();
2240     }
2241 }
2242
2243 static rtx
2244 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2245                         rtx reg2)
2246 {
2247   switch (mode)
2248     {
2249     case DImode:
2250       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2251
2252     case DFmode:
2253       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2254
2255     default:
2256       gcc_unreachable ();
2257     }
2258 }
2259
2260 static rtx
2261 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2262                        rtx mem2)
2263 {
2264   switch (mode)
2265     {
2266     case DImode:
2267       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2268
2269     case DFmode:
2270       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2271
2272     default:
2273       gcc_unreachable ();
2274     }
2275 }
2276
2277
2278 static void
2279 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2280                            unsigned start, unsigned limit, bool skip_wb)
2281 {
2282   rtx_insn *insn;
2283   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2284                                                  ? gen_frame_mem : gen_rtx_MEM);
2285   unsigned regno;
2286   unsigned regno2;
2287
2288   for (regno = aarch64_next_callee_save (start, limit);
2289        regno <= limit;
2290        regno = aarch64_next_callee_save (regno + 1, limit))
2291     {
2292       rtx reg, mem;
2293       HOST_WIDE_INT offset;
2294
2295       if (skip_wb
2296           && (regno == cfun->machine->frame.wb_candidate1
2297               || regno == cfun->machine->frame.wb_candidate2))
2298         continue;
2299
2300       reg = gen_rtx_REG (mode, regno);
2301       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2302       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2303                                               offset));
2304
2305       regno2 = aarch64_next_callee_save (regno + 1, limit);
2306
2307       if (regno2 <= limit
2308           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2309               == cfun->machine->frame.reg_offset[regno2]))
2310
2311         {
2312           rtx reg2 = gen_rtx_REG (mode, regno2);
2313           rtx mem2;
2314
2315           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2316           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2317                                                    offset));
2318           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2319                                                     reg2));
2320
2321           /* The first part of a frame-related parallel insn is
2322              always assumed to be relevant to the frame
2323              calculations; subsequent parts, are only
2324              frame-related if explicitly marked.  */
2325           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2326           regno = regno2;
2327         }
2328       else
2329         insn = emit_move_insn (mem, reg);
2330
2331       RTX_FRAME_RELATED_P (insn) = 1;
2332     }
2333 }
2334
2335 static void
2336 aarch64_restore_callee_saves (machine_mode mode,
2337                               HOST_WIDE_INT start_offset, unsigned start,
2338                               unsigned limit, bool skip_wb, rtx *cfi_ops)
2339 {
2340   rtx base_rtx = stack_pointer_rtx;
2341   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2342                                                  ? gen_frame_mem : gen_rtx_MEM);
2343   unsigned regno;
2344   unsigned regno2;
2345   HOST_WIDE_INT offset;
2346
2347   for (regno = aarch64_next_callee_save (start, limit);
2348        regno <= limit;
2349        regno = aarch64_next_callee_save (regno + 1, limit))
2350     {
2351       rtx reg, mem;
2352
2353       if (skip_wb
2354           && (regno == cfun->machine->frame.wb_candidate1
2355               || regno == cfun->machine->frame.wb_candidate2))
2356         continue;
2357
2358       reg = gen_rtx_REG (mode, regno);
2359       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2360       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2361
2362       regno2 = aarch64_next_callee_save (regno + 1, limit);
2363
2364       if (regno2 <= limit
2365           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2366               == cfun->machine->frame.reg_offset[regno2]))
2367         {
2368           rtx reg2 = gen_rtx_REG (mode, regno2);
2369           rtx mem2;
2370
2371           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2372           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2373           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2374
2375           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2376           regno = regno2;
2377         }
2378       else
2379         emit_move_insn (reg, mem);
2380       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2381     }
2382 }
2383
2384 /* AArch64 stack frames generated by this compiler look like:
2385
2386         +-------------------------------+
2387         |                               |
2388         |  incoming stack arguments     |
2389         |                               |
2390         +-------------------------------+
2391         |                               | <-- incoming stack pointer (aligned)
2392         |  callee-allocated save area   |
2393         |  for register varargs         |
2394         |                               |
2395         +-------------------------------+
2396         |  local variables              | <-- frame_pointer_rtx
2397         |                               |
2398         +-------------------------------+
2399         |  padding0                     | \
2400         +-------------------------------+  |
2401         |  callee-saved registers       |  | frame.saved_regs_size
2402         +-------------------------------+  |
2403         |  LR'                          |  |
2404         +-------------------------------+  |
2405         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2406         +-------------------------------+
2407         |  dynamic allocation           |
2408         +-------------------------------+
2409         |  padding                      |
2410         +-------------------------------+
2411         |  outgoing stack arguments     | <-- arg_pointer
2412         |                               |
2413         +-------------------------------+
2414         |                               | <-- stack_pointer_rtx (aligned)
2415
2416    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2417    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2418    unchanged.  */
2419
2420 /* Generate the prologue instructions for entry into a function.
2421    Establish the stack frame by decreasing the stack pointer with a
2422    properly calculated size and, if necessary, create a frame record
2423    filled with the values of LR and previous frame pointer.  The
2424    current FP is also set up if it is in use.  */
2425
2426 void
2427 aarch64_expand_prologue (void)
2428 {
2429   /* sub sp, sp, #<frame_size>
2430      stp {fp, lr}, [sp, #<frame_size> - 16]
2431      add fp, sp, #<frame_size> - hardfp_offset
2432      stp {cs_reg}, [fp, #-16] etc.
2433
2434      sub sp, sp, <final_adjustment_if_any>
2435   */
2436   HOST_WIDE_INT frame_size, offset;
2437   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2438   HOST_WIDE_INT hard_fp_offset;
2439   rtx_insn *insn;
2440
2441   aarch64_layout_frame ();
2442
2443   offset = frame_size = cfun->machine->frame.frame_size;
2444   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2445   fp_offset = frame_size - hard_fp_offset;
2446
2447   if (flag_stack_usage_info)
2448     current_function_static_stack_size = frame_size;
2449
2450   /* Store pairs and load pairs have a range only -512 to 504.  */
2451   if (offset >= 512)
2452     {
2453       /* When the frame has a large size, an initial decrease is done on
2454          the stack pointer to jump over the callee-allocated save area for
2455          register varargs, the local variable area and/or the callee-saved
2456          register area.  This will allow the pre-index write-back
2457          store pair instructions to be used for setting up the stack frame
2458          efficiently.  */
2459       offset = hard_fp_offset;
2460       if (offset >= 512)
2461         offset = cfun->machine->frame.saved_regs_size;
2462
2463       frame_size -= (offset + crtl->outgoing_args_size);
2464       fp_offset = 0;
2465
2466       if (frame_size >= 0x1000000)
2467         {
2468           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2469           emit_move_insn (op0, GEN_INT (-frame_size));
2470           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2471
2472           add_reg_note (insn, REG_CFA_ADJUST_CFA,
2473                         gen_rtx_SET (stack_pointer_rtx,
2474                                      plus_constant (Pmode, stack_pointer_rtx,
2475                                                     -frame_size)));
2476           RTX_FRAME_RELATED_P (insn) = 1;
2477         }
2478       else if (frame_size > 0)
2479         {
2480           int hi_ofs = frame_size & 0xfff000;
2481           int lo_ofs = frame_size & 0x000fff;
2482
2483           if (hi_ofs)
2484             {
2485               insn = emit_insn (gen_add2_insn
2486                                 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2487               RTX_FRAME_RELATED_P (insn) = 1;
2488             }
2489           if (lo_ofs)
2490             {
2491               insn = emit_insn (gen_add2_insn
2492                                 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2493               RTX_FRAME_RELATED_P (insn) = 1;
2494             }
2495         }
2496     }
2497   else
2498     frame_size = -1;
2499
2500   if (offset > 0)
2501     {
2502       bool skip_wb = false;
2503
2504       if (frame_pointer_needed)
2505         {
2506           skip_wb = true;
2507
2508           if (fp_offset)
2509             {
2510               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2511                                                GEN_INT (-offset)));
2512               RTX_FRAME_RELATED_P (insn) = 1;
2513
2514               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2515                                          R30_REGNUM, false);
2516             }
2517           else
2518             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2519
2520           /* Set up frame pointer to point to the location of the
2521              previous frame pointer on the stack.  */
2522           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2523                                            stack_pointer_rtx,
2524                                            GEN_INT (fp_offset)));
2525           RTX_FRAME_RELATED_P (insn) = 1;
2526           emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2527         }
2528       else
2529         {
2530           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2531           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2532
2533           if (fp_offset
2534               || reg1 == FIRST_PSEUDO_REGISTER
2535               || (reg2 == FIRST_PSEUDO_REGISTER
2536                   && offset >= 256))
2537             {
2538               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2539                                                GEN_INT (-offset)));
2540               RTX_FRAME_RELATED_P (insn) = 1;
2541             }
2542           else
2543             {
2544               machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2545
2546               skip_wb = true;
2547
2548               if (reg2 == FIRST_PSEUDO_REGISTER)
2549                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2550               else
2551                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2552             }
2553         }
2554
2555       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2556                                  skip_wb);
2557       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2558                                  skip_wb);
2559     }
2560
2561   /* when offset >= 512,
2562      sub sp, sp, #<outgoing_args_size> */
2563   if (frame_size > -1)
2564     {
2565       if (crtl->outgoing_args_size > 0)
2566         {
2567           insn = emit_insn (gen_add2_insn
2568                             (stack_pointer_rtx,
2569                              GEN_INT (- crtl->outgoing_args_size)));
2570           RTX_FRAME_RELATED_P (insn) = 1;
2571         }
2572     }
2573 }
2574
2575 /* Return TRUE if we can use a simple_return insn.
2576
2577    This function checks whether the callee saved stack is empty, which
2578    means no restore actions are need. The pro_and_epilogue will use
2579    this to check whether shrink-wrapping opt is feasible.  */
2580
2581 bool
2582 aarch64_use_return_insn_p (void)
2583 {
2584   if (!reload_completed)
2585     return false;
2586
2587   if (crtl->profile)
2588     return false;
2589
2590   aarch64_layout_frame ();
2591
2592   return cfun->machine->frame.frame_size == 0;
2593 }
2594
2595 /* Generate the epilogue instructions for returning from a function.  */
2596 void
2597 aarch64_expand_epilogue (bool for_sibcall)
2598 {
2599   HOST_WIDE_INT frame_size, offset;
2600   HOST_WIDE_INT fp_offset;
2601   HOST_WIDE_INT hard_fp_offset;
2602   rtx_insn *insn;
2603   /* We need to add memory barrier to prevent read from deallocated stack.  */
2604   bool need_barrier_p = (get_frame_size () != 0
2605                          || cfun->machine->frame.saved_varargs_size);
2606
2607   aarch64_layout_frame ();
2608
2609   offset = frame_size = cfun->machine->frame.frame_size;
2610   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2611   fp_offset = frame_size - hard_fp_offset;
2612
2613   /* Store pairs and load pairs have a range only -512 to 504.  */
2614   if (offset >= 512)
2615     {
2616       offset = hard_fp_offset;
2617       if (offset >= 512)
2618         offset = cfun->machine->frame.saved_regs_size;
2619
2620       frame_size -= (offset + crtl->outgoing_args_size);
2621       fp_offset = 0;
2622       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2623         {
2624           insn = emit_insn (gen_add2_insn
2625                             (stack_pointer_rtx,
2626                              GEN_INT (crtl->outgoing_args_size)));
2627           RTX_FRAME_RELATED_P (insn) = 1;
2628         }
2629     }
2630   else
2631     frame_size = -1;
2632
2633   /* If there were outgoing arguments or we've done dynamic stack
2634      allocation, then restore the stack pointer from the frame
2635      pointer.  This is at most one insn and more efficient than using
2636      GCC's internal mechanism.  */
2637   if (frame_pointer_needed
2638       && (crtl->outgoing_args_size || cfun->calls_alloca))
2639     {
2640       if (cfun->calls_alloca)
2641         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2642
2643       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2644                                        hard_frame_pointer_rtx,
2645                                        GEN_INT (0)));
2646       offset = offset - fp_offset;
2647     }
2648
2649   if (offset > 0)
2650     {
2651       unsigned reg1 = cfun->machine->frame.wb_candidate1;
2652       unsigned reg2 = cfun->machine->frame.wb_candidate2;
2653       bool skip_wb = true;
2654       rtx cfi_ops = NULL;
2655
2656       if (frame_pointer_needed)
2657         fp_offset = 0;
2658       else if (fp_offset
2659                || reg1 == FIRST_PSEUDO_REGISTER
2660                || (reg2 == FIRST_PSEUDO_REGISTER
2661                    && offset >= 256))
2662         skip_wb = false;
2663
2664       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2665                                     skip_wb, &cfi_ops);
2666       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2667                                     skip_wb, &cfi_ops);
2668
2669       if (need_barrier_p)
2670         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2671
2672       if (skip_wb)
2673         {
2674           machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2675           rtx rreg1 = gen_rtx_REG (mode1, reg1);
2676
2677           cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2678           if (reg2 == FIRST_PSEUDO_REGISTER)
2679             {
2680               rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2681               mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2682               mem = gen_rtx_MEM (mode1, mem);
2683               insn = emit_move_insn (rreg1, mem);
2684             }
2685           else
2686             {
2687               rtx rreg2 = gen_rtx_REG (mode1, reg2);
2688
2689               cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2690               insn = emit_insn (aarch64_gen_loadwb_pair
2691                                 (mode1, stack_pointer_rtx, rreg1,
2692                                  rreg2, offset));
2693             }
2694         }
2695       else
2696         {
2697           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2698                                            GEN_INT (offset)));
2699         }
2700
2701       /* Reset the CFA to be SP + FRAME_SIZE.  */
2702       rtx new_cfa = stack_pointer_rtx;
2703       if (frame_size > 0)
2704         new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2705       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2706       REG_NOTES (insn) = cfi_ops;
2707       RTX_FRAME_RELATED_P (insn) = 1;
2708     }
2709
2710   if (frame_size > 0)
2711     {
2712       if (need_barrier_p)
2713         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2714
2715       if (frame_size >= 0x1000000)
2716         {
2717           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2718           emit_move_insn (op0, GEN_INT (frame_size));
2719           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2720         }
2721       else
2722         {
2723           int hi_ofs = frame_size & 0xfff000;
2724           int lo_ofs = frame_size & 0x000fff;
2725
2726           if (hi_ofs && lo_ofs)
2727             {
2728               insn = emit_insn (gen_add2_insn
2729                                 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2730               RTX_FRAME_RELATED_P (insn) = 1;
2731               frame_size = lo_ofs;
2732             }
2733           insn = emit_insn (gen_add2_insn
2734                             (stack_pointer_rtx, GEN_INT (frame_size)));
2735         }
2736
2737       /* Reset the CFA to be SP + 0.  */
2738       add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2739       RTX_FRAME_RELATED_P (insn) = 1;
2740     }
2741
2742   /* Stack adjustment for exception handler.  */
2743   if (crtl->calls_eh_return)
2744     {
2745       /* We need to unwind the stack by the offset computed by
2746          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
2747          to be SP; letting the CFA move during this adjustment
2748          is just as correct as retaining the CFA from the body
2749          of the function.  Therefore, do nothing special.  */
2750       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2751     }
2752
2753   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2754   if (!for_sibcall)
2755     emit_jump_insn (ret_rtx);
2756 }
2757
2758 /* Return the place to copy the exception unwinding return address to.
2759    This will probably be a stack slot, but could (in theory be the
2760    return register).  */
2761 rtx
2762 aarch64_final_eh_return_addr (void)
2763 {
2764   HOST_WIDE_INT fp_offset;
2765
2766   aarch64_layout_frame ();
2767
2768   fp_offset = cfun->machine->frame.frame_size
2769               - cfun->machine->frame.hard_fp_offset;
2770
2771   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2772     return gen_rtx_REG (DImode, LR_REGNUM);
2773
2774   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2775      result in a store to save LR introduced by builtin_eh_return () being
2776      incorrectly deleted because the alias is not detected.
2777      So in the calculation of the address to copy the exception unwinding
2778      return address to, we note 2 cases.
2779      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2780      we return a SP-relative location since all the addresses are SP-relative
2781      in this case.  This prevents the store from being optimized away.
2782      If the fp_offset is not 0, then the addresses will be FP-relative and
2783      therefore we return a FP-relative location.  */
2784
2785   if (frame_pointer_needed)
2786     {
2787       if (fp_offset)
2788         return gen_frame_mem (DImode,
2789                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2790       else
2791         return gen_frame_mem (DImode,
2792                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2793     }
2794
2795   /* If FP is not needed, we calculate the location of LR, which would be
2796      at the top of the saved registers block.  */
2797
2798   return gen_frame_mem (DImode,
2799                         plus_constant (Pmode,
2800                                        stack_pointer_rtx,
2801                                        fp_offset
2802                                        + cfun->machine->frame.saved_regs_size
2803                                        - 2 * UNITS_PER_WORD));
2804 }
2805
2806 /* Possibly output code to build up a constant in a register.  For
2807    the benefit of the costs infrastructure, returns the number of
2808    instructions which would be emitted.  GENERATE inhibits or
2809    enables code generation.  */
2810
2811 static int
2812 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2813 {
2814   int insns = 0;
2815
2816   if (aarch64_bitmask_imm (val, DImode))
2817     {
2818       if (generate)
2819         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2820       insns = 1;
2821     }
2822   else
2823     {
2824       int i;
2825       int ncount = 0;
2826       int zcount = 0;
2827       HOST_WIDE_INT valp = val >> 16;
2828       HOST_WIDE_INT valm;
2829       HOST_WIDE_INT tval;
2830
2831       for (i = 16; i < 64; i += 16)
2832         {
2833           valm = (valp & 0xffff);
2834
2835           if (valm != 0)
2836             ++ zcount;
2837
2838           if (valm != 0xffff)
2839             ++ ncount;
2840
2841           valp >>= 16;
2842         }
2843
2844       /* zcount contains the number of additional MOVK instructions
2845          required if the constant is built up with an initial MOVZ instruction,
2846          while ncount is the number of MOVK instructions required if starting
2847          with a MOVN instruction.  Choose the sequence that yields the fewest
2848          number of instructions, preferring MOVZ instructions when they are both
2849          the same.  */
2850       if (ncount < zcount)
2851         {
2852           if (generate)
2853             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2854                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2855           tval = 0xffff;
2856           insns++;
2857         }
2858       else
2859         {
2860           if (generate)
2861             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2862                             GEN_INT (val & 0xffff));
2863           tval = 0;
2864           insns++;
2865         }
2866
2867       val >>= 16;
2868
2869       for (i = 16; i < 64; i += 16)
2870         {
2871           if ((val & 0xffff) != tval)
2872             {
2873               if (generate)
2874                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2875                                            GEN_INT (i),
2876                                            GEN_INT (val & 0xffff)));
2877               insns++;
2878             }
2879           val >>= 16;
2880         }
2881     }
2882   return insns;
2883 }
2884
2885 static void
2886 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2887 {
2888   HOST_WIDE_INT mdelta = delta;
2889   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2890   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2891
2892   if (mdelta < 0)
2893     mdelta = -mdelta;
2894
2895   if (mdelta >= 4096 * 4096)
2896     {
2897       (void) aarch64_build_constant (scratchreg, delta, true);
2898       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2899     }
2900   else if (mdelta > 0)
2901     {
2902       if (mdelta >= 4096)
2903         {
2904           emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
2905           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2906           if (delta < 0)
2907             emit_insn (gen_rtx_SET (this_rtx,
2908                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
2909           else
2910             emit_insn (gen_rtx_SET (this_rtx,
2911                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
2912         }
2913       if (mdelta % 4096 != 0)
2914         {
2915           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2916           emit_insn (gen_rtx_SET (this_rtx,
2917                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2918         }
2919     }
2920 }
2921
2922 /* Output code to add DELTA to the first argument, and then jump
2923    to FUNCTION.  Used for C++ multiple inheritance.  */
2924 static void
2925 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2926                          HOST_WIDE_INT delta,
2927                          HOST_WIDE_INT vcall_offset,
2928                          tree function)
2929 {
2930   /* The this pointer is always in x0.  Note that this differs from
2931      Arm where the this pointer maybe bumped to r1 if r0 is required
2932      to return a pointer to an aggregate.  On AArch64 a result value
2933      pointer will be in x8.  */
2934   int this_regno = R0_REGNUM;
2935   rtx this_rtx, temp0, temp1, addr, funexp;
2936   rtx_insn *insn;
2937
2938   reload_completed = 1;
2939   emit_note (NOTE_INSN_PROLOGUE_END);
2940
2941   if (vcall_offset == 0)
2942     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2943   else
2944     {
2945       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2946
2947       this_rtx = gen_rtx_REG (Pmode, this_regno);
2948       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2949       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2950
2951       addr = this_rtx;
2952       if (delta != 0)
2953         {
2954           if (delta >= -256 && delta < 256)
2955             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2956                                        plus_constant (Pmode, this_rtx, delta));
2957           else
2958             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2959         }
2960
2961       if (Pmode == ptr_mode)
2962         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2963       else
2964         aarch64_emit_move (temp0,
2965                            gen_rtx_ZERO_EXTEND (Pmode,
2966                                                 gen_rtx_MEM (ptr_mode, addr)));
2967
2968       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2969           addr = plus_constant (Pmode, temp0, vcall_offset);
2970       else
2971         {
2972           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2973           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2974         }
2975
2976       if (Pmode == ptr_mode)
2977         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2978       else
2979         aarch64_emit_move (temp1,
2980                            gen_rtx_SIGN_EXTEND (Pmode,
2981                                                 gen_rtx_MEM (ptr_mode, addr)));
2982
2983       emit_insn (gen_add2_insn (this_rtx, temp1));
2984     }
2985
2986   /* Generate a tail call to the target function.  */
2987   if (!TREE_USED (function))
2988     {
2989       assemble_external (function);
2990       TREE_USED (function) = 1;
2991     }
2992   funexp = XEXP (DECL_RTL (function), 0);
2993   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2994   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2995   SIBLING_CALL_P (insn) = 1;
2996
2997   insn = get_insns ();
2998   shorten_branches (insn);
2999   final_start_function (insn, file, 1);
3000   final (insn, file, 1);
3001   final_end_function ();
3002
3003   /* Stop pretending to be a post-reload pass.  */
3004   reload_completed = 0;
3005 }
3006
3007 static bool
3008 aarch64_tls_referenced_p (rtx x)
3009 {
3010   if (!TARGET_HAVE_TLS)
3011     return false;
3012   subrtx_iterator::array_type array;
3013   FOR_EACH_SUBRTX (iter, array, x, ALL)
3014     {
3015       const_rtx x = *iter;
3016       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3017         return true;
3018       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3019          TLS offsets, not real symbol references.  */
3020       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3021         iter.skip_subrtxes ();
3022     }
3023   return false;
3024 }
3025
3026
3027 static int
3028 aarch64_bitmasks_cmp (const void *i1, const void *i2)
3029 {
3030   const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3031   const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3032
3033   if (*imm1 < *imm2)
3034     return -1;
3035   if (*imm1 > *imm2)
3036     return +1;
3037   return 0;
3038 }
3039
3040
3041 static void
3042 aarch64_build_bitmask_table (void)
3043 {
3044   unsigned HOST_WIDE_INT mask, imm;
3045   unsigned int log_e, e, s, r;
3046   unsigned int nimms = 0;
3047
3048   for (log_e = 1; log_e <= 6; log_e++)
3049     {
3050       e = 1 << log_e;
3051       if (e == 64)
3052         mask = ~(HOST_WIDE_INT) 0;
3053       else
3054         mask = ((HOST_WIDE_INT) 1 << e) - 1;
3055       for (s = 1; s < e; s++)
3056         {
3057           for (r = 0; r < e; r++)
3058             {
3059               /* set s consecutive bits to 1 (s < 64) */
3060               imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3061               /* rotate right by r */
3062               if (r != 0)
3063                 imm = ((imm >> r) | (imm << (e - r))) & mask;
3064               /* replicate the constant depending on SIMD size */
3065               switch (log_e) {
3066               case 1: imm |= (imm <<  2);
3067               case 2: imm |= (imm <<  4);
3068               case 3: imm |= (imm <<  8);
3069               case 4: imm |= (imm << 16);
3070               case 5: imm |= (imm << 32);
3071               case 6:
3072                 break;
3073               default:
3074                 gcc_unreachable ();
3075               }
3076               gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3077               aarch64_bitmasks[nimms++] = imm;
3078             }
3079         }
3080     }
3081
3082   gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3083   qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3084          aarch64_bitmasks_cmp);
3085 }
3086
3087
3088 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3089    a left shift of 0 or 12 bits.  */
3090 bool
3091 aarch64_uimm12_shift (HOST_WIDE_INT val)
3092 {
3093   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3094           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3095           );
3096 }
3097
3098
3099 /* Return true if val is an immediate that can be loaded into a
3100    register by a MOVZ instruction.  */
3101 static bool
3102 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3103 {
3104   if (GET_MODE_SIZE (mode) > 4)
3105     {
3106       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3107           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3108         return 1;
3109     }
3110   else
3111     {
3112       /* Ignore sign extension.  */
3113       val &= (HOST_WIDE_INT) 0xffffffff;
3114     }
3115   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3116           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3117 }
3118
3119
3120 /* Return true if val is a valid bitmask immediate.  */
3121 bool
3122 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3123 {
3124   if (GET_MODE_SIZE (mode) < 8)
3125     {
3126       /* Replicate bit pattern.  */
3127       val &= (HOST_WIDE_INT) 0xffffffff;
3128       val |= val << 32;
3129     }
3130   return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3131                   sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3132 }
3133
3134
3135 /* Return true if val is an immediate that can be loaded into a
3136    register in a single instruction.  */
3137 bool
3138 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3139 {
3140   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3141     return 1;
3142   return aarch64_bitmask_imm (val, mode);
3143 }
3144
3145 static bool
3146 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3147 {
3148   rtx base, offset;
3149
3150   if (GET_CODE (x) == HIGH)
3151     return true;
3152
3153   split_const (x, &base, &offset);
3154   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3155     {
3156       if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3157           != SYMBOL_FORCE_TO_MEM)
3158         return true;
3159       else
3160         /* Avoid generating a 64-bit relocation in ILP32; leave
3161            to aarch64_expand_mov_immediate to handle it properly.  */
3162         return mode != ptr_mode;
3163     }
3164
3165   return aarch64_tls_referenced_p (x);
3166 }
3167
3168 /* Return true if register REGNO is a valid index register.
3169    STRICT_P is true if REG_OK_STRICT is in effect.  */
3170
3171 bool
3172 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3173 {
3174   if (!HARD_REGISTER_NUM_P (regno))
3175     {
3176       if (!strict_p)
3177         return true;
3178
3179       if (!reg_renumber)
3180         return false;
3181
3182       regno = reg_renumber[regno];
3183     }
3184   return GP_REGNUM_P (regno);
3185 }
3186
3187 /* Return true if register REGNO is a valid base register for mode MODE.
3188    STRICT_P is true if REG_OK_STRICT is in effect.  */
3189
3190 bool
3191 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3192 {
3193   if (!HARD_REGISTER_NUM_P (regno))
3194     {
3195       if (!strict_p)
3196         return true;
3197
3198       if (!reg_renumber)
3199         return false;
3200
3201       regno = reg_renumber[regno];
3202     }
3203
3204   /* The fake registers will be eliminated to either the stack or
3205      hard frame pointer, both of which are usually valid base registers.
3206      Reload deals with the cases where the eliminated form isn't valid.  */
3207   return (GP_REGNUM_P (regno)
3208           || regno == SP_REGNUM
3209           || regno == FRAME_POINTER_REGNUM
3210           || regno == ARG_POINTER_REGNUM);
3211 }
3212
3213 /* Return true if X is a valid base register for mode MODE.
3214    STRICT_P is true if REG_OK_STRICT is in effect.  */
3215
3216 static bool
3217 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3218 {
3219   if (!strict_p && GET_CODE (x) == SUBREG)
3220     x = SUBREG_REG (x);
3221
3222   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3223 }
3224
3225 /* Return true if address offset is a valid index.  If it is, fill in INFO
3226    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3227
3228 static bool
3229 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3230                         machine_mode mode, bool strict_p)
3231 {
3232   enum aarch64_address_type type;
3233   rtx index;
3234   int shift;
3235
3236   /* (reg:P) */
3237   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3238       && GET_MODE (x) == Pmode)
3239     {
3240       type = ADDRESS_REG_REG;
3241       index = x;
3242       shift = 0;
3243     }
3244   /* (sign_extend:DI (reg:SI)) */
3245   else if ((GET_CODE (x) == SIGN_EXTEND
3246             || GET_CODE (x) == ZERO_EXTEND)
3247            && GET_MODE (x) == DImode
3248            && GET_MODE (XEXP (x, 0)) == SImode)
3249     {
3250       type = (GET_CODE (x) == SIGN_EXTEND)
3251         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3252       index = XEXP (x, 0);
3253       shift = 0;
3254     }
3255   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3256   else if (GET_CODE (x) == MULT
3257            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3258                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3259            && GET_MODE (XEXP (x, 0)) == DImode
3260            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3261            && CONST_INT_P (XEXP (x, 1)))
3262     {
3263       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3264         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3265       index = XEXP (XEXP (x, 0), 0);
3266       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3267     }
3268   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3269   else if (GET_CODE (x) == ASHIFT
3270            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3271                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3272            && GET_MODE (XEXP (x, 0)) == DImode
3273            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3274            && CONST_INT_P (XEXP (x, 1)))
3275     {
3276       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3277         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3278       index = XEXP (XEXP (x, 0), 0);
3279       shift = INTVAL (XEXP (x, 1));
3280     }
3281   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3282   else if ((GET_CODE (x) == SIGN_EXTRACT
3283             || GET_CODE (x) == ZERO_EXTRACT)
3284            && GET_MODE (x) == DImode
3285            && GET_CODE (XEXP (x, 0)) == MULT
3286            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3287            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3288     {
3289       type = (GET_CODE (x) == SIGN_EXTRACT)
3290         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3291       index = XEXP (XEXP (x, 0), 0);
3292       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3293       if (INTVAL (XEXP (x, 1)) != 32 + shift
3294           || INTVAL (XEXP (x, 2)) != 0)
3295         shift = -1;
3296     }
3297   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3298      (const_int 0xffffffff<<shift)) */
3299   else if (GET_CODE (x) == AND
3300            && GET_MODE (x) == DImode
3301            && GET_CODE (XEXP (x, 0)) == MULT
3302            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3303            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3304            && CONST_INT_P (XEXP (x, 1)))
3305     {
3306       type = ADDRESS_REG_UXTW;
3307       index = XEXP (XEXP (x, 0), 0);
3308       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3309       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3310         shift = -1;
3311     }
3312   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3313   else if ((GET_CODE (x) == SIGN_EXTRACT
3314             || GET_CODE (x) == ZERO_EXTRACT)
3315            && GET_MODE (x) == DImode
3316            && GET_CODE (XEXP (x, 0)) == ASHIFT
3317            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3318            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3319     {
3320       type = (GET_CODE (x) == SIGN_EXTRACT)
3321         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3322       index = XEXP (XEXP (x, 0), 0);
3323       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3324       if (INTVAL (XEXP (x, 1)) != 32 + shift
3325           || INTVAL (XEXP (x, 2)) != 0)
3326         shift = -1;
3327     }
3328   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3329      (const_int 0xffffffff<<shift)) */
3330   else if (GET_CODE (x) == AND
3331            && GET_MODE (x) == DImode
3332            && GET_CODE (XEXP (x, 0)) == ASHIFT
3333            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3334            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3335            && CONST_INT_P (XEXP (x, 1)))
3336     {
3337       type = ADDRESS_REG_UXTW;
3338       index = XEXP (XEXP (x, 0), 0);
3339       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3340       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3341         shift = -1;
3342     }
3343   /* (mult:P (reg:P) (const_int scale)) */
3344   else if (GET_CODE (x) == MULT
3345            && GET_MODE (x) == Pmode
3346            && GET_MODE (XEXP (x, 0)) == Pmode
3347            && CONST_INT_P (XEXP (x, 1)))
3348     {
3349       type = ADDRESS_REG_REG;
3350       index = XEXP (x, 0);
3351       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3352     }
3353   /* (ashift:P (reg:P) (const_int shift)) */
3354   else if (GET_CODE (x) == ASHIFT
3355            && GET_MODE (x) == Pmode
3356            && GET_MODE (XEXP (x, 0)) == Pmode
3357            && CONST_INT_P (XEXP (x, 1)))
3358     {
3359       type = ADDRESS_REG_REG;
3360       index = XEXP (x, 0);
3361       shift = INTVAL (XEXP (x, 1));
3362     }
3363   else
3364     return false;
3365
3366   if (GET_CODE (index) == SUBREG)
3367     index = SUBREG_REG (index);
3368
3369   if ((shift == 0 ||
3370        (shift > 0 && shift <= 3
3371         && (1 << shift) == GET_MODE_SIZE (mode)))
3372       && REG_P (index)
3373       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3374     {
3375       info->type = type;
3376       info->offset = index;
3377       info->shift = shift;
3378       return true;
3379     }
3380
3381   return false;
3382 }
3383
3384 bool
3385 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3386 {
3387   return (offset >= -64 * GET_MODE_SIZE (mode)
3388           && offset < 64 * GET_MODE_SIZE (mode)
3389           && offset % GET_MODE_SIZE (mode) == 0);
3390 }
3391
3392 static inline bool
3393 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3394                                HOST_WIDE_INT offset)
3395 {
3396   return offset >= -256 && offset < 256;
3397 }
3398
3399 static inline bool
3400 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3401 {
3402   return (offset >= 0
3403           && offset < 4096 * GET_MODE_SIZE (mode)
3404           && offset % GET_MODE_SIZE (mode) == 0);
3405 }
3406
3407 /* Return true if X is a valid address for machine mode MODE.  If it is,
3408    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3409    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3410
3411 static bool
3412 aarch64_classify_address (struct aarch64_address_info *info,
3413                           rtx x, machine_mode mode,
3414                           RTX_CODE outer_code, bool strict_p)
3415 {
3416   enum rtx_code code = GET_CODE (x);
3417   rtx op0, op1;
3418
3419   /* On BE, we use load/store pair for all large int mode load/stores.  */
3420   bool load_store_pair_p = (outer_code == PARALLEL
3421                             || (BYTES_BIG_ENDIAN
3422                                 && aarch64_vect_struct_mode_p (mode)));
3423
3424   bool allow_reg_index_p =
3425     !load_store_pair_p
3426     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3427     && !aarch64_vect_struct_mode_p (mode);
3428
3429   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3430      REG addressing.  */
3431   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3432       && (code != POST_INC && code != REG))
3433     return false;
3434
3435   switch (code)
3436     {
3437     case REG:
3438     case SUBREG:
3439       info->type = ADDRESS_REG_IMM;
3440       info->base = x;
3441       info->offset = const0_rtx;
3442       return aarch64_base_register_rtx_p (x, strict_p);
3443
3444     case PLUS:
3445       op0 = XEXP (x, 0);
3446       op1 = XEXP (x, 1);
3447
3448       if (! strict_p
3449           && REG_P (op0)
3450           && (op0 == virtual_stack_vars_rtx
3451               || op0 == frame_pointer_rtx
3452               || op0 == arg_pointer_rtx)
3453           && CONST_INT_P (op1))
3454         {
3455           info->type = ADDRESS_REG_IMM;
3456           info->base = op0;
3457           info->offset = op1;
3458
3459           return true;
3460         }
3461
3462       if (GET_MODE_SIZE (mode) != 0
3463           && CONST_INT_P (op1)
3464           && aarch64_base_register_rtx_p (op0, strict_p))
3465         {
3466           HOST_WIDE_INT offset = INTVAL (op1);
3467
3468           info->type = ADDRESS_REG_IMM;
3469           info->base = op0;
3470           info->offset = op1;
3471
3472           /* TImode and TFmode values are allowed in both pairs of X
3473              registers and individual Q registers.  The available
3474              address modes are:
3475              X,X: 7-bit signed scaled offset
3476              Q:   9-bit signed offset
3477              We conservatively require an offset representable in either mode.
3478            */
3479           if (mode == TImode || mode == TFmode)
3480             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3481                     && offset_9bit_signed_unscaled_p (mode, offset));
3482
3483           /* A 7bit offset check because OImode will emit a ldp/stp
3484              instruction (only big endian will get here).
3485              For ldp/stp instructions, the offset is scaled for the size of a
3486              single element of the pair.  */
3487           if (mode == OImode)
3488             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3489
3490           /* Three 9/12 bit offsets checks because CImode will emit three
3491              ldr/str instructions (only big endian will get here).  */
3492           if (mode == CImode)
3493             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3494                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3495                         || offset_12bit_unsigned_scaled_p (V16QImode,
3496                                                            offset + 32)));
3497
3498           /* Two 7bit offsets checks because XImode will emit two ldp/stp
3499              instructions (only big endian will get here).  */
3500           if (mode == XImode)
3501             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3502                     && aarch64_offset_7bit_signed_scaled_p (TImode,
3503                                                             offset + 32));
3504
3505           if (load_store_pair_p)
3506             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3507                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3508           else
3509             return (offset_9bit_signed_unscaled_p (mode, offset)
3510                     || offset_12bit_unsigned_scaled_p (mode, offset));
3511         }
3512
3513       if (allow_reg_index_p)
3514         {
3515           /* Look for base + (scaled/extended) index register.  */
3516           if (aarch64_base_register_rtx_p (op0, strict_p)
3517               && aarch64_classify_index (info, op1, mode, strict_p))
3518             {
3519               info->base = op0;
3520               return true;
3521             }
3522           if (aarch64_base_register_rtx_p (op1, strict_p)
3523               && aarch64_classify_index (info, op0, mode, strict_p))
3524             {
3525               info->base = op1;
3526               return true;
3527             }
3528         }
3529
3530       return false;
3531
3532     case POST_INC:
3533     case POST_DEC:
3534     case PRE_INC:
3535     case PRE_DEC:
3536       info->type = ADDRESS_REG_WB;
3537       info->base = XEXP (x, 0);
3538       info->offset = NULL_RTX;
3539       return aarch64_base_register_rtx_p (info->base, strict_p);
3540
3541     case POST_MODIFY:
3542     case PRE_MODIFY:
3543       info->type = ADDRESS_REG_WB;
3544       info->base = XEXP (x, 0);
3545       if (GET_CODE (XEXP (x, 1)) == PLUS
3546           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3547           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3548           && aarch64_base_register_rtx_p (info->base, strict_p))
3549         {
3550           HOST_WIDE_INT offset;
3551           info->offset = XEXP (XEXP (x, 1), 1);
3552           offset = INTVAL (info->offset);
3553
3554           /* TImode and TFmode values are allowed in both pairs of X
3555              registers and individual Q registers.  The available
3556              address modes are:
3557              X,X: 7-bit signed scaled offset
3558              Q:   9-bit signed offset
3559              We conservatively require an offset representable in either mode.
3560            */
3561           if (mode == TImode || mode == TFmode)
3562             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3563                     && offset_9bit_signed_unscaled_p (mode, offset));
3564
3565           if (load_store_pair_p)
3566             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3567                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3568           else
3569             return offset_9bit_signed_unscaled_p (mode, offset);
3570         }
3571       return false;
3572
3573     case CONST:
3574     case SYMBOL_REF:
3575     case LABEL_REF:
3576       /* load literal: pc-relative constant pool entry.  Only supported
3577          for SI mode or larger.  */
3578       info->type = ADDRESS_SYMBOLIC;
3579
3580       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3581         {
3582           rtx sym, addend;
3583
3584           split_const (x, &sym, &addend);
3585           return (GET_CODE (sym) == LABEL_REF
3586                   || (GET_CODE (sym) == SYMBOL_REF
3587                       && CONSTANT_POOL_ADDRESS_P (sym)));
3588         }
3589       return false;
3590
3591     case LO_SUM:
3592       info->type = ADDRESS_LO_SUM;
3593       info->base = XEXP (x, 0);
3594       info->offset = XEXP (x, 1);
3595       if (allow_reg_index_p
3596           && aarch64_base_register_rtx_p (info->base, strict_p))
3597         {
3598           rtx sym, offs;
3599           split_const (info->offset, &sym, &offs);
3600           if (GET_CODE (sym) == SYMBOL_REF
3601               && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3602                   == SYMBOL_SMALL_ABSOLUTE))
3603             {
3604               /* The symbol and offset must be aligned to the access size.  */
3605               unsigned int align;
3606               unsigned int ref_size;
3607
3608               if (CONSTANT_POOL_ADDRESS_P (sym))
3609                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3610               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3611                 {
3612                   tree exp = SYMBOL_REF_DECL (sym);
3613                   align = TYPE_ALIGN (TREE_TYPE (exp));
3614                   align = CONSTANT_ALIGNMENT (exp, align);
3615                 }
3616               else if (SYMBOL_REF_DECL (sym))
3617                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3618               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3619                        && SYMBOL_REF_BLOCK (sym) != NULL)
3620                 align = SYMBOL_REF_BLOCK (sym)->alignment;
3621               else
3622                 align = BITS_PER_UNIT;
3623
3624               ref_size = GET_MODE_SIZE (mode);
3625               if (ref_size == 0)
3626                 ref_size = GET_MODE_SIZE (DImode);
3627
3628               return ((INTVAL (offs) & (ref_size - 1)) == 0
3629                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3630             }
3631         }
3632       return false;
3633
3634     default:
3635       return false;
3636     }
3637 }
3638
3639 bool
3640 aarch64_symbolic_address_p (rtx x)
3641 {
3642   rtx offset;
3643
3644   split_const (x, &x, &offset);
3645   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3646 }
3647
3648 /* Classify the base of symbolic expression X, given that X appears in
3649    context CONTEXT.  */
3650
3651 enum aarch64_symbol_type
3652 aarch64_classify_symbolic_expression (rtx x,
3653                                       enum aarch64_symbol_context context)
3654 {
3655   rtx offset;
3656
3657   split_const (x, &x, &offset);
3658   return aarch64_classify_symbol (x, offset, context);
3659 }
3660
3661
3662 /* Return TRUE if X is a legitimate address for accessing memory in
3663    mode MODE.  */
3664 static bool
3665 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3666 {
3667   struct aarch64_address_info addr;
3668
3669   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3670 }
3671
3672 /* Return TRUE if X is a legitimate address for accessing memory in
3673    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3674    pair operation.  */
3675 bool
3676 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3677                               RTX_CODE outer_code, bool strict_p)
3678 {
3679   struct aarch64_address_info addr;
3680
3681   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3682 }
3683
3684 /* Return TRUE if rtx X is immediate constant 0.0 */
3685 bool
3686 aarch64_float_const_zero_rtx_p (rtx x)
3687 {
3688   REAL_VALUE_TYPE r;
3689
3690   if (GET_MODE (x) == VOIDmode)
3691     return false;
3692
3693   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3694   if (REAL_VALUE_MINUS_ZERO (r))
3695     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3696   return REAL_VALUES_EQUAL (r, dconst0);
3697 }
3698
3699 /* Return the fixed registers used for condition codes.  */
3700
3701 static bool
3702 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3703 {
3704   *p1 = CC_REGNUM;
3705   *p2 = INVALID_REGNUM;
3706   return true;
3707 }
3708
3709 /* Emit call insn with PAT and do aarch64-specific handling.  */
3710
3711 void
3712 aarch64_emit_call_insn (rtx pat)
3713 {
3714   rtx insn = emit_call_insn (pat);
3715
3716   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3717   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3718   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3719 }
3720
3721 machine_mode
3722 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3723 {
3724   /* All floating point compares return CCFP if it is an equality
3725      comparison, and CCFPE otherwise.  */
3726   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3727     {
3728       switch (code)
3729         {
3730         case EQ:
3731         case NE:
3732         case UNORDERED:
3733         case ORDERED:
3734         case UNLT:
3735         case UNLE:
3736         case UNGT:
3737         case UNGE:
3738         case UNEQ:
3739         case LTGT:
3740           return CCFPmode;
3741
3742         case LT:
3743         case LE:
3744         case GT:
3745         case GE:
3746           return CCFPEmode;
3747
3748         default:
3749           gcc_unreachable ();
3750         }
3751     }
3752
3753   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3754       && y == const0_rtx
3755       && (code == EQ || code == NE || code == LT || code == GE)
3756       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3757           || GET_CODE (x) == NEG))
3758     return CC_NZmode;
3759
3760   /* A compare with a shifted operand.  Because of canonicalization,
3761      the comparison will have to be swapped when we emit the assembly
3762      code.  */
3763   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3764       && (REG_P (y) || GET_CODE (y) == SUBREG)
3765       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3766           || GET_CODE (x) == LSHIFTRT
3767           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3768     return CC_SWPmode;
3769
3770   /* Similarly for a negated operand, but we can only do this for
3771      equalities.  */
3772   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3773       && (REG_P (y) || GET_CODE (y) == SUBREG)
3774       && (code == EQ || code == NE)
3775       && GET_CODE (x) == NEG)
3776     return CC_Zmode;
3777
3778   /* A compare of a mode narrower than SI mode against zero can be done
3779      by extending the value in the comparison.  */
3780   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3781       && y == const0_rtx)
3782     /* Only use sign-extension if we really need it.  */
3783     return ((code == GT || code == GE || code == LE || code == LT)
3784             ? CC_SESWPmode : CC_ZESWPmode);
3785
3786   /* For everything else, return CCmode.  */
3787   return CCmode;
3788 }
3789
3790 static int
3791 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3792
3793 int
3794 aarch64_get_condition_code (rtx x)
3795 {
3796   machine_mode mode = GET_MODE (XEXP (x, 0));
3797   enum rtx_code comp_code = GET_CODE (x);
3798
3799   if (GET_MODE_CLASS (mode) != MODE_CC)
3800     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3801   return aarch64_get_condition_code_1 (mode, comp_code);
3802 }
3803
3804 static int
3805 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3806 {
3807   int ne = -1, eq = -1;
3808   switch (mode)
3809     {
3810     case CCFPmode:
3811     case CCFPEmode:
3812       switch (comp_code)
3813         {
3814         case GE: return AARCH64_GE;
3815         case GT: return AARCH64_GT;
3816         case LE: return AARCH64_LS;
3817         case LT: return AARCH64_MI;
3818         case NE: return AARCH64_NE;
3819         case EQ: return AARCH64_EQ;
3820         case ORDERED: return AARCH64_VC;
3821         case UNORDERED: return AARCH64_VS;
3822         case UNLT: return AARCH64_LT;
3823         case UNLE: return AARCH64_LE;
3824         case UNGT: return AARCH64_HI;
3825         case UNGE: return AARCH64_PL;
3826         default: return -1;
3827         }
3828       break;
3829
3830     case CC_DNEmode:
3831       ne = AARCH64_NE;
3832       eq = AARCH64_EQ;
3833       break;
3834
3835     case CC_DEQmode:
3836       ne = AARCH64_EQ;
3837       eq = AARCH64_NE;
3838       break;
3839
3840     case CC_DGEmode:
3841       ne = AARCH64_GE;
3842       eq = AARCH64_LT;
3843       break;
3844
3845     case CC_DLTmode:
3846       ne = AARCH64_LT;
3847       eq = AARCH64_GE;
3848       break;
3849
3850     case CC_DGTmode:
3851       ne = AARCH64_GT;
3852       eq = AARCH64_LE;
3853       break;
3854
3855     case CC_DLEmode:
3856       ne = AARCH64_LE;
3857       eq = AARCH64_GT;
3858       break;
3859
3860     case CC_DGEUmode:
3861       ne = AARCH64_CS;
3862       eq = AARCH64_CC;
3863       break;
3864
3865     case CC_DLTUmode:
3866       ne = AARCH64_CC;
3867       eq = AARCH64_CS;
3868       break;
3869
3870     case CC_DGTUmode:
3871       ne = AARCH64_HI;
3872       eq = AARCH64_LS;
3873       break;
3874
3875     case CC_DLEUmode:
3876       ne = AARCH64_LS;
3877       eq = AARCH64_HI;
3878       break;
3879
3880     case CCmode:
3881       switch (comp_code)
3882         {
3883         case NE: return AARCH64_NE;
3884         case EQ: return AARCH64_EQ;
3885         case GE: return AARCH64_GE;
3886         case GT: return AARCH64_GT;
3887         case LE: return AARCH64_LE;
3888         case LT: return AARCH64_LT;
3889         case GEU: return AARCH64_CS;
3890         case GTU: return AARCH64_HI;
3891         case LEU: return AARCH64_LS;
3892         case LTU: return AARCH64_CC;
3893         default: return -1;
3894         }
3895       break;
3896
3897     case CC_SWPmode:
3898     case CC_ZESWPmode:
3899     case CC_SESWPmode:
3900       switch (comp_code)
3901         {
3902         case NE: return AARCH64_NE;
3903         case EQ: return AARCH64_EQ;
3904         case GE: return AARCH64_LE;
3905         case GT: return AARCH64_LT;
3906         case LE: return AARCH64_GE;
3907         case LT: return AARCH64_GT;
3908         case GEU: return AARCH64_LS;
3909         case GTU: return AARCH64_CC;
3910         case LEU: return AARCH64_CS;
3911         case LTU: return AARCH64_HI;
3912         default: return -1;
3913         }
3914       break;
3915
3916     case CC_NZmode:
3917       switch (comp_code)
3918         {
3919         case NE: return AARCH64_NE;
3920         case EQ: return AARCH64_EQ;
3921         case GE: return AARCH64_PL;
3922         case LT: return AARCH64_MI;
3923         default: return -1;
3924         }
3925       break;
3926
3927     case CC_Zmode:
3928       switch (comp_code)
3929         {
3930         case NE: return AARCH64_NE;
3931         case EQ: return AARCH64_EQ;
3932         default: return -1;
3933         }
3934       break;
3935
3936     default:
3937       return -1;
3938       break;
3939     }
3940
3941   if (comp_code == NE)
3942     return ne;
3943
3944   if (comp_code == EQ)
3945     return eq;
3946
3947   return -1;
3948 }
3949
3950 bool
3951 aarch64_const_vec_all_same_in_range_p (rtx x,
3952                                   HOST_WIDE_INT minval,
3953                                   HOST_WIDE_INT maxval)
3954 {
3955   HOST_WIDE_INT firstval;
3956   int count, i;
3957
3958   if (GET_CODE (x) != CONST_VECTOR
3959       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3960     return false;
3961
3962   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3963   if (firstval < minval || firstval > maxval)
3964     return false;
3965
3966   count = CONST_VECTOR_NUNITS (x);
3967   for (i = 1; i < count; i++)
3968     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3969       return false;
3970
3971   return true;
3972 }
3973
3974 bool
3975 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3976 {
3977   return aarch64_const_vec_all_same_in_range_p (x, val, val);
3978 }
3979
3980 static unsigned
3981 bit_count (unsigned HOST_WIDE_INT value)
3982 {
3983   unsigned count = 0;
3984
3985   while (value)
3986     {
3987       count++;
3988       value &= value - 1;
3989     }
3990
3991   return count;
3992 }
3993
3994 /* N Z C V.  */
3995 #define AARCH64_CC_V 1
3996 #define AARCH64_CC_C (1 << 1)
3997 #define AARCH64_CC_Z (1 << 2)
3998 #define AARCH64_CC_N (1 << 3)
3999
4000 /* N Z C V flags for ccmp.  The first code is for AND op and the other
4001    is for IOR op.  Indexed by AARCH64_COND_CODE.  */
4002 static const int aarch64_nzcv_codes[][2] =
4003 {
4004   {AARCH64_CC_Z, 0}, /* EQ, Z == 1.  */
4005   {0, AARCH64_CC_Z}, /* NE, Z == 0.  */
4006   {AARCH64_CC_C, 0}, /* CS, C == 1.  */
4007   {0, AARCH64_CC_C}, /* CC, C == 0.  */
4008   {AARCH64_CC_N, 0}, /* MI, N == 1.  */
4009   {0, AARCH64_CC_N}, /* PL, N == 0.  */
4010   {AARCH64_CC_V, 0}, /* VS, V == 1.  */
4011   {0, AARCH64_CC_V}, /* VC, V == 0.  */
4012   {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0.  */
4013   {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0).  */
4014   {0, AARCH64_CC_V}, /* GE, N == V.  */
4015   {AARCH64_CC_V, 0}, /* LT, N != V.  */
4016   {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V.  */
4017   {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V).  */
4018   {0, 0}, /* AL, Any.  */
4019   {0, 0}, /* NV, Any.  */
4020 };
4021
4022 int
4023 aarch64_ccmp_mode_to_code (enum machine_mode mode)
4024 {
4025   switch (mode)
4026     {
4027     case CC_DNEmode:
4028       return NE;
4029
4030     case CC_DEQmode:
4031       return EQ;
4032
4033     case CC_DLEmode:
4034       return LE;
4035
4036     case CC_DGTmode:
4037       return GT;
4038
4039     case CC_DLTmode:
4040       return LT;
4041
4042     case CC_DGEmode:
4043       return GE;
4044
4045     case CC_DLEUmode:
4046       return LEU;
4047
4048     case CC_DGTUmode:
4049       return GTU;
4050
4051     case CC_DLTUmode:
4052       return LTU;
4053
4054     case CC_DGEUmode:
4055       return GEU;
4056
4057     default:
4058       gcc_unreachable ();
4059     }
4060 }
4061
4062
4063 void
4064 aarch64_print_operand (FILE *f, rtx x, char code)
4065 {
4066   switch (code)
4067     {
4068     /* An integer or symbol address without a preceding # sign.  */
4069     case 'c':
4070       switch (GET_CODE (x))
4071         {
4072         case CONST_INT:
4073           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4074           break;
4075
4076         case SYMBOL_REF:
4077           output_addr_const (f, x);
4078           break;
4079
4080         case CONST:
4081           if (GET_CODE (XEXP (x, 0)) == PLUS
4082               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4083             {
4084               output_addr_const (f, x);
4085               break;
4086             }
4087           /* Fall through.  */
4088
4089         default:
4090           output_operand_lossage ("Unsupported operand for code '%c'", code);
4091         }
4092       break;
4093
4094     case 'e':
4095       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4096       {
4097         int n;
4098
4099         if (!CONST_INT_P (x)
4100             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4101           {
4102             output_operand_lossage ("invalid operand for '%%%c'", code);
4103             return;
4104           }
4105
4106         switch (n)
4107           {
4108           case 3:
4109             fputc ('b', f);
4110             break;
4111           case 4:
4112             fputc ('h', f);
4113             break;
4114           case 5:
4115             fputc ('w', f);
4116             break;
4117           default:
4118             output_operand_lossage ("invalid operand for '%%%c'", code);
4119             return;
4120           }
4121       }
4122       break;
4123
4124     case 'p':
4125       {
4126         int n;
4127
4128         /* Print N such that 2^N == X.  */
4129         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4130           {
4131             output_operand_lossage ("invalid operand for '%%%c'", code);
4132             return;
4133           }
4134
4135         asm_fprintf (f, "%d", n);
4136       }
4137       break;
4138
4139     case 'P':
4140       /* Print the number of non-zero bits in X (a const_int).  */
4141       if (!CONST_INT_P (x))
4142         {
4143           output_operand_lossage ("invalid operand for '%%%c'", code);
4144           return;
4145         }
4146
4147       asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4148       break;
4149
4150     case 'H':
4151       /* Print the higher numbered register of a pair (TImode) of regs.  */
4152       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4153         {
4154           output_operand_lossage ("invalid operand for '%%%c'", code);
4155           return;
4156         }
4157
4158       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4159       break;
4160
4161     case 'm':
4162       {
4163         int cond_code;
4164         /* Print a condition (eq, ne, etc).  */
4165
4166         /* CONST_TRUE_RTX means always -- that's the default.  */
4167         if (x == const_true_rtx)
4168           return;
4169
4170         if (!COMPARISON_P (x))
4171           {
4172             output_operand_lossage ("invalid operand for '%%%c'", code);
4173             return;
4174           }
4175
4176         cond_code = aarch64_get_condition_code (x);
4177         gcc_assert (cond_code >= 0);
4178         fputs (aarch64_condition_codes[cond_code], f);
4179       }
4180       break;
4181
4182     case 'M':
4183       {
4184         int cond_code;
4185         /* Print the inverse of a condition (eq <-> ne, etc).  */
4186
4187         /* CONST_TRUE_RTX means never -- that's the default.  */
4188         if (x == const_true_rtx)
4189           {
4190             fputs ("nv", f);
4191             return;
4192           }
4193
4194         if (!COMPARISON_P (x))
4195           {
4196             output_operand_lossage ("invalid operand for '%%%c'", code);
4197             return;
4198           }
4199         cond_code = aarch64_get_condition_code (x);
4200         gcc_assert (cond_code >= 0);
4201         fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4202                                        (cond_code)], f);
4203       }
4204       break;
4205
4206     case 'b':
4207     case 'h':
4208     case 's':
4209     case 'd':
4210     case 'q':
4211       /* Print a scalar FP/SIMD register name.  */
4212       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4213         {
4214           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4215           return;
4216         }
4217       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4218       break;
4219
4220     case 'S':
4221     case 'T':
4222     case 'U':
4223     case 'V':
4224       /* Print the first FP/SIMD register name in a list.  */
4225       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4226         {
4227           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4228           return;
4229         }
4230       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4231       break;
4232
4233     case 'R':
4234       /* Print a scalar FP/SIMD register name + 1.  */
4235       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4236         {
4237           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4238           return;
4239         }
4240       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4241       break;
4242
4243     case 'X':
4244       /* Print bottom 16 bits of integer constant in hex.  */
4245       if (!CONST_INT_P (x))
4246         {
4247           output_operand_lossage ("invalid operand for '%%%c'", code);
4248           return;
4249         }
4250       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4251       break;
4252
4253     case 'w':
4254     case 'x':
4255       /* Print a general register name or the zero register (32-bit or
4256          64-bit).  */
4257       if (x == const0_rtx
4258           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4259         {
4260           asm_fprintf (f, "%czr", code);
4261           break;
4262         }
4263
4264       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4265         {
4266           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4267           break;
4268         }
4269
4270       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4271         {
4272           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4273           break;
4274         }
4275
4276       /* Fall through */
4277
4278     case 0:
4279       /* Print a normal operand, if it's a general register, then we
4280          assume DImode.  */
4281       if (x == NULL)
4282         {
4283           output_operand_lossage ("missing operand");
4284           return;
4285         }
4286
4287       switch (GET_CODE (x))
4288         {
4289         case REG:
4290           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4291           break;
4292
4293         case MEM:
4294           aarch64_memory_reference_mode = GET_MODE (x);
4295           output_address (XEXP (x, 0));
4296           break;
4297
4298         case LABEL_REF:
4299         case SYMBOL_REF:
4300           output_addr_const (asm_out_file, x);
4301           break;
4302
4303         case CONST_INT:
4304           asm_fprintf (f, "%wd", INTVAL (x));
4305           break;
4306
4307         case CONST_VECTOR:
4308           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4309             {
4310               gcc_assert (
4311                   aarch64_const_vec_all_same_in_range_p (x,
4312                                                          HOST_WIDE_INT_MIN,
4313                                                          HOST_WIDE_INT_MAX));
4314               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4315             }
4316           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4317             {
4318               fputc ('0', f);
4319             }
4320           else
4321             gcc_unreachable ();
4322           break;
4323
4324         case CONST_DOUBLE:
4325           /* CONST_DOUBLE can represent a double-width integer.
4326              In this case, the mode of x is VOIDmode.  */
4327           if (GET_MODE (x) == VOIDmode)
4328             ; /* Do Nothing.  */
4329           else if (aarch64_float_const_zero_rtx_p (x))
4330             {
4331               fputc ('0', f);
4332               break;
4333             }
4334           else if (aarch64_float_const_representable_p (x))
4335             {
4336 #define buf_size 20
4337               char float_buf[buf_size] = {'\0'};
4338               REAL_VALUE_TYPE r;
4339               REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4340               real_to_decimal_for_mode (float_buf, &r,
4341                                         buf_size, buf_size,
4342                                         1, GET_MODE (x));
4343               asm_fprintf (asm_out_file, "%s", float_buf);
4344               break;
4345 #undef buf_size
4346             }
4347           output_operand_lossage ("invalid constant");
4348           return;
4349         default:
4350           output_operand_lossage ("invalid operand");
4351           return;
4352         }
4353       break;
4354
4355     case 'A':
4356       if (GET_CODE (x) == HIGH)
4357         x = XEXP (x, 0);
4358
4359       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4360         {
4361         case SYMBOL_SMALL_GOT:
4362           asm_fprintf (asm_out_file, ":got:");
4363           break;
4364
4365         case SYMBOL_SMALL_TLSGD:
4366           asm_fprintf (asm_out_file, ":tlsgd:");
4367           break;
4368
4369         case SYMBOL_SMALL_TLSDESC:
4370           asm_fprintf (asm_out_file, ":tlsdesc:");
4371           break;
4372
4373         case SYMBOL_SMALL_GOTTPREL:
4374           asm_fprintf (asm_out_file, ":gottprel:");
4375           break;
4376
4377         case SYMBOL_SMALL_TPREL:
4378           asm_fprintf (asm_out_file, ":tprel:");
4379           break;
4380
4381         case SYMBOL_TINY_GOT:
4382           gcc_unreachable ();
4383           break;
4384
4385         default:
4386           break;
4387         }
4388       output_addr_const (asm_out_file, x);
4389       break;
4390
4391     case 'L':
4392       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4393         {
4394         case SYMBOL_SMALL_GOT:
4395           asm_fprintf (asm_out_file, ":lo12:");
4396           break;
4397
4398         case SYMBOL_SMALL_TLSGD:
4399           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4400           break;
4401
4402         case SYMBOL_SMALL_TLSDESC:
4403           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4404           break;
4405
4406         case SYMBOL_SMALL_GOTTPREL:
4407           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4408           break;
4409
4410         case SYMBOL_SMALL_TPREL:
4411           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4412           break;
4413
4414         case SYMBOL_TINY_GOT:
4415           asm_fprintf (asm_out_file, ":got:");
4416           break;
4417
4418         default:
4419           break;
4420         }
4421       output_addr_const (asm_out_file, x);
4422       break;
4423
4424     case 'G':
4425
4426       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4427         {
4428         case SYMBOL_SMALL_TPREL:
4429           asm_fprintf (asm_out_file, ":tprel_hi12:");
4430           break;
4431         default:
4432           break;
4433         }
4434       output_addr_const (asm_out_file, x);
4435       break;
4436
4437     case 'K':
4438       {
4439         int cond_code;
4440         /* Print nzcv.  */
4441
4442         if (!COMPARISON_P (x))
4443           {
4444             output_operand_lossage ("invalid operand for '%%%c'", code);
4445             return;
4446           }
4447
4448         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4449         gcc_assert (cond_code >= 0);
4450         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4451       }
4452       break;
4453
4454     case 'k':
4455       {
4456         int cond_code;
4457         /* Print nzcv.  */
4458
4459         if (!COMPARISON_P (x))
4460           {
4461             output_operand_lossage ("invalid operand for '%%%c'", code);
4462             return;
4463           }
4464
4465         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4466         gcc_assert (cond_code >= 0);
4467         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4468       }
4469       break;
4470
4471     default:
4472       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4473       return;
4474     }
4475 }
4476
4477 void
4478 aarch64_print_operand_address (FILE *f, rtx x)
4479 {
4480   struct aarch64_address_info addr;
4481
4482   if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4483                              MEM, true))
4484     switch (addr.type)
4485       {
4486       case ADDRESS_REG_IMM:
4487         if (addr.offset == const0_rtx)
4488           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4489         else
4490           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4491                        INTVAL (addr.offset));
4492         return;
4493
4494       case ADDRESS_REG_REG:
4495         if (addr.shift == 0)
4496           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4497                        reg_names [REGNO (addr.offset)]);
4498         else
4499           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4500                        reg_names [REGNO (addr.offset)], addr.shift);
4501         return;
4502
4503       case ADDRESS_REG_UXTW:
4504         if (addr.shift == 0)
4505           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4506                        REGNO (addr.offset) - R0_REGNUM);
4507         else
4508           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4509                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4510         return;
4511
4512       case ADDRESS_REG_SXTW:
4513         if (addr.shift == 0)
4514           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4515                        REGNO (addr.offset) - R0_REGNUM);
4516         else
4517           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4518                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4519         return;
4520
4521       case ADDRESS_REG_WB:
4522         switch (GET_CODE (x))
4523           {
4524           case PRE_INC:
4525             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4526                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4527             return;
4528           case POST_INC:
4529             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4530                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4531             return;
4532           case PRE_DEC:
4533             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4534                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4535             return;
4536           case POST_DEC:
4537             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4538                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4539             return;
4540           case PRE_MODIFY:
4541             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4542                          INTVAL (addr.offset));
4543             return;
4544           case POST_MODIFY:
4545             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4546                          INTVAL (addr.offset));
4547             return;
4548           default:
4549             break;
4550           }
4551         break;
4552
4553       case ADDRESS_LO_SUM:
4554         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4555         output_addr_const (f, addr.offset);
4556         asm_fprintf (f, "]");
4557         return;
4558
4559       case ADDRESS_SYMBOLIC:
4560         break;
4561       }
4562
4563   output_addr_const (f, x);
4564 }
4565
4566 bool
4567 aarch64_label_mentioned_p (rtx x)
4568 {
4569   const char *fmt;
4570   int i;
4571
4572   if (GET_CODE (x) == LABEL_REF)
4573     return true;
4574
4575   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4576      referencing instruction, but they are constant offsets, not
4577      symbols.  */
4578   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4579     return false;
4580
4581   fmt = GET_RTX_FORMAT (GET_CODE (x));
4582   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4583     {
4584       if (fmt[i] == 'E')
4585         {
4586           int j;
4587
4588           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4589             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4590               return 1;
4591         }
4592       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4593         return 1;
4594     }
4595
4596   return 0;
4597 }
4598
4599 /* Implement REGNO_REG_CLASS.  */
4600
4601 enum reg_class
4602 aarch64_regno_regclass (unsigned regno)
4603 {
4604   if (GP_REGNUM_P (regno))
4605     return GENERAL_REGS;
4606
4607   if (regno == SP_REGNUM)
4608     return STACK_REG;
4609
4610   if (regno == FRAME_POINTER_REGNUM
4611       || regno == ARG_POINTER_REGNUM)
4612     return POINTER_REGS;
4613
4614   if (FP_REGNUM_P (regno))
4615     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4616
4617   return NO_REGS;
4618 }
4619
4620 static rtx
4621 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
4622 {
4623   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4624      where mask is selected by alignment and size of the offset.
4625      We try to pick as large a range for the offset as possible to
4626      maximize the chance of a CSE.  However, for aligned addresses
4627      we limit the range to 4k so that structures with different sized
4628      elements are likely to use the same base.  */
4629
4630   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4631     {
4632       HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4633       HOST_WIDE_INT base_offset;
4634
4635       /* Does it look like we'll need a load/store-pair operation?  */
4636       if (GET_MODE_SIZE (mode) > 16
4637           || mode == TImode)
4638         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4639                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
4640       /* For offsets aren't a multiple of the access size, the limit is
4641          -256...255.  */
4642       else if (offset & (GET_MODE_SIZE (mode) - 1))
4643         base_offset = (offset + 0x100) & ~0x1ff;
4644       else
4645         base_offset = offset & ~0xfff;
4646
4647       if (base_offset == 0)
4648         return x;
4649
4650       offset -= base_offset;
4651       rtx base_reg = gen_reg_rtx (Pmode);
4652       rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4653                            NULL_RTX);
4654       emit_move_insn (base_reg, val);
4655       x = plus_constant (Pmode, base_reg, offset);
4656     }
4657
4658   return x;
4659 }
4660
4661 /* Try a machine-dependent way of reloading an illegitimate address
4662    operand.  If we find one, push the reload and return the new rtx.  */
4663
4664 rtx
4665 aarch64_legitimize_reload_address (rtx *x_p,
4666                                    machine_mode mode,
4667                                    int opnum, int type,
4668                                    int ind_levels ATTRIBUTE_UNUSED)
4669 {
4670   rtx x = *x_p;
4671
4672   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4673   if (aarch64_vect_struct_mode_p (mode)
4674       && GET_CODE (x) == PLUS
4675       && REG_P (XEXP (x, 0))
4676       && CONST_INT_P (XEXP (x, 1)))
4677     {
4678       rtx orig_rtx = x;
4679       x = copy_rtx (x);
4680       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4681                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4682                    opnum, (enum reload_type) type);
4683       return x;
4684     }
4685
4686   /* We must recognize output that we have already generated ourselves.  */
4687   if (GET_CODE (x) == PLUS
4688       && GET_CODE (XEXP (x, 0)) == PLUS
4689       && REG_P (XEXP (XEXP (x, 0), 0))
4690       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4691       && CONST_INT_P (XEXP (x, 1)))
4692     {
4693       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4694                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4695                    opnum, (enum reload_type) type);
4696       return x;
4697     }
4698
4699   /* We wish to handle large displacements off a base register by splitting
4700      the addend across an add and the mem insn.  This can cut the number of
4701      extra insns needed from 3 to 1.  It is only useful for load/store of a
4702      single register with 12 bit offset field.  */
4703   if (GET_CODE (x) == PLUS
4704       && REG_P (XEXP (x, 0))
4705       && CONST_INT_P (XEXP (x, 1))
4706       && HARD_REGISTER_P (XEXP (x, 0))
4707       && mode != TImode
4708       && mode != TFmode
4709       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4710     {
4711       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4712       HOST_WIDE_INT low = val & 0xfff;
4713       HOST_WIDE_INT high = val - low;
4714       HOST_WIDE_INT offs;
4715       rtx cst;
4716       machine_mode xmode = GET_MODE (x);
4717
4718       /* In ILP32, xmode can be either DImode or SImode.  */
4719       gcc_assert (xmode == DImode || xmode == SImode);
4720
4721       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4722          BLKmode alignment.  */
4723       if (GET_MODE_SIZE (mode) == 0)
4724         return NULL_RTX;
4725
4726       offs = low % GET_MODE_SIZE (mode);
4727
4728       /* Align misaligned offset by adjusting high part to compensate.  */
4729       if (offs != 0)
4730         {
4731           if (aarch64_uimm12_shift (high + offs))
4732             {
4733               /* Align down.  */
4734               low = low - offs;
4735               high = high + offs;
4736             }
4737           else
4738             {
4739               /* Align up.  */
4740               offs = GET_MODE_SIZE (mode) - offs;
4741               low = low + offs;
4742               high = high + (low & 0x1000) - offs;
4743               low &= 0xfff;
4744             }
4745         }
4746
4747       /* Check for overflow.  */
4748       if (high + low != val)
4749         return NULL_RTX;
4750
4751       cst = GEN_INT (high);
4752       if (!aarch64_uimm12_shift (high))
4753         cst = force_const_mem (xmode, cst);
4754
4755       /* Reload high part into base reg, leaving the low part
4756          in the mem instruction.
4757          Note that replacing this gen_rtx_PLUS with plus_constant is
4758          wrong in this case because we rely on the
4759          (plus (plus reg c1) c2) structure being preserved so that
4760          XEXP (*p, 0) in push_reload below uses the correct term.  */
4761       x = gen_rtx_PLUS (xmode,
4762                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4763                         GEN_INT (low));
4764
4765       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4766                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4767                    opnum, (enum reload_type) type);
4768       return x;
4769     }
4770
4771   return NULL_RTX;
4772 }
4773
4774
4775 static reg_class_t
4776 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4777                           reg_class_t rclass,
4778                           machine_mode mode,
4779                           secondary_reload_info *sri)
4780 {
4781   /* Without the TARGET_SIMD instructions we cannot move a Q register
4782      to a Q register directly.  We need a scratch.  */
4783   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4784       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4785       && reg_class_subset_p (rclass, FP_REGS))
4786     {
4787       if (mode == TFmode)
4788         sri->icode = CODE_FOR_aarch64_reload_movtf;
4789       else if (mode == TImode)
4790         sri->icode = CODE_FOR_aarch64_reload_movti;
4791       return NO_REGS;
4792     }
4793
4794   /* A TFmode or TImode memory access should be handled via an FP_REGS
4795      because AArch64 has richer addressing modes for LDR/STR instructions
4796      than LDP/STP instructions.  */
4797   if (TARGET_FLOAT && rclass == GENERAL_REGS
4798       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4799     return FP_REGS;
4800
4801   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4802       return GENERAL_REGS;
4803
4804   return NO_REGS;
4805 }
4806
4807 static bool
4808 aarch64_can_eliminate (const int from, const int to)
4809 {
4810   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4811      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4812
4813   if (frame_pointer_needed)
4814     {
4815       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4816         return true;
4817       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4818         return false;
4819       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4820           && !cfun->calls_alloca)
4821         return true;
4822       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4823         return true;
4824
4825       return false;
4826     }
4827   else
4828     {
4829       /* If we decided that we didn't need a leaf frame pointer but then used
4830          LR in the function, then we'll want a frame pointer after all, so
4831          prevent this elimination to ensure a frame pointer is used.  */
4832       if (to == STACK_POINTER_REGNUM
4833           && flag_omit_leaf_frame_pointer
4834           && df_regs_ever_live_p (LR_REGNUM))
4835         return false;
4836     }
4837
4838   return true;
4839 }
4840
4841 HOST_WIDE_INT
4842 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4843 {
4844   aarch64_layout_frame ();
4845
4846   if (to == HARD_FRAME_POINTER_REGNUM)
4847     {
4848       if (from == ARG_POINTER_REGNUM)
4849         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4850
4851       if (from == FRAME_POINTER_REGNUM)
4852         return (cfun->machine->frame.hard_fp_offset
4853                 - cfun->machine->frame.saved_varargs_size);
4854     }
4855
4856   if (to == STACK_POINTER_REGNUM)
4857     {
4858       if (from == FRAME_POINTER_REGNUM)
4859           return (cfun->machine->frame.frame_size
4860                   - cfun->machine->frame.saved_varargs_size);
4861     }
4862
4863   return cfun->machine->frame.frame_size;
4864 }
4865
4866 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
4867    previous frame.  */
4868
4869 rtx
4870 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4871 {
4872   if (count != 0)
4873     return const0_rtx;
4874   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4875 }
4876
4877
4878 static void
4879 aarch64_asm_trampoline_template (FILE *f)
4880 {
4881   if (TARGET_ILP32)
4882     {
4883       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4884       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4885     }
4886   else
4887     {
4888       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4889       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4890     }
4891   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4892   assemble_aligned_integer (4, const0_rtx);
4893   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4894   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4895 }
4896
4897 static void
4898 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4899 {
4900   rtx fnaddr, mem, a_tramp;
4901   const int tramp_code_sz = 16;
4902
4903   /* Don't need to copy the trailing D-words, we fill those in below.  */
4904   emit_block_move (m_tramp, assemble_trampoline_template (),
4905                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4906   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4907   fnaddr = XEXP (DECL_RTL (fndecl), 0);
4908   if (GET_MODE (fnaddr) != ptr_mode)
4909     fnaddr = convert_memory_address (ptr_mode, fnaddr);
4910   emit_move_insn (mem, fnaddr);
4911
4912   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4913   emit_move_insn (mem, chain_value);
4914
4915   /* XXX We should really define a "clear_cache" pattern and use
4916      gen_clear_cache().  */
4917   a_tramp = XEXP (m_tramp, 0);
4918   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4919                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4920                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4921                      ptr_mode);
4922 }
4923
4924 static unsigned char
4925 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4926 {
4927   switch (regclass)
4928     {
4929     case CALLER_SAVE_REGS:
4930     case POINTER_REGS:
4931     case GENERAL_REGS:
4932     case ALL_REGS:
4933     case FP_REGS:
4934     case FP_LO_REGS:
4935       return
4936         aarch64_vector_mode_p (mode)
4937           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
4938           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
4939     case STACK_REG:
4940       return 1;
4941
4942     case NO_REGS:
4943       return 0;
4944
4945     default:
4946       break;
4947     }
4948   gcc_unreachable ();
4949 }
4950
4951 static reg_class_t
4952 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4953 {
4954   if (regclass == POINTER_REGS)
4955     return GENERAL_REGS;
4956
4957   if (regclass == STACK_REG)
4958     {
4959       if (REG_P(x)
4960           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4961           return regclass;
4962
4963       return NO_REGS;
4964     }
4965
4966   /* If it's an integer immediate that MOVI can't handle, then
4967      FP_REGS is not an option, so we return NO_REGS instead.  */
4968   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4969       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4970     return NO_REGS;
4971
4972   /* Register eliminiation can result in a request for
4973      SP+constant->FP_REGS.  We cannot support such operations which
4974      use SP as source and an FP_REG as destination, so reject out
4975      right now.  */
4976   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4977     {
4978       rtx lhs = XEXP (x, 0);
4979
4980       /* Look through a possible SUBREG introduced by ILP32.  */
4981       if (GET_CODE (lhs) == SUBREG)
4982         lhs = SUBREG_REG (lhs);
4983
4984       gcc_assert (REG_P (lhs));
4985       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4986                                       POINTER_REGS));
4987       return NO_REGS;
4988     }
4989
4990   return regclass;
4991 }
4992
4993 void
4994 aarch64_asm_output_labelref (FILE* f, const char *name)
4995 {
4996   asm_fprintf (f, "%U%s", name);
4997 }
4998
4999 static void
5000 aarch64_elf_asm_constructor (rtx symbol, int priority)
5001 {
5002   if (priority == DEFAULT_INIT_PRIORITY)
5003     default_ctor_section_asm_out_constructor (symbol, priority);
5004   else
5005     {
5006       section *s;
5007       char buf[18];
5008       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5009       s = get_section (buf, SECTION_WRITE, NULL);
5010       switch_to_section (s);
5011       assemble_align (POINTER_SIZE);
5012       assemble_aligned_integer (POINTER_BYTES, symbol);
5013     }
5014 }
5015
5016 static void
5017 aarch64_elf_asm_destructor (rtx symbol, int priority)
5018 {
5019   if (priority == DEFAULT_INIT_PRIORITY)
5020     default_dtor_section_asm_out_destructor (symbol, priority);
5021   else
5022     {
5023       section *s;
5024       char buf[18];
5025       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5026       s = get_section (buf, SECTION_WRITE, NULL);
5027       switch_to_section (s);
5028       assemble_align (POINTER_SIZE);
5029       assemble_aligned_integer (POINTER_BYTES, symbol);
5030     }
5031 }
5032
5033 const char*
5034 aarch64_output_casesi (rtx *operands)
5035 {
5036   char buf[100];
5037   char label[100];
5038   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5039   int index;
5040   static const char *const patterns[4][2] =
5041   {
5042     {
5043       "ldrb\t%w3, [%0,%w1,uxtw]",
5044       "add\t%3, %4, %w3, sxtb #2"
5045     },
5046     {
5047       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5048       "add\t%3, %4, %w3, sxth #2"
5049     },
5050     {
5051       "ldr\t%w3, [%0,%w1,uxtw #2]",
5052       "add\t%3, %4, %w3, sxtw #2"
5053     },
5054     /* We assume that DImode is only generated when not optimizing and
5055        that we don't really need 64-bit address offsets.  That would
5056        imply an object file with 8GB of code in a single function!  */
5057     {
5058       "ldr\t%w3, [%0,%w1,uxtw #2]",
5059       "add\t%3, %4, %w3, sxtw #2"
5060     }
5061   };
5062
5063   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5064
5065   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5066
5067   gcc_assert (index >= 0 && index <= 3);
5068
5069   /* Need to implement table size reduction, by chaning the code below.  */
5070   output_asm_insn (patterns[index][0], operands);
5071   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5072   snprintf (buf, sizeof (buf),
5073             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5074   output_asm_insn (buf, operands);
5075   output_asm_insn (patterns[index][1], operands);
5076   output_asm_insn ("br\t%3", operands);
5077   assemble_label (asm_out_file, label);
5078   return "";
5079 }
5080
5081
5082 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5083    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5084    operator.  */
5085
5086 int
5087 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5088 {
5089   if (shift >= 0 && shift <= 3)
5090     {
5091       int size;
5092       for (size = 8; size <= 32; size *= 2)
5093         {
5094           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5095           if (mask == bits << shift)
5096             return size;
5097         }
5098     }
5099   return 0;
5100 }
5101
5102 static bool
5103 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5104                                    const_rtx x ATTRIBUTE_UNUSED)
5105 {
5106   /* We can't use blocks for constants when we're using a per-function
5107      constant pool.  */
5108   return false;
5109 }
5110
5111 static section *
5112 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5113                             rtx x ATTRIBUTE_UNUSED,
5114                             unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5115 {
5116   /* Force all constant pool entries into the current function section.  */
5117   return function_section (current_function_decl);
5118 }
5119
5120
5121 /* Costs.  */
5122
5123 /* Helper function for rtx cost calculation.  Strip a shift expression
5124    from X.  Returns the inner operand if successful, or the original
5125    expression on failure.  */
5126 static rtx
5127 aarch64_strip_shift (rtx x)
5128 {
5129   rtx op = x;
5130
5131   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5132      we can convert both to ROR during final output.  */
5133   if ((GET_CODE (op) == ASHIFT
5134        || GET_CODE (op) == ASHIFTRT
5135        || GET_CODE (op) == LSHIFTRT
5136        || GET_CODE (op) == ROTATERT
5137        || GET_CODE (op) == ROTATE)
5138       && CONST_INT_P (XEXP (op, 1)))
5139     return XEXP (op, 0);
5140
5141   if (GET_CODE (op) == MULT
5142       && CONST_INT_P (XEXP (op, 1))
5143       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5144     return XEXP (op, 0);
5145
5146   return x;
5147 }
5148
5149 /* Helper function for rtx cost calculation.  Strip an extend
5150    expression from X.  Returns the inner operand if successful, or the
5151    original expression on failure.  We deal with a number of possible
5152    canonicalization variations here.  */
5153 static rtx
5154 aarch64_strip_extend (rtx x)
5155 {
5156   rtx op = x;
5157
5158   /* Zero and sign extraction of a widened value.  */
5159   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5160       && XEXP (op, 2) == const0_rtx
5161       && GET_CODE (XEXP (op, 0)) == MULT
5162       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5163                                          XEXP (op, 1)))
5164     return XEXP (XEXP (op, 0), 0);
5165
5166   /* It can also be represented (for zero-extend) as an AND with an
5167      immediate.  */
5168   if (GET_CODE (op) == AND
5169       && GET_CODE (XEXP (op, 0)) == MULT
5170       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5171       && CONST_INT_P (XEXP (op, 1))
5172       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5173                            INTVAL (XEXP (op, 1))) != 0)
5174     return XEXP (XEXP (op, 0), 0);
5175
5176   /* Now handle extended register, as this may also have an optional
5177      left shift by 1..4.  */
5178   if (GET_CODE (op) == ASHIFT
5179       && CONST_INT_P (XEXP (op, 1))
5180       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5181     op = XEXP (op, 0);
5182
5183   if (GET_CODE (op) == ZERO_EXTEND
5184       || GET_CODE (op) == SIGN_EXTEND)
5185     op = XEXP (op, 0);
5186
5187   if (op != x)
5188     return op;
5189
5190   return x;
5191 }
5192
5193 /* Return true iff CODE is a shift supported in combination
5194    with arithmetic instructions.  */
5195
5196 static bool
5197 aarch64_shift_p (enum rtx_code code)
5198 {
5199   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5200 }
5201
5202 /* Helper function for rtx cost calculation.  Calculate the cost of
5203    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5204    Return the calculated cost of the expression, recursing manually in to
5205    operands where needed.  */
5206
5207 static int
5208 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5209 {
5210   rtx op0, op1;
5211   const struct cpu_cost_table *extra_cost
5212     = aarch64_tune_params->insn_extra_cost;
5213   int cost = 0;
5214   bool compound_p = (outer == PLUS || outer == MINUS);
5215   machine_mode mode = GET_MODE (x);
5216
5217   gcc_checking_assert (code == MULT);
5218
5219   op0 = XEXP (x, 0);
5220   op1 = XEXP (x, 1);
5221
5222   if (VECTOR_MODE_P (mode))
5223     mode = GET_MODE_INNER (mode);
5224
5225   /* Integer multiply/fma.  */
5226   if (GET_MODE_CLASS (mode) == MODE_INT)
5227     {
5228       /* The multiply will be canonicalized as a shift, cost it as such.  */
5229       if (aarch64_shift_p (GET_CODE (x))
5230           || (CONST_INT_P (op1)
5231               && exact_log2 (INTVAL (op1)) > 0))
5232         {
5233           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5234                            || GET_CODE (op0) == SIGN_EXTEND;
5235           if (speed)
5236             {
5237               if (compound_p)
5238                 {
5239                   if (REG_P (op1))
5240                     /* ARITH + shift-by-register.  */
5241                     cost += extra_cost->alu.arith_shift_reg;
5242                   else if (is_extend)
5243                     /* ARITH + extended register.  We don't have a cost field
5244                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
5245                     cost += extra_cost->alu.extend_arith;
5246                   else
5247                     /* ARITH + shift-by-immediate.  */
5248                     cost += extra_cost->alu.arith_shift;
5249                 }
5250               else
5251                 /* LSL (immediate).  */
5252                 cost += extra_cost->alu.shift;
5253
5254             }
5255           /* Strip extends as we will have costed them in the case above.  */
5256           if (is_extend)
5257             op0 = aarch64_strip_extend (op0);
5258
5259           cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5260
5261           return cost;
5262         }
5263
5264       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
5265          compound and let the below cases handle it.  After all, MNEG is a
5266          special-case alias of MSUB.  */
5267       if (GET_CODE (op0) == NEG)
5268         {
5269           op0 = XEXP (op0, 0);
5270           compound_p = true;
5271         }
5272
5273       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5274       if ((GET_CODE (op0) == ZERO_EXTEND
5275            && GET_CODE (op1) == ZERO_EXTEND)
5276           || (GET_CODE (op0) == SIGN_EXTEND
5277               && GET_CODE (op1) == SIGN_EXTEND))
5278         {
5279           cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5280                   + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5281
5282           if (speed)
5283             {
5284               if (compound_p)
5285                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
5286                 cost += extra_cost->mult[0].extend_add;
5287               else
5288                 /* MUL/SMULL/UMULL.  */
5289                 cost += extra_cost->mult[0].extend;
5290             }
5291
5292           return cost;
5293         }
5294
5295       /* This is either an integer multiply or a MADD.  In both cases
5296          we want to recurse and cost the operands.  */
5297       cost += rtx_cost (op0, MULT, 0, speed)
5298               + rtx_cost (op1, MULT, 1, speed);
5299
5300       if (speed)
5301         {
5302           if (compound_p)
5303             /* MADD/MSUB.  */
5304             cost += extra_cost->mult[mode == DImode].add;
5305           else
5306             /* MUL.  */
5307             cost += extra_cost->mult[mode == DImode].simple;
5308         }
5309
5310       return cost;
5311     }
5312   else
5313     {
5314       if (speed)
5315         {
5316           /* Floating-point FMA/FMUL can also support negations of the
5317              operands.  */
5318           if (GET_CODE (op0) == NEG)
5319             op0 = XEXP (op0, 0);
5320           if (GET_CODE (op1) == NEG)
5321             op1 = XEXP (op1, 0);
5322
5323           if (compound_p)
5324             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5325             cost += extra_cost->fp[mode == DFmode].fma;
5326           else
5327             /* FMUL/FNMUL.  */
5328             cost += extra_cost->fp[mode == DFmode].mult;
5329         }
5330
5331       cost += rtx_cost (op0, MULT, 0, speed)
5332               + rtx_cost (op1, MULT, 1, speed);
5333       return cost;
5334     }
5335 }
5336
5337 static int
5338 aarch64_address_cost (rtx x,
5339                       machine_mode mode,
5340                       addr_space_t as ATTRIBUTE_UNUSED,
5341                       bool speed)
5342 {
5343   enum rtx_code c = GET_CODE (x);
5344   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
5345   struct aarch64_address_info info;
5346   int cost = 0;
5347   info.shift = 0;
5348
5349   if (!aarch64_classify_address (&info, x, mode, c, false))
5350     {
5351       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5352         {
5353           /* This is a CONST or SYMBOL ref which will be split
5354              in a different way depending on the code model in use.
5355              Cost it through the generic infrastructure.  */
5356           int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5357           /* Divide through by the cost of one instruction to
5358              bring it to the same units as the address costs.  */
5359           cost_symbol_ref /= COSTS_N_INSNS (1);
5360           /* The cost is then the cost of preparing the address,
5361              followed by an immediate (possibly 0) offset.  */
5362           return cost_symbol_ref + addr_cost->imm_offset;
5363         }
5364       else
5365         {
5366           /* This is most likely a jump table from a case
5367              statement.  */
5368           return addr_cost->register_offset;
5369         }
5370     }
5371
5372   switch (info.type)
5373     {
5374       case ADDRESS_LO_SUM:
5375       case ADDRESS_SYMBOLIC:
5376       case ADDRESS_REG_IMM:
5377         cost += addr_cost->imm_offset;
5378         break;
5379
5380       case ADDRESS_REG_WB:
5381         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5382           cost += addr_cost->pre_modify;
5383         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5384           cost += addr_cost->post_modify;
5385         else
5386           gcc_unreachable ();
5387
5388         break;
5389
5390       case ADDRESS_REG_REG:
5391         cost += addr_cost->register_offset;
5392         break;
5393
5394       case ADDRESS_REG_UXTW:
5395       case ADDRESS_REG_SXTW:
5396         cost += addr_cost->register_extend;
5397         break;
5398
5399       default:
5400         gcc_unreachable ();
5401     }
5402
5403
5404   if (info.shift > 0)
5405     {
5406       /* For the sake of calculating the cost of the shifted register
5407          component, we can treat same sized modes in the same way.  */
5408       switch (GET_MODE_BITSIZE (mode))
5409         {
5410           case 16:
5411             cost += addr_cost->addr_scale_costs.hi;
5412             break;
5413
5414           case 32:
5415             cost += addr_cost->addr_scale_costs.si;
5416             break;
5417
5418           case 64:
5419             cost += addr_cost->addr_scale_costs.di;
5420             break;
5421
5422           /* We can't tell, or this is a 128-bit vector.  */
5423           default:
5424             cost += addr_cost->addr_scale_costs.ti;
5425             break;
5426         }
5427     }
5428
5429   return cost;
5430 }
5431
5432 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
5433    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
5434    to be taken.  */
5435
5436 int
5437 aarch64_branch_cost (bool speed_p, bool predictable_p)
5438 {
5439   /* When optimizing for speed, use the cost of unpredictable branches.  */
5440   const struct cpu_branch_cost *branch_costs =
5441     aarch64_tune_params->branch_costs;
5442
5443   if (!speed_p || predictable_p)
5444     return branch_costs->predictable;
5445   else
5446     return branch_costs->unpredictable;
5447 }
5448
5449 /* Return true if the RTX X in mode MODE is a zero or sign extract
5450    usable in an ADD or SUB (extended register) instruction.  */
5451 static bool
5452 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5453 {
5454   /* Catch add with a sign extract.
5455      This is add_<optab><mode>_multp2.  */
5456   if (GET_CODE (x) == SIGN_EXTRACT
5457       || GET_CODE (x) == ZERO_EXTRACT)
5458     {
5459       rtx op0 = XEXP (x, 0);
5460       rtx op1 = XEXP (x, 1);
5461       rtx op2 = XEXP (x, 2);
5462
5463       if (GET_CODE (op0) == MULT
5464           && CONST_INT_P (op1)
5465           && op2 == const0_rtx
5466           && CONST_INT_P (XEXP (op0, 1))
5467           && aarch64_is_extend_from_extract (mode,
5468                                              XEXP (op0, 1),
5469                                              op1))
5470         {
5471           return true;
5472         }
5473     }
5474
5475   return false;
5476 }
5477
5478 static bool
5479 aarch64_frint_unspec_p (unsigned int u)
5480 {
5481   switch (u)
5482     {
5483       case UNSPEC_FRINTZ:
5484       case UNSPEC_FRINTP:
5485       case UNSPEC_FRINTM:
5486       case UNSPEC_FRINTA:
5487       case UNSPEC_FRINTN:
5488       case UNSPEC_FRINTX:
5489       case UNSPEC_FRINTI:
5490         return true;
5491
5492       default:
5493         return false;
5494     }
5495 }
5496
5497 /* Return true iff X is an rtx that will match an extr instruction
5498    i.e. as described in the *extr<mode>5_insn family of patterns.
5499    OP0 and OP1 will be set to the operands of the shifts involved
5500    on success and will be NULL_RTX otherwise.  */
5501
5502 static bool
5503 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5504 {
5505   rtx op0, op1;
5506   machine_mode mode = GET_MODE (x);
5507
5508   *res_op0 = NULL_RTX;
5509   *res_op1 = NULL_RTX;
5510
5511   if (GET_CODE (x) != IOR)
5512     return false;
5513
5514   op0 = XEXP (x, 0);
5515   op1 = XEXP (x, 1);
5516
5517   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5518       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5519     {
5520      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
5521       if (GET_CODE (op1) == ASHIFT)
5522         std::swap (op0, op1);
5523
5524       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5525         return false;
5526
5527       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5528       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5529
5530       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5531           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5532         {
5533           *res_op0 = XEXP (op0, 0);
5534           *res_op1 = XEXP (op1, 0);
5535           return true;
5536         }
5537     }
5538
5539   return false;
5540 }
5541
5542 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5543    storing it in *COST.  Result is true if the total cost of the operation
5544    has now been calculated.  */
5545 static bool
5546 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5547 {
5548   rtx inner;
5549   rtx comparator;
5550   enum rtx_code cmpcode;
5551
5552   if (COMPARISON_P (op0))
5553     {
5554       inner = XEXP (op0, 0);
5555       comparator = XEXP (op0, 1);
5556       cmpcode = GET_CODE (op0);
5557     }
5558   else
5559     {
5560       inner = op0;
5561       comparator = const0_rtx;
5562       cmpcode = NE;
5563     }
5564
5565   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5566     {
5567       /* Conditional branch.  */
5568       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5569         return true;
5570       else
5571         {
5572           if (cmpcode == NE || cmpcode == EQ)
5573             {
5574               if (comparator == const0_rtx)
5575                 {
5576                   /* TBZ/TBNZ/CBZ/CBNZ.  */
5577                   if (GET_CODE (inner) == ZERO_EXTRACT)
5578                     /* TBZ/TBNZ.  */
5579                     *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5580                                        0, speed);
5581                 else
5582                   /* CBZ/CBNZ.  */
5583                   *cost += rtx_cost (inner, cmpcode, 0, speed);
5584
5585                 return true;
5586               }
5587             }
5588           else if (cmpcode == LT || cmpcode == GE)
5589             {
5590               /* TBZ/TBNZ.  */
5591               if (comparator == const0_rtx)
5592                 return true;
5593             }
5594         }
5595     }
5596   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5597     {
5598       /* It's a conditional operation based on the status flags,
5599          so it must be some flavor of CSEL.  */
5600
5601       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
5602       if (GET_CODE (op1) == NEG
5603           || GET_CODE (op1) == NOT
5604           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5605         op1 = XEXP (op1, 0);
5606
5607       *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5608       *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5609       return true;
5610     }
5611
5612   /* We don't know what this is, cost all operands.  */
5613   return false;
5614 }
5615
5616 /* Calculate the cost of calculating X, storing it in *COST.  Result
5617    is true if the total cost of the operation has now been calculated.  */
5618 static bool
5619 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5620                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5621 {
5622   rtx op0, op1, op2;
5623   const struct cpu_cost_table *extra_cost
5624     = aarch64_tune_params->insn_extra_cost;
5625   machine_mode mode = GET_MODE (x);
5626
5627   /* By default, assume that everything has equivalent cost to the
5628      cheapest instruction.  Any additional costs are applied as a delta
5629      above this default.  */
5630   *cost = COSTS_N_INSNS (1);
5631
5632   switch (code)
5633     {
5634     case SET:
5635       /* The cost depends entirely on the operands to SET.  */
5636       *cost = 0;
5637       op0 = SET_DEST (x);
5638       op1 = SET_SRC (x);
5639
5640       switch (GET_CODE (op0))
5641         {
5642         case MEM:
5643           if (speed)
5644             {
5645               rtx address = XEXP (op0, 0);
5646               if (VECTOR_MODE_P (mode))
5647                 *cost += extra_cost->ldst.storev;
5648               else if (GET_MODE_CLASS (mode) == MODE_INT)
5649                 *cost += extra_cost->ldst.store;
5650               else if (mode == SFmode)
5651                 *cost += extra_cost->ldst.storef;
5652               else if (mode == DFmode)
5653                 *cost += extra_cost->ldst.stored;
5654
5655               *cost +=
5656                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5657                                                      0, speed));
5658             }
5659
5660           *cost += rtx_cost (op1, SET, 1, speed);
5661           return true;
5662
5663         case SUBREG:
5664           if (! REG_P (SUBREG_REG (op0)))
5665             *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5666
5667           /* Fall through.  */
5668         case REG:
5669           /* The cost is one per vector-register copied.  */
5670           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
5671             {
5672               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5673                               / GET_MODE_SIZE (V4SImode);
5674               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5675             }
5676           /* const0_rtx is in general free, but we will use an
5677              instruction to set a register to 0.  */
5678           else if (REG_P (op1) || op1 == const0_rtx)
5679             {
5680               /* The cost is 1 per register copied.  */
5681               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5682                               / UNITS_PER_WORD;
5683               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5684             }
5685           else
5686             /* Cost is just the cost of the RHS of the set.  */
5687             *cost += rtx_cost (op1, SET, 1, speed);
5688           return true;
5689
5690         case ZERO_EXTRACT:
5691         case SIGN_EXTRACT:
5692           /* Bit-field insertion.  Strip any redundant widening of
5693              the RHS to meet the width of the target.  */
5694           if (GET_CODE (op1) == SUBREG)
5695             op1 = SUBREG_REG (op1);
5696           if ((GET_CODE (op1) == ZERO_EXTEND
5697                || GET_CODE (op1) == SIGN_EXTEND)
5698               && CONST_INT_P (XEXP (op0, 1))
5699               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5700                   >= INTVAL (XEXP (op0, 1))))
5701             op1 = XEXP (op1, 0);
5702
5703           if (CONST_INT_P (op1))
5704             {
5705               /* MOV immediate is assumed to always be cheap.  */
5706               *cost = COSTS_N_INSNS (1);
5707             }
5708           else
5709             {
5710               /* BFM.  */
5711               if (speed)
5712                 *cost += extra_cost->alu.bfi;
5713               *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5714             }
5715
5716           return true;
5717
5718         default:
5719           /* We can't make sense of this, assume default cost.  */
5720           *cost = COSTS_N_INSNS (1);
5721           return false;
5722         }
5723       return false;
5724
5725     case CONST_INT:
5726       /* If an instruction can incorporate a constant within the
5727          instruction, the instruction's expression avoids calling
5728          rtx_cost() on the constant.  If rtx_cost() is called on a
5729          constant, then it is usually because the constant must be
5730          moved into a register by one or more instructions.
5731
5732          The exception is constant 0, which can be expressed
5733          as XZR/WZR and is therefore free.  The exception to this is
5734          if we have (set (reg) (const0_rtx)) in which case we must cost
5735          the move.  However, we can catch that when we cost the SET, so
5736          we don't need to consider that here.  */
5737       if (x == const0_rtx)
5738         *cost = 0;
5739       else
5740         {
5741           /* To an approximation, building any other constant is
5742              proportionally expensive to the number of instructions
5743              required to build that constant.  This is true whether we
5744              are compiling for SPEED or otherwise.  */
5745           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5746                                  (NULL_RTX, x, false, mode));
5747         }
5748       return true;
5749
5750     case CONST_DOUBLE:
5751       if (speed)
5752         {
5753           /* mov[df,sf]_aarch64.  */
5754           if (aarch64_float_const_representable_p (x))
5755             /* FMOV (scalar immediate).  */
5756             *cost += extra_cost->fp[mode == DFmode].fpconst;
5757           else if (!aarch64_float_const_zero_rtx_p (x))
5758             {
5759               /* This will be a load from memory.  */
5760               if (mode == DFmode)
5761                 *cost += extra_cost->ldst.loadd;
5762               else
5763                 *cost += extra_cost->ldst.loadf;
5764             }
5765           else
5766             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5767                or MOV v0.s[0], wzr - neither of which are modeled by the
5768                cost tables.  Just use the default cost.  */
5769             {
5770             }
5771         }
5772
5773       return true;
5774
5775     case MEM:
5776       if (speed)
5777         {
5778           /* For loads we want the base cost of a load, plus an
5779              approximation for the additional cost of the addressing
5780              mode.  */
5781           rtx address = XEXP (x, 0);
5782           if (VECTOR_MODE_P (mode))
5783             *cost += extra_cost->ldst.loadv;
5784           else if (GET_MODE_CLASS (mode) == MODE_INT)
5785             *cost += extra_cost->ldst.load;
5786           else if (mode == SFmode)
5787             *cost += extra_cost->ldst.loadf;
5788           else if (mode == DFmode)
5789             *cost += extra_cost->ldst.loadd;
5790
5791           *cost +=
5792                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5793                                                      0, speed));
5794         }
5795
5796       return true;
5797
5798     case NEG:
5799       op0 = XEXP (x, 0);
5800
5801       if (VECTOR_MODE_P (mode))
5802         {
5803           if (speed)
5804             {
5805               /* FNEG.  */
5806               *cost += extra_cost->vect.alu;
5807             }
5808           return false;
5809         }
5810
5811       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5812        {
5813           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5814               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5815             {
5816               /* CSETM.  */
5817               *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5818               return true;
5819             }
5820
5821           /* Cost this as SUB wzr, X.  */
5822           op0 = CONST0_RTX (GET_MODE (x));
5823           op1 = XEXP (x, 0);
5824           goto cost_minus;
5825         }
5826
5827       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5828         {
5829           /* Support (neg(fma...)) as a single instruction only if
5830              sign of zeros is unimportant.  This matches the decision
5831              making in aarch64.md.  */
5832           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5833             {
5834               /* FNMADD.  */
5835               *cost = rtx_cost (op0, NEG, 0, speed);
5836               return true;
5837             }
5838           if (speed)
5839             /* FNEG.  */
5840             *cost += extra_cost->fp[mode == DFmode].neg;
5841           return false;
5842         }
5843
5844       return false;
5845
5846     case CLRSB:
5847     case CLZ:
5848       if (speed)
5849         {
5850           if (VECTOR_MODE_P (mode))
5851             *cost += extra_cost->vect.alu;
5852           else
5853             *cost += extra_cost->alu.clz;
5854         }
5855
5856       return false;
5857
5858     case COMPARE:
5859       op0 = XEXP (x, 0);
5860       op1 = XEXP (x, 1);
5861
5862       if (op1 == const0_rtx
5863           && GET_CODE (op0) == AND)
5864         {
5865           x = op0;
5866           goto cost_logic;
5867         }
5868
5869       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5870         {
5871           /* TODO: A write to the CC flags possibly costs extra, this
5872              needs encoding in the cost tables.  */
5873
5874           /* CC_ZESWPmode supports zero extend for free.  */
5875           if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5876             op0 = XEXP (op0, 0);
5877
5878           /* ANDS.  */
5879           if (GET_CODE (op0) == AND)
5880             {
5881               x = op0;
5882               goto cost_logic;
5883             }
5884
5885           if (GET_CODE (op0) == PLUS)
5886             {
5887               /* ADDS (and CMN alias).  */
5888               x = op0;
5889               goto cost_plus;
5890             }
5891
5892           if (GET_CODE (op0) == MINUS)
5893             {
5894               /* SUBS.  */
5895               x = op0;
5896               goto cost_minus;
5897             }
5898
5899           if (GET_CODE (op1) == NEG)
5900             {
5901               /* CMN.  */
5902               if (speed)
5903                 *cost += extra_cost->alu.arith;
5904
5905               *cost += rtx_cost (op0, COMPARE, 0, speed);
5906               *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5907               return true;
5908             }
5909
5910           /* CMP.
5911
5912              Compare can freely swap the order of operands, and
5913              canonicalization puts the more complex operation first.
5914              But the integer MINUS logic expects the shift/extend
5915              operation in op1.  */
5916           if (! (REG_P (op0)
5917                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5918           {
5919             op0 = XEXP (x, 1);
5920             op1 = XEXP (x, 0);
5921           }
5922           goto cost_minus;
5923         }
5924
5925       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5926         {
5927           /* FCMP.  */
5928           if (speed)
5929             *cost += extra_cost->fp[mode == DFmode].compare;
5930
5931           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5932             {
5933               *cost += rtx_cost (op0, COMPARE, 0, speed);
5934               /* FCMP supports constant 0.0 for no extra cost. */
5935               return true;
5936             }
5937           return false;
5938         }
5939
5940       if (VECTOR_MODE_P (mode))
5941         {
5942           /* Vector compare.  */
5943           if (speed)
5944             *cost += extra_cost->vect.alu;
5945
5946           if (aarch64_float_const_zero_rtx_p (op1))
5947             {
5948               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
5949                  cost.  */
5950               return true;
5951             }
5952           return false;
5953         }
5954       return false;
5955
5956     case MINUS:
5957       {
5958         op0 = XEXP (x, 0);
5959         op1 = XEXP (x, 1);
5960
5961 cost_minus:
5962         *cost += rtx_cost (op0, MINUS, 0, speed);
5963
5964         /* Detect valid immediates.  */
5965         if ((GET_MODE_CLASS (mode) == MODE_INT
5966              || (GET_MODE_CLASS (mode) == MODE_CC
5967                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5968             && CONST_INT_P (op1)
5969             && aarch64_uimm12_shift (INTVAL (op1)))
5970           {
5971             if (speed)
5972               /* SUB(S) (immediate).  */
5973               *cost += extra_cost->alu.arith;
5974             return true;
5975           }
5976
5977         /* Look for SUB (extended register).  */
5978         if (aarch64_rtx_arith_op_extract_p (op1, mode))
5979           {
5980             if (speed)
5981               *cost += extra_cost->alu.extend_arith;
5982
5983             *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5984                                (enum rtx_code) GET_CODE (op1),
5985                                0, speed);
5986             return true;
5987           }
5988
5989         rtx new_op1 = aarch64_strip_extend (op1);
5990
5991         /* Cost this as an FMA-alike operation.  */
5992         if ((GET_CODE (new_op1) == MULT
5993              || aarch64_shift_p (GET_CODE (new_op1)))
5994             && code != COMPARE)
5995           {
5996             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5997                                             (enum rtx_code) code,
5998                                             speed);
5999             return true;
6000           }
6001
6002         *cost += rtx_cost (new_op1, MINUS, 1, speed);
6003
6004         if (speed)
6005           {
6006             if (VECTOR_MODE_P (mode))
6007               {
6008                 /* Vector SUB.  */
6009                 *cost += extra_cost->vect.alu;
6010               }
6011             else if (GET_MODE_CLASS (mode) == MODE_INT)
6012               {
6013                 /* SUB(S).  */
6014                 *cost += extra_cost->alu.arith;
6015               }
6016             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6017               {
6018                 /* FSUB.  */
6019                 *cost += extra_cost->fp[mode == DFmode].addsub;
6020               }
6021           }
6022         return true;
6023       }
6024
6025     case PLUS:
6026       {
6027         rtx new_op0;
6028
6029         op0 = XEXP (x, 0);
6030         op1 = XEXP (x, 1);
6031
6032 cost_plus:
6033         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6034             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6035           {
6036             /* CSINC.  */
6037             *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
6038             *cost += rtx_cost (op1, PLUS, 1, speed);
6039             return true;
6040           }
6041
6042         if (GET_MODE_CLASS (mode) == MODE_INT
6043             && CONST_INT_P (op1)
6044             && aarch64_uimm12_shift (INTVAL (op1)))
6045           {
6046             *cost += rtx_cost (op0, PLUS, 0, speed);
6047
6048             if (speed)
6049               /* ADD (immediate).  */
6050               *cost += extra_cost->alu.arith;
6051             return true;
6052           }
6053
6054         *cost += rtx_cost (op1, PLUS, 1, speed);
6055
6056         /* Look for ADD (extended register).  */
6057         if (aarch64_rtx_arith_op_extract_p (op0, mode))
6058           {
6059             if (speed)
6060               *cost += extra_cost->alu.extend_arith;
6061
6062             *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
6063                                (enum rtx_code) GET_CODE (op0),
6064                                0, speed);
6065             return true;
6066           }
6067
6068         /* Strip any extend, leave shifts behind as we will
6069            cost them through mult_cost.  */
6070         new_op0 = aarch64_strip_extend (op0);
6071
6072         if (GET_CODE (new_op0) == MULT
6073             || aarch64_shift_p (GET_CODE (new_op0)))
6074           {
6075             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6076                                             speed);
6077             return true;
6078           }
6079
6080         *cost += rtx_cost (new_op0, PLUS, 0, speed);
6081
6082         if (speed)
6083           {
6084             if (VECTOR_MODE_P (mode))
6085               {
6086                 /* Vector ADD.  */
6087                 *cost += extra_cost->vect.alu;
6088               }
6089             else if (GET_MODE_CLASS (mode) == MODE_INT)
6090               {
6091                 /* ADD.  */
6092                 *cost += extra_cost->alu.arith;
6093               }
6094             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6095               {
6096                 /* FADD.  */
6097                 *cost += extra_cost->fp[mode == DFmode].addsub;
6098               }
6099           }
6100         return true;
6101       }
6102
6103     case BSWAP:
6104       *cost = COSTS_N_INSNS (1);
6105
6106       if (speed)
6107         {
6108           if (VECTOR_MODE_P (mode))
6109             *cost += extra_cost->vect.alu;
6110           else
6111             *cost += extra_cost->alu.rev;
6112         }
6113       return false;
6114
6115     case IOR:
6116       if (aarch_rev16_p (x))
6117         {
6118           *cost = COSTS_N_INSNS (1);
6119
6120           if (speed)
6121             {
6122               if (VECTOR_MODE_P (mode))
6123                 *cost += extra_cost->vect.alu;
6124               else
6125                 *cost += extra_cost->alu.rev;
6126             }
6127           return true;
6128         }
6129
6130       if (aarch64_extr_rtx_p (x, &op0, &op1))
6131         {
6132           *cost += rtx_cost (op0, IOR, 0, speed)
6133                    + rtx_cost (op1, IOR, 1, speed);
6134           if (speed)
6135             *cost += extra_cost->alu.shift;
6136
6137           return true;
6138         }
6139     /* Fall through.  */
6140     case XOR:
6141     case AND:
6142     cost_logic:
6143       op0 = XEXP (x, 0);
6144       op1 = XEXP (x, 1);
6145
6146       if (VECTOR_MODE_P (mode))
6147         {
6148           if (speed)
6149             *cost += extra_cost->vect.alu;
6150           return true;
6151         }
6152
6153       if (code == AND
6154           && GET_CODE (op0) == MULT
6155           && CONST_INT_P (XEXP (op0, 1))
6156           && CONST_INT_P (op1)
6157           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6158                                INTVAL (op1)) != 0)
6159         {
6160           /* This is a UBFM/SBFM.  */
6161           *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
6162           if (speed)
6163             *cost += extra_cost->alu.bfx;
6164           return true;
6165         }
6166
6167       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6168         {
6169           /* We possibly get the immediate for free, this is not
6170              modelled.  */
6171           if (CONST_INT_P (op1)
6172               && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
6173             {
6174               *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6175
6176               if (speed)
6177                 *cost += extra_cost->alu.logical;
6178
6179               return true;
6180             }
6181           else
6182             {
6183               rtx new_op0 = op0;
6184
6185               /* Handle ORN, EON, or BIC.  */
6186               if (GET_CODE (op0) == NOT)
6187                 op0 = XEXP (op0, 0);
6188
6189               new_op0 = aarch64_strip_shift (op0);
6190
6191               /* If we had a shift on op0 then this is a logical-shift-
6192                  by-register/immediate operation.  Otherwise, this is just
6193                  a logical operation.  */
6194               if (speed)
6195                 {
6196                   if (new_op0 != op0)
6197                     {
6198                       /* Shift by immediate.  */
6199                       if (CONST_INT_P (XEXP (op0, 1)))
6200                         *cost += extra_cost->alu.log_shift;
6201                       else
6202                         *cost += extra_cost->alu.log_shift_reg;
6203                     }
6204                   else
6205                     *cost += extra_cost->alu.logical;
6206                 }
6207
6208               /* In both cases we want to cost both operands.  */
6209               *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6210                        + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6211
6212               return true;
6213             }
6214         }
6215       return false;
6216
6217     case NOT:
6218       x = XEXP (x, 0);
6219       op0 = aarch64_strip_shift (x);
6220
6221       if (VECTOR_MODE_P (mode))
6222         {
6223           /* Vector NOT.  */
6224           *cost += extra_cost->vect.alu;
6225           return false;
6226         }
6227
6228       /* MVN-shifted-reg.  */
6229       if (op0 != x)
6230         {
6231           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6232
6233           if (speed)
6234             *cost += extra_cost->alu.log_shift;
6235
6236           return true;
6237         }
6238       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6239          Handle the second form here taking care that 'a' in the above can
6240          be a shift.  */
6241       else if (GET_CODE (op0) == XOR)
6242         {
6243           rtx newop0 = XEXP (op0, 0);
6244           rtx newop1 = XEXP (op0, 1);
6245           rtx op0_stripped = aarch64_strip_shift (newop0);
6246
6247           *cost += rtx_cost (newop1, (enum rtx_code) code, 1, speed)
6248                    + rtx_cost (op0_stripped, XOR, 0, speed);
6249
6250           if (speed)
6251             {
6252               if (op0_stripped != newop0)
6253                 *cost += extra_cost->alu.log_shift;
6254               else
6255                 *cost += extra_cost->alu.logical;
6256             }
6257
6258           return true;
6259         }
6260       /* MVN.  */
6261       if (speed)
6262         *cost += extra_cost->alu.logical;
6263
6264       return false;
6265
6266     case ZERO_EXTEND:
6267
6268       op0 = XEXP (x, 0);
6269       /* If a value is written in SI mode, then zero extended to DI
6270          mode, the operation will in general be free as a write to
6271          a 'w' register implicitly zeroes the upper bits of an 'x'
6272          register.  However, if this is
6273
6274            (set (reg) (zero_extend (reg)))
6275
6276          we must cost the explicit register move.  */
6277       if (mode == DImode
6278           && GET_MODE (op0) == SImode
6279           && outer == SET)
6280         {
6281           int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6282
6283           if (!op_cost && speed)
6284             /* MOV.  */
6285             *cost += extra_cost->alu.extend;
6286           else
6287             /* Free, the cost is that of the SI mode operation.  */
6288             *cost = op_cost;
6289
6290           return true;
6291         }
6292       else if (MEM_P (XEXP (x, 0)))
6293         {
6294           /* All loads can zero extend to any size for free.  */
6295           *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6296           return true;
6297         }
6298
6299       if (speed)
6300         {
6301           if (VECTOR_MODE_P (mode))
6302             {
6303               /* UMOV.  */
6304               *cost += extra_cost->vect.alu;
6305             }
6306           else
6307             {
6308               /* UXTB/UXTH.  */
6309               *cost += extra_cost->alu.extend;
6310             }
6311         }
6312       return false;
6313
6314     case SIGN_EXTEND:
6315       if (MEM_P (XEXP (x, 0)))
6316         {
6317           /* LDRSH.  */
6318           if (speed)
6319             {
6320               rtx address = XEXP (XEXP (x, 0), 0);
6321               *cost += extra_cost->ldst.load_sign_extend;
6322
6323               *cost +=
6324                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6325                                                      0, speed));
6326             }
6327           return true;
6328         }
6329
6330       if (speed)
6331         {
6332           if (VECTOR_MODE_P (mode))
6333             *cost += extra_cost->vect.alu;
6334           else
6335             *cost += extra_cost->alu.extend;
6336         }
6337       return false;
6338
6339     case ASHIFT:
6340       op0 = XEXP (x, 0);
6341       op1 = XEXP (x, 1);
6342
6343       if (CONST_INT_P (op1))
6344         {
6345           if (speed)
6346             {
6347               if (VECTOR_MODE_P (mode))
6348                 {
6349                   /* Vector shift (immediate).  */
6350                   *cost += extra_cost->vect.alu;
6351                 }
6352               else
6353                 {
6354                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6355                      aliases.  */
6356                   *cost += extra_cost->alu.shift;
6357                 }
6358             }
6359
6360           /* We can incorporate zero/sign extend for free.  */
6361           if (GET_CODE (op0) == ZERO_EXTEND
6362               || GET_CODE (op0) == SIGN_EXTEND)
6363             op0 = XEXP (op0, 0);
6364
6365           *cost += rtx_cost (op0, ASHIFT, 0, speed);
6366           return true;
6367         }
6368       else
6369         {
6370           if (speed)
6371             {
6372               if (VECTOR_MODE_P (mode))
6373                 {
6374                   /* Vector shift (register).  */
6375                   *cost += extra_cost->vect.alu;
6376                 }
6377               else
6378                 {
6379                   /* LSLV.  */
6380                   *cost += extra_cost->alu.shift_reg;
6381                 }
6382             }
6383           return false;  /* All arguments need to be in registers.  */
6384         }
6385
6386     case ROTATE:
6387     case ROTATERT:
6388     case LSHIFTRT:
6389     case ASHIFTRT:
6390       op0 = XEXP (x, 0);
6391       op1 = XEXP (x, 1);
6392
6393       if (CONST_INT_P (op1))
6394         {
6395           /* ASR (immediate) and friends.  */
6396           if (speed)
6397             {
6398               if (VECTOR_MODE_P (mode))
6399                 *cost += extra_cost->vect.alu;
6400               else
6401                 *cost += extra_cost->alu.shift;
6402             }
6403
6404           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6405           return true;
6406         }
6407       else
6408         {
6409
6410           /* ASR (register) and friends.  */
6411           if (speed)
6412             {
6413               if (VECTOR_MODE_P (mode))
6414                 *cost += extra_cost->vect.alu;
6415               else
6416                 *cost += extra_cost->alu.shift_reg;
6417             }
6418           return false;  /* All arguments need to be in registers.  */
6419         }
6420
6421     case SYMBOL_REF:
6422
6423       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6424         {
6425           /* LDR.  */
6426           if (speed)
6427             *cost += extra_cost->ldst.load;
6428         }
6429       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6430                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6431         {
6432           /* ADRP, followed by ADD.  */
6433           *cost += COSTS_N_INSNS (1);
6434           if (speed)
6435             *cost += 2 * extra_cost->alu.arith;
6436         }
6437       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6438                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6439         {
6440           /* ADR.  */
6441           if (speed)
6442             *cost += extra_cost->alu.arith;
6443         }
6444
6445       if (flag_pic)
6446         {
6447           /* One extra load instruction, after accessing the GOT.  */
6448           *cost += COSTS_N_INSNS (1);
6449           if (speed)
6450             *cost += extra_cost->ldst.load;
6451         }
6452       return true;
6453
6454     case HIGH:
6455     case LO_SUM:
6456       /* ADRP/ADD (immediate).  */
6457       if (speed)
6458         *cost += extra_cost->alu.arith;
6459       return true;
6460
6461     case ZERO_EXTRACT:
6462     case SIGN_EXTRACT:
6463       /* UBFX/SBFX.  */
6464       if (speed)
6465         {
6466           if (VECTOR_MODE_P (mode))
6467             *cost += extra_cost->vect.alu;
6468           else
6469             *cost += extra_cost->alu.bfx;
6470         }
6471
6472       /* We can trust that the immediates used will be correct (there
6473          are no by-register forms), so we need only cost op0.  */
6474       *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6475       return true;
6476
6477     case MULT:
6478       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6479       /* aarch64_rtx_mult_cost always handles recursion to its
6480          operands.  */
6481       return true;
6482
6483     case MOD:
6484     case UMOD:
6485       if (speed)
6486         {
6487           if (VECTOR_MODE_P (mode))
6488             *cost += extra_cost->vect.alu;
6489           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6490             *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6491                       + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6492           else if (GET_MODE (x) == DFmode)
6493             *cost += (extra_cost->fp[1].mult
6494                       + extra_cost->fp[1].div);
6495           else if (GET_MODE (x) == SFmode)
6496             *cost += (extra_cost->fp[0].mult
6497                       + extra_cost->fp[0].div);
6498         }
6499       return false;  /* All arguments need to be in registers.  */
6500
6501     case DIV:
6502     case UDIV:
6503     case SQRT:
6504       if (speed)
6505         {
6506           if (VECTOR_MODE_P (mode))
6507             *cost += extra_cost->vect.alu;
6508           else if (GET_MODE_CLASS (mode) == MODE_INT)
6509             /* There is no integer SQRT, so only DIV and UDIV can get
6510                here.  */
6511             *cost += extra_cost->mult[mode == DImode].idiv;
6512           else
6513             *cost += extra_cost->fp[mode == DFmode].div;
6514         }
6515       return false;  /* All arguments need to be in registers.  */
6516
6517     case IF_THEN_ELSE:
6518       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6519                                          XEXP (x, 2), cost, speed);
6520
6521     case EQ:
6522     case NE:
6523     case GT:
6524     case GTU:
6525     case LT:
6526     case LTU:
6527     case GE:
6528     case GEU:
6529     case LE:
6530     case LEU:
6531
6532       return false; /* All arguments must be in registers.  */
6533
6534     case FMA:
6535       op0 = XEXP (x, 0);
6536       op1 = XEXP (x, 1);
6537       op2 = XEXP (x, 2);
6538
6539       if (speed)
6540         {
6541           if (VECTOR_MODE_P (mode))
6542             *cost += extra_cost->vect.alu;
6543           else
6544             *cost += extra_cost->fp[mode == DFmode].fma;
6545         }
6546
6547       /* FMSUB, FNMADD, and FNMSUB are free.  */
6548       if (GET_CODE (op0) == NEG)
6549         op0 = XEXP (op0, 0);
6550
6551       if (GET_CODE (op2) == NEG)
6552         op2 = XEXP (op2, 0);
6553
6554       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6555          and the by-element operand as operand 0.  */
6556       if (GET_CODE (op1) == NEG)
6557         op1 = XEXP (op1, 0);
6558
6559       /* Catch vector-by-element operations.  The by-element operand can
6560          either be (vec_duplicate (vec_select (x))) or just
6561          (vec_select (x)), depending on whether we are multiplying by
6562          a vector or a scalar.
6563
6564          Canonicalization is not very good in these cases, FMA4 will put the
6565          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
6566       if (GET_CODE (op0) == VEC_DUPLICATE)
6567         op0 = XEXP (op0, 0);
6568       else if (GET_CODE (op1) == VEC_DUPLICATE)
6569         op1 = XEXP (op1, 0);
6570
6571       if (GET_CODE (op0) == VEC_SELECT)
6572         op0 = XEXP (op0, 0);
6573       else if (GET_CODE (op1) == VEC_SELECT)
6574         op1 = XEXP (op1, 0);
6575
6576       /* If the remaining parameters are not registers,
6577          get the cost to put them into registers.  */
6578       *cost += rtx_cost (op0, FMA, 0, speed);
6579       *cost += rtx_cost (op1, FMA, 1, speed);
6580       *cost += rtx_cost (op2, FMA, 2, speed);
6581       return true;
6582
6583     case FLOAT:
6584     case UNSIGNED_FLOAT:
6585       if (speed)
6586         *cost += extra_cost->fp[mode == DFmode].fromint;
6587       return false;
6588
6589     case FLOAT_EXTEND:
6590       if (speed)
6591         {
6592           if (VECTOR_MODE_P (mode))
6593             {
6594               /*Vector truncate.  */
6595               *cost += extra_cost->vect.alu;
6596             }
6597           else
6598             *cost += extra_cost->fp[mode == DFmode].widen;
6599         }
6600       return false;
6601
6602     case FLOAT_TRUNCATE:
6603       if (speed)
6604         {
6605           if (VECTOR_MODE_P (mode))
6606             {
6607               /*Vector conversion.  */
6608               *cost += extra_cost->vect.alu;
6609             }
6610           else
6611             *cost += extra_cost->fp[mode == DFmode].narrow;
6612         }
6613       return false;
6614
6615     case FIX:
6616     case UNSIGNED_FIX:
6617       x = XEXP (x, 0);
6618       /* Strip the rounding part.  They will all be implemented
6619          by the fcvt* family of instructions anyway.  */
6620       if (GET_CODE (x) == UNSPEC)
6621         {
6622           unsigned int uns_code = XINT (x, 1);
6623
6624           if (uns_code == UNSPEC_FRINTA
6625               || uns_code == UNSPEC_FRINTM
6626               || uns_code == UNSPEC_FRINTN
6627               || uns_code == UNSPEC_FRINTP
6628               || uns_code == UNSPEC_FRINTZ)
6629             x = XVECEXP (x, 0, 0);
6630         }
6631
6632       if (speed)
6633         {
6634           if (VECTOR_MODE_P (mode))
6635             *cost += extra_cost->vect.alu;
6636           else
6637             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6638         }
6639       *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6640       return true;
6641
6642     case ABS:
6643       if (VECTOR_MODE_P (mode))
6644         {
6645           /* ABS (vector).  */
6646           if (speed)
6647             *cost += extra_cost->vect.alu;
6648         }
6649       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6650         {
6651           op0 = XEXP (x, 0);
6652
6653           /* FABD, which is analogous to FADD.  */
6654           if (GET_CODE (op0) == MINUS)
6655             {
6656               *cost += rtx_cost (XEXP (op0, 0), MINUS, 0, speed);
6657                         + rtx_cost (XEXP (op0, 1), MINUS, 1, speed);
6658               if (speed)
6659                 *cost += extra_cost->fp[mode == DFmode].addsub;
6660
6661               return true;
6662             }
6663           /* Simple FABS is analogous to FNEG.  */
6664           if (speed)
6665             *cost += extra_cost->fp[mode == DFmode].neg;
6666         }
6667       else
6668         {
6669           /* Integer ABS will either be split to
6670              two arithmetic instructions, or will be an ABS
6671              (scalar), which we don't model.  */
6672           *cost = COSTS_N_INSNS (2);
6673           if (speed)
6674             *cost += 2 * extra_cost->alu.arith;
6675         }
6676       return false;
6677
6678     case SMAX:
6679     case SMIN:
6680       if (speed)
6681         {
6682           if (VECTOR_MODE_P (mode))
6683             *cost += extra_cost->vect.alu;
6684           else
6685             {
6686               /* FMAXNM/FMINNM/FMAX/FMIN.
6687                  TODO: This may not be accurate for all implementations, but
6688                  we do not model this in the cost tables.  */
6689               *cost += extra_cost->fp[mode == DFmode].addsub;
6690             }
6691         }
6692       return false;
6693
6694     case UNSPEC:
6695       /* The floating point round to integer frint* instructions.  */
6696       if (aarch64_frint_unspec_p (XINT (x, 1)))
6697         {
6698           if (speed)
6699             *cost += extra_cost->fp[mode == DFmode].roundint;
6700
6701           return false;
6702         }
6703
6704       if (XINT (x, 1) == UNSPEC_RBIT)
6705         {
6706           if (speed)
6707             *cost += extra_cost->alu.rev;
6708
6709           return false;
6710         }
6711       break;
6712
6713     case TRUNCATE:
6714
6715       /* Decompose <su>muldi3_highpart.  */
6716       if (/* (truncate:DI  */
6717           mode == DImode
6718           /*   (lshiftrt:TI  */
6719           && GET_MODE (XEXP (x, 0)) == TImode
6720           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6721           /*      (mult:TI  */
6722           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6723           /*        (ANY_EXTEND:TI (reg:DI))
6724                     (ANY_EXTEND:TI (reg:DI)))  */
6725           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6726                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6727               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6728                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6729           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6730           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6731           /*     (const_int 64)  */
6732           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6733           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6734         {
6735           /* UMULH/SMULH.  */
6736           if (speed)
6737             *cost += extra_cost->mult[mode == DImode].extend;
6738           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6739                              MULT, 0, speed);
6740           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6741                              MULT, 1, speed);
6742           return true;
6743         }
6744
6745       /* Fall through.  */
6746     default:
6747       break;
6748     }
6749
6750   if (dump_file && (dump_flags & TDF_DETAILS))
6751     fprintf (dump_file,
6752       "\nFailed to cost RTX.  Assuming default cost.\n");
6753
6754   return true;
6755 }
6756
6757 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6758    calculated for X.  This cost is stored in *COST.  Returns true
6759    if the total cost of X was calculated.  */
6760 static bool
6761 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6762                    int param, int *cost, bool speed)
6763 {
6764   bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6765
6766   if (dump_file && (dump_flags & TDF_DETAILS))
6767     {
6768       print_rtl_single (dump_file, x);
6769       fprintf (dump_file, "\n%s cost: %d (%s)\n",
6770                speed ? "Hot" : "Cold",
6771                *cost, result ? "final" : "partial");
6772     }
6773
6774   return result;
6775 }
6776
6777 static int
6778 aarch64_register_move_cost (machine_mode mode,
6779                             reg_class_t from_i, reg_class_t to_i)
6780 {
6781   enum reg_class from = (enum reg_class) from_i;
6782   enum reg_class to = (enum reg_class) to_i;
6783   const struct cpu_regmove_cost *regmove_cost
6784     = aarch64_tune_params->regmove_cost;
6785
6786   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
6787   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6788     to = GENERAL_REGS;
6789
6790   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6791     from = GENERAL_REGS;
6792
6793   /* Moving between GPR and stack cost is the same as GP2GP.  */
6794   if ((from == GENERAL_REGS && to == STACK_REG)
6795       || (to == GENERAL_REGS && from == STACK_REG))
6796     return regmove_cost->GP2GP;
6797
6798   /* To/From the stack register, we move via the gprs.  */
6799   if (to == STACK_REG || from == STACK_REG)
6800     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6801             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6802
6803   if (GET_MODE_SIZE (mode) == 16)
6804     {
6805       /* 128-bit operations on general registers require 2 instructions.  */
6806       if (from == GENERAL_REGS && to == GENERAL_REGS)
6807         return regmove_cost->GP2GP * 2;
6808       else if (from == GENERAL_REGS)
6809         return regmove_cost->GP2FP * 2;
6810       else if (to == GENERAL_REGS)
6811         return regmove_cost->FP2GP * 2;
6812
6813       /* When AdvSIMD instructions are disabled it is not possible to move
6814          a 128-bit value directly between Q registers.  This is handled in
6815          secondary reload.  A general register is used as a scratch to move
6816          the upper DI value and the lower DI value is moved directly,
6817          hence the cost is the sum of three moves. */
6818       if (! TARGET_SIMD)
6819         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6820
6821       return regmove_cost->FP2FP;
6822     }
6823
6824   if (from == GENERAL_REGS && to == GENERAL_REGS)
6825     return regmove_cost->GP2GP;
6826   else if (from == GENERAL_REGS)
6827     return regmove_cost->GP2FP;
6828   else if (to == GENERAL_REGS)
6829     return regmove_cost->FP2GP;
6830
6831   return regmove_cost->FP2FP;
6832 }
6833
6834 static int
6835 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6836                           reg_class_t rclass ATTRIBUTE_UNUSED,
6837                           bool in ATTRIBUTE_UNUSED)
6838 {
6839   return aarch64_tune_params->memmov_cost;
6840 }
6841
6842 /* Return the number of instructions that can be issued per cycle.  */
6843 static int
6844 aarch64_sched_issue_rate (void)
6845 {
6846   return aarch64_tune_params->issue_rate;
6847 }
6848
6849 static int
6850 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6851 {
6852   int issue_rate = aarch64_sched_issue_rate ();
6853
6854   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6855 }
6856
6857 /* Vectorizer cost model target hooks.  */
6858
6859 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
6860 static int
6861 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6862                                     tree vectype,
6863                                     int misalign ATTRIBUTE_UNUSED)
6864 {
6865   unsigned elements;
6866
6867   switch (type_of_cost)
6868     {
6869       case scalar_stmt:
6870         return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6871
6872       case scalar_load:
6873         return aarch64_tune_params->vec_costs->scalar_load_cost;
6874
6875       case scalar_store:
6876         return aarch64_tune_params->vec_costs->scalar_store_cost;
6877
6878       case vector_stmt:
6879         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6880
6881       case vector_load:
6882         return aarch64_tune_params->vec_costs->vec_align_load_cost;
6883
6884       case vector_store:
6885         return aarch64_tune_params->vec_costs->vec_store_cost;
6886
6887       case vec_to_scalar:
6888         return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6889
6890       case scalar_to_vec:
6891         return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6892
6893       case unaligned_load:
6894         return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6895
6896       case unaligned_store:
6897         return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6898
6899       case cond_branch_taken:
6900         return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6901
6902       case cond_branch_not_taken:
6903         return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6904
6905       case vec_perm:
6906       case vec_promote_demote:
6907         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6908
6909       case vec_construct:
6910         elements = TYPE_VECTOR_SUBPARTS (vectype);
6911         return elements / 2 + 1;
6912
6913       default:
6914         gcc_unreachable ();
6915     }
6916 }
6917
6918 /* Implement targetm.vectorize.add_stmt_cost.  */
6919 static unsigned
6920 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6921                        struct _stmt_vec_info *stmt_info, int misalign,
6922                        enum vect_cost_model_location where)
6923 {
6924   unsigned *cost = (unsigned *) data;
6925   unsigned retval = 0;
6926
6927   if (flag_vect_cost_model)
6928     {
6929       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6930       int stmt_cost =
6931             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6932
6933       /* Statements in an inner loop relative to the loop being
6934          vectorized are weighted more heavily.  The value here is
6935          a function (linear for now) of the loop nest level.  */
6936       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6937         {
6938           loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6939           struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
6940           unsigned nest_level = loop_depth (loop);
6941
6942           count *= nest_level;
6943         }
6944
6945       retval = (unsigned) (count * stmt_cost);
6946       cost[where] += retval;
6947     }
6948
6949   return retval;
6950 }
6951
6952 static void initialize_aarch64_code_model (void);
6953
6954 /* Parse the architecture extension string.  */
6955
6956 static void
6957 aarch64_parse_extension (char *str)
6958 {
6959   /* The extension string is parsed left to right.  */
6960   const struct aarch64_option_extension *opt = NULL;
6961
6962   /* Flag to say whether we are adding or removing an extension.  */
6963   int adding_ext = -1;
6964
6965   while (str != NULL && *str != 0)
6966     {
6967       char *ext;
6968       size_t len;
6969
6970       str++;
6971       ext = strchr (str, '+');
6972
6973       if (ext != NULL)
6974         len = ext - str;
6975       else
6976         len = strlen (str);
6977
6978       if (len >= 2 && strncmp (str, "no", 2) == 0)
6979         {
6980           adding_ext = 0;
6981           len -= 2;
6982           str += 2;
6983         }
6984       else if (len > 0)
6985         adding_ext = 1;
6986
6987       if (len == 0)
6988         {
6989           error ("missing feature modifier after %qs", adding_ext ? "+"
6990                                                                   : "+no");
6991           return;
6992         }
6993
6994       /* Scan over the extensions table trying to find an exact match.  */
6995       for (opt = all_extensions; opt->name != NULL; opt++)
6996         {
6997           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6998             {
6999               /* Add or remove the extension.  */
7000               if (adding_ext)
7001                 aarch64_isa_flags |= opt->flags_on;
7002               else
7003                 aarch64_isa_flags &= ~(opt->flags_off);
7004               break;
7005             }
7006         }
7007
7008       if (opt->name == NULL)
7009         {
7010           /* Extension not found in list.  */
7011           error ("unknown feature modifier %qs", str);
7012           return;
7013         }
7014
7015       str = ext;
7016     };
7017
7018   return;
7019 }
7020
7021 /* Parse the ARCH string.  */
7022
7023 static void
7024 aarch64_parse_arch (void)
7025 {
7026   char *ext;
7027   const struct processor *arch;
7028   char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
7029   size_t len;
7030
7031   strcpy (str, aarch64_arch_string);
7032
7033   ext = strchr (str, '+');
7034
7035   if (ext != NULL)
7036     len = ext - str;
7037   else
7038     len = strlen (str);
7039
7040   if (len == 0)
7041     {
7042       error ("missing arch name in -march=%qs", str);
7043       return;
7044     }
7045
7046   /* Loop through the list of supported ARCHs to find a match.  */
7047   for (arch = all_architectures; arch->name != NULL; arch++)
7048     {
7049       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7050         {
7051           selected_arch = arch;
7052           aarch64_isa_flags = selected_arch->flags;
7053
7054           if (!selected_cpu)
7055             selected_cpu = &all_cores[selected_arch->core];
7056
7057           if (ext != NULL)
7058             {
7059               /* ARCH string contains at least one extension.  */
7060               aarch64_parse_extension (ext);
7061             }
7062
7063           if (strcmp (selected_arch->arch, selected_cpu->arch))
7064             {
7065               warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
7066                        selected_cpu->name, selected_arch->name);
7067             }
7068
7069           return;
7070         }
7071     }
7072
7073   /* ARCH name not found in list.  */
7074   error ("unknown value %qs for -march", str);
7075   return;
7076 }
7077
7078 /* Parse the CPU string.  */
7079
7080 static void
7081 aarch64_parse_cpu (void)
7082 {
7083   char *ext;
7084   const struct processor *cpu;
7085   char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
7086   size_t len;
7087
7088   strcpy (str, aarch64_cpu_string);
7089
7090   ext = strchr (str, '+');
7091
7092   if (ext != NULL)
7093     len = ext - str;
7094   else
7095     len = strlen (str);
7096
7097   if (len == 0)
7098     {
7099       error ("missing cpu name in -mcpu=%qs", str);
7100       return;
7101     }
7102
7103   /* Loop through the list of supported CPUs to find a match.  */
7104   for (cpu = all_cores; cpu->name != NULL; cpu++)
7105     {
7106       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7107         {
7108           selected_cpu = cpu;
7109           aarch64_isa_flags = selected_cpu->flags;
7110
7111           if (ext != NULL)
7112             {
7113               /* CPU string contains at least one extension.  */
7114               aarch64_parse_extension (ext);
7115             }
7116
7117           return;
7118         }
7119     }
7120
7121   /* CPU name not found in list.  */
7122   error ("unknown value %qs for -mcpu", str);
7123   return;
7124 }
7125
7126 /* Parse the TUNE string.  */
7127
7128 static void
7129 aarch64_parse_tune (void)
7130 {
7131   const struct processor *cpu;
7132   char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
7133   strcpy (str, aarch64_tune_string);
7134
7135   /* Loop through the list of supported CPUs to find a match.  */
7136   for (cpu = all_cores; cpu->name != NULL; cpu++)
7137     {
7138       if (strcmp (cpu->name, str) == 0)
7139         {
7140           selected_tune = cpu;
7141           return;
7142         }
7143     }
7144
7145   /* CPU name not found in list.  */
7146   error ("unknown value %qs for -mtune", str);
7147   return;
7148 }
7149
7150
7151 /* Implement TARGET_OPTION_OVERRIDE.  */
7152
7153 static void
7154 aarch64_override_options (void)
7155 {
7156   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
7157      If either of -march or -mtune is given, they override their
7158      respective component of -mcpu.
7159
7160      So, first parse AARCH64_CPU_STRING, then the others, be careful
7161      with -march as, if -mcpu is not present on the command line, march
7162      must set a sensible default CPU.  */
7163   if (aarch64_cpu_string)
7164     {
7165       aarch64_parse_cpu ();
7166     }
7167
7168   if (aarch64_arch_string)
7169     {
7170       aarch64_parse_arch ();
7171     }
7172
7173   if (aarch64_tune_string)
7174     {
7175       aarch64_parse_tune ();
7176     }
7177
7178 #ifndef HAVE_AS_MABI_OPTION
7179   /* The compiler may have been configured with 2.23.* binutils, which does
7180      not have support for ILP32.  */
7181   if (TARGET_ILP32)
7182     error ("Assembler does not support -mabi=ilp32");
7183 #endif
7184
7185   initialize_aarch64_code_model ();
7186
7187   aarch64_build_bitmask_table ();
7188
7189   /* This target defaults to strict volatile bitfields.  */
7190   if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
7191     flag_strict_volatile_bitfields = 1;
7192
7193   /* If the user did not specify a processor, choose the default
7194      one for them.  This will be the CPU set during configuration using
7195      --with-cpu, otherwise it is "generic".  */
7196   if (!selected_cpu)
7197     {
7198       selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
7199       aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
7200     }
7201
7202   gcc_assert (selected_cpu);
7203
7204   if (!selected_tune)
7205     selected_tune = selected_cpu;
7206
7207   aarch64_tune_flags = selected_tune->flags;
7208   aarch64_tune = selected_tune->core;
7209   aarch64_tune_params = selected_tune->tune;
7210   aarch64_architecture_version = selected_cpu->architecture_version;
7211
7212   if (aarch64_fix_a53_err835769 == 2)
7213     {
7214 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
7215       aarch64_fix_a53_err835769 = 1;
7216 #else
7217       aarch64_fix_a53_err835769 = 0;
7218 #endif
7219     }
7220
7221   aarch64_register_fma_steering ();
7222
7223   aarch64_override_options_after_change ();
7224 }
7225
7226 /* Implement targetm.override_options_after_change.  */
7227
7228 static void
7229 aarch64_override_options_after_change (void)
7230 {
7231   if (flag_omit_frame_pointer)
7232     flag_omit_leaf_frame_pointer = false;
7233   else if (flag_omit_leaf_frame_pointer)
7234     flag_omit_frame_pointer = true;
7235
7236   /* If not optimizing for size, set the default
7237      alignment to what the target wants */
7238   if (!optimize_size)
7239     {
7240       if (align_loops <= 0)
7241         align_loops = aarch64_tune_params->loop_align;
7242       if (align_jumps <= 0)
7243         align_jumps = aarch64_tune_params->jump_align;
7244       if (align_functions <= 0)
7245         align_functions = aarch64_tune_params->function_align;
7246     }
7247 }
7248
7249 static struct machine_function *
7250 aarch64_init_machine_status (void)
7251 {
7252   struct machine_function *machine;
7253   machine = ggc_cleared_alloc<machine_function> ();
7254   return machine;
7255 }
7256
7257 void
7258 aarch64_init_expanders (void)
7259 {
7260   init_machine_status = aarch64_init_machine_status;
7261 }
7262
7263 /* A checking mechanism for the implementation of the various code models.  */
7264 static void
7265 initialize_aarch64_code_model (void)
7266 {
7267    if (flag_pic)
7268      {
7269        switch (aarch64_cmodel_var)
7270          {
7271          case AARCH64_CMODEL_TINY:
7272            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
7273            break;
7274          case AARCH64_CMODEL_SMALL:
7275            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
7276            break;
7277          case AARCH64_CMODEL_LARGE:
7278            sorry ("code model %qs with -f%s", "large",
7279                   flag_pic > 1 ? "PIC" : "pic");
7280          default:
7281            gcc_unreachable ();
7282          }
7283      }
7284    else
7285      aarch64_cmodel = aarch64_cmodel_var;
7286 }
7287
7288 /* Return true if SYMBOL_REF X binds locally.  */
7289
7290 static bool
7291 aarch64_symbol_binds_local_p (const_rtx x)
7292 {
7293   return (SYMBOL_REF_DECL (x)
7294           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
7295           : SYMBOL_REF_LOCAL_P (x));
7296 }
7297
7298 /* Return true if SYMBOL_REF X is thread local */
7299 static bool
7300 aarch64_tls_symbol_p (rtx x)
7301 {
7302   if (! TARGET_HAVE_TLS)
7303     return false;
7304
7305   if (GET_CODE (x) != SYMBOL_REF)
7306     return false;
7307
7308   return SYMBOL_REF_TLS_MODEL (x) != 0;
7309 }
7310
7311 /* Classify a TLS symbol into one of the TLS kinds.  */
7312 enum aarch64_symbol_type
7313 aarch64_classify_tls_symbol (rtx x)
7314 {
7315   enum tls_model tls_kind = tls_symbolic_operand_type (x);
7316
7317   switch (tls_kind)
7318     {
7319     case TLS_MODEL_GLOBAL_DYNAMIC:
7320     case TLS_MODEL_LOCAL_DYNAMIC:
7321       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
7322
7323     case TLS_MODEL_INITIAL_EXEC:
7324       return SYMBOL_SMALL_GOTTPREL;
7325
7326     case TLS_MODEL_LOCAL_EXEC:
7327       return SYMBOL_SMALL_TPREL;
7328
7329     case TLS_MODEL_EMULATED:
7330     case TLS_MODEL_NONE:
7331       return SYMBOL_FORCE_TO_MEM;
7332
7333     default:
7334       gcc_unreachable ();
7335     }
7336 }
7337
7338 /* Return the method that should be used to access SYMBOL_REF or
7339    LABEL_REF X in context CONTEXT.  */
7340
7341 enum aarch64_symbol_type
7342 aarch64_classify_symbol (rtx x, rtx offset,
7343                          enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
7344 {
7345   if (GET_CODE (x) == LABEL_REF)
7346     {
7347       switch (aarch64_cmodel)
7348         {
7349         case AARCH64_CMODEL_LARGE:
7350           return SYMBOL_FORCE_TO_MEM;
7351
7352         case AARCH64_CMODEL_TINY_PIC:
7353         case AARCH64_CMODEL_TINY:
7354           return SYMBOL_TINY_ABSOLUTE;
7355
7356         case AARCH64_CMODEL_SMALL_PIC:
7357         case AARCH64_CMODEL_SMALL:
7358           return SYMBOL_SMALL_ABSOLUTE;
7359
7360         default:
7361           gcc_unreachable ();
7362         }
7363     }
7364
7365   if (GET_CODE (x) == SYMBOL_REF)
7366     {
7367       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7368           return SYMBOL_FORCE_TO_MEM;
7369
7370       if (aarch64_tls_symbol_p (x))
7371         return aarch64_classify_tls_symbol (x);
7372
7373       switch (aarch64_cmodel)
7374         {
7375         case AARCH64_CMODEL_TINY:
7376           /* When we retreive symbol + offset address, we have to make sure
7377              the offset does not cause overflow of the final address.  But
7378              we have no way of knowing the address of symbol at compile time
7379              so we can't accurately say if the distance between the PC and
7380              symbol + offset is outside the addressible range of +/-1M in the
7381              TINY code model.  So we rely on images not being greater than
7382              1M and cap the offset at 1M and anything beyond 1M will have to
7383              be loaded using an alternative mechanism.  */
7384           if (SYMBOL_REF_WEAK (x)
7385               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7386             return SYMBOL_FORCE_TO_MEM;
7387           return SYMBOL_TINY_ABSOLUTE;
7388
7389         case AARCH64_CMODEL_SMALL:
7390           /* Same reasoning as the tiny code model, but the offset cap here is
7391              4G.  */
7392           if (SYMBOL_REF_WEAK (x)
7393               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
7394                             HOST_WIDE_INT_C (4294967264)))
7395             return SYMBOL_FORCE_TO_MEM;
7396           return SYMBOL_SMALL_ABSOLUTE;
7397
7398         case AARCH64_CMODEL_TINY_PIC:
7399           if (!aarch64_symbol_binds_local_p (x))
7400             return SYMBOL_TINY_GOT;
7401           return SYMBOL_TINY_ABSOLUTE;
7402
7403         case AARCH64_CMODEL_SMALL_PIC:
7404           if (!aarch64_symbol_binds_local_p (x))
7405             return SYMBOL_SMALL_GOT;
7406           return SYMBOL_SMALL_ABSOLUTE;
7407
7408         default:
7409           gcc_unreachable ();
7410         }
7411     }
7412
7413   /* By default push everything into the constant pool.  */
7414   return SYMBOL_FORCE_TO_MEM;
7415 }
7416
7417 bool
7418 aarch64_constant_address_p (rtx x)
7419 {
7420   return (CONSTANT_P (x) && memory_address_p (DImode, x));
7421 }
7422
7423 bool
7424 aarch64_legitimate_pic_operand_p (rtx x)
7425 {
7426   if (GET_CODE (x) == SYMBOL_REF
7427       || (GET_CODE (x) == CONST
7428           && GET_CODE (XEXP (x, 0)) == PLUS
7429           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7430      return false;
7431
7432   return true;
7433 }
7434
7435 /* Return true if X holds either a quarter-precision or
7436      floating-point +0.0 constant.  */
7437 static bool
7438 aarch64_valid_floating_const (machine_mode mode, rtx x)
7439 {
7440   if (!CONST_DOUBLE_P (x))
7441     return false;
7442
7443   if (aarch64_float_const_zero_rtx_p (x))
7444     return true;
7445
7446   /* We only handle moving 0.0 to a TFmode register.  */
7447   if (!(mode == SFmode || mode == DFmode))
7448     return false;
7449
7450   return aarch64_float_const_representable_p (x);
7451 }
7452
7453 static bool
7454 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7455 {
7456   /* Do not allow vector struct mode constants.  We could support
7457      0 and -1 easily, but they need support in aarch64-simd.md.  */
7458   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7459     return false;
7460
7461   /* This could probably go away because
7462      we now decompose CONST_INTs according to expand_mov_immediate.  */
7463   if ((GET_CODE (x) == CONST_VECTOR
7464        && aarch64_simd_valid_immediate (x, mode, false, NULL))
7465       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7466         return !targetm.cannot_force_const_mem (mode, x);
7467
7468   if (GET_CODE (x) == HIGH
7469       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7470     return true;
7471
7472   return aarch64_constant_address_p (x);
7473 }
7474
7475 rtx
7476 aarch64_load_tp (rtx target)
7477 {
7478   if (!target
7479       || GET_MODE (target) != Pmode
7480       || !register_operand (target, Pmode))
7481     target = gen_reg_rtx (Pmode);
7482
7483   /* Can return in any reg.  */
7484   emit_insn (gen_aarch64_load_tp_hard (target));
7485   return target;
7486 }
7487
7488 /* On AAPCS systems, this is the "struct __va_list".  */
7489 static GTY(()) tree va_list_type;
7490
7491 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7492    Return the type to use as __builtin_va_list.
7493
7494    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7495
7496    struct __va_list
7497    {
7498      void *__stack;
7499      void *__gr_top;
7500      void *__vr_top;
7501      int   __gr_offs;
7502      int   __vr_offs;
7503    };  */
7504
7505 static tree
7506 aarch64_build_builtin_va_list (void)
7507 {
7508   tree va_list_name;
7509   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7510
7511   /* Create the type.  */
7512   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7513   /* Give it the required name.  */
7514   va_list_name = build_decl (BUILTINS_LOCATION,
7515                              TYPE_DECL,
7516                              get_identifier ("__va_list"),
7517                              va_list_type);
7518   DECL_ARTIFICIAL (va_list_name) = 1;
7519   TYPE_NAME (va_list_type) = va_list_name;
7520   TYPE_STUB_DECL (va_list_type) = va_list_name;
7521
7522   /* Create the fields.  */
7523   f_stack = build_decl (BUILTINS_LOCATION,
7524                         FIELD_DECL, get_identifier ("__stack"),
7525                         ptr_type_node);
7526   f_grtop = build_decl (BUILTINS_LOCATION,
7527                         FIELD_DECL, get_identifier ("__gr_top"),
7528                         ptr_type_node);
7529   f_vrtop = build_decl (BUILTINS_LOCATION,
7530                         FIELD_DECL, get_identifier ("__vr_top"),
7531                         ptr_type_node);
7532   f_groff = build_decl (BUILTINS_LOCATION,
7533                         FIELD_DECL, get_identifier ("__gr_offs"),
7534                         integer_type_node);
7535   f_vroff = build_decl (BUILTINS_LOCATION,
7536                         FIELD_DECL, get_identifier ("__vr_offs"),
7537                         integer_type_node);
7538
7539   DECL_ARTIFICIAL (f_stack) = 1;
7540   DECL_ARTIFICIAL (f_grtop) = 1;
7541   DECL_ARTIFICIAL (f_vrtop) = 1;
7542   DECL_ARTIFICIAL (f_groff) = 1;
7543   DECL_ARTIFICIAL (f_vroff) = 1;
7544
7545   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7546   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7547   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7548   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7549   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7550
7551   TYPE_FIELDS (va_list_type) = f_stack;
7552   DECL_CHAIN (f_stack) = f_grtop;
7553   DECL_CHAIN (f_grtop) = f_vrtop;
7554   DECL_CHAIN (f_vrtop) = f_groff;
7555   DECL_CHAIN (f_groff) = f_vroff;
7556
7557   /* Compute its layout.  */
7558   layout_type (va_list_type);
7559
7560   return va_list_type;
7561 }
7562
7563 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
7564 static void
7565 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7566 {
7567   const CUMULATIVE_ARGS *cum;
7568   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7569   tree stack, grtop, vrtop, groff, vroff;
7570   tree t;
7571   int gr_save_area_size;
7572   int vr_save_area_size;
7573   int vr_offset;
7574
7575   cum = &crtl->args.info;
7576   gr_save_area_size
7577     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7578   vr_save_area_size
7579     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7580
7581   if (!TARGET_FLOAT)
7582     {
7583       gcc_assert (cum->aapcs_nvrn == 0);
7584       vr_save_area_size = 0;
7585     }
7586
7587   f_stack = TYPE_FIELDS (va_list_type_node);
7588   f_grtop = DECL_CHAIN (f_stack);
7589   f_vrtop = DECL_CHAIN (f_grtop);
7590   f_groff = DECL_CHAIN (f_vrtop);
7591   f_vroff = DECL_CHAIN (f_groff);
7592
7593   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7594                   NULL_TREE);
7595   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7596                   NULL_TREE);
7597   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7598                   NULL_TREE);
7599   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7600                   NULL_TREE);
7601   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7602                   NULL_TREE);
7603
7604   /* Emit code to initialize STACK, which points to the next varargs stack
7605      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
7606      by named arguments.  STACK is 8-byte aligned.  */
7607   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7608   if (cum->aapcs_stack_size > 0)
7609     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7610   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7611   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7612
7613   /* Emit code to initialize GRTOP, the top of the GR save area.
7614      virtual_incoming_args_rtx should have been 16 byte aligned.  */
7615   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7616   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7617   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7618
7619   /* Emit code to initialize VRTOP, the top of the VR save area.
7620      This address is gr_save_area_bytes below GRTOP, rounded
7621      down to the next 16-byte boundary.  */
7622   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7623   vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7624                              STACK_BOUNDARY / BITS_PER_UNIT);
7625
7626   if (vr_offset)
7627     t = fold_build_pointer_plus_hwi (t, -vr_offset);
7628   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7629   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7630
7631   /* Emit code to initialize GROFF, the offset from GRTOP of the
7632      next GPR argument.  */
7633   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7634               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7635   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7636
7637   /* Likewise emit code to initialize VROFF, the offset from FTOP
7638      of the next VR argument.  */
7639   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7640               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7641   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7642 }
7643
7644 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
7645
7646 static tree
7647 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7648                               gimple_seq *post_p ATTRIBUTE_UNUSED)
7649 {
7650   tree addr;
7651   bool indirect_p;
7652   bool is_ha;           /* is HFA or HVA.  */
7653   bool dw_align;        /* double-word align.  */
7654   machine_mode ag_mode = VOIDmode;
7655   int nregs;
7656   machine_mode mode;
7657
7658   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7659   tree stack, f_top, f_off, off, arg, roundup, on_stack;
7660   HOST_WIDE_INT size, rsize, adjust, align;
7661   tree t, u, cond1, cond2;
7662
7663   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7664   if (indirect_p)
7665     type = build_pointer_type (type);
7666
7667   mode = TYPE_MODE (type);
7668
7669   f_stack = TYPE_FIELDS (va_list_type_node);
7670   f_grtop = DECL_CHAIN (f_stack);
7671   f_vrtop = DECL_CHAIN (f_grtop);
7672   f_groff = DECL_CHAIN (f_vrtop);
7673   f_vroff = DECL_CHAIN (f_groff);
7674
7675   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7676                   f_stack, NULL_TREE);
7677   size = int_size_in_bytes (type);
7678   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7679
7680   dw_align = false;
7681   adjust = 0;
7682   if (aarch64_vfp_is_call_or_return_candidate (mode,
7683                                                type,
7684                                                &ag_mode,
7685                                                &nregs,
7686                                                &is_ha))
7687     {
7688       /* TYPE passed in fp/simd registers.  */
7689       if (!TARGET_FLOAT)
7690         aarch64_err_no_fpadvsimd (mode, "varargs");
7691
7692       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
7693                       unshare_expr (valist), f_vrtop, NULL_TREE);
7694       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
7695                       unshare_expr (valist), f_vroff, NULL_TREE);
7696
7697       rsize = nregs * UNITS_PER_VREG;
7698
7699       if (is_ha)
7700         {
7701           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
7702             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
7703         }
7704       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
7705                && size < UNITS_PER_VREG)
7706         {
7707           adjust = UNITS_PER_VREG - size;
7708         }
7709     }
7710   else
7711     {
7712       /* TYPE passed in general registers.  */
7713       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
7714                       unshare_expr (valist), f_grtop, NULL_TREE);
7715       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
7716                       unshare_expr (valist), f_groff, NULL_TREE);
7717       rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
7718       nregs = rsize / UNITS_PER_WORD;
7719
7720       if (align > 8)
7721         dw_align = true;
7722
7723       if (BLOCK_REG_PADDING (mode, type, 1) == downward
7724           && size < UNITS_PER_WORD)
7725         {
7726           adjust = UNITS_PER_WORD  - size;
7727         }
7728     }
7729
7730   /* Get a local temporary for the field value.  */
7731   off = get_initialized_tmp_var (f_off, pre_p, NULL);
7732
7733   /* Emit code to branch if off >= 0.  */
7734   t = build2 (GE_EXPR, boolean_type_node, off,
7735               build_int_cst (TREE_TYPE (off), 0));
7736   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
7737
7738   if (dw_align)
7739     {
7740       /* Emit: offs = (offs + 15) & -16.  */
7741       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7742                   build_int_cst (TREE_TYPE (off), 15));
7743       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
7744                   build_int_cst (TREE_TYPE (off), -16));
7745       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
7746     }
7747   else
7748     roundup = NULL;
7749
7750   /* Update ap.__[g|v]r_offs  */
7751   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7752               build_int_cst (TREE_TYPE (off), rsize));
7753   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
7754
7755   /* String up.  */
7756   if (roundup)
7757     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7758
7759   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
7760   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
7761               build_int_cst (TREE_TYPE (f_off), 0));
7762   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
7763
7764   /* String up: make sure the assignment happens before the use.  */
7765   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
7766   COND_EXPR_ELSE (cond1) = t;
7767
7768   /* Prepare the trees handling the argument that is passed on the stack;
7769      the top level node will store in ON_STACK.  */
7770   arg = get_initialized_tmp_var (stack, pre_p, NULL);
7771   if (align > 8)
7772     {
7773       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
7774       t = fold_convert (intDI_type_node, arg);
7775       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7776                   build_int_cst (TREE_TYPE (t), 15));
7777       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7778                   build_int_cst (TREE_TYPE (t), -16));
7779       t = fold_convert (TREE_TYPE (arg), t);
7780       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7781     }
7782   else
7783     roundup = NULL;
7784   /* Advance ap.__stack  */
7785   t = fold_convert (intDI_type_node, arg);
7786   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7787               build_int_cst (TREE_TYPE (t), size + 7));
7788   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7789               build_int_cst (TREE_TYPE (t), -8));
7790   t = fold_convert (TREE_TYPE (arg), t);
7791   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7792   /* String up roundup and advance.  */
7793   if (roundup)
7794     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7795   /* String up with arg */
7796   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7797   /* Big-endianness related address adjustment.  */
7798   if (BLOCK_REG_PADDING (mode, type, 1) == downward
7799       && size < UNITS_PER_WORD)
7800   {
7801     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7802                 size_int (UNITS_PER_WORD - size));
7803     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7804   }
7805
7806   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7807   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7808
7809   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
7810   t = off;
7811   if (adjust)
7812     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7813                 build_int_cst (TREE_TYPE (off), adjust));
7814
7815   t = fold_convert (sizetype, t);
7816   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7817
7818   if (is_ha)
7819     {
7820       /* type ha; // treat as "struct {ftype field[n];}"
7821          ... [computing offs]
7822          for (i = 0; i <nregs; ++i, offs += 16)
7823            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7824          return ha;  */
7825       int i;
7826       tree tmp_ha, field_t, field_ptr_t;
7827
7828       /* Declare a local variable.  */
7829       tmp_ha = create_tmp_var_raw (type, "ha");
7830       gimple_add_tmp_var (tmp_ha);
7831
7832       /* Establish the base type.  */
7833       switch (ag_mode)
7834         {
7835         case SFmode:
7836           field_t = float_type_node;
7837           field_ptr_t = float_ptr_type_node;
7838           break;
7839         case DFmode:
7840           field_t = double_type_node;
7841           field_ptr_t = double_ptr_type_node;
7842           break;
7843         case TFmode:
7844           field_t = long_double_type_node;
7845           field_ptr_t = long_double_ptr_type_node;
7846           break;
7847 /* The half precision and quad precision are not fully supported yet.  Enable
7848    the following code after the support is complete.  Need to find the correct
7849    type node for __fp16 *.  */
7850 #if 0
7851         case HFmode:
7852           field_t = float_type_node;
7853           field_ptr_t = float_ptr_type_node;
7854           break;
7855 #endif
7856         case V2SImode:
7857         case V4SImode:
7858             {
7859               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7860               field_t = build_vector_type_for_mode (innertype, ag_mode);
7861               field_ptr_t = build_pointer_type (field_t);
7862             }
7863           break;
7864         default:
7865           gcc_assert (0);
7866         }
7867
7868       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
7869       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7870       addr = t;
7871       t = fold_convert (field_ptr_t, addr);
7872       t = build2 (MODIFY_EXPR, field_t,
7873                   build1 (INDIRECT_REF, field_t, tmp_ha),
7874                   build1 (INDIRECT_REF, field_t, t));
7875
7876       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
7877       for (i = 1; i < nregs; ++i)
7878         {
7879           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7880           u = fold_convert (field_ptr_t, addr);
7881           u = build2 (MODIFY_EXPR, field_t,
7882                       build2 (MEM_REF, field_t, tmp_ha,
7883                               build_int_cst (field_ptr_t,
7884                                              (i *
7885                                               int_size_in_bytes (field_t)))),
7886                       build1 (INDIRECT_REF, field_t, u));
7887           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7888         }
7889
7890       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7891       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7892     }
7893
7894   COND_EXPR_ELSE (cond2) = t;
7895   addr = fold_convert (build_pointer_type (type), cond1);
7896   addr = build_va_arg_indirect_ref (addr);
7897
7898   if (indirect_p)
7899     addr = build_va_arg_indirect_ref (addr);
7900
7901   return addr;
7902 }
7903
7904 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
7905
7906 static void
7907 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7908                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7909                                 int no_rtl)
7910 {
7911   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7912   CUMULATIVE_ARGS local_cum;
7913   int gr_saved, vr_saved;
7914
7915   /* The caller has advanced CUM up to, but not beyond, the last named
7916      argument.  Advance a local copy of CUM past the last "real" named
7917      argument, to find out how many registers are left over.  */
7918   local_cum = *cum;
7919   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7920
7921   /* Found out how many registers we need to save.  */
7922   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7923   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7924
7925   if (!TARGET_FLOAT)
7926     {
7927       gcc_assert (local_cum.aapcs_nvrn == 0);
7928       vr_saved = 0;
7929     }
7930
7931   if (!no_rtl)
7932     {
7933       if (gr_saved > 0)
7934         {
7935           rtx ptr, mem;
7936
7937           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
7938           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7939                                - gr_saved * UNITS_PER_WORD);
7940           mem = gen_frame_mem (BLKmode, ptr);
7941           set_mem_alias_set (mem, get_varargs_alias_set ());
7942
7943           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7944                                mem, gr_saved);
7945         }
7946       if (vr_saved > 0)
7947         {
7948           /* We can't use move_block_from_reg, because it will use
7949              the wrong mode, storing D regs only.  */
7950           machine_mode mode = TImode;
7951           int off, i;
7952
7953           /* Set OFF to the offset from virtual_incoming_args_rtx of
7954              the first vector register.  The VR save area lies below
7955              the GR one, and is aligned to 16 bytes.  */
7956           off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7957                                    STACK_BOUNDARY / BITS_PER_UNIT);
7958           off -= vr_saved * UNITS_PER_VREG;
7959
7960           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7961             {
7962               rtx ptr, mem;
7963
7964               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7965               mem = gen_frame_mem (mode, ptr);
7966               set_mem_alias_set (mem, get_varargs_alias_set ());
7967               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7968               off += UNITS_PER_VREG;
7969             }
7970         }
7971     }
7972
7973   /* We don't save the size into *PRETEND_SIZE because we want to avoid
7974      any complication of having crtl->args.pretend_args_size changed.  */
7975   cfun->machine->frame.saved_varargs_size
7976     = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7977                       STACK_BOUNDARY / BITS_PER_UNIT)
7978        + vr_saved * UNITS_PER_VREG);
7979 }
7980
7981 static void
7982 aarch64_conditional_register_usage (void)
7983 {
7984   int i;
7985   if (!TARGET_FLOAT)
7986     {
7987       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7988         {
7989           fixed_regs[i] = 1;
7990           call_used_regs[i] = 1;
7991         }
7992     }
7993 }
7994
7995 /* Walk down the type tree of TYPE counting consecutive base elements.
7996    If *MODEP is VOIDmode, then set it to the first valid floating point
7997    type.  If a non-floating point type is found, or if a floating point
7998    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7999    otherwise return the count in the sub-tree.  */
8000 static int
8001 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
8002 {
8003   machine_mode mode;
8004   HOST_WIDE_INT size;
8005
8006   switch (TREE_CODE (type))
8007     {
8008     case REAL_TYPE:
8009       mode = TYPE_MODE (type);
8010       if (mode != DFmode && mode != SFmode && mode != TFmode)
8011         return -1;
8012
8013       if (*modep == VOIDmode)
8014         *modep = mode;
8015
8016       if (*modep == mode)
8017         return 1;
8018
8019       break;
8020
8021     case COMPLEX_TYPE:
8022       mode = TYPE_MODE (TREE_TYPE (type));
8023       if (mode != DFmode && mode != SFmode && mode != TFmode)
8024         return -1;
8025
8026       if (*modep == VOIDmode)
8027         *modep = mode;
8028
8029       if (*modep == mode)
8030         return 2;
8031
8032       break;
8033
8034     case VECTOR_TYPE:
8035       /* Use V2SImode and V4SImode as representatives of all 64-bit
8036          and 128-bit vector types.  */
8037       size = int_size_in_bytes (type);
8038       switch (size)
8039         {
8040         case 8:
8041           mode = V2SImode;
8042           break;
8043         case 16:
8044           mode = V4SImode;
8045           break;
8046         default:
8047           return -1;
8048         }
8049
8050       if (*modep == VOIDmode)
8051         *modep = mode;
8052
8053       /* Vector modes are considered to be opaque: two vectors are
8054          equivalent for the purposes of being homogeneous aggregates
8055          if they are the same size.  */
8056       if (*modep == mode)
8057         return 1;
8058
8059       break;
8060
8061     case ARRAY_TYPE:
8062       {
8063         int count;
8064         tree index = TYPE_DOMAIN (type);
8065
8066         /* Can't handle incomplete types nor sizes that are not
8067            fixed.  */
8068         if (!COMPLETE_TYPE_P (type)
8069             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8070           return -1;
8071
8072         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
8073         if (count == -1
8074             || !index
8075             || !TYPE_MAX_VALUE (index)
8076             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
8077             || !TYPE_MIN_VALUE (index)
8078             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
8079             || count < 0)
8080           return -1;
8081
8082         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
8083                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
8084
8085         /* There must be no padding.  */
8086         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8087           return -1;
8088
8089         return count;
8090       }
8091
8092     case RECORD_TYPE:
8093       {
8094         int count = 0;
8095         int sub_count;
8096         tree field;
8097
8098         /* Can't handle incomplete types nor sizes that are not
8099            fixed.  */
8100         if (!COMPLETE_TYPE_P (type)
8101             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8102           return -1;
8103
8104         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
8105           {
8106             if (TREE_CODE (field) != FIELD_DECL)
8107               continue;
8108
8109             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
8110             if (sub_count < 0)
8111               return -1;
8112             count += sub_count;
8113           }
8114
8115         /* There must be no padding.  */
8116         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8117           return -1;
8118
8119         return count;
8120       }
8121
8122     case UNION_TYPE:
8123     case QUAL_UNION_TYPE:
8124       {
8125         /* These aren't very interesting except in a degenerate case.  */
8126         int count = 0;
8127         int sub_count;
8128         tree field;
8129
8130         /* Can't handle incomplete types nor sizes that are not
8131            fixed.  */
8132         if (!COMPLETE_TYPE_P (type)
8133             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8134           return -1;
8135
8136         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
8137           {
8138             if (TREE_CODE (field) != FIELD_DECL)
8139               continue;
8140
8141             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
8142             if (sub_count < 0)
8143               return -1;
8144             count = count > sub_count ? count : sub_count;
8145           }
8146
8147         /* There must be no padding.  */
8148         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8149           return -1;
8150
8151         return count;
8152       }
8153
8154     default:
8155       break;
8156     }
8157
8158   return -1;
8159 }
8160
8161 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
8162    type as described in AAPCS64 \S 4.1.2.
8163
8164    See the comment above aarch64_composite_type_p for the notes on MODE.  */
8165
8166 static bool
8167 aarch64_short_vector_p (const_tree type,
8168                         machine_mode mode)
8169 {
8170   HOST_WIDE_INT size = -1;
8171
8172   if (type && TREE_CODE (type) == VECTOR_TYPE)
8173     size = int_size_in_bytes (type);
8174   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
8175             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8176     size = GET_MODE_SIZE (mode);
8177
8178   return (size == 8 || size == 16);
8179 }
8180
8181 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
8182    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
8183    array types.  The C99 floating-point complex types are also considered
8184    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
8185    types, which are GCC extensions and out of the scope of AAPCS64, are
8186    treated as composite types here as well.
8187
8188    Note that MODE itself is not sufficient in determining whether a type
8189    is such a composite type or not.  This is because
8190    stor-layout.c:compute_record_mode may have already changed the MODE
8191    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
8192    structure with only one field may have its MODE set to the mode of the
8193    field.  Also an integer mode whose size matches the size of the
8194    RECORD_TYPE type may be used to substitute the original mode
8195    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
8196    solely relied on.  */
8197
8198 static bool
8199 aarch64_composite_type_p (const_tree type,
8200                           machine_mode mode)
8201 {
8202   if (aarch64_short_vector_p (type, mode))
8203     return false;
8204
8205   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
8206     return true;
8207
8208   if (mode == BLKmode
8209       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
8210       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
8211     return true;
8212
8213   return false;
8214 }
8215
8216 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
8217    shall be passed or returned in simd/fp register(s) (providing these
8218    parameter passing registers are available).
8219
8220    Upon successful return, *COUNT returns the number of needed registers,
8221    *BASE_MODE returns the mode of the individual register and when IS_HAF
8222    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
8223    floating-point aggregate or a homogeneous short-vector aggregate.  */
8224
8225 static bool
8226 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
8227                                          const_tree type,
8228                                          machine_mode *base_mode,
8229                                          int *count,
8230                                          bool *is_ha)
8231 {
8232   machine_mode new_mode = VOIDmode;
8233   bool composite_p = aarch64_composite_type_p (type, mode);
8234
8235   if (is_ha != NULL) *is_ha = false;
8236
8237   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
8238       || aarch64_short_vector_p (type, mode))
8239     {
8240       *count = 1;
8241       new_mode = mode;
8242     }
8243   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
8244     {
8245       if (is_ha != NULL) *is_ha = true;
8246       *count = 2;
8247       new_mode = GET_MODE_INNER (mode);
8248     }
8249   else if (type && composite_p)
8250     {
8251       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
8252
8253       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
8254         {
8255           if (is_ha != NULL) *is_ha = true;
8256           *count = ag_count;
8257         }
8258       else
8259         return false;
8260     }
8261   else
8262     return false;
8263
8264   *base_mode = new_mode;
8265   return true;
8266 }
8267
8268 /* Implement TARGET_STRUCT_VALUE_RTX.  */
8269
8270 static rtx
8271 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
8272                           int incoming ATTRIBUTE_UNUSED)
8273 {
8274   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
8275 }
8276
8277 /* Implements target hook vector_mode_supported_p.  */
8278 static bool
8279 aarch64_vector_mode_supported_p (machine_mode mode)
8280 {
8281   if (TARGET_SIMD
8282       && (mode == V4SImode  || mode == V8HImode
8283           || mode == V16QImode || mode == V2DImode
8284           || mode == V2SImode  || mode == V4HImode
8285           || mode == V8QImode || mode == V2SFmode
8286           || mode == V4SFmode || mode == V2DFmode
8287           || mode == V1DFmode))
8288     return true;
8289
8290   return false;
8291 }
8292
8293 /* Return appropriate SIMD container
8294    for MODE within a vector of WIDTH bits.  */
8295 static machine_mode
8296 aarch64_simd_container_mode (machine_mode mode, unsigned width)
8297 {
8298   gcc_assert (width == 64 || width == 128);
8299   if (TARGET_SIMD)
8300     {
8301       if (width == 128)
8302         switch (mode)
8303           {
8304           case DFmode:
8305             return V2DFmode;
8306           case SFmode:
8307             return V4SFmode;
8308           case SImode:
8309             return V4SImode;
8310           case HImode:
8311             return V8HImode;
8312           case QImode:
8313             return V16QImode;
8314           case DImode:
8315             return V2DImode;
8316           default:
8317             break;
8318           }
8319       else
8320         switch (mode)
8321           {
8322           case SFmode:
8323             return V2SFmode;
8324           case SImode:
8325             return V2SImode;
8326           case HImode:
8327             return V4HImode;
8328           case QImode:
8329             return V8QImode;
8330           default:
8331             break;
8332           }
8333     }
8334   return word_mode;
8335 }
8336
8337 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
8338 static machine_mode
8339 aarch64_preferred_simd_mode (machine_mode mode)
8340 {
8341   return aarch64_simd_container_mode (mode, 128);
8342 }
8343
8344 /* Return the bitmask of possible vector sizes for the vectorizer
8345    to iterate over.  */
8346 static unsigned int
8347 aarch64_autovectorize_vector_sizes (void)
8348 {
8349   return (16 | 8);
8350 }
8351
8352 /* Implement TARGET_MANGLE_TYPE.  */
8353
8354 static const char *
8355 aarch64_mangle_type (const_tree type)
8356 {
8357   /* The AArch64 ABI documents say that "__va_list" has to be
8358      managled as if it is in the "std" namespace.  */
8359   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8360     return "St9__va_list";
8361
8362   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
8363      builtin types.  */
8364   if (TYPE_NAME (type) != NULL)
8365     return aarch64_mangle_builtin_type (type);
8366
8367   /* Use the default mangling.  */
8368   return NULL;
8369 }
8370
8371
8372 /* Return true if the rtx_insn contains a MEM RTX somewhere
8373    in it.  */
8374
8375 static bool
8376 has_memory_op (rtx_insn *mem_insn)
8377 {
8378   subrtx_iterator::array_type array;
8379   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8380     if (MEM_P (*iter))
8381       return true;
8382
8383   return false;
8384 }
8385
8386 /* Find the first rtx_insn before insn that will generate an assembly
8387    instruction.  */
8388
8389 static rtx_insn *
8390 aarch64_prev_real_insn (rtx_insn *insn)
8391 {
8392   if (!insn)
8393     return NULL;
8394
8395   do
8396     {
8397       insn = prev_real_insn (insn);
8398     }
8399   while (insn && recog_memoized (insn) < 0);
8400
8401   return insn;
8402 }
8403
8404 static bool
8405 is_madd_op (enum attr_type t1)
8406 {
8407   unsigned int i;
8408   /* A number of these may be AArch32 only.  */
8409   enum attr_type mlatypes[] = {
8410     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8411     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8412     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8413   };
8414
8415   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8416     {
8417       if (t1 == mlatypes[i])
8418         return true;
8419     }
8420
8421   return false;
8422 }
8423
8424 /* Check if there is a register dependency between a load and the insn
8425    for which we hold recog_data.  */
8426
8427 static bool
8428 dep_between_memop_and_curr (rtx memop)
8429 {
8430   rtx load_reg;
8431   int opno;
8432
8433   gcc_assert (GET_CODE (memop) == SET);
8434
8435   if (!REG_P (SET_DEST (memop)))
8436     return false;
8437
8438   load_reg = SET_DEST (memop);
8439   for (opno = 1; opno < recog_data.n_operands; opno++)
8440     {
8441       rtx operand = recog_data.operand[opno];
8442       if (REG_P (operand)
8443           && reg_overlap_mentioned_p (load_reg, operand))
8444         return true;
8445
8446     }
8447   return false;
8448 }
8449
8450
8451 /* When working around the Cortex-A53 erratum 835769,
8452    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8453    instruction and has a preceding memory instruction such that a NOP
8454    should be inserted between them.  */
8455
8456 bool
8457 aarch64_madd_needs_nop (rtx_insn* insn)
8458 {
8459   enum attr_type attr_type;
8460   rtx_insn *prev;
8461   rtx body;
8462
8463   if (!aarch64_fix_a53_err835769)
8464     return false;
8465
8466   if (recog_memoized (insn) < 0)
8467     return false;
8468
8469   attr_type = get_attr_type (insn);
8470   if (!is_madd_op (attr_type))
8471     return false;
8472
8473   prev = aarch64_prev_real_insn (insn);
8474   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8475      Restore recog state to INSN to avoid state corruption.  */
8476   extract_constrain_insn_cached (insn);
8477
8478   if (!prev || !has_memory_op (prev))
8479     return false;
8480
8481   body = single_set (prev);
8482
8483   /* If the previous insn is a memory op and there is no dependency between
8484      it and the DImode madd, emit a NOP between them.  If body is NULL then we
8485      have a complex memory operation, probably a load/store pair.
8486      Be conservative for now and emit a NOP.  */
8487   if (GET_MODE (recog_data.operand[0]) == DImode
8488       && (!body || !dep_between_memop_and_curr (body)))
8489     return true;
8490
8491   return false;
8492
8493 }
8494
8495
8496 /* Implement FINAL_PRESCAN_INSN.  */
8497
8498 void
8499 aarch64_final_prescan_insn (rtx_insn *insn)
8500 {
8501   if (aarch64_madd_needs_nop (insn))
8502     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8503 }
8504
8505
8506 /* Return the equivalent letter for size.  */
8507 static char
8508 sizetochar (int size)
8509 {
8510   switch (size)
8511     {
8512     case 64: return 'd';
8513     case 32: return 's';
8514     case 16: return 'h';
8515     case 8 : return 'b';
8516     default: gcc_unreachable ();
8517     }
8518 }
8519
8520 /* Return true iff x is a uniform vector of floating-point
8521    constants, and the constant can be represented in
8522    quarter-precision form.  Note, as aarch64_float_const_representable
8523    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
8524 static bool
8525 aarch64_vect_float_const_representable_p (rtx x)
8526 {
8527   int i = 0;
8528   REAL_VALUE_TYPE r0, ri;
8529   rtx x0, xi;
8530
8531   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8532     return false;
8533
8534   x0 = CONST_VECTOR_ELT (x, 0);
8535   if (!CONST_DOUBLE_P (x0))
8536     return false;
8537
8538   REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8539
8540   for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8541     {
8542       xi = CONST_VECTOR_ELT (x, i);
8543       if (!CONST_DOUBLE_P (xi))
8544         return false;
8545
8546       REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8547       if (!REAL_VALUES_EQUAL (r0, ri))
8548         return false;
8549     }
8550
8551   return aarch64_float_const_representable_p (x0);
8552 }
8553
8554 /* Return true for valid and false for invalid.  */
8555 bool
8556 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8557                               struct simd_immediate_info *info)
8558 {
8559 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
8560   matches = 1;                                          \
8561   for (i = 0; i < idx; i += (STRIDE))                   \
8562     if (!(TEST))                                        \
8563       matches = 0;                                      \
8564   if (matches)                                          \
8565     {                                                   \
8566       immtype = (CLASS);                                \
8567       elsize = (ELSIZE);                                \
8568       eshift = (SHIFT);                                 \
8569       emvn = (NEG);                                     \
8570       break;                                            \
8571     }
8572
8573   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8574   unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8575   unsigned char bytes[16];
8576   int immtype = -1, matches;
8577   unsigned int invmask = inverse ? 0xff : 0;
8578   int eshift, emvn;
8579
8580   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8581     {
8582       if (! (aarch64_simd_imm_zero_p (op, mode)
8583              || aarch64_vect_float_const_representable_p (op)))
8584         return false;
8585
8586       if (info)
8587         {
8588           info->value = CONST_VECTOR_ELT (op, 0);
8589           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8590           info->mvn = false;
8591           info->shift = 0;
8592         }
8593
8594       return true;
8595     }
8596
8597   /* Splat vector constant out into a byte vector.  */
8598   for (i = 0; i < n_elts; i++)
8599     {
8600       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
8601          it must be laid out in the vector register in reverse order.  */
8602       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8603       unsigned HOST_WIDE_INT elpart;
8604       unsigned int part, parts;
8605
8606       if (CONST_INT_P (el))
8607         {
8608           elpart = INTVAL (el);
8609           parts = 1;
8610         }
8611       else if (GET_CODE (el) == CONST_DOUBLE)
8612         {
8613           elpart = CONST_DOUBLE_LOW (el);
8614           parts = 2;
8615         }
8616       else
8617         gcc_unreachable ();
8618
8619       for (part = 0; part < parts; part++)
8620         {
8621           unsigned int byte;
8622           for (byte = 0; byte < innersize; byte++)
8623             {
8624               bytes[idx++] = (elpart & 0xff) ^ invmask;
8625               elpart >>= BITS_PER_UNIT;
8626             }
8627           if (GET_CODE (el) == CONST_DOUBLE)
8628             elpart = CONST_DOUBLE_HIGH (el);
8629         }
8630     }
8631
8632   /* Sanity check.  */
8633   gcc_assert (idx == GET_MODE_SIZE (mode));
8634
8635   do
8636     {
8637       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8638              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8639
8640       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8641              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8642
8643       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8644              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8645
8646       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8647              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8648
8649       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8650
8651       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8652
8653       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8654              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8655
8656       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8657              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8658
8659       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8660              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8661
8662       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8663              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8664
8665       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8666
8667       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8668
8669       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8670              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8671
8672       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8673              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8674
8675       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8676              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8677
8678       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8679              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8680
8681       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8682
8683       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8684              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8685     }
8686   while (0);
8687
8688   if (immtype == -1)
8689     return false;
8690
8691   if (info)
8692     {
8693       info->element_width = elsize;
8694       info->mvn = emvn != 0;
8695       info->shift = eshift;
8696
8697       unsigned HOST_WIDE_INT imm = 0;
8698
8699       if (immtype >= 12 && immtype <= 15)
8700         info->msl = true;
8701
8702       /* Un-invert bytes of recognized vector, if necessary.  */
8703       if (invmask != 0)
8704         for (i = 0; i < idx; i++)
8705           bytes[i] ^= invmask;
8706
8707       if (immtype == 17)
8708         {
8709           /* FIXME: Broken on 32-bit H_W_I hosts.  */
8710           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8711
8712           for (i = 0; i < 8; i++)
8713             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8714               << (i * BITS_PER_UNIT);
8715
8716
8717           info->value = GEN_INT (imm);
8718         }
8719       else
8720         {
8721           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8722             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8723
8724           /* Construct 'abcdefgh' because the assembler cannot handle
8725              generic constants.  */
8726           if (info->mvn)
8727             imm = ~imm;
8728           imm = (imm >> info->shift) & 0xff;
8729           info->value = GEN_INT (imm);
8730         }
8731     }
8732
8733   return true;
8734 #undef CHECK
8735 }
8736
8737 /* Check of immediate shift constants are within range.  */
8738 bool
8739 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8740 {
8741   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8742   if (left)
8743     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8744   else
8745     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8746 }
8747
8748 /* Return true if X is a uniform vector where all elements
8749    are either the floating-point constant 0.0 or the
8750    integer constant 0.  */
8751 bool
8752 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8753 {
8754   return x == CONST0_RTX (mode);
8755 }
8756
8757 bool
8758 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8759 {
8760   HOST_WIDE_INT imm = INTVAL (x);
8761   int i;
8762
8763   for (i = 0; i < 8; i++)
8764     {
8765       unsigned int byte = imm & 0xff;
8766       if (byte != 0xff && byte != 0)
8767        return false;
8768       imm >>= 8;
8769     }
8770
8771   return true;
8772 }
8773
8774 bool
8775 aarch64_mov_operand_p (rtx x,
8776                        enum aarch64_symbol_context context,
8777                        machine_mode mode)
8778 {
8779   if (GET_CODE (x) == HIGH
8780       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8781     return true;
8782
8783   if (CONST_INT_P (x))
8784     return true;
8785
8786   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8787     return true;
8788
8789   return aarch64_classify_symbolic_expression (x, context)
8790     == SYMBOL_TINY_ABSOLUTE;
8791 }
8792
8793 /* Return a const_int vector of VAL.  */
8794 rtx
8795 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8796 {
8797   int nunits = GET_MODE_NUNITS (mode);
8798   rtvec v = rtvec_alloc (nunits);
8799   int i;
8800
8801   for (i=0; i < nunits; i++)
8802     RTVEC_ELT (v, i) = GEN_INT (val);
8803
8804   return gen_rtx_CONST_VECTOR (mode, v);
8805 }
8806
8807 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
8808
8809 bool
8810 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8811 {
8812   machine_mode vmode;
8813
8814   gcc_assert (!VECTOR_MODE_P (mode));
8815   vmode = aarch64_preferred_simd_mode (mode);
8816   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8817   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8818 }
8819
8820 /* Construct and return a PARALLEL RTX vector with elements numbering the
8821    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8822    the vector - from the perspective of the architecture.  This does not
8823    line up with GCC's perspective on lane numbers, so we end up with
8824    different masks depending on our target endian-ness.  The diagram
8825    below may help.  We must draw the distinction when building masks
8826    which select one half of the vector.  An instruction selecting
8827    architectural low-lanes for a big-endian target, must be described using
8828    a mask selecting GCC high-lanes.
8829
8830                  Big-Endian             Little-Endian
8831
8832 GCC             0   1   2   3           3   2   1   0
8833               | x | x | x | x |       | x | x | x | x |
8834 Architecture    3   2   1   0           3   2   1   0
8835
8836 Low Mask:         { 2, 3 }                { 0, 1 }
8837 High Mask:        { 0, 1 }                { 2, 3 }
8838 */
8839
8840 rtx
8841 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8842 {
8843   int nunits = GET_MODE_NUNITS (mode);
8844   rtvec v = rtvec_alloc (nunits / 2);
8845   int high_base = nunits / 2;
8846   int low_base = 0;
8847   int base;
8848   rtx t1;
8849   int i;
8850
8851   if (BYTES_BIG_ENDIAN)
8852     base = high ? low_base : high_base;
8853   else
8854     base = high ? high_base : low_base;
8855
8856   for (i = 0; i < nunits / 2; i++)
8857     RTVEC_ELT (v, i) = GEN_INT (base + i);
8858
8859   t1 = gen_rtx_PARALLEL (mode, v);
8860   return t1;
8861 }
8862
8863 /* Check OP for validity as a PARALLEL RTX vector with elements
8864    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8865    from the perspective of the architecture.  See the diagram above
8866    aarch64_simd_vect_par_cnst_half for more details.  */
8867
8868 bool
8869 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8870                                        bool high)
8871 {
8872   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8873   HOST_WIDE_INT count_op = XVECLEN (op, 0);
8874   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8875   int i = 0;
8876
8877   if (!VECTOR_MODE_P (mode))
8878     return false;
8879
8880   if (count_op != count_ideal)
8881     return false;
8882
8883   for (i = 0; i < count_ideal; i++)
8884     {
8885       rtx elt_op = XVECEXP (op, 0, i);
8886       rtx elt_ideal = XVECEXP (ideal, 0, i);
8887
8888       if (!CONST_INT_P (elt_op)
8889           || INTVAL (elt_ideal) != INTVAL (elt_op))
8890         return false;
8891     }
8892   return true;
8893 }
8894
8895 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
8896    HIGH (exclusive).  */
8897 void
8898 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
8899                           const_tree exp)
8900 {
8901   HOST_WIDE_INT lane;
8902   gcc_assert (CONST_INT_P (operand));
8903   lane = INTVAL (operand);
8904
8905   if (lane < low || lane >= high)
8906   {
8907     if (exp)
8908       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
8909     else
8910       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
8911   }
8912 }
8913
8914 /* Return TRUE if OP is a valid vector addressing mode.  */
8915 bool
8916 aarch64_simd_mem_operand_p (rtx op)
8917 {
8918   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8919                         || REG_P (XEXP (op, 0)));
8920 }
8921
8922 /* Emit a register copy from operand to operand, taking care not to
8923    early-clobber source registers in the process.
8924
8925    COUNT is the number of components into which the copy needs to be
8926    decomposed.  */
8927 void
8928 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
8929                                 unsigned int count)
8930 {
8931   unsigned int i;
8932   int rdest = REGNO (operands[0]);
8933   int rsrc = REGNO (operands[1]);
8934
8935   if (!reg_overlap_mentioned_p (operands[0], operands[1])
8936       || rdest < rsrc)
8937     for (i = 0; i < count; i++)
8938       emit_move_insn (gen_rtx_REG (mode, rdest + i),
8939                       gen_rtx_REG (mode, rsrc + i));
8940   else
8941     for (i = 0; i < count; i++)
8942       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
8943                       gen_rtx_REG (mode, rsrc + count - i - 1));
8944 }
8945
8946 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8947    one of VSTRUCT modes: OI, CI or XI.  */
8948 int
8949 aarch64_simd_attr_length_move (rtx_insn *insn)
8950 {
8951   machine_mode mode;
8952
8953   extract_insn_cached (insn);
8954
8955   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8956     {
8957       mode = GET_MODE (recog_data.operand[0]);
8958       switch (mode)
8959         {
8960         case OImode:
8961           return 8;
8962         case CImode:
8963           return 12;
8964         case XImode:
8965           return 16;
8966         default:
8967           gcc_unreachable ();
8968         }
8969     }
8970   return 4;
8971 }
8972
8973 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
8974    one of VSTRUCT modes: OI, CI, EI, or XI.  */
8975 int
8976 aarch64_simd_attr_length_rglist (enum machine_mode mode)
8977 {
8978   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
8979 }
8980
8981 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
8982    alignment of a vector to 128 bits.  */
8983 static HOST_WIDE_INT
8984 aarch64_simd_vector_alignment (const_tree type)
8985 {
8986   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8987   return MIN (align, 128);
8988 }
8989
8990 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
8991 static bool
8992 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8993 {
8994   if (is_packed)
8995     return false;
8996
8997   /* We guarantee alignment for vectors up to 128-bits.  */
8998   if (tree_int_cst_compare (TYPE_SIZE (type),
8999                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
9000     return false;
9001
9002   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
9003   return true;
9004 }
9005
9006 /* If VALS is a vector constant that can be loaded into a register
9007    using DUP, generate instructions to do so and return an RTX to
9008    assign to the register.  Otherwise return NULL_RTX.  */
9009 static rtx
9010 aarch64_simd_dup_constant (rtx vals)
9011 {
9012   machine_mode mode = GET_MODE (vals);
9013   machine_mode inner_mode = GET_MODE_INNER (mode);
9014   int n_elts = GET_MODE_NUNITS (mode);
9015   bool all_same = true;
9016   rtx x;
9017   int i;
9018
9019   if (GET_CODE (vals) != CONST_VECTOR)
9020     return NULL_RTX;
9021
9022   for (i = 1; i < n_elts; ++i)
9023     {
9024       x = CONST_VECTOR_ELT (vals, i);
9025       if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
9026         all_same = false;
9027     }
9028
9029   if (!all_same)
9030     return NULL_RTX;
9031
9032   /* We can load this constant by using DUP and a constant in a
9033      single ARM register.  This will be cheaper than a vector
9034      load.  */
9035   x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
9036   return gen_rtx_VEC_DUPLICATE (mode, x);
9037 }
9038
9039
9040 /* Generate code to load VALS, which is a PARALLEL containing only
9041    constants (for vec_init) or CONST_VECTOR, efficiently into a
9042    register.  Returns an RTX to copy into the register, or NULL_RTX
9043    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
9044 static rtx
9045 aarch64_simd_make_constant (rtx vals)
9046 {
9047   machine_mode mode = GET_MODE (vals);
9048   rtx const_dup;
9049   rtx const_vec = NULL_RTX;
9050   int n_elts = GET_MODE_NUNITS (mode);
9051   int n_const = 0;
9052   int i;
9053
9054   if (GET_CODE (vals) == CONST_VECTOR)
9055     const_vec = vals;
9056   else if (GET_CODE (vals) == PARALLEL)
9057     {
9058       /* A CONST_VECTOR must contain only CONST_INTs and
9059          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
9060          Only store valid constants in a CONST_VECTOR.  */
9061       for (i = 0; i < n_elts; ++i)
9062         {
9063           rtx x = XVECEXP (vals, 0, i);
9064           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9065             n_const++;
9066         }
9067       if (n_const == n_elts)
9068         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
9069     }
9070   else
9071     gcc_unreachable ();
9072
9073   if (const_vec != NULL_RTX
9074       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
9075     /* Load using MOVI/MVNI.  */
9076     return const_vec;
9077   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
9078     /* Loaded using DUP.  */
9079     return const_dup;
9080   else if (const_vec != NULL_RTX)
9081     /* Load from constant pool. We can not take advantage of single-cycle
9082        LD1 because we need a PC-relative addressing mode.  */
9083     return const_vec;
9084   else
9085     /* A PARALLEL containing something not valid inside CONST_VECTOR.
9086        We can not construct an initializer.  */
9087     return NULL_RTX;
9088 }
9089
9090 void
9091 aarch64_expand_vector_init (rtx target, rtx vals)
9092 {
9093   machine_mode mode = GET_MODE (target);
9094   machine_mode inner_mode = GET_MODE_INNER (mode);
9095   int n_elts = GET_MODE_NUNITS (mode);
9096   int n_var = 0;
9097   rtx any_const = NULL_RTX;
9098   bool all_same = true;
9099
9100   for (int i = 0; i < n_elts; ++i)
9101     {
9102       rtx x = XVECEXP (vals, 0, i);
9103       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
9104         ++n_var;
9105       else
9106         any_const = x;
9107
9108       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
9109         all_same = false;
9110     }
9111
9112   if (n_var == 0)
9113     {
9114       rtx constant = aarch64_simd_make_constant (vals);
9115       if (constant != NULL_RTX)
9116         {
9117           emit_move_insn (target, constant);
9118           return;
9119         }
9120     }
9121
9122   /* Splat a single non-constant element if we can.  */
9123   if (all_same)
9124     {
9125       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
9126       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
9127       return;
9128     }
9129
9130   /* Half the fields (or less) are non-constant.  Load constant then overwrite
9131      varying fields.  Hope that this is more efficient than using the stack.  */
9132   if (n_var <= n_elts/2)
9133     {
9134       rtx copy = copy_rtx (vals);
9135
9136       /* Load constant part of vector.  We really don't care what goes into the
9137          parts we will overwrite, but we're more likely to be able to load the
9138          constant efficiently if it has fewer, larger, repeating parts
9139          (see aarch64_simd_valid_immediate).  */
9140       for (int i = 0; i < n_elts; i++)
9141         {
9142           rtx x = XVECEXP (vals, 0, i);
9143           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9144             continue;
9145           rtx subst = any_const;
9146           for (int bit = n_elts / 2; bit > 0; bit /= 2)
9147             {
9148               /* Look in the copied vector, as more elements are const.  */
9149               rtx test = XVECEXP (copy, 0, i ^ bit);
9150               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
9151                 {
9152                   subst = test;
9153                   break;
9154                 }
9155             }
9156           XVECEXP (copy, 0, i) = subst;
9157         }
9158       aarch64_expand_vector_init (target, copy);
9159
9160       /* Insert variables.  */
9161       enum insn_code icode = optab_handler (vec_set_optab, mode);
9162       gcc_assert (icode != CODE_FOR_nothing);
9163
9164       for (int i = 0; i < n_elts; i++)
9165         {
9166           rtx x = XVECEXP (vals, 0, i);
9167           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9168             continue;
9169           x = copy_to_mode_reg (inner_mode, x);
9170           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
9171         }
9172       return;
9173     }
9174
9175   /* Construct the vector in memory one field at a time
9176      and load the whole vector.  */
9177   rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
9178   for (int i = 0; i < n_elts; i++)
9179     emit_move_insn (adjust_address_nv (mem, inner_mode,
9180                                     i * GET_MODE_SIZE (inner_mode)),
9181                     XVECEXP (vals, 0, i));
9182   emit_move_insn (target, mem);
9183
9184 }
9185
9186 static unsigned HOST_WIDE_INT
9187 aarch64_shift_truncation_mask (machine_mode mode)
9188 {
9189   return
9190     (aarch64_vector_mode_supported_p (mode)
9191      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
9192 }
9193
9194 #ifndef TLS_SECTION_ASM_FLAG
9195 #define TLS_SECTION_ASM_FLAG 'T'
9196 #endif
9197
9198 void
9199 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
9200                                tree decl ATTRIBUTE_UNUSED)
9201 {
9202   char flagchars[10], *f = flagchars;
9203
9204   /* If we have already declared this section, we can use an
9205      abbreviated form to switch back to it -- unless this section is
9206      part of a COMDAT groups, in which case GAS requires the full
9207      declaration every time.  */
9208   if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9209       && (flags & SECTION_DECLARED))
9210     {
9211       fprintf (asm_out_file, "\t.section\t%s\n", name);
9212       return;
9213     }
9214
9215   if (!(flags & SECTION_DEBUG))
9216     *f++ = 'a';
9217   if (flags & SECTION_WRITE)
9218     *f++ = 'w';
9219   if (flags & SECTION_CODE)
9220     *f++ = 'x';
9221   if (flags & SECTION_SMALL)
9222     *f++ = 's';
9223   if (flags & SECTION_MERGE)
9224     *f++ = 'M';
9225   if (flags & SECTION_STRINGS)
9226     *f++ = 'S';
9227   if (flags & SECTION_TLS)
9228     *f++ = TLS_SECTION_ASM_FLAG;
9229   if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9230     *f++ = 'G';
9231   *f = '\0';
9232
9233   fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
9234
9235   if (!(flags & SECTION_NOTYPE))
9236     {
9237       const char *type;
9238       const char *format;
9239
9240       if (flags & SECTION_BSS)
9241         type = "nobits";
9242       else
9243         type = "progbits";
9244
9245 #ifdef TYPE_OPERAND_FMT
9246       format = "," TYPE_OPERAND_FMT;
9247 #else
9248       format = ",@%s";
9249 #endif
9250
9251       fprintf (asm_out_file, format, type);
9252
9253       if (flags & SECTION_ENTSIZE)
9254         fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
9255       if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9256         {
9257           if (TREE_CODE (decl) == IDENTIFIER_NODE)
9258             fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
9259           else
9260             fprintf (asm_out_file, ",%s,comdat",
9261                      IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
9262         }
9263     }
9264
9265   putc ('\n', asm_out_file);
9266 }
9267
9268 /* Select a format to encode pointers in exception handling data.  */
9269 int
9270 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
9271 {
9272    int type;
9273    switch (aarch64_cmodel)
9274      {
9275      case AARCH64_CMODEL_TINY:
9276      case AARCH64_CMODEL_TINY_PIC:
9277      case AARCH64_CMODEL_SMALL:
9278      case AARCH64_CMODEL_SMALL_PIC:
9279        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
9280           for everything.  */
9281        type = DW_EH_PE_sdata4;
9282        break;
9283      default:
9284        /* No assumptions here.  8-byte relocs required.  */
9285        type = DW_EH_PE_sdata8;
9286        break;
9287      }
9288    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
9289 }
9290
9291 /* Emit load exclusive.  */
9292
9293 static void
9294 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
9295                              rtx mem, rtx model_rtx)
9296 {
9297   rtx (*gen) (rtx, rtx, rtx);
9298
9299   switch (mode)
9300     {
9301     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
9302     case HImode: gen = gen_aarch64_load_exclusivehi; break;
9303     case SImode: gen = gen_aarch64_load_exclusivesi; break;
9304     case DImode: gen = gen_aarch64_load_exclusivedi; break;
9305     default:
9306       gcc_unreachable ();
9307     }
9308
9309   emit_insn (gen (rval, mem, model_rtx));
9310 }
9311
9312 /* Emit store exclusive.  */
9313
9314 static void
9315 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
9316                               rtx rval, rtx mem, rtx model_rtx)
9317 {
9318   rtx (*gen) (rtx, rtx, rtx, rtx);
9319
9320   switch (mode)
9321     {
9322     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
9323     case HImode: gen = gen_aarch64_store_exclusivehi; break;
9324     case SImode: gen = gen_aarch64_store_exclusivesi; break;
9325     case DImode: gen = gen_aarch64_store_exclusivedi; break;
9326     default:
9327       gcc_unreachable ();
9328     }
9329
9330   emit_insn (gen (bval, rval, mem, model_rtx));
9331 }
9332
9333 /* Mark the previous jump instruction as unlikely.  */
9334
9335 static void
9336 aarch64_emit_unlikely_jump (rtx insn)
9337 {
9338   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
9339
9340   insn = emit_jump_insn (insn);
9341   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
9342 }
9343
9344 /* Expand a compare and swap pattern.  */
9345
9346 void
9347 aarch64_expand_compare_and_swap (rtx operands[])
9348 {
9349   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9350   machine_mode mode, cmp_mode;
9351   rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9352
9353   bval = operands[0];
9354   rval = operands[1];
9355   mem = operands[2];
9356   oldval = operands[3];
9357   newval = operands[4];
9358   is_weak = operands[5];
9359   mod_s = operands[6];
9360   mod_f = operands[7];
9361   mode = GET_MODE (mem);
9362   cmp_mode = mode;
9363
9364   /* Normally the succ memory model must be stronger than fail, but in the
9365      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9366      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
9367
9368   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
9369       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
9370     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9371
9372   switch (mode)
9373     {
9374     case QImode:
9375     case HImode:
9376       /* For short modes, we're going to perform the comparison in SImode,
9377          so do the zero-extension now.  */
9378       cmp_mode = SImode;
9379       rval = gen_reg_rtx (SImode);
9380       oldval = convert_modes (SImode, mode, oldval, true);
9381       /* Fall through.  */
9382
9383     case SImode:
9384     case DImode:
9385       /* Force the value into a register if needed.  */
9386       if (!aarch64_plus_operand (oldval, mode))
9387         oldval = force_reg (cmp_mode, oldval);
9388       break;
9389
9390     default:
9391       gcc_unreachable ();
9392     }
9393
9394   switch (mode)
9395     {
9396     case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9397     case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9398     case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9399     case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9400     default:
9401       gcc_unreachable ();
9402     }
9403
9404   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9405
9406   if (mode == QImode || mode == HImode)
9407     emit_move_insn (operands[1], gen_lowpart (mode, rval));
9408
9409   x = gen_rtx_REG (CCmode, CC_REGNUM);
9410   x = gen_rtx_EQ (SImode, x, const0_rtx);
9411   emit_insn (gen_rtx_SET (bval, x));
9412 }
9413
9414 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
9415    sequence implementing an atomic operation.  */
9416
9417 static void
9418 aarch64_emit_post_barrier (enum memmodel model)
9419 {
9420   const enum memmodel base_model = memmodel_base (model);
9421
9422   if (is_mm_sync (model)
9423       && (base_model == MEMMODEL_ACQUIRE
9424           || base_model == MEMMODEL_ACQ_REL
9425           || base_model == MEMMODEL_SEQ_CST))
9426     {
9427       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
9428     }
9429 }
9430
9431 /* Split a compare and swap pattern.  */
9432
9433 void
9434 aarch64_split_compare_and_swap (rtx operands[])
9435 {
9436   rtx rval, mem, oldval, newval, scratch;
9437   machine_mode mode;
9438   bool is_weak;
9439   rtx_code_label *label1, *label2;
9440   rtx x, cond;
9441   enum memmodel model;
9442   rtx model_rtx;
9443
9444   rval = operands[0];
9445   mem = operands[1];
9446   oldval = operands[2];
9447   newval = operands[3];
9448   is_weak = (operands[4] != const0_rtx);
9449   model_rtx = operands[5];
9450   scratch = operands[7];
9451   mode = GET_MODE (mem);
9452   model = memmodel_from_int (INTVAL (model_rtx));
9453
9454   label1 = NULL;
9455   if (!is_weak)
9456     {
9457       label1 = gen_label_rtx ();
9458       emit_label (label1);
9459     }
9460   label2 = gen_label_rtx ();
9461
9462   /* The initial load can be relaxed for a __sync operation since a final
9463      barrier will be emitted to stop code hoisting.  */
9464   if (is_mm_sync (model))
9465     aarch64_emit_load_exclusive (mode, rval, mem,
9466                                  GEN_INT (MEMMODEL_RELAXED));
9467   else
9468     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
9469
9470   cond = aarch64_gen_compare_reg (NE, rval, oldval);
9471   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9472   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9473                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9474   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9475
9476   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
9477
9478   if (!is_weak)
9479     {
9480       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9481       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9482                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9483       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9484     }
9485   else
9486     {
9487       cond = gen_rtx_REG (CCmode, CC_REGNUM);
9488       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9489       emit_insn (gen_rtx_SET (cond, x));
9490     }
9491
9492   emit_label (label2);
9493
9494   /* Emit any final barrier needed for a __sync operation.  */
9495   if (is_mm_sync (model))
9496     aarch64_emit_post_barrier (model);
9497 }
9498
9499 /* Split an atomic operation.  */
9500
9501 void
9502 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9503                      rtx value, rtx model_rtx, rtx cond)
9504 {
9505   machine_mode mode = GET_MODE (mem);
9506   machine_mode wmode = (mode == DImode ? DImode : SImode);
9507   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
9508   const bool is_sync = is_mm_sync (model);
9509   rtx_code_label *label;
9510   rtx x;
9511
9512   label = gen_label_rtx ();
9513   emit_label (label);
9514
9515   if (new_out)
9516     new_out = gen_lowpart (wmode, new_out);
9517   if (old_out)
9518     old_out = gen_lowpart (wmode, old_out);
9519   else
9520     old_out = new_out;
9521   value = simplify_gen_subreg (wmode, value, mode, 0);
9522
9523   /* The initial load can be relaxed for a __sync operation since a final
9524      barrier will be emitted to stop code hoisting.  */
9525  if (is_sync)
9526     aarch64_emit_load_exclusive (mode, old_out, mem,
9527                                  GEN_INT (MEMMODEL_RELAXED));
9528   else
9529     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9530
9531   switch (code)
9532     {
9533     case SET:
9534       new_out = value;
9535       break;
9536
9537     case NOT:
9538       x = gen_rtx_AND (wmode, old_out, value);
9539       emit_insn (gen_rtx_SET (new_out, x));
9540       x = gen_rtx_NOT (wmode, new_out);
9541       emit_insn (gen_rtx_SET (new_out, x));
9542       break;
9543
9544     case MINUS:
9545       if (CONST_INT_P (value))
9546         {
9547           value = GEN_INT (-INTVAL (value));
9548           code = PLUS;
9549         }
9550       /* Fall through.  */
9551
9552     default:
9553       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9554       emit_insn (gen_rtx_SET (new_out, x));
9555       break;
9556     }
9557
9558   aarch64_emit_store_exclusive (mode, cond, mem,
9559                                 gen_lowpart (mode, new_out), model_rtx);
9560
9561   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9562   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9563                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9564   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9565
9566   /* Emit any final barrier needed for a __sync operation.  */
9567   if (is_sync)
9568     aarch64_emit_post_barrier (model);
9569 }
9570
9571 static void
9572 aarch64_print_extension (void)
9573 {
9574   const struct aarch64_option_extension *opt = NULL;
9575
9576   for (opt = all_extensions; opt->name != NULL; opt++)
9577     if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9578       asm_fprintf (asm_out_file, "+%s", opt->name);
9579
9580   asm_fprintf (asm_out_file, "\n");
9581 }
9582
9583 static void
9584 aarch64_start_file (void)
9585 {
9586   if (selected_arch)
9587     {
9588       asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9589       aarch64_print_extension ();
9590     }
9591   else if (selected_cpu)
9592     {
9593       const char *truncated_name
9594             = aarch64_rewrite_selected_cpu (selected_cpu->name);
9595       asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9596       aarch64_print_extension ();
9597     }
9598   default_file_start();
9599 }
9600
9601 /* Target hook for c_mode_for_suffix.  */
9602 static machine_mode
9603 aarch64_c_mode_for_suffix (char suffix)
9604 {
9605   if (suffix == 'q')
9606     return TFmode;
9607
9608   return VOIDmode;
9609 }
9610
9611 /* We can only represent floating point constants which will fit in
9612    "quarter-precision" values.  These values are characterised by
9613    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
9614    by:
9615
9616    (-1)^s * (n/16) * 2^r
9617
9618    Where:
9619      's' is the sign bit.
9620      'n' is an integer in the range 16 <= n <= 31.
9621      'r' is an integer in the range -3 <= r <= 4.  */
9622
9623 /* Return true iff X can be represented by a quarter-precision
9624    floating point immediate operand X.  Note, we cannot represent 0.0.  */
9625 bool
9626 aarch64_float_const_representable_p (rtx x)
9627 {
9628   /* This represents our current view of how many bits
9629      make up the mantissa.  */
9630   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9631   int exponent;
9632   unsigned HOST_WIDE_INT mantissa, mask;
9633   REAL_VALUE_TYPE r, m;
9634   bool fail;
9635
9636   if (!CONST_DOUBLE_P (x))
9637     return false;
9638
9639   if (GET_MODE (x) == VOIDmode)
9640     return false;
9641
9642   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9643
9644   /* We cannot represent infinities, NaNs or +/-zero.  We won't
9645      know if we have +zero until we analyse the mantissa, but we
9646      can reject the other invalid values.  */
9647   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9648       || REAL_VALUE_MINUS_ZERO (r))
9649     return false;
9650
9651   /* Extract exponent.  */
9652   r = real_value_abs (&r);
9653   exponent = REAL_EXP (&r);
9654
9655   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9656      highest (sign) bit, with a fixed binary point at bit point_pos.
9657      m1 holds the low part of the mantissa, m2 the high part.
9658      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9659      bits for the mantissa, this can fail (low bits will be lost).  */
9660   real_ldexp (&m, &r, point_pos - exponent);
9661   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9662
9663   /* If the low part of the mantissa has bits set we cannot represent
9664      the value.  */
9665   if (w.elt (0) != 0)
9666     return false;
9667   /* We have rejected the lower HOST_WIDE_INT, so update our
9668      understanding of how many bits lie in the mantissa and
9669      look only at the high HOST_WIDE_INT.  */
9670   mantissa = w.elt (1);
9671   point_pos -= HOST_BITS_PER_WIDE_INT;
9672
9673   /* We can only represent values with a mantissa of the form 1.xxxx.  */
9674   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9675   if ((mantissa & mask) != 0)
9676     return false;
9677
9678   /* Having filtered unrepresentable values, we may now remove all
9679      but the highest 5 bits.  */
9680   mantissa >>= point_pos - 5;
9681
9682   /* We cannot represent the value 0.0, so reject it.  This is handled
9683      elsewhere.  */
9684   if (mantissa == 0)
9685     return false;
9686
9687   /* Then, as bit 4 is always set, we can mask it off, leaving
9688      the mantissa in the range [0, 15].  */
9689   mantissa &= ~(1 << 4);
9690   gcc_assert (mantissa <= 15);
9691
9692   /* GCC internally does not use IEEE754-like encoding (where normalized
9693      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
9694      Our mantissa values are shifted 4 places to the left relative to
9695      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9696      by 5 places to correct for GCC's representation.  */
9697   exponent = 5 - exponent;
9698
9699   return (exponent >= 0 && exponent <= 7);
9700 }
9701
9702 char*
9703 aarch64_output_simd_mov_immediate (rtx const_vector,
9704                                    machine_mode mode,
9705                                    unsigned width)
9706 {
9707   bool is_valid;
9708   static char templ[40];
9709   const char *mnemonic;
9710   const char *shift_op;
9711   unsigned int lane_count = 0;
9712   char element_char;
9713
9714   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
9715
9716   /* This will return true to show const_vector is legal for use as either
9717      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
9718      also update INFO to show how the immediate should be generated.  */
9719   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
9720   gcc_assert (is_valid);
9721
9722   element_char = sizetochar (info.element_width);
9723   lane_count = width / info.element_width;
9724
9725   mode = GET_MODE_INNER (mode);
9726   if (mode == SFmode || mode == DFmode)
9727     {
9728       gcc_assert (info.shift == 0 && ! info.mvn);
9729       if (aarch64_float_const_zero_rtx_p (info.value))
9730         info.value = GEN_INT (0);
9731       else
9732         {
9733 #define buf_size 20
9734           REAL_VALUE_TYPE r;
9735           REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
9736           char float_buf[buf_size] = {'\0'};
9737           real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
9738 #undef buf_size
9739
9740           if (lane_count == 1)
9741             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
9742           else
9743             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
9744                       lane_count, element_char, float_buf);
9745           return templ;
9746         }
9747     }
9748
9749   mnemonic = info.mvn ? "mvni" : "movi";
9750   shift_op = info.msl ? "msl" : "lsl";
9751
9752   if (lane_count == 1)
9753     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9754               mnemonic, UINTVAL (info.value));
9755   else if (info.shift)
9756     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9757               ", %s %d", mnemonic, lane_count, element_char,
9758               UINTVAL (info.value), shift_op, info.shift);
9759   else
9760     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9761               mnemonic, lane_count, element_char, UINTVAL (info.value));
9762   return templ;
9763 }
9764
9765 char*
9766 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9767                                           machine_mode mode)
9768 {
9769   machine_mode vmode;
9770
9771   gcc_assert (!VECTOR_MODE_P (mode));
9772   vmode = aarch64_simd_container_mode (mode, 64);
9773   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9774   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9775 }
9776
9777 /* Split operands into moves from op[1] + op[2] into op[0].  */
9778
9779 void
9780 aarch64_split_combinev16qi (rtx operands[3])
9781 {
9782   unsigned int dest = REGNO (operands[0]);
9783   unsigned int src1 = REGNO (operands[1]);
9784   unsigned int src2 = REGNO (operands[2]);
9785   machine_mode halfmode = GET_MODE (operands[1]);
9786   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9787   rtx destlo, desthi;
9788
9789   gcc_assert (halfmode == V16QImode);
9790
9791   if (src1 == dest && src2 == dest + halfregs)
9792     {
9793       /* No-op move.  Can't split to nothing; emit something.  */
9794       emit_note (NOTE_INSN_DELETED);
9795       return;
9796     }
9797
9798   /* Preserve register attributes for variable tracking.  */
9799   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9800   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9801                                GET_MODE_SIZE (halfmode));
9802
9803   /* Special case of reversed high/low parts.  */
9804   if (reg_overlap_mentioned_p (operands[2], destlo)
9805       && reg_overlap_mentioned_p (operands[1], desthi))
9806     {
9807       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9808       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9809       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9810     }
9811   else if (!reg_overlap_mentioned_p (operands[2], destlo))
9812     {
9813       /* Try to avoid unnecessary moves if part of the result
9814          is in the right place already.  */
9815       if (src1 != dest)
9816         emit_move_insn (destlo, operands[1]);
9817       if (src2 != dest + halfregs)
9818         emit_move_insn (desthi, operands[2]);
9819     }
9820   else
9821     {
9822       if (src2 != dest + halfregs)
9823         emit_move_insn (desthi, operands[2]);
9824       if (src1 != dest)
9825         emit_move_insn (destlo, operands[1]);
9826     }
9827 }
9828
9829 /* vec_perm support.  */
9830
9831 #define MAX_VECT_LEN 16
9832
9833 struct expand_vec_perm_d
9834 {
9835   rtx target, op0, op1;
9836   unsigned char perm[MAX_VECT_LEN];
9837   machine_mode vmode;
9838   unsigned char nelt;
9839   bool one_vector_p;
9840   bool testing_p;
9841 };
9842
9843 /* Generate a variable permutation.  */
9844
9845 static void
9846 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9847 {
9848   machine_mode vmode = GET_MODE (target);
9849   bool one_vector_p = rtx_equal_p (op0, op1);
9850
9851   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9852   gcc_checking_assert (GET_MODE (op0) == vmode);
9853   gcc_checking_assert (GET_MODE (op1) == vmode);
9854   gcc_checking_assert (GET_MODE (sel) == vmode);
9855   gcc_checking_assert (TARGET_SIMD);
9856
9857   if (one_vector_p)
9858     {
9859       if (vmode == V8QImode)
9860         {
9861           /* Expand the argument to a V16QI mode by duplicating it.  */
9862           rtx pair = gen_reg_rtx (V16QImode);
9863           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9864           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9865         }
9866       else
9867         {
9868           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9869         }
9870     }
9871   else
9872     {
9873       rtx pair;
9874
9875       if (vmode == V8QImode)
9876         {
9877           pair = gen_reg_rtx (V16QImode);
9878           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9879           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9880         }
9881       else
9882         {
9883           pair = gen_reg_rtx (OImode);
9884           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9885           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9886         }
9887     }
9888 }
9889
9890 void
9891 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9892 {
9893   machine_mode vmode = GET_MODE (target);
9894   unsigned int nelt = GET_MODE_NUNITS (vmode);
9895   bool one_vector_p = rtx_equal_p (op0, op1);
9896   rtx mask;
9897
9898   /* The TBL instruction does not use a modulo index, so we must take care
9899      of that ourselves.  */
9900   mask = aarch64_simd_gen_const_vector_dup (vmode,
9901       one_vector_p ? nelt - 1 : 2 * nelt - 1);
9902   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9903
9904   /* For big-endian, we also need to reverse the index within the vector
9905      (but not which vector).  */
9906   if (BYTES_BIG_ENDIAN)
9907     {
9908       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
9909       if (!one_vector_p)
9910         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9911       sel = expand_simple_binop (vmode, XOR, sel, mask,
9912                                  NULL, 0, OPTAB_LIB_WIDEN);
9913     }
9914   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9915 }
9916
9917 /* Recognize patterns suitable for the TRN instructions.  */
9918 static bool
9919 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9920 {
9921   unsigned int i, odd, mask, nelt = d->nelt;
9922   rtx out, in0, in1, x;
9923   rtx (*gen) (rtx, rtx, rtx);
9924   machine_mode vmode = d->vmode;
9925
9926   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9927     return false;
9928
9929   /* Note that these are little-endian tests.
9930      We correct for big-endian later.  */
9931   if (d->perm[0] == 0)
9932     odd = 0;
9933   else if (d->perm[0] == 1)
9934     odd = 1;
9935   else
9936     return false;
9937   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9938
9939   for (i = 0; i < nelt; i += 2)
9940     {
9941       if (d->perm[i] != i + odd)
9942         return false;
9943       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9944         return false;
9945     }
9946
9947   /* Success!  */
9948   if (d->testing_p)
9949     return true;
9950
9951   in0 = d->op0;
9952   in1 = d->op1;
9953   if (BYTES_BIG_ENDIAN)
9954     {
9955       x = in0, in0 = in1, in1 = x;
9956       odd = !odd;
9957     }
9958   out = d->target;
9959
9960   if (odd)
9961     {
9962       switch (vmode)
9963         {
9964         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9965         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9966         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9967         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9968         case V4SImode: gen = gen_aarch64_trn2v4si; break;
9969         case V2SImode: gen = gen_aarch64_trn2v2si; break;
9970         case V2DImode: gen = gen_aarch64_trn2v2di; break;
9971         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9972         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9973         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9974         default:
9975           return false;
9976         }
9977     }
9978   else
9979     {
9980       switch (vmode)
9981         {
9982         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9983         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9984         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9985         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9986         case V4SImode: gen = gen_aarch64_trn1v4si; break;
9987         case V2SImode: gen = gen_aarch64_trn1v2si; break;
9988         case V2DImode: gen = gen_aarch64_trn1v2di; break;
9989         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9990         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9991         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9992         default:
9993           return false;
9994         }
9995     }
9996
9997   emit_insn (gen (out, in0, in1));
9998   return true;
9999 }
10000
10001 /* Recognize patterns suitable for the UZP instructions.  */
10002 static bool
10003 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
10004 {
10005   unsigned int i, odd, mask, nelt = d->nelt;
10006   rtx out, in0, in1, x;
10007   rtx (*gen) (rtx, rtx, rtx);
10008   machine_mode vmode = d->vmode;
10009
10010   if (GET_MODE_UNIT_SIZE (vmode) > 8)
10011     return false;
10012
10013   /* Note that these are little-endian tests.
10014      We correct for big-endian later.  */
10015   if (d->perm[0] == 0)
10016     odd = 0;
10017   else if (d->perm[0] == 1)
10018     odd = 1;
10019   else
10020     return false;
10021   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10022
10023   for (i = 0; i < nelt; i++)
10024     {
10025       unsigned elt = (i * 2 + odd) & mask;
10026       if (d->perm[i] != elt)
10027         return false;
10028     }
10029
10030   /* Success!  */
10031   if (d->testing_p)
10032     return true;
10033
10034   in0 = d->op0;
10035   in1 = d->op1;
10036   if (BYTES_BIG_ENDIAN)
10037     {
10038       x = in0, in0 = in1, in1 = x;
10039       odd = !odd;
10040     }
10041   out = d->target;
10042
10043   if (odd)
10044     {
10045       switch (vmode)
10046         {
10047         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
10048         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
10049         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
10050         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
10051         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
10052         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
10053         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
10054         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
10055         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
10056         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
10057         default:
10058           return false;
10059         }
10060     }
10061   else
10062     {
10063       switch (vmode)
10064         {
10065         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
10066         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
10067         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
10068         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
10069         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
10070         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
10071         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
10072         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
10073         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
10074         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
10075         default:
10076           return false;
10077         }
10078     }
10079
10080   emit_insn (gen (out, in0, in1));
10081   return true;
10082 }
10083
10084 /* Recognize patterns suitable for the ZIP instructions.  */
10085 static bool
10086 aarch64_evpc_zip (struct expand_vec_perm_d *d)
10087 {
10088   unsigned int i, high, mask, nelt = d->nelt;
10089   rtx out, in0, in1, x;
10090   rtx (*gen) (rtx, rtx, rtx);
10091   machine_mode vmode = d->vmode;
10092
10093   if (GET_MODE_UNIT_SIZE (vmode) > 8)
10094     return false;
10095
10096   /* Note that these are little-endian tests.
10097      We correct for big-endian later.  */
10098   high = nelt / 2;
10099   if (d->perm[0] == high)
10100     /* Do Nothing.  */
10101     ;
10102   else if (d->perm[0] == 0)
10103     high = 0;
10104   else
10105     return false;
10106   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10107
10108   for (i = 0; i < nelt / 2; i++)
10109     {
10110       unsigned elt = (i + high) & mask;
10111       if (d->perm[i * 2] != elt)
10112         return false;
10113       elt = (elt + nelt) & mask;
10114       if (d->perm[i * 2 + 1] != elt)
10115         return false;
10116     }
10117
10118   /* Success!  */
10119   if (d->testing_p)
10120     return true;
10121
10122   in0 = d->op0;
10123   in1 = d->op1;
10124   if (BYTES_BIG_ENDIAN)
10125     {
10126       x = in0, in0 = in1, in1 = x;
10127       high = !high;
10128     }
10129   out = d->target;
10130
10131   if (high)
10132     {
10133       switch (vmode)
10134         {
10135         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
10136         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
10137         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
10138         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
10139         case V4SImode: gen = gen_aarch64_zip2v4si; break;
10140         case V2SImode: gen = gen_aarch64_zip2v2si; break;
10141         case V2DImode: gen = gen_aarch64_zip2v2di; break;
10142         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
10143         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
10144         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
10145         default:
10146           return false;
10147         }
10148     }
10149   else
10150     {
10151       switch (vmode)
10152         {
10153         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
10154         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
10155         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
10156         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
10157         case V4SImode: gen = gen_aarch64_zip1v4si; break;
10158         case V2SImode: gen = gen_aarch64_zip1v2si; break;
10159         case V2DImode: gen = gen_aarch64_zip1v2di; break;
10160         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
10161         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
10162         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
10163         default:
10164           return false;
10165         }
10166     }
10167
10168   emit_insn (gen (out, in0, in1));
10169   return true;
10170 }
10171
10172 /* Recognize patterns for the EXT insn.  */
10173
10174 static bool
10175 aarch64_evpc_ext (struct expand_vec_perm_d *d)
10176 {
10177   unsigned int i, nelt = d->nelt;
10178   rtx (*gen) (rtx, rtx, rtx, rtx);
10179   rtx offset;
10180
10181   unsigned int location = d->perm[0]; /* Always < nelt.  */
10182
10183   /* Check if the extracted indices are increasing by one.  */
10184   for (i = 1; i < nelt; i++)
10185     {
10186       unsigned int required = location + i;
10187       if (d->one_vector_p)
10188         {
10189           /* We'll pass the same vector in twice, so allow indices to wrap.  */
10190           required &= (nelt - 1);
10191         }
10192       if (d->perm[i] != required)
10193         return false;
10194     }
10195
10196   switch (d->vmode)
10197     {
10198     case V16QImode: gen = gen_aarch64_extv16qi; break;
10199     case V8QImode: gen = gen_aarch64_extv8qi; break;
10200     case V4HImode: gen = gen_aarch64_extv4hi; break;
10201     case V8HImode: gen = gen_aarch64_extv8hi; break;
10202     case V2SImode: gen = gen_aarch64_extv2si; break;
10203     case V4SImode: gen = gen_aarch64_extv4si; break;
10204     case V2SFmode: gen = gen_aarch64_extv2sf; break;
10205     case V4SFmode: gen = gen_aarch64_extv4sf; break;
10206     case V2DImode: gen = gen_aarch64_extv2di; break;
10207     case V2DFmode: gen = gen_aarch64_extv2df; break;
10208     default:
10209       return false;
10210     }
10211
10212   /* Success! */
10213   if (d->testing_p)
10214     return true;
10215
10216   /* The case where (location == 0) is a no-op for both big- and little-endian,
10217      and is removed by the mid-end at optimization levels -O1 and higher.  */
10218
10219   if (BYTES_BIG_ENDIAN && (location != 0))
10220     {
10221       /* After setup, we want the high elements of the first vector (stored
10222          at the LSB end of the register), and the low elements of the second
10223          vector (stored at the MSB end of the register). So swap.  */
10224       std::swap (d->op0, d->op1);
10225       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
10226       location = nelt - location;
10227     }
10228
10229   offset = GEN_INT (location);
10230   emit_insn (gen (d->target, d->op0, d->op1, offset));
10231   return true;
10232 }
10233
10234 /* Recognize patterns for the REV insns.  */
10235
10236 static bool
10237 aarch64_evpc_rev (struct expand_vec_perm_d *d)
10238 {
10239   unsigned int i, j, diff, nelt = d->nelt;
10240   rtx (*gen) (rtx, rtx);
10241
10242   if (!d->one_vector_p)
10243     return false;
10244
10245   diff = d->perm[0];
10246   switch (diff)
10247     {
10248     case 7:
10249       switch (d->vmode)
10250         {
10251         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
10252         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
10253         default:
10254           return false;
10255         }
10256       break;
10257     case 3:
10258       switch (d->vmode)
10259         {
10260         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
10261         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
10262         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
10263         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
10264         default:
10265           return false;
10266         }
10267       break;
10268     case 1:
10269       switch (d->vmode)
10270         {
10271         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
10272         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
10273         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
10274         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
10275         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
10276         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
10277         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
10278         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
10279         default:
10280           return false;
10281         }
10282       break;
10283     default:
10284       return false;
10285     }
10286
10287   for (i = 0; i < nelt ; i += diff + 1)
10288     for (j = 0; j <= diff; j += 1)
10289       {
10290         /* This is guaranteed to be true as the value of diff
10291            is 7, 3, 1 and we should have enough elements in the
10292            queue to generate this.  Getting a vector mask with a
10293            value of diff other than these values implies that
10294            something is wrong by the time we get here.  */
10295         gcc_assert (i + j < nelt);
10296         if (d->perm[i + j] != i + diff - j)
10297           return false;
10298       }
10299
10300   /* Success! */
10301   if (d->testing_p)
10302     return true;
10303
10304   emit_insn (gen (d->target, d->op0));
10305   return true;
10306 }
10307
10308 static bool
10309 aarch64_evpc_dup (struct expand_vec_perm_d *d)
10310 {
10311   rtx (*gen) (rtx, rtx, rtx);
10312   rtx out = d->target;
10313   rtx in0;
10314   machine_mode vmode = d->vmode;
10315   unsigned int i, elt, nelt = d->nelt;
10316   rtx lane;
10317
10318   elt = d->perm[0];
10319   for (i = 1; i < nelt; i++)
10320     {
10321       if (elt != d->perm[i])
10322         return false;
10323     }
10324
10325   /* The generic preparation in aarch64_expand_vec_perm_const_1
10326      swaps the operand order and the permute indices if it finds
10327      d->perm[0] to be in the second operand.  Thus, we can always
10328      use d->op0 and need not do any extra arithmetic to get the
10329      correct lane number.  */
10330   in0 = d->op0;
10331   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
10332
10333   switch (vmode)
10334     {
10335     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
10336     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
10337     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
10338     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
10339     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
10340     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
10341     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
10342     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
10343     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
10344     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
10345     default:
10346       return false;
10347     }
10348
10349   emit_insn (gen (out, in0, lane));
10350   return true;
10351 }
10352
10353 static bool
10354 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
10355 {
10356   rtx rperm[MAX_VECT_LEN], sel;
10357   machine_mode vmode = d->vmode;
10358   unsigned int i, nelt = d->nelt;
10359
10360   if (d->testing_p)
10361     return true;
10362
10363   /* Generic code will try constant permutation twice.  Once with the
10364      original mode and again with the elements lowered to QImode.
10365      So wait and don't do the selector expansion ourselves.  */
10366   if (vmode != V8QImode && vmode != V16QImode)
10367     return false;
10368
10369   for (i = 0; i < nelt; ++i)
10370     {
10371       int nunits = GET_MODE_NUNITS (vmode);
10372
10373       /* If big-endian and two vectors we end up with a weird mixed-endian
10374          mode on NEON.  Reverse the index within each word but not the word
10375          itself.  */
10376       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
10377                                            : d->perm[i]);
10378     }
10379   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
10380   sel = force_reg (vmode, sel);
10381
10382   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
10383   return true;
10384 }
10385
10386 static bool
10387 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10388 {
10389   /* The pattern matching functions above are written to look for a small
10390      number to begin the sequence (0, 1, N/2).  If we begin with an index
10391      from the second operand, we can swap the operands.  */
10392   if (d->perm[0] >= d->nelt)
10393     {
10394       unsigned i, nelt = d->nelt;
10395
10396       gcc_assert (nelt == (nelt & -nelt));
10397       for (i = 0; i < nelt; ++i)
10398         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
10399
10400       std::swap (d->op0, d->op1);
10401     }
10402
10403   if (TARGET_SIMD)
10404     {
10405       if (aarch64_evpc_rev (d))
10406         return true;
10407       else if (aarch64_evpc_ext (d))
10408         return true;
10409       else if (aarch64_evpc_dup (d))
10410         return true;
10411       else if (aarch64_evpc_zip (d))
10412         return true;
10413       else if (aarch64_evpc_uzp (d))
10414         return true;
10415       else if (aarch64_evpc_trn (d))
10416         return true;
10417       return aarch64_evpc_tbl (d);
10418     }
10419   return false;
10420 }
10421
10422 /* Expand a vec_perm_const pattern.  */
10423
10424 bool
10425 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10426 {
10427   struct expand_vec_perm_d d;
10428   int i, nelt, which;
10429
10430   d.target = target;
10431   d.op0 = op0;
10432   d.op1 = op1;
10433
10434   d.vmode = GET_MODE (target);
10435   gcc_assert (VECTOR_MODE_P (d.vmode));
10436   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10437   d.testing_p = false;
10438
10439   for (i = which = 0; i < nelt; ++i)
10440     {
10441       rtx e = XVECEXP (sel, 0, i);
10442       int ei = INTVAL (e) & (2 * nelt - 1);
10443       which |= (ei < nelt ? 1 : 2);
10444       d.perm[i] = ei;
10445     }
10446
10447   switch (which)
10448     {
10449     default:
10450       gcc_unreachable ();
10451
10452     case 3:
10453       d.one_vector_p = false;
10454       if (!rtx_equal_p (op0, op1))
10455         break;
10456
10457       /* The elements of PERM do not suggest that only the first operand
10458          is used, but both operands are identical.  Allow easier matching
10459          of the permutation by folding the permutation into the single
10460          input vector.  */
10461       /* Fall Through.  */
10462     case 2:
10463       for (i = 0; i < nelt; ++i)
10464         d.perm[i] &= nelt - 1;
10465       d.op0 = op1;
10466       d.one_vector_p = true;
10467       break;
10468
10469     case 1:
10470       d.op1 = op0;
10471       d.one_vector_p = true;
10472       break;
10473     }
10474
10475   return aarch64_expand_vec_perm_const_1 (&d);
10476 }
10477
10478 static bool
10479 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10480                                      const unsigned char *sel)
10481 {
10482   struct expand_vec_perm_d d;
10483   unsigned int i, nelt, which;
10484   bool ret;
10485
10486   d.vmode = vmode;
10487   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10488   d.testing_p = true;
10489   memcpy (d.perm, sel, nelt);
10490
10491   /* Calculate whether all elements are in one vector.  */
10492   for (i = which = 0; i < nelt; ++i)
10493     {
10494       unsigned char e = d.perm[i];
10495       gcc_assert (e < 2 * nelt);
10496       which |= (e < nelt ? 1 : 2);
10497     }
10498
10499   /* If all elements are from the second vector, reindex as if from the
10500      first vector.  */
10501   if (which == 2)
10502     for (i = 0; i < nelt; ++i)
10503       d.perm[i] -= nelt;
10504
10505   /* Check whether the mask can be applied to a single vector.  */
10506   d.one_vector_p = (which != 3);
10507
10508   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10509   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10510   if (!d.one_vector_p)
10511     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10512
10513   start_sequence ();
10514   ret = aarch64_expand_vec_perm_const_1 (&d);
10515   end_sequence ();
10516
10517   return ret;
10518 }
10519
10520 rtx
10521 aarch64_reverse_mask (enum machine_mode mode)
10522 {
10523   /* We have to reverse each vector because we dont have
10524      a permuted load that can reverse-load according to ABI rules.  */
10525   rtx mask;
10526   rtvec v = rtvec_alloc (16);
10527   int i, j;
10528   int nunits = GET_MODE_NUNITS (mode);
10529   int usize = GET_MODE_UNIT_SIZE (mode);
10530
10531   gcc_assert (BYTES_BIG_ENDIAN);
10532   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
10533
10534   for (i = 0; i < nunits; i++)
10535     for (j = 0; j < usize; j++)
10536       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
10537   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
10538   return force_reg (V16QImode, mask);
10539 }
10540
10541 /* Implement MODES_TIEABLE_P.  */
10542
10543 bool
10544 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10545 {
10546   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10547     return true;
10548
10549   /* We specifically want to allow elements of "structure" modes to
10550      be tieable to the structure.  This more general condition allows
10551      other rarer situations too.  */
10552   if (TARGET_SIMD
10553       && aarch64_vector_mode_p (mode1)
10554       && aarch64_vector_mode_p (mode2))
10555     return true;
10556
10557   return false;
10558 }
10559
10560 /* Return a new RTX holding the result of moving POINTER forward by
10561    AMOUNT bytes.  */
10562
10563 static rtx
10564 aarch64_move_pointer (rtx pointer, int amount)
10565 {
10566   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10567
10568   return adjust_automodify_address (pointer, GET_MODE (pointer),
10569                                     next, amount);
10570 }
10571
10572 /* Return a new RTX holding the result of moving POINTER forward by the
10573    size of the mode it points to.  */
10574
10575 static rtx
10576 aarch64_progress_pointer (rtx pointer)
10577 {
10578   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10579
10580   return aarch64_move_pointer (pointer, amount);
10581 }
10582
10583 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10584    MODE bytes.  */
10585
10586 static void
10587 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10588                                               machine_mode mode)
10589 {
10590   rtx reg = gen_reg_rtx (mode);
10591
10592   /* "Cast" the pointers to the correct mode.  */
10593   *src = adjust_address (*src, mode, 0);
10594   *dst = adjust_address (*dst, mode, 0);
10595   /* Emit the memcpy.  */
10596   emit_move_insn (reg, *src);
10597   emit_move_insn (*dst, reg);
10598   /* Move the pointers forward.  */
10599   *src = aarch64_progress_pointer (*src);
10600   *dst = aarch64_progress_pointer (*dst);
10601 }
10602
10603 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
10604    we succeed, otherwise return false.  */
10605
10606 bool
10607 aarch64_expand_movmem (rtx *operands)
10608 {
10609   unsigned int n;
10610   rtx dst = operands[0];
10611   rtx src = operands[1];
10612   rtx base;
10613   bool speed_p = !optimize_function_for_size_p (cfun);
10614
10615   /* When optimizing for size, give a better estimate of the length of a
10616      memcpy call, but use the default otherwise.  */
10617   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10618
10619   /* We can't do anything smart if the amount to copy is not constant.  */
10620   if (!CONST_INT_P (operands[2]))
10621     return false;
10622
10623   n = UINTVAL (operands[2]);
10624
10625   /* Try to keep the number of instructions low.  For cases below 16 bytes we
10626      need to make at most two moves.  For cases above 16 bytes it will be one
10627      move for each 16 byte chunk, then at most two additional moves.  */
10628   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10629     return false;
10630
10631   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10632   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10633
10634   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10635   src = adjust_automodify_address (src, VOIDmode, base, 0);
10636
10637   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10638      1-byte chunk.  */
10639   if (n < 4)
10640     {
10641       if (n >= 2)
10642         {
10643           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10644           n -= 2;
10645         }
10646
10647       if (n == 1)
10648         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10649
10650       return true;
10651     }
10652
10653   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
10654      4-byte chunk, partially overlapping with the previously copied chunk.  */
10655   if (n < 8)
10656     {
10657       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10658       n -= 4;
10659       if (n > 0)
10660         {
10661           int move = n - 4;
10662
10663           src = aarch64_move_pointer (src, move);
10664           dst = aarch64_move_pointer (dst, move);
10665           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10666         }
10667       return true;
10668     }
10669
10670   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
10671      them, then (if applicable) an 8-byte chunk.  */
10672   while (n >= 8)
10673     {
10674       if (n / 16)
10675         {
10676           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10677           n -= 16;
10678         }
10679       else
10680         {
10681           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10682           n -= 8;
10683         }
10684     }
10685
10686   /* Finish the final bytes of the copy.  We can always do this in one
10687      instruction.  We either copy the exact amount we need, or partially
10688      overlap with the previous chunk we copied and copy 8-bytes.  */
10689   if (n == 0)
10690     return true;
10691   else if (n == 1)
10692     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10693   else if (n == 2)
10694     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10695   else if (n == 4)
10696     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10697   else
10698     {
10699       if (n == 3)
10700         {
10701           src = aarch64_move_pointer (src, -1);
10702           dst = aarch64_move_pointer (dst, -1);
10703           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10704         }
10705       else
10706         {
10707           int move = n - 8;
10708
10709           src = aarch64_move_pointer (src, move);
10710           dst = aarch64_move_pointer (dst, move);
10711           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10712         }
10713     }
10714
10715   return true;
10716 }
10717
10718 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
10719
10720 static unsigned HOST_WIDE_INT
10721 aarch64_asan_shadow_offset (void)
10722 {
10723   return (HOST_WIDE_INT_1 << 36);
10724 }
10725
10726 static bool
10727 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
10728                                         unsigned int align,
10729                                         enum by_pieces_operation op,
10730                                         bool speed_p)
10731 {
10732   /* STORE_BY_PIECES can be used when copying a constant string, but
10733      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10734      For now we always fail this and let the move_by_pieces code copy
10735      the string from read-only memory.  */
10736   if (op == STORE_BY_PIECES)
10737     return false;
10738
10739   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10740 }
10741
10742 static enum machine_mode
10743 aarch64_code_to_ccmode (enum rtx_code code)
10744 {
10745   switch (code)
10746     {
10747     case NE:
10748       return CC_DNEmode;
10749
10750     case EQ:
10751       return CC_DEQmode;
10752
10753     case LE:
10754       return CC_DLEmode;
10755
10756     case LT:
10757       return CC_DLTmode;
10758
10759     case GE:
10760       return CC_DGEmode;
10761
10762     case GT:
10763       return CC_DGTmode;
10764
10765     case LEU:
10766       return CC_DLEUmode;
10767
10768     case LTU:
10769       return CC_DLTUmode;
10770
10771     case GEU:
10772       return CC_DGEUmode;
10773
10774     case GTU:
10775       return CC_DGTUmode;
10776
10777     default:
10778       return CCmode;
10779     }
10780 }
10781
10782 static rtx
10783 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
10784                         int code, tree treeop0, tree treeop1)
10785 {
10786   enum machine_mode op_mode, cmp_mode, cc_mode;
10787   rtx op0, op1, cmp, target;
10788   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10789   enum insn_code icode;
10790   struct expand_operand ops[4];
10791
10792   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
10793   if (cc_mode == CCmode)
10794     return NULL_RTX;
10795
10796   start_sequence ();
10797   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10798
10799   op_mode = GET_MODE (op0);
10800   if (op_mode == VOIDmode)
10801     op_mode = GET_MODE (op1);
10802
10803   switch (op_mode)
10804     {
10805     case QImode:
10806     case HImode:
10807     case SImode:
10808       cmp_mode = SImode;
10809       icode = CODE_FOR_cmpsi;
10810       break;
10811
10812     case DImode:
10813       cmp_mode = DImode;
10814       icode = CODE_FOR_cmpdi;
10815       break;
10816
10817     default:
10818       end_sequence ();
10819       return NULL_RTX;
10820     }
10821
10822   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10823   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10824   if (!op0 || !op1)
10825     {
10826       end_sequence ();
10827       return NULL_RTX;
10828     }
10829   *prep_seq = get_insns ();
10830   end_sequence ();
10831
10832   cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
10833   target = gen_rtx_REG (CCmode, CC_REGNUM);
10834
10835   create_output_operand (&ops[0], target, CCmode);
10836   create_fixed_operand (&ops[1], cmp);
10837   create_fixed_operand (&ops[2], op0);
10838   create_fixed_operand (&ops[3], op1);
10839
10840   start_sequence ();
10841   if (!maybe_expand_insn (icode, 4, ops))
10842     {
10843       end_sequence ();
10844       return NULL_RTX;
10845     }
10846   *gen_seq = get_insns ();
10847   end_sequence ();
10848
10849   return gen_rtx_REG (cc_mode, CC_REGNUM);
10850 }
10851
10852 static rtx
10853 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
10854                        tree treeop0, tree treeop1, int bit_code)
10855 {
10856   rtx op0, op1, cmp0, cmp1, target;
10857   enum machine_mode op_mode, cmp_mode, cc_mode;
10858   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10859   enum insn_code icode = CODE_FOR_ccmp_andsi;
10860   struct expand_operand ops[6];
10861
10862   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
10863   if (cc_mode == CCmode)
10864     return NULL_RTX;
10865
10866   push_to_sequence ((rtx_insn*) *prep_seq);
10867   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10868
10869   op_mode = GET_MODE (op0);
10870   if (op_mode == VOIDmode)
10871     op_mode = GET_MODE (op1);
10872
10873   switch (op_mode)
10874     {
10875     case QImode:
10876     case HImode:
10877     case SImode:
10878       cmp_mode = SImode;
10879       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
10880                                                 : CODE_FOR_ccmp_iorsi;
10881       break;
10882
10883     case DImode:
10884       cmp_mode = DImode;
10885       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
10886                                                 : CODE_FOR_ccmp_iordi;
10887       break;
10888
10889     default:
10890       end_sequence ();
10891       return NULL_RTX;
10892     }
10893
10894   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10895   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10896   if (!op0 || !op1)
10897     {
10898       end_sequence ();
10899       return NULL_RTX;
10900     }
10901   *prep_seq = get_insns ();
10902   end_sequence ();
10903
10904   target = gen_rtx_REG (cc_mode, CC_REGNUM);
10905   cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
10906   cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
10907
10908   create_fixed_operand (&ops[0], prev);
10909   create_fixed_operand (&ops[1], target);
10910   create_fixed_operand (&ops[2], op0);
10911   create_fixed_operand (&ops[3], op1);
10912   create_fixed_operand (&ops[4], cmp0);
10913   create_fixed_operand (&ops[5], cmp1);
10914
10915   push_to_sequence ((rtx_insn*) *gen_seq);
10916   if (!maybe_expand_insn (icode, 6, ops))
10917     {
10918       end_sequence ();
10919       return NULL_RTX;
10920     }
10921
10922   *gen_seq = get_insns ();
10923   end_sequence ();
10924
10925   return target;
10926 }
10927
10928 #undef TARGET_GEN_CCMP_FIRST
10929 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
10930
10931 #undef TARGET_GEN_CCMP_NEXT
10932 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
10933
10934 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
10935    instruction fusion of some sort.  */
10936
10937 static bool
10938 aarch64_macro_fusion_p (void)
10939 {
10940   return aarch64_tune_params->fusible_ops != AARCH64_FUSE_NOTHING;
10941 }
10942
10943
10944 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
10945    should be kept together during scheduling.  */
10946
10947 static bool
10948 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
10949 {
10950   rtx set_dest;
10951   rtx prev_set = single_set (prev);
10952   rtx curr_set = single_set (curr);
10953   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
10954   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
10955
10956   if (!aarch64_macro_fusion_p ())
10957     return false;
10958
10959   if (simple_sets_p
10960       && (aarch64_tune_params->fusible_ops & AARCH64_FUSE_MOV_MOVK))
10961     {
10962       /* We are trying to match:
10963          prev (mov)  == (set (reg r0) (const_int imm16))
10964          curr (movk) == (set (zero_extract (reg r0)
10965                                            (const_int 16)
10966                                            (const_int 16))
10967                              (const_int imm16_1))  */
10968
10969       set_dest = SET_DEST (curr_set);
10970
10971       if (GET_CODE (set_dest) == ZERO_EXTRACT
10972           && CONST_INT_P (SET_SRC (curr_set))
10973           && CONST_INT_P (SET_SRC (prev_set))
10974           && CONST_INT_P (XEXP (set_dest, 2))
10975           && INTVAL (XEXP (set_dest, 2)) == 16
10976           && REG_P (XEXP (set_dest, 0))
10977           && REG_P (SET_DEST (prev_set))
10978           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
10979         {
10980           return true;
10981         }
10982     }
10983
10984   if (simple_sets_p
10985       && (aarch64_tune_params->fusible_ops & AARCH64_FUSE_ADRP_ADD))
10986     {
10987
10988       /*  We're trying to match:
10989           prev (adrp) == (set (reg r1)
10990                               (high (symbol_ref ("SYM"))))
10991           curr (add) == (set (reg r0)
10992                              (lo_sum (reg r1)
10993                                      (symbol_ref ("SYM"))))
10994           Note that r0 need not necessarily be the same as r1, especially
10995           during pre-regalloc scheduling.  */
10996
10997       if (satisfies_constraint_Ush (SET_SRC (prev_set))
10998           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10999         {
11000           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
11001               && REG_P (XEXP (SET_SRC (curr_set), 0))
11002               && REGNO (XEXP (SET_SRC (curr_set), 0))
11003                  == REGNO (SET_DEST (prev_set))
11004               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
11005                               XEXP (SET_SRC (curr_set), 1)))
11006             return true;
11007         }
11008     }
11009
11010   if (simple_sets_p
11011       && (aarch64_tune_params->fusible_ops & AARCH64_FUSE_MOVK_MOVK))
11012     {
11013
11014       /* We're trying to match:
11015          prev (movk) == (set (zero_extract (reg r0)
11016                                            (const_int 16)
11017                                            (const_int 32))
11018                              (const_int imm16_1))
11019          curr (movk) == (set (zero_extract (reg r0)
11020                                            (const_int 16)
11021                                            (const_int 48))
11022                              (const_int imm16_2))  */
11023
11024       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
11025           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
11026           && REG_P (XEXP (SET_DEST (prev_set), 0))
11027           && REG_P (XEXP (SET_DEST (curr_set), 0))
11028           && REGNO (XEXP (SET_DEST (prev_set), 0))
11029              == REGNO (XEXP (SET_DEST (curr_set), 0))
11030           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
11031           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
11032           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
11033           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
11034           && CONST_INT_P (SET_SRC (prev_set))
11035           && CONST_INT_P (SET_SRC (curr_set)))
11036         return true;
11037
11038     }
11039   if (simple_sets_p
11040       && (aarch64_tune_params->fusible_ops & AARCH64_FUSE_ADRP_LDR))
11041     {
11042       /* We're trying to match:
11043           prev (adrp) == (set (reg r0)
11044                               (high (symbol_ref ("SYM"))))
11045           curr (ldr) == (set (reg r1)
11046                              (mem (lo_sum (reg r0)
11047                                              (symbol_ref ("SYM")))))
11048                  or
11049           curr (ldr) == (set (reg r1)
11050                              (zero_extend (mem
11051                                            (lo_sum (reg r0)
11052                                                    (symbol_ref ("SYM"))))))  */
11053       if (satisfies_constraint_Ush (SET_SRC (prev_set))
11054           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
11055         {
11056           rtx curr_src = SET_SRC (curr_set);
11057
11058           if (GET_CODE (curr_src) == ZERO_EXTEND)
11059             curr_src = XEXP (curr_src, 0);
11060
11061           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
11062               && REG_P (XEXP (XEXP (curr_src, 0), 0))
11063               && REGNO (XEXP (XEXP (curr_src, 0), 0))
11064                  == REGNO (SET_DEST (prev_set))
11065               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
11066                               XEXP (SET_SRC (prev_set), 0)))
11067               return true;
11068         }
11069     }
11070
11071   if ((aarch64_tune_params->fusible_ops & AARCH64_FUSE_CMP_BRANCH)
11072       && any_condjump_p (curr))
11073     {
11074       enum attr_type prev_type = get_attr_type (prev);
11075
11076       /* FIXME: this misses some which is considered simple arthematic
11077          instructions for ThunderX.  Simple shifts are missed here.  */
11078       if (prev_type == TYPE_ALUS_SREG
11079           || prev_type == TYPE_ALUS_IMM
11080           || prev_type == TYPE_LOGICS_REG
11081           || prev_type == TYPE_LOGICS_IMM)
11082         return true;
11083     }
11084
11085   return false;
11086 }
11087
11088 /* If MEM is in the form of [base+offset], extract the two parts
11089    of address and set to BASE and OFFSET, otherwise return false
11090    after clearing BASE and OFFSET.  */
11091
11092 bool
11093 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
11094 {
11095   rtx addr;
11096
11097   gcc_assert (MEM_P (mem));
11098
11099   addr = XEXP (mem, 0);
11100
11101   if (REG_P (addr))
11102     {
11103       *base = addr;
11104       *offset = const0_rtx;
11105       return true;
11106     }
11107
11108   if (GET_CODE (addr) == PLUS
11109       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
11110     {
11111       *base = XEXP (addr, 0);
11112       *offset = XEXP (addr, 1);
11113       return true;
11114     }
11115
11116   *base = NULL_RTX;
11117   *offset = NULL_RTX;
11118
11119   return false;
11120 }
11121
11122 /* Types for scheduling fusion.  */
11123 enum sched_fusion_type
11124 {
11125   SCHED_FUSION_NONE = 0,
11126   SCHED_FUSION_LD_SIGN_EXTEND,
11127   SCHED_FUSION_LD_ZERO_EXTEND,
11128   SCHED_FUSION_LD,
11129   SCHED_FUSION_ST,
11130   SCHED_FUSION_NUM
11131 };
11132
11133 /* If INSN is a load or store of address in the form of [base+offset],
11134    extract the two parts and set to BASE and OFFSET.  Return scheduling
11135    fusion type this INSN is.  */
11136
11137 static enum sched_fusion_type
11138 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
11139 {
11140   rtx x, dest, src;
11141   enum sched_fusion_type fusion = SCHED_FUSION_LD;
11142
11143   gcc_assert (INSN_P (insn));
11144   x = PATTERN (insn);
11145   if (GET_CODE (x) != SET)
11146     return SCHED_FUSION_NONE;
11147
11148   src = SET_SRC (x);
11149   dest = SET_DEST (x);
11150
11151   if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
11152       && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
11153     return SCHED_FUSION_NONE;
11154
11155   if (GET_CODE (src) == SIGN_EXTEND)
11156     {
11157       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
11158       src = XEXP (src, 0);
11159       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
11160         return SCHED_FUSION_NONE;
11161     }
11162   else if (GET_CODE (src) == ZERO_EXTEND)
11163     {
11164       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
11165       src = XEXP (src, 0);
11166       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
11167         return SCHED_FUSION_NONE;
11168     }
11169
11170   if (GET_CODE (src) == MEM && REG_P (dest))
11171     extract_base_offset_in_addr (src, base, offset);
11172   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
11173     {
11174       fusion = SCHED_FUSION_ST;
11175       extract_base_offset_in_addr (dest, base, offset);
11176     }
11177   else
11178     return SCHED_FUSION_NONE;
11179
11180   if (*base == NULL_RTX || *offset == NULL_RTX)
11181     fusion = SCHED_FUSION_NONE;
11182
11183   return fusion;
11184 }
11185
11186 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
11187
11188    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
11189    and PRI are only calculated for these instructions.  For other instruction,
11190    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
11191    type instruction fusion can be added by returning different priorities.
11192
11193    It's important that irrelevant instructions get the largest FUSION_PRI.  */
11194
11195 static void
11196 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
11197                                int *fusion_pri, int *pri)
11198 {
11199   int tmp, off_val;
11200   rtx base, offset;
11201   enum sched_fusion_type fusion;
11202
11203   gcc_assert (INSN_P (insn));
11204
11205   tmp = max_pri - 1;
11206   fusion = fusion_load_store (insn, &base, &offset);
11207   if (fusion == SCHED_FUSION_NONE)
11208     {
11209       *pri = tmp;
11210       *fusion_pri = tmp;
11211       return;
11212     }
11213
11214   /* Set FUSION_PRI according to fusion type and base register.  */
11215   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
11216
11217   /* Calculate PRI.  */
11218   tmp /= 2;
11219
11220   /* INSN with smaller offset goes first.  */
11221   off_val = (int)(INTVAL (offset));
11222   if (off_val >= 0)
11223     tmp -= (off_val & 0xfffff);
11224   else
11225     tmp += ((- off_val) & 0xfffff);
11226
11227   *pri = tmp;
11228   return;
11229 }
11230
11231 /* Given OPERANDS of consecutive load/store, check if we can merge
11232    them into ldp/stp.  LOAD is true if they are load instructions.
11233    MODE is the mode of memory operands.  */
11234
11235 bool
11236 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
11237                                 enum machine_mode mode)
11238 {
11239   HOST_WIDE_INT offval_1, offval_2, msize;
11240   enum reg_class rclass_1, rclass_2;
11241   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
11242
11243   if (load)
11244     {
11245       mem_1 = operands[1];
11246       mem_2 = operands[3];
11247       reg_1 = operands[0];
11248       reg_2 = operands[2];
11249       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
11250       if (REGNO (reg_1) == REGNO (reg_2))
11251         return false;
11252     }
11253   else
11254     {
11255       mem_1 = operands[0];
11256       mem_2 = operands[2];
11257       reg_1 = operands[1];
11258       reg_2 = operands[3];
11259     }
11260
11261   /* The mems cannot be volatile.  */
11262   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
11263     return false;
11264
11265   /* Check if the addresses are in the form of [base+offset].  */
11266   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11267   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11268     return false;
11269   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11270   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11271     return false;
11272
11273   /* Check if the bases are same.  */
11274   if (!rtx_equal_p (base_1, base_2))
11275     return false;
11276
11277   offval_1 = INTVAL (offset_1);
11278   offval_2 = INTVAL (offset_2);
11279   msize = GET_MODE_SIZE (mode);
11280   /* Check if the offsets are consecutive.  */
11281   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
11282     return false;
11283
11284   /* Check if the addresses are clobbered by load.  */
11285   if (load)
11286     {
11287       if (reg_mentioned_p (reg_1, mem_1))
11288         return false;
11289
11290       /* In increasing order, the last load can clobber the address.  */
11291       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
11292       return false;
11293     }
11294
11295   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11296     rclass_1 = FP_REGS;
11297   else
11298     rclass_1 = GENERAL_REGS;
11299
11300   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11301     rclass_2 = FP_REGS;
11302   else
11303     rclass_2 = GENERAL_REGS;
11304
11305   /* Check if the registers are of same class.  */
11306   if (rclass_1 != rclass_2)
11307     return false;
11308
11309   return true;
11310 }
11311
11312 /* Given OPERANDS of consecutive load/store, check if we can merge
11313    them into ldp/stp by adjusting the offset.  LOAD is true if they
11314    are load instructions.  MODE is the mode of memory operands.
11315
11316    Given below consecutive stores:
11317
11318      str  w1, [xb, 0x100]
11319      str  w1, [xb, 0x104]
11320      str  w1, [xb, 0x108]
11321      str  w1, [xb, 0x10c]
11322
11323    Though the offsets are out of the range supported by stp, we can
11324    still pair them after adjusting the offset, like:
11325
11326      add  scratch, xb, 0x100
11327      stp  w1, w1, [scratch]
11328      stp  w1, w1, [scratch, 0x8]
11329
11330    The peephole patterns detecting this opportunity should guarantee
11331    the scratch register is avaliable.  */
11332
11333 bool
11334 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
11335                                        enum machine_mode mode)
11336 {
11337   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
11338   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
11339   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
11340   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
11341
11342   if (load)
11343     {
11344       reg_1 = operands[0];
11345       mem_1 = operands[1];
11346       reg_2 = operands[2];
11347       mem_2 = operands[3];
11348       reg_3 = operands[4];
11349       mem_3 = operands[5];
11350       reg_4 = operands[6];
11351       mem_4 = operands[7];
11352       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
11353                   && REG_P (reg_3) && REG_P (reg_4));
11354       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
11355         return false;
11356     }
11357   else
11358     {
11359       mem_1 = operands[0];
11360       reg_1 = operands[1];
11361       mem_2 = operands[2];
11362       reg_2 = operands[3];
11363       mem_3 = operands[4];
11364       reg_3 = operands[5];
11365       mem_4 = operands[6];
11366       reg_4 = operands[7];
11367     }
11368   /* Skip if memory operand is by itslef valid for ldp/stp.  */
11369   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
11370     return false;
11371
11372   /* The mems cannot be volatile.  */
11373   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
11374       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
11375     return false;
11376
11377   /* Check if the addresses are in the form of [base+offset].  */
11378   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11379   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11380     return false;
11381   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11382   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11383     return false;
11384   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11385   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11386     return false;
11387   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11388   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11389     return false;
11390
11391   /* Check if the bases are same.  */
11392   if (!rtx_equal_p (base_1, base_2)
11393       || !rtx_equal_p (base_2, base_3)
11394       || !rtx_equal_p (base_3, base_4))
11395     return false;
11396
11397   offval_1 = INTVAL (offset_1);
11398   offval_2 = INTVAL (offset_2);
11399   offval_3 = INTVAL (offset_3);
11400   offval_4 = INTVAL (offset_4);
11401   msize = GET_MODE_SIZE (mode);
11402   /* Check if the offsets are consecutive.  */
11403   if ((offval_1 != (offval_2 + msize)
11404        || offval_1 != (offval_3 + msize * 2)
11405        || offval_1 != (offval_4 + msize * 3))
11406       && (offval_4 != (offval_3 + msize)
11407           || offval_4 != (offval_2 + msize * 2)
11408           || offval_4 != (offval_1 + msize * 3)))
11409     return false;
11410
11411   /* Check if the addresses are clobbered by load.  */
11412   if (load)
11413     {
11414       if (reg_mentioned_p (reg_1, mem_1)
11415           || reg_mentioned_p (reg_2, mem_2)
11416           || reg_mentioned_p (reg_3, mem_3))
11417         return false;
11418
11419       /* In increasing order, the last load can clobber the address.  */
11420       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11421         return false;
11422     }
11423
11424   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11425     rclass_1 = FP_REGS;
11426   else
11427     rclass_1 = GENERAL_REGS;
11428
11429   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11430     rclass_2 = FP_REGS;
11431   else
11432     rclass_2 = GENERAL_REGS;
11433
11434   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11435     rclass_3 = FP_REGS;
11436   else
11437     rclass_3 = GENERAL_REGS;
11438
11439   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11440     rclass_4 = FP_REGS;
11441   else
11442     rclass_4 = GENERAL_REGS;
11443
11444   /* Check if the registers are of same class.  */
11445   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11446     return false;
11447
11448   return true;
11449 }
11450
11451 /* Given OPERANDS of consecutive load/store, this function pairs them
11452    into ldp/stp after adjusting the offset.  It depends on the fact
11453    that addresses of load/store instructions are in increasing order.
11454    MODE is the mode of memory operands.  CODE is the rtl operator
11455    which should be applied to all memory operands, it's SIGN_EXTEND,
11456    ZERO_EXTEND or UNKNOWN.  */
11457
11458 bool
11459 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11460                              enum machine_mode mode, RTX_CODE code)
11461 {
11462   rtx base, offset, t1, t2;
11463   rtx mem_1, mem_2, mem_3, mem_4;
11464   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11465
11466   if (load)
11467     {
11468       mem_1 = operands[1];
11469       mem_2 = operands[3];
11470       mem_3 = operands[5];
11471       mem_4 = operands[7];
11472     }
11473   else
11474     {
11475       mem_1 = operands[0];
11476       mem_2 = operands[2];
11477       mem_3 = operands[4];
11478       mem_4 = operands[6];
11479       gcc_assert (code == UNKNOWN);
11480     }
11481
11482   extract_base_offset_in_addr (mem_1, &base, &offset);
11483   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11484
11485   /* Adjust offset thus it can fit in ldp/stp instruction.  */
11486   msize = GET_MODE_SIZE (mode);
11487   stp_off_limit = msize * 0x40;
11488   off_val = INTVAL (offset);
11489   abs_off = (off_val < 0) ? -off_val : off_val;
11490   new_off = abs_off % stp_off_limit;
11491   adj_off = abs_off - new_off;
11492
11493   /* Further adjust to make sure all offsets are OK.  */
11494   if ((new_off + msize * 2) >= stp_off_limit)
11495     {
11496       adj_off += stp_off_limit;
11497       new_off -= stp_off_limit;
11498     }
11499
11500   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
11501   if (adj_off >= 0x1000)
11502     return false;
11503
11504   if (off_val < 0)
11505     {
11506       adj_off = -adj_off;
11507       new_off = -new_off;
11508     }
11509
11510   /* Create new memory references.  */
11511   mem_1 = change_address (mem_1, VOIDmode,
11512                           plus_constant (DImode, operands[8], new_off));
11513
11514   /* Check if the adjusted address is OK for ldp/stp.  */
11515   if (!aarch64_mem_pair_operand (mem_1, mode))
11516     return false;
11517
11518   msize = GET_MODE_SIZE (mode);
11519   mem_2 = change_address (mem_2, VOIDmode,
11520                           plus_constant (DImode,
11521                                          operands[8],
11522                                          new_off + msize));
11523   mem_3 = change_address (mem_3, VOIDmode,
11524                           plus_constant (DImode,
11525                                          operands[8],
11526                                          new_off + msize * 2));
11527   mem_4 = change_address (mem_4, VOIDmode,
11528                           plus_constant (DImode,
11529                                          operands[8],
11530                                          new_off + msize * 3));
11531
11532   if (code == ZERO_EXTEND)
11533     {
11534       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11535       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11536       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11537       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11538     }
11539   else if (code == SIGN_EXTEND)
11540     {
11541       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11542       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11543       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11544       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11545     }
11546
11547   if (load)
11548     {
11549       operands[1] = mem_1;
11550       operands[3] = mem_2;
11551       operands[5] = mem_3;
11552       operands[7] = mem_4;
11553     }
11554   else
11555     {
11556       operands[0] = mem_1;
11557       operands[2] = mem_2;
11558       operands[4] = mem_3;
11559       operands[6] = mem_4;
11560     }
11561
11562   /* Emit adjusting instruction.  */
11563   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
11564   /* Emit ldp/stp instructions.  */
11565   t1 = gen_rtx_SET (operands[0], operands[1]);
11566   t2 = gen_rtx_SET (operands[2], operands[3]);
11567   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11568   t1 = gen_rtx_SET (operands[4], operands[5]);
11569   t2 = gen_rtx_SET (operands[6], operands[7]);
11570   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11571   return true;
11572 }
11573
11574 #undef TARGET_ADDRESS_COST
11575 #define TARGET_ADDRESS_COST aarch64_address_cost
11576
11577 /* This hook will determines whether unnamed bitfields affect the alignment
11578    of the containing structure.  The hook returns true if the structure
11579    should inherit the alignment requirements of an unnamed bitfield's
11580    type.  */
11581 #undef TARGET_ALIGN_ANON_BITFIELD
11582 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11583
11584 #undef TARGET_ASM_ALIGNED_DI_OP
11585 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11586
11587 #undef TARGET_ASM_ALIGNED_HI_OP
11588 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11589
11590 #undef TARGET_ASM_ALIGNED_SI_OP
11591 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11592
11593 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11594 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11595   hook_bool_const_tree_hwi_hwi_const_tree_true
11596
11597 #undef TARGET_ASM_FILE_START
11598 #define TARGET_ASM_FILE_START aarch64_start_file
11599
11600 #undef TARGET_ASM_OUTPUT_MI_THUNK
11601 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11602
11603 #undef TARGET_ASM_SELECT_RTX_SECTION
11604 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11605
11606 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11607 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11608
11609 #undef TARGET_BUILD_BUILTIN_VA_LIST
11610 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11611
11612 #undef TARGET_CALLEE_COPIES
11613 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11614
11615 #undef TARGET_CAN_ELIMINATE
11616 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11617
11618 #undef TARGET_CANNOT_FORCE_CONST_MEM
11619 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11620
11621 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11622 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11623
11624 /* Only the least significant bit is used for initialization guard
11625    variables.  */
11626 #undef TARGET_CXX_GUARD_MASK_BIT
11627 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11628
11629 #undef TARGET_C_MODE_FOR_SUFFIX
11630 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11631
11632 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11633 #undef  TARGET_DEFAULT_TARGET_FLAGS
11634 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11635 #endif
11636
11637 #undef TARGET_CLASS_MAX_NREGS
11638 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11639
11640 #undef TARGET_BUILTIN_DECL
11641 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11642
11643 #undef  TARGET_EXPAND_BUILTIN
11644 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11645
11646 #undef TARGET_EXPAND_BUILTIN_VA_START
11647 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11648
11649 #undef TARGET_FOLD_BUILTIN
11650 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11651
11652 #undef TARGET_FUNCTION_ARG
11653 #define TARGET_FUNCTION_ARG aarch64_function_arg
11654
11655 #undef TARGET_FUNCTION_ARG_ADVANCE
11656 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11657
11658 #undef TARGET_FUNCTION_ARG_BOUNDARY
11659 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11660
11661 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11662 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11663
11664 #undef TARGET_FUNCTION_VALUE
11665 #define TARGET_FUNCTION_VALUE aarch64_function_value
11666
11667 #undef TARGET_FUNCTION_VALUE_REGNO_P
11668 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11669
11670 #undef TARGET_FRAME_POINTER_REQUIRED
11671 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11672
11673 #undef TARGET_GIMPLE_FOLD_BUILTIN
11674 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11675
11676 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11677 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11678
11679 #undef  TARGET_INIT_BUILTINS
11680 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
11681
11682 #undef TARGET_LEGITIMATE_ADDRESS_P
11683 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11684
11685 #undef TARGET_LEGITIMATE_CONSTANT_P
11686 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11687
11688 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11689 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11690
11691 #undef TARGET_LRA_P
11692 #define TARGET_LRA_P hook_bool_void_true
11693
11694 #undef TARGET_MANGLE_TYPE
11695 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11696
11697 #undef TARGET_MEMORY_MOVE_COST
11698 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11699
11700 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11701 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11702
11703 #undef TARGET_MUST_PASS_IN_STACK
11704 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11705
11706 /* This target hook should return true if accesses to volatile bitfields
11707    should use the narrowest mode possible.  It should return false if these
11708    accesses should use the bitfield container type.  */
11709 #undef TARGET_NARROW_VOLATILE_BITFIELD
11710 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11711
11712 #undef  TARGET_OPTION_OVERRIDE
11713 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11714
11715 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11716 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11717   aarch64_override_options_after_change
11718
11719 #undef TARGET_PASS_BY_REFERENCE
11720 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11721
11722 #undef TARGET_PREFERRED_RELOAD_CLASS
11723 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11724
11725 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11726 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11727
11728 #undef TARGET_SECONDARY_RELOAD
11729 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11730
11731 #undef TARGET_SHIFT_TRUNCATION_MASK
11732 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11733
11734 #undef TARGET_SETUP_INCOMING_VARARGS
11735 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11736
11737 #undef TARGET_STRUCT_VALUE_RTX
11738 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
11739
11740 #undef TARGET_REGISTER_MOVE_COST
11741 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11742
11743 #undef TARGET_RETURN_IN_MEMORY
11744 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11745
11746 #undef TARGET_RETURN_IN_MSB
11747 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11748
11749 #undef TARGET_RTX_COSTS
11750 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11751
11752 #undef TARGET_SCHED_ISSUE_RATE
11753 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11754
11755 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
11756 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
11757   aarch64_sched_first_cycle_multipass_dfa_lookahead
11758
11759 #undef TARGET_TRAMPOLINE_INIT
11760 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11761
11762 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11763 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11764
11765 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11766 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11767
11768 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11769 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11770
11771 #undef TARGET_VECTORIZE_ADD_STMT_COST
11772 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11773
11774 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11775 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11776   aarch64_builtin_vectorization_cost
11777
11778 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11779 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11780
11781 #undef TARGET_VECTORIZE_BUILTINS
11782 #define TARGET_VECTORIZE_BUILTINS
11783
11784 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11785 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11786   aarch64_builtin_vectorized_function
11787
11788 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11789 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11790   aarch64_autovectorize_vector_sizes
11791
11792 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11793 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11794   aarch64_atomic_assign_expand_fenv
11795
11796 /* Section anchor support.  */
11797
11798 #undef TARGET_MIN_ANCHOR_OFFSET
11799 #define TARGET_MIN_ANCHOR_OFFSET -256
11800
11801 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11802    byte offset; we can do much more for larger data types, but have no way
11803    to determine the size of the access.  We assume accesses are aligned.  */
11804 #undef TARGET_MAX_ANCHOR_OFFSET
11805 #define TARGET_MAX_ANCHOR_OFFSET 4095
11806
11807 #undef TARGET_VECTOR_ALIGNMENT
11808 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11809
11810 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11811 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11812   aarch64_simd_vector_alignment_reachable
11813
11814 /* vec_perm support.  */
11815
11816 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11817 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11818   aarch64_vectorize_vec_perm_const_ok
11819
11820
11821 #undef TARGET_FIXED_CONDITION_CODE_REGS
11822 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11823
11824 #undef TARGET_FLAGS_REGNUM
11825 #define TARGET_FLAGS_REGNUM CC_REGNUM
11826
11827 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11828 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11829
11830 #undef TARGET_ASAN_SHADOW_OFFSET
11831 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11832
11833 #undef TARGET_LEGITIMIZE_ADDRESS
11834 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11835
11836 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11837 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11838   aarch64_use_by_pieces_infrastructure_p
11839
11840 #undef TARGET_CAN_USE_DOLOOP_P
11841 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11842
11843 #undef TARGET_SCHED_MACRO_FUSION_P
11844 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11845
11846 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11847 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11848
11849 #undef TARGET_SCHED_FUSION_PRIORITY
11850 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11851
11852 struct gcc_target targetm = TARGET_INITIALIZER;
11853
11854 #include "gt-aarch64.h"