gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2015 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "insn-codes.h"
  26 #include "rtl.h"
  27 #include "insn-attr.h"
  28 #include "hash-set.h"
  29 #include "machmode.h"
  30 #include "vec.h"
  31 #include "double-int.h"
  32 #include "input.h"
  33 #include "alias.h"
  34 #include "symtab.h"
  35 #include "wide-int.h"
  36 #include "inchash.h"
  37 #include "tree.h"
  38 #include "fold-const.h"
  39 #include "stringpool.h"
  40 #include "stor-layout.h"
  41 #include "calls.h"
  42 #include "varasm.h"
  43 #include "regs.h"
  44 #include "dominance.h"
  45 #include "cfg.h"
  46 #include "cfgrtl.h"
  47 #include "cfganal.h"
  48 #include "lcm.h"
  49 #include "cfgbuild.h"
  50 #include "cfgcleanup.h"
  51 #include "predict.h"
  52 #include "basic-block.h"
  53 #include "df.h"
  54 #include "hard-reg-set.h"
  55 #include "output.h"
  56 #include "hashtab.h"
  57 #include "function.h"
  58 #include "flags.h"
  59 #include "statistics.h"
  60 #include "real.h"
  61 #include "fixed-value.h"
  62 #include "insn-config.h"
  63 #include "expmed.h"
  64 #include "dojump.h"
  65 #include "explow.h"
  66 #include "emit-rtl.h"
  67 #include "stmt.h"
  68 #include "expr.h"
  69 #include "reload.h"
  70 #include "toplev.h"
  71 #include "target.h"
  72 #include "target-def.h"
  73 #include "targhooks.h"
  74 #include "ggc.h"
  75 #include "tm_p.h"
  76 #include "recog.h"
  77 #include "langhooks.h"
  78 #include "diagnostic-core.h"
  79 #include "hash-table.h"
  80 #include "tree-ssa-alias.h"
  81 #include "internal-fn.h"
  82 #include "gimple-fold.h"
  83 #include "tree-eh.h"
  84 #include "gimple-expr.h"
  85 #include "is-a.h"
  86 #include "gimple.h"
  87 #include "gimplify.h"
  88 #include "optabs.h"
  89 #include "dwarf2.h"
  90 #include "cfgloop.h"
  91 #include "tree-vectorizer.h"
  92 #include "aarch64-cost-tables.h"
  93 #include "dumpfile.h"
  94 #include "builtins.h"
  95 #include "rtl-iter.h"
  96 #include "tm-constrs.h"
  97 #include "sched-int.h"
  98 #include "cortex-a57-fma-steering.h"
  99
 100 /* Defined for convenience.  */
 101 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
 102
 103 /* Classifies an address.
 104
 105    ADDRESS_REG_IMM
 106        A simple base register plus immediate offset.
 107
 108    ADDRESS_REG_WB
 109        A base register indexed by immediate offset with writeback.
 110
 111    ADDRESS_REG_REG
 112        A base register indexed by (optionally scaled) register.
 113
 114    ADDRESS_REG_UXTW
 115        A base register indexed by (optionally scaled) zero-extended register.
 116
 117    ADDRESS_REG_SXTW
 118        A base register indexed by (optionally scaled) sign-extended register.
 119
 120    ADDRESS_LO_SUM
 121        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 122
 123    ADDRESS_SYMBOLIC:
 124        A constant symbolic address, in pc-relative literal pool.  */
 125
 126 enum aarch64_address_type {
 127   ADDRESS_REG_IMM,
 128   ADDRESS_REG_WB,
 129   ADDRESS_REG_REG,
 130   ADDRESS_REG_UXTW,
 131   ADDRESS_REG_SXTW,
 132   ADDRESS_LO_SUM,
 133   ADDRESS_SYMBOLIC
 134 };
 135
 136 struct aarch64_address_info {
 137   enum aarch64_address_type type;
 138   rtx base;
 139   rtx offset;
 140   int shift;
 141   enum aarch64_symbol_type symbol_type;
 142 };
 143
 144 struct simd_immediate_info
 145 {
 146   rtx value;
 147   int shift;
 148   int element_width;
 149   bool mvn;
 150   bool msl;
 151 };
 152
 153 /* The current code model.  */
 154 enum aarch64_code_model aarch64_cmodel;
 155
 156 #ifdef HAVE_AS_TLS
 157 #undef TARGET_HAVE_TLS
 158 #define TARGET_HAVE_TLS 1
 159 #endif
 160
 161 static bool aarch64_composite_type_p (const_tree, machine_mode);
 162 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 163                                                      const_tree,
 164                                                      machine_mode *, int *,
 165                                                      bool *);
 166 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 167 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 168 static void aarch64_override_options_after_change (void);
 169 static bool aarch64_vector_mode_supported_p (machine_mode);
 170 static unsigned bit_count (unsigned HOST_WIDE_INT);
 171 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 172                                                  const unsigned char *sel);
 173 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 174
 175 /* Major revision number of the ARM Architecture implemented by the target.  */
 176 unsigned aarch64_architecture_version;
 177
 178 /* The processor for which instructions should be scheduled.  */
 179 enum aarch64_processor aarch64_tune = cortexa53;
 180
 181 /* The current tuning set.  */
 182 const struct tune_params *aarch64_tune_params;
 183
 184 /* Mask to specify which instructions we are allowed to generate.  */
 185 unsigned long aarch64_isa_flags = 0;
 186
 187 /* Mask to specify which instruction scheduling options should be used.  */
 188 unsigned long aarch64_tune_flags = 0;
 189
 190 /* Tuning parameters.  */
 191
 192 static const struct cpu_addrcost_table generic_addrcost_table =
 193 {
 194     {
 195       0, /* hi  */
 196       0, /* si  */
 197       0, /* di  */
 198       0, /* ti  */
 199     },
 200   0, /* pre_modify  */
 201   0, /* post_modify  */
 202   0, /* register_offset  */
 203   0, /* register_extend  */
 204   0 /* imm_offset  */
 205 };
 206
 207 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 208 {
 209     {
 210       1, /* hi  */
 211       0, /* si  */
 212       0, /* di  */
 213       1, /* ti  */
 214     },
 215   0, /* pre_modify  */
 216   0, /* post_modify  */
 217   0, /* register_offset  */
 218   0, /* register_extend  */
 219   0, /* imm_offset  */
 220 };
 221
 222 static const struct cpu_addrcost_table xgene1_addrcost_table =
 223 {
 224     {
 225       1, /* hi  */
 226       0, /* si  */
 227       0, /* di  */
 228       1, /* ti  */
 229     },
 230   1, /* pre_modify  */
 231   0, /* post_modify  */
 232   0, /* register_offset  */
 233   1, /* register_extend  */
 234   0, /* imm_offset  */
 235 };
 236
 237 static const struct cpu_regmove_cost generic_regmove_cost =
 238 {
 239   1, /* GP2GP  */
 240   /* Avoid the use of slow int<->fp moves for spilling by setting
 241      their cost higher than memmov_cost.  */
 242   5, /* GP2FP  */
 243   5, /* FP2GP  */
 244   2 /* FP2FP  */
 245 };
 246
 247 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 248 {
 249   1, /* GP2GP  */
 250   /* Avoid the use of slow int<->fp moves for spilling by setting
 251      their cost higher than memmov_cost.  */
 252   5, /* GP2FP  */
 253   5, /* FP2GP  */
 254   2 /* FP2FP  */
 255 };
 256
 257 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 258 {
 259   1, /* GP2GP  */
 260   /* Avoid the use of slow int<->fp moves for spilling by setting
 261      their cost higher than memmov_cost.  */
 262   5, /* GP2FP  */
 263   5, /* FP2GP  */
 264   2 /* FP2FP  */
 265 };
 266
 267 static const struct cpu_regmove_cost thunderx_regmove_cost =
 268 {
 269   2, /* GP2GP  */
 270   2, /* GP2FP  */
 271   6, /* FP2GP  */
 272   4 /* FP2FP  */
 273 };
 274
 275 static const struct cpu_regmove_cost xgene1_regmove_cost =
 276 {
 277   1, /* GP2GP  */
 278   /* Avoid the use of slow int<->fp moves for spilling by setting
 279      their cost higher than memmov_cost.  */
 280   8, /* GP2FP  */
 281   8, /* FP2GP  */
 282   2 /* FP2FP  */
 283 };
 284
 285 /* Generic costs for vector insn classes.  */
 286 static const struct cpu_vector_cost generic_vector_cost =
 287 {
 288   1, /* scalar_stmt_cost  */
 289   1, /* scalar_load_cost  */
 290   1, /* scalar_store_cost  */
 291   1, /* vec_stmt_cost  */
 292   1, /* vec_to_scalar_cost  */
 293   1, /* scalar_to_vec_cost  */
 294   1, /* vec_align_load_cost  */
 295   1, /* vec_unalign_load_cost  */
 296   1, /* vec_unalign_store_cost  */
 297   1, /* vec_store_cost  */
 298   3, /* cond_taken_branch_cost  */
 299   1 /* cond_not_taken_branch_cost  */
 300 };
 301
 302 /* Generic costs for vector insn classes.  */
 303 static const struct cpu_vector_cost cortexa57_vector_cost =
 304 {
 305   1, /* scalar_stmt_cost  */
 306   4, /* scalar_load_cost  */
 307   1, /* scalar_store_cost  */
 308   3, /* vec_stmt_cost  */
 309   8, /* vec_to_scalar_cost  */
 310   8, /* scalar_to_vec_cost  */
 311   5, /* vec_align_load_cost  */
 312   5, /* vec_unalign_load_cost  */
 313   1, /* vec_unalign_store_cost  */
 314   1, /* vec_store_cost  */
 315   1, /* cond_taken_branch_cost  */
 316   1 /* cond_not_taken_branch_cost  */
 317 };
 318
 319 /* Generic costs for vector insn classes.  */
 320 static const struct cpu_vector_cost xgene1_vector_cost =
 321 {
 322   1, /* scalar_stmt_cost  */
 323   5, /* scalar_load_cost  */
 324   1, /* scalar_store_cost  */
 325   2, /* vec_stmt_cost  */
 326   4, /* vec_to_scalar_cost  */
 327   4, /* scalar_to_vec_cost  */
 328   10, /* vec_align_load_cost  */
 329   10, /* vec_unalign_load_cost  */
 330   2, /* vec_unalign_store_cost  */
 331   2, /* vec_store_cost  */
 332   2, /* cond_taken_branch_cost  */
 333   1 /* cond_not_taken_branch_cost  */
 334 };
 335
 336 #define AARCH64_FUSE_NOTHING    (0)
 337 #define AARCH64_FUSE_MOV_MOVK   (1 << 0)
 338 #define AARCH64_FUSE_ADRP_ADD   (1 << 1)
 339 #define AARCH64_FUSE_MOVK_MOVK  (1 << 2)
 340 #define AARCH64_FUSE_ADRP_LDR   (1 << 3)
 341 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
 342
 343 /* Generic costs for branch instructions.  */
 344 static const struct cpu_branch_cost generic_branch_cost =
 345 {
 346   2,  /* Predictable.  */
 347   2   /* Unpredictable.  */
 348 };
 349
 350 static const struct tune_params generic_tunings =
 351 {
 352   &cortexa57_extra_costs,
 353   &generic_addrcost_table,
 354   &generic_regmove_cost,
 355   &generic_vector_cost,
 356   &generic_branch_cost,
 357   4, /* memmov_cost  */
 358   2, /* issue_rate  */
 359   AARCH64_FUSE_NOTHING, /* fuseable_ops  */
 360   8,    /* function_align.  */
 361   8,    /* jump_align.  */
 362   4,    /* loop_align.  */
 363   2,    /* int_reassoc_width.  */
 364   4,    /* fp_reassoc_width.  */
 365   1,    /* vec_reassoc_width.  */
 366   2,    /* min_div_recip_mul_sf.  */
 367   2     /* min_div_recip_mul_df.  */
 368 };
 369
 370 static const struct tune_params cortexa53_tunings =
 371 {
 372   &cortexa53_extra_costs,
 373   &generic_addrcost_table,
 374   &cortexa53_regmove_cost,
 375   &generic_vector_cost,
 376   &generic_branch_cost,
 377   4, /* memmov_cost  */
 378   2, /* issue_rate  */
 379   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 380    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fuseable_ops  */
 381   8,    /* function_align.  */
 382   8,    /* jump_align.  */
 383   4,    /* loop_align.  */
 384   2,    /* int_reassoc_width.  */
 385   4,    /* fp_reassoc_width.  */
 386   1,    /* vec_reassoc_width.  */
 387   2,    /* min_div_recip_mul_sf.  */
 388   2     /* min_div_recip_mul_df.  */
 389 };
 390
 391 static const struct tune_params cortexa57_tunings =
 392 {
 393   &cortexa57_extra_costs,
 394   &cortexa57_addrcost_table,
 395   &cortexa57_regmove_cost,
 396   &cortexa57_vector_cost,
 397   &generic_branch_cost,
 398   4, /* memmov_cost  */
 399   3, /* issue_rate  */
 400   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 401    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 402   16,   /* function_align.  */
 403   8,    /* jump_align.  */
 404   4,    /* loop_align.  */
 405   2,    /* int_reassoc_width.  */
 406   4,    /* fp_reassoc_width.  */
 407   1,    /* vec_reassoc_width.  */
 408   2,    /* min_div_recip_mul_sf.  */
 409   2     /* min_div_recip_mul_df.  */
 410 };
 411
 412 static const struct tune_params thunderx_tunings =
 413 {
 414   &thunderx_extra_costs,
 415   &generic_addrcost_table,
 416   &thunderx_regmove_cost,
 417   &generic_vector_cost,
 418   &generic_branch_cost,
 419   6, /* memmov_cost  */
 420   2, /* issue_rate  */
 421   AARCH64_FUSE_CMP_BRANCH, /* fuseable_ops  */
 422   8,    /* function_align.  */
 423   8,    /* jump_align.  */
 424   8,    /* loop_align.  */
 425   2,    /* int_reassoc_width.  */
 426   4,    /* fp_reassoc_width.  */
 427   1,    /* vec_reassoc_width.  */
 428   2,    /* min_div_recip_mul_sf.  */
 429   2     /* min_div_recip_mul_df.  */
 430 };
 431
 432 static const struct tune_params xgene1_tunings =
 433 {
 434   &xgene1_extra_costs,
 435   &xgene1_addrcost_table,
 436   &xgene1_regmove_cost,
 437   &xgene1_vector_cost,
 438   &generic_branch_cost,
 439   6, /* memmov_cost  */
 440   4, /* issue_rate  */
 441   AARCH64_FUSE_NOTHING, /* fuseable_ops  */
 442   16,   /* function_align.  */
 443   8,    /* jump_align.  */
 444   16,   /* loop_align.  */
 445   2,    /* int_reassoc_width.  */
 446   4,    /* fp_reassoc_width.  */
 447   1,    /* vec_reassoc_width.  */
 448   2,    /* min_div_recip_mul_sf.  */
 449   2     /* min_div_recip_mul_df.  */
 450 };
 451
 452 /* A processor implementing AArch64.  */
 453 struct processor
 454 {
 455   const char *const name;
 456   enum aarch64_processor core;
 457   const char *arch;
 458   unsigned architecture_version;
 459   const unsigned long flags;
 460   const struct tune_params *const tune;
 461 };
 462
 463 /* Processor cores implementing AArch64.  */
 464 static const struct processor all_cores[] =
 465 {
 466 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
 467   {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
 468 #include "aarch64-cores.def"
 469 #undef AARCH64_CORE
 470   {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
 471   {NULL, aarch64_none, NULL, 0, 0, NULL}
 472 };
 473
 474 /* Architectures implementing AArch64.  */
 475 static const struct processor all_architectures[] =
 476 {
 477 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
 478   {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
 479 #include "aarch64-arches.def"
 480 #undef AARCH64_ARCH
 481   {NULL, aarch64_none, NULL, 0, 0, NULL}
 482 };
 483
 484 /* Target specification.  These are populated as commandline arguments
 485    are processed, or NULL if not specified.  */
 486 static const struct processor *selected_arch;
 487 static const struct processor *selected_cpu;
 488 static const struct processor *selected_tune;
 489
 490 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 491
 492 /* An ISA extension in the co-processor and main instruction set space.  */
 493 struct aarch64_option_extension
 494 {
 495   const char *const name;
 496   const unsigned long flags_on;
 497   const unsigned long flags_off;
 498 };
 499
 500 /* ISA extensions in AArch64.  */
 501 static const struct aarch64_option_extension all_extensions[] =
 502 {
 503 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
 504   {NAME, FLAGS_ON, FLAGS_OFF},
 505 #include "aarch64-option-extensions.def"
 506 #undef AARCH64_OPT_EXTENSION
 507   {NULL, 0, 0}
 508 };
 509
 510 /* Used to track the size of an address when generating a pre/post
 511    increment address.  */
 512 static machine_mode aarch64_memory_reference_mode;
 513
 514 /* A table of valid AArch64 "bitmask immediate" values for
 515    logical instructions.  */
 516
 517 #define AARCH64_NUM_BITMASKS  5334
 518 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
 519
 520 typedef enum aarch64_cond_code
 521 {
 522   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 523   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 524   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 525 }
 526 aarch64_cc;
 527
 528 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 529
 530 /* The condition codes of the processor, and the inverse function.  */
 531 static const char * const aarch64_condition_codes[] =
 532 {
 533   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 534   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 535 };
 536
 537 static unsigned int
 538 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 539 {
 540   if (GET_MODE_UNIT_SIZE (mode) == 4)
 541     return aarch64_tune_params->min_div_recip_mul_sf;
 542   return aarch64_tune_params->min_div_recip_mul_df;
 543 }
 544
 545 static int
 546 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 547                              enum machine_mode mode)
 548 {
 549   if (VECTOR_MODE_P (mode))
 550     return aarch64_tune_params->vec_reassoc_width;
 551   if (INTEGRAL_MODE_P (mode))
 552     return aarch64_tune_params->int_reassoc_width;
 553   if (FLOAT_MODE_P (mode))
 554     return aarch64_tune_params->fp_reassoc_width;
 555   return 1;
 556 }
 557
 558 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 559 unsigned
 560 aarch64_dbx_register_number (unsigned regno)
 561 {
 562    if (GP_REGNUM_P (regno))
 563      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 564    else if (regno == SP_REGNUM)
 565      return AARCH64_DWARF_SP;
 566    else if (FP_REGNUM_P (regno))
 567      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 568
 569    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 570       equivalent DWARF register.  */
 571    return DWARF_FRAME_REGISTERS;
 572 }
 573
 574 /* Return TRUE if MODE is any of the large INT modes.  */
 575 static bool
 576 aarch64_vect_struct_mode_p (machine_mode mode)
 577 {
 578   return mode == OImode || mode == CImode || mode == XImode;
 579 }
 580
 581 /* Return TRUE if MODE is any of the vector modes.  */
 582 static bool
 583 aarch64_vector_mode_p (machine_mode mode)
 584 {
 585   return aarch64_vector_mode_supported_p (mode)
 586          || aarch64_vect_struct_mode_p (mode);
 587 }
 588
 589 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 590 static bool
 591 aarch64_array_mode_supported_p (machine_mode mode,
 592                                 unsigned HOST_WIDE_INT nelems)
 593 {
 594   if (TARGET_SIMD
 595       && AARCH64_VALID_SIMD_QREG_MODE (mode)
 596       && (nelems >= 2 && nelems <= 4))
 597     return true;
 598
 599   return false;
 600 }
 601
 602 /* Implement HARD_REGNO_NREGS.  */
 603
 604 int
 605 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
 606 {
 607   switch (aarch64_regno_regclass (regno))
 608     {
 609     case FP_REGS:
 610     case FP_LO_REGS:
 611       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 612     default:
 613       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 614     }
 615   gcc_unreachable ();
 616 }
 617
 618 /* Implement HARD_REGNO_MODE_OK.  */
 619
 620 int
 621 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
 622 {
 623   if (GET_MODE_CLASS (mode) == MODE_CC)
 624     return regno == CC_REGNUM;
 625
 626   if (regno == SP_REGNUM)
 627     /* The purpose of comparing with ptr_mode is to support the
 628        global register variable associated with the stack pointer
 629        register via the syntax of asm ("wsp") in ILP32.  */
 630     return mode == Pmode || mode == ptr_mode;
 631
 632   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 633     return mode == Pmode;
 634
 635   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 636     return 1;
 637
 638   if (FP_REGNUM_P (regno))
 639     {
 640       if (aarch64_vect_struct_mode_p (mode))
 641         return
 642           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 643       else
 644         return 1;
 645     }
 646
 647   return 0;
 648 }
 649
 650 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 651 machine_mode
 652 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 653                                      machine_mode mode)
 654 {
 655   /* Handle modes that fit within single registers.  */
 656   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 657     {
 658       if (GET_MODE_SIZE (mode) >= 4)
 659         return mode;
 660       else
 661         return SImode;
 662     }
 663   /* Fall back to generic for multi-reg and very large modes.  */
 664   else
 665     return choose_hard_reg_mode (regno, nregs, false);
 666 }
 667
 668 /* Return true if calls to DECL should be treated as
 669    long-calls (ie called via a register).  */
 670 static bool
 671 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 672 {
 673   return false;
 674 }
 675
 676 /* Return true if calls to symbol-ref SYM should be treated as
 677    long-calls (ie called via a register).  */
 678 bool
 679 aarch64_is_long_call_p (rtx sym)
 680 {
 681   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 682 }
 683
 684 /* Return true if the offsets to a zero/sign-extract operation
 685    represent an expression that matches an extend operation.  The
 686    operands represent the paramters from
 687
 688    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 689 bool
 690 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
 691                                 rtx extract_imm)
 692 {
 693   HOST_WIDE_INT mult_val, extract_val;
 694
 695   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 696     return false;
 697
 698   mult_val = INTVAL (mult_imm);
 699   extract_val = INTVAL (extract_imm);
 700
 701   if (extract_val > 8
 702       && extract_val < GET_MODE_BITSIZE (mode)
 703       && exact_log2 (extract_val & ~7) > 0
 704       && (extract_val & 7) <= 4
 705       && mult_val == (1 << (extract_val & 7)))
 706     return true;
 707
 708   return false;
 709 }
 710
 711 /* Emit an insn that's a simple single-set.  Both the operands must be
 712    known to be valid.  */
 713 inline static rtx
 714 emit_set_insn (rtx x, rtx y)
 715 {
 716   return emit_insn (gen_rtx_SET (VOIDmode, x, y));
 717 }
 718
 719 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 720    return the rtx for register 0 in the proper mode.  */
 721 rtx
 722 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 723 {
 724   machine_mode mode = SELECT_CC_MODE (code, x, y);
 725   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 726
 727   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 728   return cc_reg;
 729 }
 730
 731 /* Build the SYMBOL_REF for __tls_get_addr.  */
 732
 733 static GTY(()) rtx tls_get_addr_libfunc;
 734
 735 rtx
 736 aarch64_tls_get_addr (void)
 737 {
 738   if (!tls_get_addr_libfunc)
 739     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 740   return tls_get_addr_libfunc;
 741 }
 742
 743 /* Return the TLS model to use for ADDR.  */
 744
 745 static enum tls_model
 746 tls_symbolic_operand_type (rtx addr)
 747 {
 748   enum tls_model tls_kind = TLS_MODEL_NONE;
 749   rtx sym, addend;
 750
 751   if (GET_CODE (addr) == CONST)
 752     {
 753       split_const (addr, &sym, &addend);
 754       if (GET_CODE (sym) == SYMBOL_REF)
 755         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 756     }
 757   else if (GET_CODE (addr) == SYMBOL_REF)
 758     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 759
 760   return tls_kind;
 761 }
 762
 763 /* We'll allow lo_sum's in addresses in our legitimate addresses
 764    so that combine would take care of combining addresses where
 765    necessary, but for generation purposes, we'll generate the address
 766    as :
 767    RTL                               Absolute
 768    tmp = hi (symbol_ref);            adrp  x1, foo
 769    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 770                                      nop
 771
 772    PIC                               TLS
 773    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 774    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 775                                      bl   __tls_get_addr
 776                                      nop
 777
 778    Load TLS symbol, depending on TLS mechanism and TLS access model.
 779
 780    Global Dynamic - Traditional TLS:
 781    adrp tmp, :tlsgd:imm
 782    add  dest, tmp, #:tlsgd_lo12:imm
 783    bl   __tls_get_addr
 784
 785    Global Dynamic - TLS Descriptors:
 786    adrp dest, :tlsdesc:imm
 787    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 788    add  dest, dest, #:tlsdesc_lo12:imm
 789    blr  tmp
 790    mrs  tp, tpidr_el0
 791    add  dest, dest, tp
 792
 793    Initial Exec:
 794    mrs  tp, tpidr_el0
 795    adrp tmp, :gottprel:imm
 796    ldr  dest, [tmp, #:gottprel_lo12:imm]
 797    add  dest, dest, tp
 798
 799    Local Exec:
 800    mrs  tp, tpidr_el0
 801    add  t0, tp, #:tprel_hi12:imm, lsl #12
 802    add  t0, t0, #:tprel_lo12_nc:imm
 803 */
 804
 805 static void
 806 aarch64_load_symref_appropriately (rtx dest, rtx imm,
 807                                    enum aarch64_symbol_type type)
 808 {
 809   switch (type)
 810     {
 811     case SYMBOL_SMALL_ABSOLUTE:
 812       {
 813         /* In ILP32, the mode of dest can be either SImode or DImode.  */
 814         rtx tmp_reg = dest;
 815         machine_mode mode = GET_MODE (dest);
 816
 817         gcc_assert (mode == Pmode || mode == ptr_mode);
 818
 819         if (can_create_pseudo_p ())
 820           tmp_reg = gen_reg_rtx (mode);
 821
 822         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 823         emit_insn (gen_add_losym (dest, tmp_reg, imm));
 824         return;
 825       }
 826
 827     case SYMBOL_TINY_ABSOLUTE:
 828       emit_insn (gen_rtx_SET (Pmode, dest, imm));
 829       return;
 830
 831     case SYMBOL_SMALL_GOT:
 832       {
 833         /* In ILP32, the mode of dest can be either SImode or DImode,
 834            while the got entry is always of SImode size.  The mode of
 835            dest depends on how dest is used: if dest is assigned to a
 836            pointer (e.g. in the memory), it has SImode; it may have
 837            DImode if dest is dereferenced to access the memeory.
 838            This is why we have to handle three different ldr_got_small
 839            patterns here (two patterns for ILP32).  */
 840         rtx tmp_reg = dest;
 841         machine_mode mode = GET_MODE (dest);
 842
 843         if (can_create_pseudo_p ())
 844           tmp_reg = gen_reg_rtx (mode);
 845
 846         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 847         if (mode == ptr_mode)
 848           {
 849             if (mode == DImode)
 850               emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
 851             else
 852               emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
 853           }
 854         else
 855           {
 856             gcc_assert (mode == Pmode);
 857             emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
 858           }
 859
 860         return;
 861       }
 862
 863     case SYMBOL_SMALL_TLSGD:
 864       {
 865         rtx_insn *insns;
 866         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
 867
 868         start_sequence ();
 869         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
 870         insns = get_insns ();
 871         end_sequence ();
 872
 873         RTL_CONST_CALL_P (insns) = 1;
 874         emit_libcall_block (insns, dest, result, imm);
 875         return;
 876       }
 877
 878     case SYMBOL_SMALL_TLSDESC:
 879       {
 880         machine_mode mode = GET_MODE (dest);
 881         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
 882         rtx tp;
 883
 884         gcc_assert (mode == Pmode || mode == ptr_mode);
 885
 886         /* In ILP32, the got entry is always of SImode size.  Unlike
 887            small GOT, the dest is fixed at reg 0.  */
 888         if (TARGET_ILP32)
 889           emit_insn (gen_tlsdesc_small_si (imm));
 890         else
 891           emit_insn (gen_tlsdesc_small_di (imm));
 892         tp = aarch64_load_tp (NULL);
 893
 894         if (mode != Pmode)
 895           tp = gen_lowpart (mode, tp);
 896
 897         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
 898         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 899         return;
 900       }
 901
 902     case SYMBOL_SMALL_GOTTPREL:
 903       {
 904         /* In ILP32, the mode of dest can be either SImode or DImode,
 905            while the got entry is always of SImode size.  The mode of
 906            dest depends on how dest is used: if dest is assigned to a
 907            pointer (e.g. in the memory), it has SImode; it may have
 908            DImode if dest is dereferenced to access the memeory.
 909            This is why we have to handle three different tlsie_small
 910            patterns here (two patterns for ILP32).  */
 911         machine_mode mode = GET_MODE (dest);
 912         rtx tmp_reg = gen_reg_rtx (mode);
 913         rtx tp = aarch64_load_tp (NULL);
 914
 915         if (mode == ptr_mode)
 916           {
 917             if (mode == DImode)
 918               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
 919             else
 920               {
 921                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
 922                 tp = gen_lowpart (mode, tp);
 923               }
 924           }
 925         else
 926           {
 927             gcc_assert (mode == Pmode);
 928             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
 929           }
 930
 931         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
 932         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 933         return;
 934       }
 935
 936     case SYMBOL_SMALL_TPREL:
 937       {
 938         rtx tp = aarch64_load_tp (NULL);
 939
 940         if (GET_MODE (dest) != Pmode)
 941           tp = gen_lowpart (GET_MODE (dest), tp);
 942
 943         emit_insn (gen_tlsle_small (dest, tp, imm));
 944         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 945         return;
 946       }
 947
 948     case SYMBOL_TINY_GOT:
 949       emit_insn (gen_ldr_got_tiny (dest, imm));
 950       return;
 951
 952     default:
 953       gcc_unreachable ();
 954     }
 955 }
 956
 957 /* Emit a move from SRC to DEST.  Assume that the move expanders can
 958    handle all moves if !can_create_pseudo_p ().  The distinction is
 959    important because, unlike emit_move_insn, the move expanders know
 960    how to force Pmode objects into the constant pool even when the
 961    constant pool address is not itself legitimate.  */
 962 static rtx
 963 aarch64_emit_move (rtx dest, rtx src)
 964 {
 965   return (can_create_pseudo_p ()
 966           ? emit_move_insn (dest, src)
 967           : emit_move_insn_1 (dest, src));
 968 }
 969
 970 /* Split a 128-bit move operation into two 64-bit move operations,
 971    taking care to handle partial overlap of register to register
 972    copies.  Special cases are needed when moving between GP regs and
 973    FP regs.  SRC can be a register, constant or memory; DST a register
 974    or memory.  If either operand is memory it must not have any side
 975    effects.  */
 976 void
 977 aarch64_split_128bit_move (rtx dst, rtx src)
 978 {
 979   rtx dst_lo, dst_hi;
 980   rtx src_lo, src_hi;
 981
 982   machine_mode mode = GET_MODE (dst);
 983
 984   gcc_assert (mode == TImode || mode == TFmode);
 985   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
 986   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
 987
 988   if (REG_P (dst) && REG_P (src))
 989     {
 990       int src_regno = REGNO (src);
 991       int dst_regno = REGNO (dst);
 992
 993       /* Handle FP <-> GP regs.  */
 994       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
 995         {
 996           src_lo = gen_lowpart (word_mode, src);
 997           src_hi = gen_highpart (word_mode, src);
 998
 999           if (mode == TImode)
1000             {
1001               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1002               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1003             }
1004           else
1005             {
1006               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1007               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1008             }
1009           return;
1010         }
1011       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1012         {
1013           dst_lo = gen_lowpart (word_mode, dst);
1014           dst_hi = gen_highpart (word_mode, dst);
1015
1016           if (mode == TImode)
1017             {
1018               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1019               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1020             }
1021           else
1022             {
1023               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1024               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1025             }
1026           return;
1027         }
1028     }
1029
1030   dst_lo = gen_lowpart (word_mode, dst);
1031   dst_hi = gen_highpart (word_mode, dst);
1032   src_lo = gen_lowpart (word_mode, src);
1033   src_hi = gen_highpart_mode (word_mode, mode, src);
1034
1035   /* At most one pairing may overlap.  */
1036   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1037     {
1038       aarch64_emit_move (dst_hi, src_hi);
1039       aarch64_emit_move (dst_lo, src_lo);
1040     }
1041   else
1042     {
1043       aarch64_emit_move (dst_lo, src_lo);
1044       aarch64_emit_move (dst_hi, src_hi);
1045     }
1046 }
1047
1048 bool
1049 aarch64_split_128bit_move_p (rtx dst, rtx src)
1050 {
1051   return (! REG_P (src)
1052           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1053 }
1054
1055 /* Split a complex SIMD combine.  */
1056
1057 void
1058 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1059 {
1060   machine_mode src_mode = GET_MODE (src1);
1061   machine_mode dst_mode = GET_MODE (dst);
1062
1063   gcc_assert (VECTOR_MODE_P (dst_mode));
1064
1065   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1066     {
1067       rtx (*gen) (rtx, rtx, rtx);
1068
1069       switch (src_mode)
1070         {
1071         case V8QImode:
1072           gen = gen_aarch64_simd_combinev8qi;
1073           break;
1074         case V4HImode:
1075           gen = gen_aarch64_simd_combinev4hi;
1076           break;
1077         case V2SImode:
1078           gen = gen_aarch64_simd_combinev2si;
1079           break;
1080         case V2SFmode:
1081           gen = gen_aarch64_simd_combinev2sf;
1082           break;
1083         case DImode:
1084           gen = gen_aarch64_simd_combinedi;
1085           break;
1086         case DFmode:
1087           gen = gen_aarch64_simd_combinedf;
1088           break;
1089         default:
1090           gcc_unreachable ();
1091         }
1092
1093       emit_insn (gen (dst, src1, src2));
1094       return;
1095     }
1096 }
1097
1098 /* Split a complex SIMD move.  */
1099
1100 void
1101 aarch64_split_simd_move (rtx dst, rtx src)
1102 {
1103   machine_mode src_mode = GET_MODE (src);
1104   machine_mode dst_mode = GET_MODE (dst);
1105
1106   gcc_assert (VECTOR_MODE_P (dst_mode));
1107
1108   if (REG_P (dst) && REG_P (src))
1109     {
1110       rtx (*gen) (rtx, rtx);
1111
1112       gcc_assert (VECTOR_MODE_P (src_mode));
1113
1114       switch (src_mode)
1115         {
1116         case V16QImode:
1117           gen = gen_aarch64_split_simd_movv16qi;
1118           break;
1119         case V8HImode:
1120           gen = gen_aarch64_split_simd_movv8hi;
1121           break;
1122         case V4SImode:
1123           gen = gen_aarch64_split_simd_movv4si;
1124           break;
1125         case V2DImode:
1126           gen = gen_aarch64_split_simd_movv2di;
1127           break;
1128         case V4SFmode:
1129           gen = gen_aarch64_split_simd_movv4sf;
1130           break;
1131         case V2DFmode:
1132           gen = gen_aarch64_split_simd_movv2df;
1133           break;
1134         default:
1135           gcc_unreachable ();
1136         }
1137
1138       emit_insn (gen (dst, src));
1139       return;
1140     }
1141 }
1142
1143 static rtx
1144 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1145 {
1146   if (can_create_pseudo_p ())
1147     return force_reg (mode, value);
1148   else
1149     {
1150       x = aarch64_emit_move (x, value);
1151       return x;
1152     }
1153 }
1154
1155
1156 static rtx
1157 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1158 {
1159   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1160     {
1161       rtx high;
1162       /* Load the full offset into a register.  This
1163          might be improvable in the future.  */
1164       high = GEN_INT (offset);
1165       offset = 0;
1166       high = aarch64_force_temporary (mode, temp, high);
1167       reg = aarch64_force_temporary (mode, temp,
1168                                      gen_rtx_PLUS (mode, high, reg));
1169     }
1170   return plus_constant (mode, reg, offset);
1171 }
1172
1173 static int
1174 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1175                                 machine_mode mode)
1176 {
1177   unsigned HOST_WIDE_INT mask;
1178   int i;
1179   bool first;
1180   unsigned HOST_WIDE_INT val;
1181   bool subtargets;
1182   rtx subtarget;
1183   int one_match, zero_match, first_not_ffff_match;
1184   int num_insns = 0;
1185
1186   if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1187     {
1188       if (generate)
1189         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1190       num_insns++;
1191       return num_insns;
1192     }
1193
1194   if (mode == SImode)
1195     {
1196       /* We know we can't do this in 1 insn, and we must be able to do it
1197          in two; so don't mess around looking for sequences that don't buy
1198          us anything.  */
1199       if (generate)
1200         {
1201           emit_insn (gen_rtx_SET (VOIDmode, dest,
1202                                   GEN_INT (INTVAL (imm) & 0xffff)));
1203           emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1204                                      GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1205         }
1206       num_insns += 2;
1207       return num_insns;
1208     }
1209
1210   /* Remaining cases are all for DImode.  */
1211
1212   val = INTVAL (imm);
1213   subtargets = optimize && can_create_pseudo_p ();
1214
1215   one_match = 0;
1216   zero_match = 0;
1217   mask = 0xffff;
1218   first_not_ffff_match = -1;
1219
1220   for (i = 0; i < 64; i += 16, mask <<= 16)
1221     {
1222       if ((val & mask) == mask)
1223         one_match++;
1224       else
1225         {
1226           if (first_not_ffff_match < 0)
1227             first_not_ffff_match = i;
1228           if ((val & mask) == 0)
1229             zero_match++;
1230         }
1231     }
1232
1233   if (one_match == 2)
1234     {
1235       /* Set one of the quarters and then insert back into result.  */
1236       mask = 0xffffll << first_not_ffff_match;
1237       if (generate)
1238         {
1239           emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1240           emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1241                                      GEN_INT ((val >> first_not_ffff_match)
1242                                               & 0xffff)));
1243         }
1244       num_insns += 2;
1245       return num_insns;
1246     }
1247
1248   if (zero_match == 2)
1249     goto simple_sequence;
1250
1251   mask = 0x0ffff0000UL;
1252   for (i = 16; i < 64; i += 16, mask <<= 16)
1253     {
1254       HOST_WIDE_INT comp = mask & ~(mask - 1);
1255
1256       if (aarch64_uimm12_shift (val - (val & mask)))
1257         {
1258           if (generate)
1259             {
1260               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1261               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1262                                       GEN_INT (val & mask)));
1263               emit_insn (gen_adddi3 (dest, subtarget,
1264                                      GEN_INT (val - (val & mask))));
1265             }
1266           num_insns += 2;
1267           return num_insns;
1268         }
1269       else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1270         {
1271           if (generate)
1272             {
1273               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1274               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1275                                       GEN_INT ((val + comp) & mask)));
1276               emit_insn (gen_adddi3 (dest, subtarget,
1277                                      GEN_INT (val - ((val + comp) & mask))));
1278             }
1279           num_insns += 2;
1280           return num_insns;
1281         }
1282       else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1283         {
1284           if (generate)
1285             {
1286               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1287               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1288                                       GEN_INT ((val - comp) | ~mask)));
1289               emit_insn (gen_adddi3 (dest, subtarget,
1290                                      GEN_INT (val - ((val - comp) | ~mask))));
1291             }
1292           num_insns += 2;
1293           return num_insns;
1294         }
1295       else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1296         {
1297           if (generate)
1298             {
1299               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1300               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1301                                       GEN_INT (val | ~mask)));
1302               emit_insn (gen_adddi3 (dest, subtarget,
1303                                      GEN_INT (val - (val | ~mask))));
1304             }
1305           num_insns += 2;
1306           return num_insns;
1307         }
1308     }
1309
1310   /* See if we can do it by arithmetically combining two
1311      immediates.  */
1312   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1313     {
1314       int j;
1315       mask = 0xffff;
1316
1317       if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1318           || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1319         {
1320           if (generate)
1321             {
1322               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1323               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1324                                       GEN_INT (aarch64_bitmasks[i])));
1325               emit_insn (gen_adddi3 (dest, subtarget,
1326                                      GEN_INT (val - aarch64_bitmasks[i])));
1327             }
1328           num_insns += 2;
1329           return num_insns;
1330         }
1331
1332       for (j = 0; j < 64; j += 16, mask <<= 16)
1333         {
1334           if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1335             {
1336               if (generate)
1337                 {
1338                   emit_insn (gen_rtx_SET (VOIDmode, dest,
1339                                           GEN_INT (aarch64_bitmasks[i])));
1340                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1341                                              GEN_INT ((val >> j) & 0xffff)));
1342                 }
1343               num_insns += 2;
1344               return num_insns;
1345             }
1346         }
1347     }
1348
1349   /* See if we can do it by logically combining two immediates.  */
1350   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1351     {
1352       if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1353         {
1354           int j;
1355
1356           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1357             if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1358               {
1359                 if (generate)
1360                   {
1361                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1362                     emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1363                                             GEN_INT (aarch64_bitmasks[i])));
1364                     emit_insn (gen_iordi3 (dest, subtarget,
1365                                            GEN_INT (aarch64_bitmasks[j])));
1366                   }
1367                 num_insns += 2;
1368                 return num_insns;
1369               }
1370         }
1371       else if ((val & aarch64_bitmasks[i]) == val)
1372         {
1373           int j;
1374
1375           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1376             if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1377               {
1378                 if (generate)
1379                   {
1380                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1381                     emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1382                                             GEN_INT (aarch64_bitmasks[j])));
1383                     emit_insn (gen_anddi3 (dest, subtarget,
1384                                            GEN_INT (aarch64_bitmasks[i])));
1385                   }
1386                 num_insns += 2;
1387                 return num_insns;
1388               }
1389         }
1390     }
1391
1392   if (one_match > zero_match)
1393     {
1394       /* Set either first three quarters or all but the third.   */
1395       mask = 0xffffll << (16 - first_not_ffff_match);
1396       if (generate)
1397         emit_insn (gen_rtx_SET (VOIDmode, dest,
1398                                 GEN_INT (val | mask | 0xffffffff00000000ull)));
1399       num_insns ++;
1400
1401       /* Now insert other two quarters.  */
1402       for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1403            i < 64; i += 16, mask <<= 16)
1404         {
1405           if ((val & mask) != mask)
1406             {
1407               if (generate)
1408                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1409                                            GEN_INT ((val >> i) & 0xffff)));
1410               num_insns ++;
1411             }
1412         }
1413       return num_insns;
1414     }
1415
1416  simple_sequence:
1417   first = true;
1418   mask = 0xffff;
1419   for (i = 0; i < 64; i += 16, mask <<= 16)
1420     {
1421       if ((val & mask) != 0)
1422         {
1423           if (first)
1424             {
1425               if (generate)
1426                 emit_insn (gen_rtx_SET (VOIDmode, dest,
1427                                         GEN_INT (val & mask)));
1428               num_insns ++;
1429               first = false;
1430             }
1431           else
1432             {
1433               if (generate)
1434                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1435                                            GEN_INT ((val >> i) & 0xffff)));
1436               num_insns ++;
1437             }
1438         }
1439     }
1440
1441   return num_insns;
1442 }
1443
1444
1445 void
1446 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1447 {
1448   machine_mode mode = GET_MODE (dest);
1449
1450   gcc_assert (mode == SImode || mode == DImode);
1451
1452   /* Check on what type of symbol it is.  */
1453   if (GET_CODE (imm) == SYMBOL_REF
1454       || GET_CODE (imm) == LABEL_REF
1455       || GET_CODE (imm) == CONST)
1456     {
1457       rtx mem, base, offset;
1458       enum aarch64_symbol_type sty;
1459
1460       /* If we have (const (plus symbol offset)), separate out the offset
1461          before we start classifying the symbol.  */
1462       split_const (imm, &base, &offset);
1463
1464       sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1465       switch (sty)
1466         {
1467         case SYMBOL_FORCE_TO_MEM:
1468           if (offset != const0_rtx
1469               && targetm.cannot_force_const_mem (mode, imm))
1470             {
1471               gcc_assert (can_create_pseudo_p ());
1472               base = aarch64_force_temporary (mode, dest, base);
1473               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1474               aarch64_emit_move (dest, base);
1475               return;
1476             }
1477           mem = force_const_mem (ptr_mode, imm);
1478           gcc_assert (mem);
1479           if (mode != ptr_mode)
1480             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1481           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1482           return;
1483
1484         case SYMBOL_SMALL_TLSGD:
1485         case SYMBOL_SMALL_TLSDESC:
1486         case SYMBOL_SMALL_GOTTPREL:
1487         case SYMBOL_SMALL_GOT:
1488         case SYMBOL_TINY_GOT:
1489           if (offset != const0_rtx)
1490             {
1491               gcc_assert(can_create_pseudo_p ());
1492               base = aarch64_force_temporary (mode, dest, base);
1493               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1494               aarch64_emit_move (dest, base);
1495               return;
1496             }
1497           /* FALLTHRU */
1498
1499         case SYMBOL_SMALL_TPREL:
1500         case SYMBOL_SMALL_ABSOLUTE:
1501         case SYMBOL_TINY_ABSOLUTE:
1502           aarch64_load_symref_appropriately (dest, imm, sty);
1503           return;
1504
1505         default:
1506           gcc_unreachable ();
1507         }
1508     }
1509
1510   if (!CONST_INT_P (imm))
1511     {
1512       if (GET_CODE (imm) == HIGH)
1513         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1514       else
1515         {
1516           rtx mem = force_const_mem (mode, imm);
1517           gcc_assert (mem);
1518           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1519         }
1520
1521       return;
1522     }
1523
1524   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1525 }
1526
1527 static bool
1528 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1529                                  tree exp ATTRIBUTE_UNUSED)
1530 {
1531   /* Currently, always true.  */
1532   return true;
1533 }
1534
1535 /* Implement TARGET_PASS_BY_REFERENCE.  */
1536
1537 static bool
1538 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1539                            machine_mode mode,
1540                            const_tree type,
1541                            bool named ATTRIBUTE_UNUSED)
1542 {
1543   HOST_WIDE_INT size;
1544   machine_mode dummymode;
1545   int nregs;
1546
1547   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1548   size = (mode == BLKmode && type)
1549     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1550
1551   /* Aggregates are passed by reference based on their size.  */
1552   if (type && AGGREGATE_TYPE_P (type))
1553     {
1554       size = int_size_in_bytes (type);
1555     }
1556
1557   /* Variable sized arguments are always returned by reference.  */
1558   if (size < 0)
1559     return true;
1560
1561   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1562   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1563                                                &dummymode, &nregs,
1564                                                NULL))
1565     return false;
1566
1567   /* Arguments which are variable sized or larger than 2 registers are
1568      passed by reference unless they are a homogenous floating point
1569      aggregate.  */
1570   return size > 2 * UNITS_PER_WORD;
1571 }
1572
1573 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1574 static bool
1575 aarch64_return_in_msb (const_tree valtype)
1576 {
1577   machine_mode dummy_mode;
1578   int dummy_int;
1579
1580   /* Never happens in little-endian mode.  */
1581   if (!BYTES_BIG_ENDIAN)
1582     return false;
1583
1584   /* Only composite types smaller than or equal to 16 bytes can
1585      be potentially returned in registers.  */
1586   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1587       || int_size_in_bytes (valtype) <= 0
1588       || int_size_in_bytes (valtype) > 16)
1589     return false;
1590
1591   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1592      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1593      is always passed/returned in the least significant bits of fp/simd
1594      register(s).  */
1595   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1596                                                &dummy_mode, &dummy_int, NULL))
1597     return false;
1598
1599   return true;
1600 }
1601
1602 /* Implement TARGET_FUNCTION_VALUE.
1603    Define how to find the value returned by a function.  */
1604
1605 static rtx
1606 aarch64_function_value (const_tree type, const_tree func,
1607                         bool outgoing ATTRIBUTE_UNUSED)
1608 {
1609   machine_mode mode;
1610   int unsignedp;
1611   int count;
1612   machine_mode ag_mode;
1613
1614   mode = TYPE_MODE (type);
1615   if (INTEGRAL_TYPE_P (type))
1616     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1617
1618   if (aarch64_return_in_msb (type))
1619     {
1620       HOST_WIDE_INT size = int_size_in_bytes (type);
1621
1622       if (size % UNITS_PER_WORD != 0)
1623         {
1624           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1625           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1626         }
1627     }
1628
1629   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1630                                                &ag_mode, &count, NULL))
1631     {
1632       if (!aarch64_composite_type_p (type, mode))
1633         {
1634           gcc_assert (count == 1 && mode == ag_mode);
1635           return gen_rtx_REG (mode, V0_REGNUM);
1636         }
1637       else
1638         {
1639           int i;
1640           rtx par;
1641
1642           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1643           for (i = 0; i < count; i++)
1644             {
1645               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1646               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1647                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1648               XVECEXP (par, 0, i) = tmp;
1649             }
1650           return par;
1651         }
1652     }
1653   else
1654     return gen_rtx_REG (mode, R0_REGNUM);
1655 }
1656
1657 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1658    Return true if REGNO is the number of a hard register in which the values
1659    of called function may come back.  */
1660
1661 static bool
1662 aarch64_function_value_regno_p (const unsigned int regno)
1663 {
1664   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1665      of 16-byte return values are: 128-bit integers and 16-byte small
1666      structures (excluding homogeneous floating-point aggregates).  */
1667   if (regno == R0_REGNUM || regno == R1_REGNUM)
1668     return true;
1669
1670   /* Up to four fp/simd registers can return a function value, e.g. a
1671      homogeneous floating-point aggregate having four members.  */
1672   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1673     return !TARGET_GENERAL_REGS_ONLY;
1674
1675   return false;
1676 }
1677
1678 /* Implement TARGET_RETURN_IN_MEMORY.
1679
1680    If the type T of the result of a function is such that
1681      void func (T arg)
1682    would require that arg be passed as a value in a register (or set of
1683    registers) according to the parameter passing rules, then the result
1684    is returned in the same registers as would be used for such an
1685    argument.  */
1686
1687 static bool
1688 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1689 {
1690   HOST_WIDE_INT size;
1691   machine_mode ag_mode;
1692   int count;
1693
1694   if (!AGGREGATE_TYPE_P (type)
1695       && TREE_CODE (type) != COMPLEX_TYPE
1696       && TREE_CODE (type) != VECTOR_TYPE)
1697     /* Simple scalar types always returned in registers.  */
1698     return false;
1699
1700   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1701                                                type,
1702                                                &ag_mode,
1703                                                &count,
1704                                                NULL))
1705     return false;
1706
1707   /* Types larger than 2 registers returned in memory.  */
1708   size = int_size_in_bytes (type);
1709   return (size < 0 || size > 2 * UNITS_PER_WORD);
1710 }
1711
1712 static bool
1713 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1714                                const_tree type, int *nregs)
1715 {
1716   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1717   return aarch64_vfp_is_call_or_return_candidate (mode,
1718                                                   type,
1719                                                   &pcum->aapcs_vfp_rmode,
1720                                                   nregs,
1721                                                   NULL);
1722 }
1723
1724 /* Given MODE and TYPE of a function argument, return the alignment in
1725    bits.  The idea is to suppress any stronger alignment requested by
1726    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1727    This is a helper function for local use only.  */
1728
1729 static unsigned int
1730 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1731 {
1732   unsigned int alignment;
1733
1734   if (type)
1735     {
1736       if (!integer_zerop (TYPE_SIZE (type)))
1737         {
1738           if (TYPE_MODE (type) == mode)
1739             alignment = TYPE_ALIGN (type);
1740           else
1741             alignment = GET_MODE_ALIGNMENT (mode);
1742         }
1743       else
1744         alignment = 0;
1745     }
1746   else
1747     alignment = GET_MODE_ALIGNMENT (mode);
1748
1749   return alignment;
1750 }
1751
1752 /* Layout a function argument according to the AAPCS64 rules.  The rule
1753    numbers refer to the rule numbers in the AAPCS64.  */
1754
1755 static void
1756 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1757                     const_tree type,
1758                     bool named ATTRIBUTE_UNUSED)
1759 {
1760   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1761   int ncrn, nvrn, nregs;
1762   bool allocate_ncrn, allocate_nvrn;
1763   HOST_WIDE_INT size;
1764
1765   /* We need to do this once per argument.  */
1766   if (pcum->aapcs_arg_processed)
1767     return;
1768
1769   pcum->aapcs_arg_processed = true;
1770
1771   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1772   size
1773     = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1774                         UNITS_PER_WORD);
1775
1776   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1777   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1778                                                  mode,
1779                                                  type,
1780                                                  &nregs);
1781
1782   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1783      The following code thus handles passing by SIMD/FP registers first.  */
1784
1785   nvrn = pcum->aapcs_nvrn;
1786
1787   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1788      and homogenous short-vector aggregates (HVA).  */
1789   if (allocate_nvrn)
1790     {
1791       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1792         {
1793           pcum->aapcs_nextnvrn = nvrn + nregs;
1794           if (!aarch64_composite_type_p (type, mode))
1795             {
1796               gcc_assert (nregs == 1);
1797               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1798             }
1799           else
1800             {
1801               rtx par;
1802               int i;
1803               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1804               for (i = 0; i < nregs; i++)
1805                 {
1806                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1807                                          V0_REGNUM + nvrn + i);
1808                   tmp = gen_rtx_EXPR_LIST
1809                     (VOIDmode, tmp,
1810                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1811                   XVECEXP (par, 0, i) = tmp;
1812                 }
1813               pcum->aapcs_reg = par;
1814             }
1815           return;
1816         }
1817       else
1818         {
1819           /* C.3 NSRN is set to 8.  */
1820           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1821           goto on_stack;
1822         }
1823     }
1824
1825   ncrn = pcum->aapcs_ncrn;
1826   nregs = size / UNITS_PER_WORD;
1827
1828   /* C6 - C9.  though the sign and zero extension semantics are
1829      handled elsewhere.  This is the case where the argument fits
1830      entirely general registers.  */
1831   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1832     {
1833       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1834
1835       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1836
1837       /* C.8 if the argument has an alignment of 16 then the NGRN is
1838          rounded up to the next even number.  */
1839       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1840         {
1841           ++ncrn;
1842           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1843         }
1844       /* NREGS can be 0 when e.g. an empty structure is to be passed.
1845          A reg is still generated for it, but the caller should be smart
1846          enough not to use it.  */
1847       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1848         {
1849           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1850         }
1851       else
1852         {
1853           rtx par;
1854           int i;
1855
1856           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1857           for (i = 0; i < nregs; i++)
1858             {
1859               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1860               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1861                                        GEN_INT (i * UNITS_PER_WORD));
1862               XVECEXP (par, 0, i) = tmp;
1863             }
1864           pcum->aapcs_reg = par;
1865         }
1866
1867       pcum->aapcs_nextncrn = ncrn + nregs;
1868       return;
1869     }
1870
1871   /* C.11  */
1872   pcum->aapcs_nextncrn = NUM_ARG_REGS;
1873
1874   /* The argument is passed on stack; record the needed number of words for
1875      this argument and align the total size if necessary.  */
1876 on_stack:
1877   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1878   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1879     pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1880                                                16 / UNITS_PER_WORD);
1881   return;
1882 }
1883
1884 /* Implement TARGET_FUNCTION_ARG.  */
1885
1886 static rtx
1887 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1888                       const_tree type, bool named)
1889 {
1890   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1891   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1892
1893   if (mode == VOIDmode)
1894     return NULL_RTX;
1895
1896   aarch64_layout_arg (pcum_v, mode, type, named);
1897   return pcum->aapcs_reg;
1898 }
1899
1900 void
1901 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1902                            const_tree fntype ATTRIBUTE_UNUSED,
1903                            rtx libname ATTRIBUTE_UNUSED,
1904                            const_tree fndecl ATTRIBUTE_UNUSED,
1905                            unsigned n_named ATTRIBUTE_UNUSED)
1906 {
1907   pcum->aapcs_ncrn = 0;
1908   pcum->aapcs_nvrn = 0;
1909   pcum->aapcs_nextncrn = 0;
1910   pcum->aapcs_nextnvrn = 0;
1911   pcum->pcs_variant = ARM_PCS_AAPCS64;
1912   pcum->aapcs_reg = NULL_RTX;
1913   pcum->aapcs_arg_processed = false;
1914   pcum->aapcs_stack_words = 0;
1915   pcum->aapcs_stack_size = 0;
1916
1917   return;
1918 }
1919
1920 static void
1921 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1922                               machine_mode mode,
1923                               const_tree type,
1924                               bool named)
1925 {
1926   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1927   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1928     {
1929       aarch64_layout_arg (pcum_v, mode, type, named);
1930       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1931                   != (pcum->aapcs_stack_words != 0));
1932       pcum->aapcs_arg_processed = false;
1933       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1934       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1935       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1936       pcum->aapcs_stack_words = 0;
1937       pcum->aapcs_reg = NULL_RTX;
1938     }
1939 }
1940
1941 bool
1942 aarch64_function_arg_regno_p (unsigned regno)
1943 {
1944   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1945           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1946 }
1947
1948 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
1949    PARM_BOUNDARY bits of alignment, but will be given anything up
1950    to STACK_BOUNDARY bits if the type requires it.  This makes sure
1951    that both before and after the layout of each argument, the Next
1952    Stacked Argument Address (NSAA) will have a minimum alignment of
1953    8 bytes.  */
1954
1955 static unsigned int
1956 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1957 {
1958   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1959
1960   if (alignment < PARM_BOUNDARY)
1961     alignment = PARM_BOUNDARY;
1962   if (alignment > STACK_BOUNDARY)
1963     alignment = STACK_BOUNDARY;
1964   return alignment;
1965 }
1966
1967 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1968
1969    Return true if an argument passed on the stack should be padded upwards,
1970    i.e. if the least-significant byte of the stack slot has useful data.
1971
1972    Small aggregate types are placed in the lowest memory address.
1973
1974    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
1975
1976 bool
1977 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1978 {
1979   /* On little-endian targets, the least significant byte of every stack
1980      argument is passed at the lowest byte address of the stack slot.  */
1981   if (!BYTES_BIG_ENDIAN)
1982     return true;
1983
1984   /* Otherwise, integral, floating-point and pointer types are padded downward:
1985      the least significant byte of a stack argument is passed at the highest
1986      byte address of the stack slot.  */
1987   if (type
1988       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1989          || POINTER_TYPE_P (type))
1990       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1991     return false;
1992
1993   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
1994   return true;
1995 }
1996
1997 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1998
1999    It specifies padding for the last (may also be the only)
2000    element of a block move between registers and memory.  If
2001    assuming the block is in the memory, padding upward means that
2002    the last element is padded after its highest significant byte,
2003    while in downward padding, the last element is padded at the
2004    its least significant byte side.
2005
2006    Small aggregates and small complex types are always padded
2007    upwards.
2008
2009    We don't need to worry about homogeneous floating-point or
2010    short-vector aggregates; their move is not affected by the
2011    padding direction determined here.  Regardless of endianness,
2012    each element of such an aggregate is put in the least
2013    significant bits of a fp/simd register.
2014
2015    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2016    register has useful data, and return the opposite if the most
2017    significant byte does.  */
2018
2019 bool
2020 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2021                      bool first ATTRIBUTE_UNUSED)
2022 {
2023
2024   /* Small composite types are always padded upward.  */
2025   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2026     {
2027       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2028                             : GET_MODE_SIZE (mode));
2029       if (size < 2 * UNITS_PER_WORD)
2030         return true;
2031     }
2032
2033   /* Otherwise, use the default padding.  */
2034   return !BYTES_BIG_ENDIAN;
2035 }
2036
2037 static machine_mode
2038 aarch64_libgcc_cmp_return_mode (void)
2039 {
2040   return SImode;
2041 }
2042
2043 static bool
2044 aarch64_frame_pointer_required (void)
2045 {
2046   /* In aarch64_override_options_after_change
2047      flag_omit_leaf_frame_pointer turns off the frame pointer by
2048      default.  Turn it back on now if we've not got a leaf
2049      function.  */
2050   if (flag_omit_leaf_frame_pointer
2051       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2052     return true;
2053
2054   return false;
2055 }
2056
2057 /* Mark the registers that need to be saved by the callee and calculate
2058    the size of the callee-saved registers area and frame record (both FP
2059    and LR may be omitted).  */
2060 static void
2061 aarch64_layout_frame (void)
2062 {
2063   HOST_WIDE_INT offset = 0;
2064   int regno;
2065
2066   if (reload_completed && cfun->machine->frame.laid_out)
2067     return;
2068
2069 #define SLOT_NOT_REQUIRED (-2)
2070 #define SLOT_REQUIRED     (-1)
2071
2072   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2073   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2074
2075   /* First mark all the registers that really need to be saved...  */
2076   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2077     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2078
2079   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2080     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2081
2082   /* ... that includes the eh data registers (if needed)...  */
2083   if (crtl->calls_eh_return)
2084     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2085       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2086         = SLOT_REQUIRED;
2087
2088   /* ... and any callee saved register that dataflow says is live.  */
2089   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2090     if (df_regs_ever_live_p (regno)
2091         && (regno == R30_REGNUM
2092             || !call_used_regs[regno]))
2093       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2094
2095   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2096     if (df_regs_ever_live_p (regno)
2097         && !call_used_regs[regno])
2098       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2099
2100   if (frame_pointer_needed)
2101     {
2102       /* FP and LR are placed in the linkage record.  */
2103       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2104       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2105       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2106       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2107       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2108       offset += 2 * UNITS_PER_WORD;
2109     }
2110
2111   /* Now assign stack slots for them.  */
2112   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2113     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2114       {
2115         cfun->machine->frame.reg_offset[regno] = offset;
2116         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2117           cfun->machine->frame.wb_candidate1 = regno;
2118         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2119           cfun->machine->frame.wb_candidate2 = regno;
2120         offset += UNITS_PER_WORD;
2121       }
2122
2123   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2124     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2125       {
2126         cfun->machine->frame.reg_offset[regno] = offset;
2127         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2128           cfun->machine->frame.wb_candidate1 = regno;
2129         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2130                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2131           cfun->machine->frame.wb_candidate2 = regno;
2132         offset += UNITS_PER_WORD;
2133       }
2134
2135   cfun->machine->frame.padding0 =
2136     (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2137   offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2138
2139   cfun->machine->frame.saved_regs_size = offset;
2140
2141   cfun->machine->frame.hard_fp_offset
2142     = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2143                         + get_frame_size ()
2144                         + cfun->machine->frame.saved_regs_size,
2145                         STACK_BOUNDARY / BITS_PER_UNIT);
2146
2147   cfun->machine->frame.frame_size
2148     = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2149                         + crtl->outgoing_args_size,
2150                         STACK_BOUNDARY / BITS_PER_UNIT);
2151
2152   cfun->machine->frame.laid_out = true;
2153 }
2154
2155 static bool
2156 aarch64_register_saved_on_entry (int regno)
2157 {
2158   return cfun->machine->frame.reg_offset[regno] >= 0;
2159 }
2160
2161 static unsigned
2162 aarch64_next_callee_save (unsigned regno, unsigned limit)
2163 {
2164   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2165     regno ++;
2166   return regno;
2167 }
2168
2169 static void
2170 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2171                            HOST_WIDE_INT adjustment)
2172  {
2173   rtx base_rtx = stack_pointer_rtx;
2174   rtx insn, reg, mem;
2175
2176   reg = gen_rtx_REG (mode, regno);
2177   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2178                             plus_constant (Pmode, base_rtx, -adjustment));
2179   mem = gen_rtx_MEM (mode, mem);
2180
2181   insn = emit_move_insn (mem, reg);
2182   RTX_FRAME_RELATED_P (insn) = 1;
2183 }
2184
2185 static rtx
2186 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2187                           HOST_WIDE_INT adjustment)
2188 {
2189   switch (mode)
2190     {
2191     case DImode:
2192       return gen_storewb_pairdi_di (base, base, reg, reg2,
2193                                     GEN_INT (-adjustment),
2194                                     GEN_INT (UNITS_PER_WORD - adjustment));
2195     case DFmode:
2196       return gen_storewb_pairdf_di (base, base, reg, reg2,
2197                                     GEN_INT (-adjustment),
2198                                     GEN_INT (UNITS_PER_WORD - adjustment));
2199     default:
2200       gcc_unreachable ();
2201     }
2202 }
2203
2204 static void
2205 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2206                          unsigned regno2, HOST_WIDE_INT adjustment)
2207 {
2208   rtx_insn *insn;
2209   rtx reg1 = gen_rtx_REG (mode, regno1);
2210   rtx reg2 = gen_rtx_REG (mode, regno2);
2211
2212   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2213                                               reg2, adjustment));
2214   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2215   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2216   RTX_FRAME_RELATED_P (insn) = 1;
2217 }
2218
2219 static rtx
2220 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2221                          HOST_WIDE_INT adjustment)
2222 {
2223   switch (mode)
2224     {
2225     case DImode:
2226       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2227                                    GEN_INT (UNITS_PER_WORD));
2228     case DFmode:
2229       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2230                                    GEN_INT (UNITS_PER_WORD));
2231     default:
2232       gcc_unreachable ();
2233     }
2234 }
2235
2236 static rtx
2237 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2238                         rtx reg2)
2239 {
2240   switch (mode)
2241     {
2242     case DImode:
2243       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2244
2245     case DFmode:
2246       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2247
2248     default:
2249       gcc_unreachable ();
2250     }
2251 }
2252
2253 static rtx
2254 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2255                        rtx mem2)
2256 {
2257   switch (mode)
2258     {
2259     case DImode:
2260       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2261
2262     case DFmode:
2263       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2264
2265     default:
2266       gcc_unreachable ();
2267     }
2268 }
2269
2270
2271 static void
2272 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2273                            unsigned start, unsigned limit, bool skip_wb)
2274 {
2275   rtx_insn *insn;
2276   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2277                                                  ? gen_frame_mem : gen_rtx_MEM);
2278   unsigned regno;
2279   unsigned regno2;
2280
2281   for (regno = aarch64_next_callee_save (start, limit);
2282        regno <= limit;
2283        regno = aarch64_next_callee_save (regno + 1, limit))
2284     {
2285       rtx reg, mem;
2286       HOST_WIDE_INT offset;
2287
2288       if (skip_wb
2289           && (regno == cfun->machine->frame.wb_candidate1
2290               || regno == cfun->machine->frame.wb_candidate2))
2291         continue;
2292
2293       reg = gen_rtx_REG (mode, regno);
2294       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2295       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2296                                               offset));
2297
2298       regno2 = aarch64_next_callee_save (regno + 1, limit);
2299
2300       if (regno2 <= limit
2301           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2302               == cfun->machine->frame.reg_offset[regno2]))
2303
2304         {
2305           rtx reg2 = gen_rtx_REG (mode, regno2);
2306           rtx mem2;
2307
2308           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2309           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2310                                                    offset));
2311           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2312                                                     reg2));
2313
2314           /* The first part of a frame-related parallel insn is
2315              always assumed to be relevant to the frame
2316              calculations; subsequent parts, are only
2317              frame-related if explicitly marked.  */
2318           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2319           regno = regno2;
2320         }
2321       else
2322         insn = emit_move_insn (mem, reg);
2323
2324       RTX_FRAME_RELATED_P (insn) = 1;
2325     }
2326 }
2327
2328 static void
2329 aarch64_restore_callee_saves (machine_mode mode,
2330                               HOST_WIDE_INT start_offset, unsigned start,
2331                               unsigned limit, bool skip_wb, rtx *cfi_ops)
2332 {
2333   rtx base_rtx = stack_pointer_rtx;
2334   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2335                                                  ? gen_frame_mem : gen_rtx_MEM);
2336   unsigned regno;
2337   unsigned regno2;
2338   HOST_WIDE_INT offset;
2339
2340   for (regno = aarch64_next_callee_save (start, limit);
2341        regno <= limit;
2342        regno = aarch64_next_callee_save (regno + 1, limit))
2343     {
2344       rtx reg, mem;
2345
2346       if (skip_wb
2347           && (regno == cfun->machine->frame.wb_candidate1
2348               || regno == cfun->machine->frame.wb_candidate2))
2349         continue;
2350
2351       reg = gen_rtx_REG (mode, regno);
2352       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2353       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2354
2355       regno2 = aarch64_next_callee_save (regno + 1, limit);
2356
2357       if (regno2 <= limit
2358           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2359               == cfun->machine->frame.reg_offset[regno2]))
2360         {
2361           rtx reg2 = gen_rtx_REG (mode, regno2);
2362           rtx mem2;
2363
2364           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2365           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2366           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2367
2368           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2369           regno = regno2;
2370         }
2371       else
2372         emit_move_insn (reg, mem);
2373       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2374     }
2375 }
2376
2377 /* AArch64 stack frames generated by this compiler look like:
2378
2379         +-------------------------------+
2380         |                               |
2381         |  incoming stack arguments     |
2382         |                               |
2383         +-------------------------------+
2384         |                               | <-- incoming stack pointer (aligned)
2385         |  callee-allocated save area   |
2386         |  for register varargs         |
2387         |                               |
2388         +-------------------------------+
2389         |  local variables              | <-- frame_pointer_rtx
2390         |                               |
2391         +-------------------------------+
2392         |  padding0                     | \
2393         +-------------------------------+  |
2394         |  callee-saved registers       |  | frame.saved_regs_size
2395         +-------------------------------+  |
2396         |  LR'                          |  |
2397         +-------------------------------+  |
2398         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2399         +-------------------------------+
2400         |  dynamic allocation           |
2401         +-------------------------------+
2402         |  padding                      |
2403         +-------------------------------+
2404         |  outgoing stack arguments     | <-- arg_pointer
2405         |                               |
2406         +-------------------------------+
2407         |                               | <-- stack_pointer_rtx (aligned)
2408
2409    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2410    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2411    unchanged.  */
2412
2413 /* Generate the prologue instructions for entry into a function.
2414    Establish the stack frame by decreasing the stack pointer with a
2415    properly calculated size and, if necessary, create a frame record
2416    filled with the values of LR and previous frame pointer.  The
2417    current FP is also set up if it is in use.  */
2418
2419 void
2420 aarch64_expand_prologue (void)
2421 {
2422   /* sub sp, sp, #<frame_size>
2423      stp {fp, lr}, [sp, #<frame_size> - 16]
2424      add fp, sp, #<frame_size> - hardfp_offset
2425      stp {cs_reg}, [fp, #-16] etc.
2426
2427      sub sp, sp, <final_adjustment_if_any>
2428   */
2429   HOST_WIDE_INT frame_size, offset;
2430   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2431   HOST_WIDE_INT hard_fp_offset;
2432   rtx_insn *insn;
2433
2434   aarch64_layout_frame ();
2435
2436   offset = frame_size = cfun->machine->frame.frame_size;
2437   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2438   fp_offset = frame_size - hard_fp_offset;
2439
2440   if (flag_stack_usage_info)
2441     current_function_static_stack_size = frame_size;
2442
2443   /* Store pairs and load pairs have a range only -512 to 504.  */
2444   if (offset >= 512)
2445     {
2446       /* When the frame has a large size, an initial decrease is done on
2447          the stack pointer to jump over the callee-allocated save area for
2448          register varargs, the local variable area and/or the callee-saved
2449          register area.  This will allow the pre-index write-back
2450          store pair instructions to be used for setting up the stack frame
2451          efficiently.  */
2452       offset = hard_fp_offset;
2453       if (offset >= 512)
2454         offset = cfun->machine->frame.saved_regs_size;
2455
2456       frame_size -= (offset + crtl->outgoing_args_size);
2457       fp_offset = 0;
2458
2459       if (frame_size >= 0x1000000)
2460         {
2461           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2462           emit_move_insn (op0, GEN_INT (-frame_size));
2463           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2464
2465           add_reg_note (insn, REG_CFA_ADJUST_CFA,
2466                         gen_rtx_SET (VOIDmode, stack_pointer_rtx,
2467                                      plus_constant (Pmode, stack_pointer_rtx,
2468                                                     -frame_size)));
2469           RTX_FRAME_RELATED_P (insn) = 1;
2470         }
2471       else if (frame_size > 0)
2472         {
2473           int hi_ofs = frame_size & 0xfff000;
2474           int lo_ofs = frame_size & 0x000fff;
2475
2476           if (hi_ofs)
2477             {
2478               insn = emit_insn (gen_add2_insn
2479                                 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2480               RTX_FRAME_RELATED_P (insn) = 1;
2481             }
2482           if (lo_ofs)
2483             {
2484               insn = emit_insn (gen_add2_insn
2485                                 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2486               RTX_FRAME_RELATED_P (insn) = 1;
2487             }
2488         }
2489     }
2490   else
2491     frame_size = -1;
2492
2493   if (offset > 0)
2494     {
2495       bool skip_wb = false;
2496
2497       if (frame_pointer_needed)
2498         {
2499           skip_wb = true;
2500
2501           if (fp_offset)
2502             {
2503               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2504                                                GEN_INT (-offset)));
2505               RTX_FRAME_RELATED_P (insn) = 1;
2506
2507               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2508                                          R30_REGNUM, false);
2509             }
2510           else
2511             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2512
2513           /* Set up frame pointer to point to the location of the
2514              previous frame pointer on the stack.  */
2515           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2516                                            stack_pointer_rtx,
2517                                            GEN_INT (fp_offset)));
2518           RTX_FRAME_RELATED_P (insn) = 1;
2519           emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2520         }
2521       else
2522         {
2523           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2524           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2525
2526           if (fp_offset
2527               || reg1 == FIRST_PSEUDO_REGISTER
2528               || (reg2 == FIRST_PSEUDO_REGISTER
2529                   && offset >= 256))
2530             {
2531               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2532                                                GEN_INT (-offset)));
2533               RTX_FRAME_RELATED_P (insn) = 1;
2534             }
2535           else
2536             {
2537               machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2538
2539               skip_wb = true;
2540
2541               if (reg2 == FIRST_PSEUDO_REGISTER)
2542                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2543               else
2544                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2545             }
2546         }
2547
2548       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2549                                  skip_wb);
2550       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2551                                  skip_wb);
2552     }
2553
2554   /* when offset >= 512,
2555      sub sp, sp, #<outgoing_args_size> */
2556   if (frame_size > -1)
2557     {
2558       if (crtl->outgoing_args_size > 0)
2559         {
2560           insn = emit_insn (gen_add2_insn
2561                             (stack_pointer_rtx,
2562                              GEN_INT (- crtl->outgoing_args_size)));
2563           RTX_FRAME_RELATED_P (insn) = 1;
2564         }
2565     }
2566 }
2567
2568 /* Return TRUE if we can use a simple_return insn.
2569
2570    This function checks whether the callee saved stack is empty, which
2571    means no restore actions are need. The pro_and_epilogue will use
2572    this to check whether shrink-wrapping opt is feasible.  */
2573
2574 bool
2575 aarch64_use_return_insn_p (void)
2576 {
2577   if (!reload_completed)
2578     return false;
2579
2580   if (crtl->profile)
2581     return false;
2582
2583   aarch64_layout_frame ();
2584
2585   return cfun->machine->frame.frame_size == 0;
2586 }
2587
2588 /* Generate the epilogue instructions for returning from a function.  */
2589 void
2590 aarch64_expand_epilogue (bool for_sibcall)
2591 {
2592   HOST_WIDE_INT frame_size, offset;
2593   HOST_WIDE_INT fp_offset;
2594   HOST_WIDE_INT hard_fp_offset;
2595   rtx_insn *insn;
2596   /* We need to add memory barrier to prevent read from deallocated stack.  */
2597   bool need_barrier_p = (get_frame_size () != 0
2598                          || cfun->machine->frame.saved_varargs_size);
2599
2600   aarch64_layout_frame ();
2601
2602   offset = frame_size = cfun->machine->frame.frame_size;
2603   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2604   fp_offset = frame_size - hard_fp_offset;
2605
2606   /* Store pairs and load pairs have a range only -512 to 504.  */
2607   if (offset >= 512)
2608     {
2609       offset = hard_fp_offset;
2610       if (offset >= 512)
2611         offset = cfun->machine->frame.saved_regs_size;
2612
2613       frame_size -= (offset + crtl->outgoing_args_size);
2614       fp_offset = 0;
2615       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2616         {
2617           insn = emit_insn (gen_add2_insn
2618                             (stack_pointer_rtx,
2619                              GEN_INT (crtl->outgoing_args_size)));
2620           RTX_FRAME_RELATED_P (insn) = 1;
2621         }
2622     }
2623   else
2624     frame_size = -1;
2625
2626   /* If there were outgoing arguments or we've done dynamic stack
2627      allocation, then restore the stack pointer from the frame
2628      pointer.  This is at most one insn and more efficient than using
2629      GCC's internal mechanism.  */
2630   if (frame_pointer_needed
2631       && (crtl->outgoing_args_size || cfun->calls_alloca))
2632     {
2633       if (cfun->calls_alloca)
2634         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2635
2636       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2637                                        hard_frame_pointer_rtx,
2638                                        GEN_INT (0)));
2639       offset = offset - fp_offset;
2640     }
2641
2642   if (offset > 0)
2643     {
2644       unsigned reg1 = cfun->machine->frame.wb_candidate1;
2645       unsigned reg2 = cfun->machine->frame.wb_candidate2;
2646       bool skip_wb = true;
2647       rtx cfi_ops = NULL;
2648
2649       if (frame_pointer_needed)
2650         fp_offset = 0;
2651       else if (fp_offset
2652                || reg1 == FIRST_PSEUDO_REGISTER
2653                || (reg2 == FIRST_PSEUDO_REGISTER
2654                    && offset >= 256))
2655         skip_wb = false;
2656
2657       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2658                                     skip_wb, &cfi_ops);
2659       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2660                                     skip_wb, &cfi_ops);
2661
2662       if (need_barrier_p)
2663         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2664
2665       if (skip_wb)
2666         {
2667           machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2668           rtx rreg1 = gen_rtx_REG (mode1, reg1);
2669
2670           cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2671           if (reg2 == FIRST_PSEUDO_REGISTER)
2672             {
2673               rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2674               mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2675               mem = gen_rtx_MEM (mode1, mem);
2676               insn = emit_move_insn (rreg1, mem);
2677             }
2678           else
2679             {
2680               rtx rreg2 = gen_rtx_REG (mode1, reg2);
2681
2682               cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2683               insn = emit_insn (aarch64_gen_loadwb_pair
2684                                 (mode1, stack_pointer_rtx, rreg1,
2685                                  rreg2, offset));
2686             }
2687         }
2688       else
2689         {
2690           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2691                                            GEN_INT (offset)));
2692         }
2693
2694       /* Reset the CFA to be SP + FRAME_SIZE.  */
2695       rtx new_cfa = stack_pointer_rtx;
2696       if (frame_size > 0)
2697         new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2698       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2699       REG_NOTES (insn) = cfi_ops;
2700       RTX_FRAME_RELATED_P (insn) = 1;
2701     }
2702
2703   if (frame_size > 0)
2704     {
2705       if (need_barrier_p)
2706         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2707
2708       if (frame_size >= 0x1000000)
2709         {
2710           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2711           emit_move_insn (op0, GEN_INT (frame_size));
2712           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2713         }
2714       else
2715         {
2716           int hi_ofs = frame_size & 0xfff000;
2717           int lo_ofs = frame_size & 0x000fff;
2718
2719           if (hi_ofs && lo_ofs)
2720             {
2721               insn = emit_insn (gen_add2_insn
2722                                 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2723               RTX_FRAME_RELATED_P (insn) = 1;
2724               frame_size = lo_ofs;
2725             }
2726           insn = emit_insn (gen_add2_insn
2727                             (stack_pointer_rtx, GEN_INT (frame_size)));
2728         }
2729
2730       /* Reset the CFA to be SP + 0.  */
2731       add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2732       RTX_FRAME_RELATED_P (insn) = 1;
2733     }
2734
2735   /* Stack adjustment for exception handler.  */
2736   if (crtl->calls_eh_return)
2737     {
2738       /* We need to unwind the stack by the offset computed by
2739          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
2740          to be SP; letting the CFA move during this adjustment
2741          is just as correct as retaining the CFA from the body
2742          of the function.  Therefore, do nothing special.  */
2743       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2744     }
2745
2746   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2747   if (!for_sibcall)
2748     emit_jump_insn (ret_rtx);
2749 }
2750
2751 /* Return the place to copy the exception unwinding return address to.
2752    This will probably be a stack slot, but could (in theory be the
2753    return register).  */
2754 rtx
2755 aarch64_final_eh_return_addr (void)
2756 {
2757   HOST_WIDE_INT fp_offset;
2758
2759   aarch64_layout_frame ();
2760
2761   fp_offset = cfun->machine->frame.frame_size
2762               - cfun->machine->frame.hard_fp_offset;
2763
2764   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2765     return gen_rtx_REG (DImode, LR_REGNUM);
2766
2767   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2768      result in a store to save LR introduced by builtin_eh_return () being
2769      incorrectly deleted because the alias is not detected.
2770      So in the calculation of the address to copy the exception unwinding
2771      return address to, we note 2 cases.
2772      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2773      we return a SP-relative location since all the addresses are SP-relative
2774      in this case.  This prevents the store from being optimized away.
2775      If the fp_offset is not 0, then the addresses will be FP-relative and
2776      therefore we return a FP-relative location.  */
2777
2778   if (frame_pointer_needed)
2779     {
2780       if (fp_offset)
2781         return gen_frame_mem (DImode,
2782                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2783       else
2784         return gen_frame_mem (DImode,
2785                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2786     }
2787
2788   /* If FP is not needed, we calculate the location of LR, which would be
2789      at the top of the saved registers block.  */
2790
2791   return gen_frame_mem (DImode,
2792                         plus_constant (Pmode,
2793                                        stack_pointer_rtx,
2794                                        fp_offset
2795                                        + cfun->machine->frame.saved_regs_size
2796                                        - 2 * UNITS_PER_WORD));
2797 }
2798
2799 /* Possibly output code to build up a constant in a register.  For
2800    the benefit of the costs infrastructure, returns the number of
2801    instructions which would be emitted.  GENERATE inhibits or
2802    enables code generation.  */
2803
2804 static int
2805 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2806 {
2807   int insns = 0;
2808
2809   if (aarch64_bitmask_imm (val, DImode))
2810     {
2811       if (generate)
2812         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2813       insns = 1;
2814     }
2815   else
2816     {
2817       int i;
2818       int ncount = 0;
2819       int zcount = 0;
2820       HOST_WIDE_INT valp = val >> 16;
2821       HOST_WIDE_INT valm;
2822       HOST_WIDE_INT tval;
2823
2824       for (i = 16; i < 64; i += 16)
2825         {
2826           valm = (valp & 0xffff);
2827
2828           if (valm != 0)
2829             ++ zcount;
2830
2831           if (valm != 0xffff)
2832             ++ ncount;
2833
2834           valp >>= 16;
2835         }
2836
2837       /* zcount contains the number of additional MOVK instructions
2838          required if the constant is built up with an initial MOVZ instruction,
2839          while ncount is the number of MOVK instructions required if starting
2840          with a MOVN instruction.  Choose the sequence that yields the fewest
2841          number of instructions, preferring MOVZ instructions when they are both
2842          the same.  */
2843       if (ncount < zcount)
2844         {
2845           if (generate)
2846             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2847                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2848           tval = 0xffff;
2849           insns++;
2850         }
2851       else
2852         {
2853           if (generate)
2854             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2855                             GEN_INT (val & 0xffff));
2856           tval = 0;
2857           insns++;
2858         }
2859
2860       val >>= 16;
2861
2862       for (i = 16; i < 64; i += 16)
2863         {
2864           if ((val & 0xffff) != tval)
2865             {
2866               if (generate)
2867                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2868                                            GEN_INT (i),
2869                                            GEN_INT (val & 0xffff)));
2870               insns++;
2871             }
2872           val >>= 16;
2873         }
2874     }
2875   return insns;
2876 }
2877
2878 static void
2879 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2880 {
2881   HOST_WIDE_INT mdelta = delta;
2882   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2883   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2884
2885   if (mdelta < 0)
2886     mdelta = -mdelta;
2887
2888   if (mdelta >= 4096 * 4096)
2889     {
2890       (void) aarch64_build_constant (scratchreg, delta, true);
2891       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2892     }
2893   else if (mdelta > 0)
2894     {
2895       if (mdelta >= 4096)
2896         {
2897           emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2898           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2899           if (delta < 0)
2900             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2901                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
2902           else
2903             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2904                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
2905         }
2906       if (mdelta % 4096 != 0)
2907         {
2908           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2909           emit_insn (gen_rtx_SET (Pmode, this_rtx,
2910                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2911         }
2912     }
2913 }
2914
2915 /* Output code to add DELTA to the first argument, and then jump
2916    to FUNCTION.  Used for C++ multiple inheritance.  */
2917 static void
2918 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2919                          HOST_WIDE_INT delta,
2920                          HOST_WIDE_INT vcall_offset,
2921                          tree function)
2922 {
2923   /* The this pointer is always in x0.  Note that this differs from
2924      Arm where the this pointer maybe bumped to r1 if r0 is required
2925      to return a pointer to an aggregate.  On AArch64 a result value
2926      pointer will be in x8.  */
2927   int this_regno = R0_REGNUM;
2928   rtx this_rtx, temp0, temp1, addr, funexp;
2929   rtx_insn *insn;
2930
2931   reload_completed = 1;
2932   emit_note (NOTE_INSN_PROLOGUE_END);
2933
2934   if (vcall_offset == 0)
2935     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2936   else
2937     {
2938       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2939
2940       this_rtx = gen_rtx_REG (Pmode, this_regno);
2941       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2942       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2943
2944       addr = this_rtx;
2945       if (delta != 0)
2946         {
2947           if (delta >= -256 && delta < 256)
2948             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2949                                        plus_constant (Pmode, this_rtx, delta));
2950           else
2951             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2952         }
2953
2954       if (Pmode == ptr_mode)
2955         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2956       else
2957         aarch64_emit_move (temp0,
2958                            gen_rtx_ZERO_EXTEND (Pmode,
2959                                                 gen_rtx_MEM (ptr_mode, addr)));
2960
2961       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2962           addr = plus_constant (Pmode, temp0, vcall_offset);
2963       else
2964         {
2965           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2966           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2967         }
2968
2969       if (Pmode == ptr_mode)
2970         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2971       else
2972         aarch64_emit_move (temp1,
2973                            gen_rtx_SIGN_EXTEND (Pmode,
2974                                                 gen_rtx_MEM (ptr_mode, addr)));
2975
2976       emit_insn (gen_add2_insn (this_rtx, temp1));
2977     }
2978
2979   /* Generate a tail call to the target function.  */
2980   if (!TREE_USED (function))
2981     {
2982       assemble_external (function);
2983       TREE_USED (function) = 1;
2984     }
2985   funexp = XEXP (DECL_RTL (function), 0);
2986   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2987   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2988   SIBLING_CALL_P (insn) = 1;
2989
2990   insn = get_insns ();
2991   shorten_branches (insn);
2992   final_start_function (insn, file, 1);
2993   final (insn, file, 1);
2994   final_end_function ();
2995
2996   /* Stop pretending to be a post-reload pass.  */
2997   reload_completed = 0;
2998 }
2999
3000 static bool
3001 aarch64_tls_referenced_p (rtx x)
3002 {
3003   if (!TARGET_HAVE_TLS)
3004     return false;
3005   subrtx_iterator::array_type array;
3006   FOR_EACH_SUBRTX (iter, array, x, ALL)
3007     {
3008       const_rtx x = *iter;
3009       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3010         return true;
3011       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3012          TLS offsets, not real symbol references.  */
3013       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3014         iter.skip_subrtxes ();
3015     }
3016   return false;
3017 }
3018
3019
3020 static int
3021 aarch64_bitmasks_cmp (const void *i1, const void *i2)
3022 {
3023   const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3024   const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3025
3026   if (*imm1 < *imm2)
3027     return -1;
3028   if (*imm1 > *imm2)
3029     return +1;
3030   return 0;
3031 }
3032
3033
3034 static void
3035 aarch64_build_bitmask_table (void)
3036 {
3037   unsigned HOST_WIDE_INT mask, imm;
3038   unsigned int log_e, e, s, r;
3039   unsigned int nimms = 0;
3040
3041   for (log_e = 1; log_e <= 6; log_e++)
3042     {
3043       e = 1 << log_e;
3044       if (e == 64)
3045         mask = ~(HOST_WIDE_INT) 0;
3046       else
3047         mask = ((HOST_WIDE_INT) 1 << e) - 1;
3048       for (s = 1; s < e; s++)
3049         {
3050           for (r = 0; r < e; r++)
3051             {
3052               /* set s consecutive bits to 1 (s < 64) */
3053               imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3054               /* rotate right by r */
3055               if (r != 0)
3056                 imm = ((imm >> r) | (imm << (e - r))) & mask;
3057               /* replicate the constant depending on SIMD size */
3058               switch (log_e) {
3059               case 1: imm |= (imm <<  2);
3060               case 2: imm |= (imm <<  4);
3061               case 3: imm |= (imm <<  8);
3062               case 4: imm |= (imm << 16);
3063               case 5: imm |= (imm << 32);
3064               case 6:
3065                 break;
3066               default:
3067                 gcc_unreachable ();
3068               }
3069               gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3070               aarch64_bitmasks[nimms++] = imm;
3071             }
3072         }
3073     }
3074
3075   gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3076   qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3077          aarch64_bitmasks_cmp);
3078 }
3079
3080
3081 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3082    a left shift of 0 or 12 bits.  */
3083 bool
3084 aarch64_uimm12_shift (HOST_WIDE_INT val)
3085 {
3086   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3087           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3088           );
3089 }
3090
3091
3092 /* Return true if val is an immediate that can be loaded into a
3093    register by a MOVZ instruction.  */
3094 static bool
3095 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3096 {
3097   if (GET_MODE_SIZE (mode) > 4)
3098     {
3099       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3100           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3101         return 1;
3102     }
3103   else
3104     {
3105       /* Ignore sign extension.  */
3106       val &= (HOST_WIDE_INT) 0xffffffff;
3107     }
3108   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3109           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3110 }
3111
3112
3113 /* Return true if val is a valid bitmask immediate.  */
3114 bool
3115 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3116 {
3117   if (GET_MODE_SIZE (mode) < 8)
3118     {
3119       /* Replicate bit pattern.  */
3120       val &= (HOST_WIDE_INT) 0xffffffff;
3121       val |= val << 32;
3122     }
3123   return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3124                   sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3125 }
3126
3127
3128 /* Return true if val is an immediate that can be loaded into a
3129    register in a single instruction.  */
3130 bool
3131 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3132 {
3133   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3134     return 1;
3135   return aarch64_bitmask_imm (val, mode);
3136 }
3137
3138 static bool
3139 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3140 {
3141   rtx base, offset;
3142
3143   if (GET_CODE (x) == HIGH)
3144     return true;
3145
3146   split_const (x, &base, &offset);
3147   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3148     {
3149       if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3150           != SYMBOL_FORCE_TO_MEM)
3151         return true;
3152       else
3153         /* Avoid generating a 64-bit relocation in ILP32; leave
3154            to aarch64_expand_mov_immediate to handle it properly.  */
3155         return mode != ptr_mode;
3156     }
3157
3158   return aarch64_tls_referenced_p (x);
3159 }
3160
3161 /* Return true if register REGNO is a valid index register.
3162    STRICT_P is true if REG_OK_STRICT is in effect.  */
3163
3164 bool
3165 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3166 {
3167   if (!HARD_REGISTER_NUM_P (regno))
3168     {
3169       if (!strict_p)
3170         return true;
3171
3172       if (!reg_renumber)
3173         return false;
3174
3175       regno = reg_renumber[regno];
3176     }
3177   return GP_REGNUM_P (regno);
3178 }
3179
3180 /* Return true if register REGNO is a valid base register for mode MODE.
3181    STRICT_P is true if REG_OK_STRICT is in effect.  */
3182
3183 bool
3184 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3185 {
3186   if (!HARD_REGISTER_NUM_P (regno))
3187     {
3188       if (!strict_p)
3189         return true;
3190
3191       if (!reg_renumber)
3192         return false;
3193
3194       regno = reg_renumber[regno];
3195     }
3196
3197   /* The fake registers will be eliminated to either the stack or
3198      hard frame pointer, both of which are usually valid base registers.
3199      Reload deals with the cases where the eliminated form isn't valid.  */
3200   return (GP_REGNUM_P (regno)
3201           || regno == SP_REGNUM
3202           || regno == FRAME_POINTER_REGNUM
3203           || regno == ARG_POINTER_REGNUM);
3204 }
3205
3206 /* Return true if X is a valid base register for mode MODE.
3207    STRICT_P is true if REG_OK_STRICT is in effect.  */
3208
3209 static bool
3210 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3211 {
3212   if (!strict_p && GET_CODE (x) == SUBREG)
3213     x = SUBREG_REG (x);
3214
3215   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3216 }
3217
3218 /* Return true if address offset is a valid index.  If it is, fill in INFO
3219    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3220
3221 static bool
3222 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3223                         machine_mode mode, bool strict_p)
3224 {
3225   enum aarch64_address_type type;
3226   rtx index;
3227   int shift;
3228
3229   /* (reg:P) */
3230   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3231       && GET_MODE (x) == Pmode)
3232     {
3233       type = ADDRESS_REG_REG;
3234       index = x;
3235       shift = 0;
3236     }
3237   /* (sign_extend:DI (reg:SI)) */
3238   else if ((GET_CODE (x) == SIGN_EXTEND
3239             || GET_CODE (x) == ZERO_EXTEND)
3240            && GET_MODE (x) == DImode
3241            && GET_MODE (XEXP (x, 0)) == SImode)
3242     {
3243       type = (GET_CODE (x) == SIGN_EXTEND)
3244         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3245       index = XEXP (x, 0);
3246       shift = 0;
3247     }
3248   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3249   else if (GET_CODE (x) == MULT
3250            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3251                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3252            && GET_MODE (XEXP (x, 0)) == DImode
3253            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3254            && CONST_INT_P (XEXP (x, 1)))
3255     {
3256       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3257         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3258       index = XEXP (XEXP (x, 0), 0);
3259       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3260     }
3261   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3262   else if (GET_CODE (x) == ASHIFT
3263            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3264                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3265            && GET_MODE (XEXP (x, 0)) == DImode
3266            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3267            && CONST_INT_P (XEXP (x, 1)))
3268     {
3269       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3270         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3271       index = XEXP (XEXP (x, 0), 0);
3272       shift = INTVAL (XEXP (x, 1));
3273     }
3274   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3275   else if ((GET_CODE (x) == SIGN_EXTRACT
3276             || GET_CODE (x) == ZERO_EXTRACT)
3277            && GET_MODE (x) == DImode
3278            && GET_CODE (XEXP (x, 0)) == MULT
3279            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3280            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3281     {
3282       type = (GET_CODE (x) == SIGN_EXTRACT)
3283         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3284       index = XEXP (XEXP (x, 0), 0);
3285       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3286       if (INTVAL (XEXP (x, 1)) != 32 + shift
3287           || INTVAL (XEXP (x, 2)) != 0)
3288         shift = -1;
3289     }
3290   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3291      (const_int 0xffffffff<<shift)) */
3292   else if (GET_CODE (x) == AND
3293            && GET_MODE (x) == DImode
3294            && GET_CODE (XEXP (x, 0)) == MULT
3295            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3296            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3297            && CONST_INT_P (XEXP (x, 1)))
3298     {
3299       type = ADDRESS_REG_UXTW;
3300       index = XEXP (XEXP (x, 0), 0);
3301       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3302       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3303         shift = -1;
3304     }
3305   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3306   else if ((GET_CODE (x) == SIGN_EXTRACT
3307             || GET_CODE (x) == ZERO_EXTRACT)
3308            && GET_MODE (x) == DImode
3309            && GET_CODE (XEXP (x, 0)) == ASHIFT
3310            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3311            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3312     {
3313       type = (GET_CODE (x) == SIGN_EXTRACT)
3314         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3315       index = XEXP (XEXP (x, 0), 0);
3316       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3317       if (INTVAL (XEXP (x, 1)) != 32 + shift
3318           || INTVAL (XEXP (x, 2)) != 0)
3319         shift = -1;
3320     }
3321   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3322      (const_int 0xffffffff<<shift)) */
3323   else if (GET_CODE (x) == AND
3324            && GET_MODE (x) == DImode
3325            && GET_CODE (XEXP (x, 0)) == ASHIFT
3326            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3327            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3328            && CONST_INT_P (XEXP (x, 1)))
3329     {
3330       type = ADDRESS_REG_UXTW;
3331       index = XEXP (XEXP (x, 0), 0);
3332       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3333       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3334         shift = -1;
3335     }
3336   /* (mult:P (reg:P) (const_int scale)) */
3337   else if (GET_CODE (x) == MULT
3338            && GET_MODE (x) == Pmode
3339            && GET_MODE (XEXP (x, 0)) == Pmode
3340            && CONST_INT_P (XEXP (x, 1)))
3341     {
3342       type = ADDRESS_REG_REG;
3343       index = XEXP (x, 0);
3344       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3345     }
3346   /* (ashift:P (reg:P) (const_int shift)) */
3347   else if (GET_CODE (x) == ASHIFT
3348            && GET_MODE (x) == Pmode
3349            && GET_MODE (XEXP (x, 0)) == Pmode
3350            && CONST_INT_P (XEXP (x, 1)))
3351     {
3352       type = ADDRESS_REG_REG;
3353       index = XEXP (x, 0);
3354       shift = INTVAL (XEXP (x, 1));
3355     }
3356   else
3357     return false;
3358
3359   if (GET_CODE (index) == SUBREG)
3360     index = SUBREG_REG (index);
3361
3362   if ((shift == 0 ||
3363        (shift > 0 && shift <= 3
3364         && (1 << shift) == GET_MODE_SIZE (mode)))
3365       && REG_P (index)
3366       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3367     {
3368       info->type = type;
3369       info->offset = index;
3370       info->shift = shift;
3371       return true;
3372     }
3373
3374   return false;
3375 }
3376
3377 bool
3378 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3379 {
3380   return (offset >= -64 * GET_MODE_SIZE (mode)
3381           && offset < 64 * GET_MODE_SIZE (mode)
3382           && offset % GET_MODE_SIZE (mode) == 0);
3383 }
3384
3385 static inline bool
3386 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3387                                HOST_WIDE_INT offset)
3388 {
3389   return offset >= -256 && offset < 256;
3390 }
3391
3392 static inline bool
3393 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3394 {
3395   return (offset >= 0
3396           && offset < 4096 * GET_MODE_SIZE (mode)
3397           && offset % GET_MODE_SIZE (mode) == 0);
3398 }
3399
3400 /* Return true if X is a valid address for machine mode MODE.  If it is,
3401    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3402    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3403
3404 static bool
3405 aarch64_classify_address (struct aarch64_address_info *info,
3406                           rtx x, machine_mode mode,
3407                           RTX_CODE outer_code, bool strict_p)
3408 {
3409   enum rtx_code code = GET_CODE (x);
3410   rtx op0, op1;
3411
3412   /* On BE, we use load/store pair for all large int mode load/stores.  */
3413   bool load_store_pair_p = (outer_code == PARALLEL
3414                             || (BYTES_BIG_ENDIAN
3415                                 && aarch64_vect_struct_mode_p (mode)));
3416
3417   bool allow_reg_index_p =
3418     !load_store_pair_p
3419     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3420     && !aarch64_vect_struct_mode_p (mode);
3421
3422   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3423      REG addressing.  */
3424   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3425       && (code != POST_INC && code != REG))
3426     return false;
3427
3428   switch (code)
3429     {
3430     case REG:
3431     case SUBREG:
3432       info->type = ADDRESS_REG_IMM;
3433       info->base = x;
3434       info->offset = const0_rtx;
3435       return aarch64_base_register_rtx_p (x, strict_p);
3436
3437     case PLUS:
3438       op0 = XEXP (x, 0);
3439       op1 = XEXP (x, 1);
3440
3441       if (! strict_p
3442           && REG_P (op0)
3443           && (op0 == virtual_stack_vars_rtx
3444               || op0 == frame_pointer_rtx
3445               || op0 == arg_pointer_rtx)
3446           && CONST_INT_P (op1))
3447         {
3448           info->type = ADDRESS_REG_IMM;
3449           info->base = op0;
3450           info->offset = op1;
3451
3452           return true;
3453         }
3454
3455       if (GET_MODE_SIZE (mode) != 0
3456           && CONST_INT_P (op1)
3457           && aarch64_base_register_rtx_p (op0, strict_p))
3458         {
3459           HOST_WIDE_INT offset = INTVAL (op1);
3460
3461           info->type = ADDRESS_REG_IMM;
3462           info->base = op0;
3463           info->offset = op1;
3464
3465           /* TImode and TFmode values are allowed in both pairs of X
3466              registers and individual Q registers.  The available
3467              address modes are:
3468              X,X: 7-bit signed scaled offset
3469              Q:   9-bit signed offset
3470              We conservatively require an offset representable in either mode.
3471            */
3472           if (mode == TImode || mode == TFmode)
3473             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3474                     && offset_9bit_signed_unscaled_p (mode, offset));
3475
3476           /* A 7bit offset check because OImode will emit a ldp/stp
3477              instruction (only big endian will get here).
3478              For ldp/stp instructions, the offset is scaled for the size of a
3479              single element of the pair.  */
3480           if (mode == OImode)
3481             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3482
3483           /* Three 9/12 bit offsets checks because CImode will emit three
3484              ldr/str instructions (only big endian will get here).  */
3485           if (mode == CImode)
3486             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3487                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3488                         || offset_12bit_unsigned_scaled_p (V16QImode,
3489                                                            offset + 32)));
3490
3491           /* Two 7bit offsets checks because XImode will emit two ldp/stp
3492              instructions (only big endian will get here).  */
3493           if (mode == XImode)
3494             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3495                     && aarch64_offset_7bit_signed_scaled_p (TImode,
3496                                                             offset + 32));
3497
3498           if (load_store_pair_p)
3499             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3500                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3501           else
3502             return (offset_9bit_signed_unscaled_p (mode, offset)
3503                     || offset_12bit_unsigned_scaled_p (mode, offset));
3504         }
3505
3506       if (allow_reg_index_p)
3507         {
3508           /* Look for base + (scaled/extended) index register.  */
3509           if (aarch64_base_register_rtx_p (op0, strict_p)
3510               && aarch64_classify_index (info, op1, mode, strict_p))
3511             {
3512               info->base = op0;
3513               return true;
3514             }
3515           if (aarch64_base_register_rtx_p (op1, strict_p)
3516               && aarch64_classify_index (info, op0, mode, strict_p))
3517             {
3518               info->base = op1;
3519               return true;
3520             }
3521         }
3522
3523       return false;
3524
3525     case POST_INC:
3526     case POST_DEC:
3527     case PRE_INC:
3528     case PRE_DEC:
3529       info->type = ADDRESS_REG_WB;
3530       info->base = XEXP (x, 0);
3531       info->offset = NULL_RTX;
3532       return aarch64_base_register_rtx_p (info->base, strict_p);
3533
3534     case POST_MODIFY:
3535     case PRE_MODIFY:
3536       info->type = ADDRESS_REG_WB;
3537       info->base = XEXP (x, 0);
3538       if (GET_CODE (XEXP (x, 1)) == PLUS
3539           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3540           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3541           && aarch64_base_register_rtx_p (info->base, strict_p))
3542         {
3543           HOST_WIDE_INT offset;
3544           info->offset = XEXP (XEXP (x, 1), 1);
3545           offset = INTVAL (info->offset);
3546
3547           /* TImode and TFmode values are allowed in both pairs of X
3548              registers and individual Q registers.  The available
3549              address modes are:
3550              X,X: 7-bit signed scaled offset
3551              Q:   9-bit signed offset
3552              We conservatively require an offset representable in either mode.
3553            */
3554           if (mode == TImode || mode == TFmode)
3555             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3556                     && offset_9bit_signed_unscaled_p (mode, offset));
3557
3558           if (load_store_pair_p)
3559             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3560                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3561           else
3562             return offset_9bit_signed_unscaled_p (mode, offset);
3563         }
3564       return false;
3565
3566     case CONST:
3567     case SYMBOL_REF:
3568     case LABEL_REF:
3569       /* load literal: pc-relative constant pool entry.  Only supported
3570          for SI mode or larger.  */
3571       info->type = ADDRESS_SYMBOLIC;
3572
3573       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3574         {
3575           rtx sym, addend;
3576
3577           split_const (x, &sym, &addend);
3578           return (GET_CODE (sym) == LABEL_REF
3579                   || (GET_CODE (sym) == SYMBOL_REF
3580                       && CONSTANT_POOL_ADDRESS_P (sym)));
3581         }
3582       return false;
3583
3584     case LO_SUM:
3585       info->type = ADDRESS_LO_SUM;
3586       info->base = XEXP (x, 0);
3587       info->offset = XEXP (x, 1);
3588       if (allow_reg_index_p
3589           && aarch64_base_register_rtx_p (info->base, strict_p))
3590         {
3591           rtx sym, offs;
3592           split_const (info->offset, &sym, &offs);
3593           if (GET_CODE (sym) == SYMBOL_REF
3594               && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3595                   == SYMBOL_SMALL_ABSOLUTE))
3596             {
3597               /* The symbol and offset must be aligned to the access size.  */
3598               unsigned int align;
3599               unsigned int ref_size;
3600
3601               if (CONSTANT_POOL_ADDRESS_P (sym))
3602                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3603               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3604                 {
3605                   tree exp = SYMBOL_REF_DECL (sym);
3606                   align = TYPE_ALIGN (TREE_TYPE (exp));
3607                   align = CONSTANT_ALIGNMENT (exp, align);
3608                 }
3609               else if (SYMBOL_REF_DECL (sym))
3610                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3611               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3612                        && SYMBOL_REF_BLOCK (sym) != NULL)
3613                 align = SYMBOL_REF_BLOCK (sym)->alignment;
3614               else
3615                 align = BITS_PER_UNIT;
3616
3617               ref_size = GET_MODE_SIZE (mode);
3618               if (ref_size == 0)
3619                 ref_size = GET_MODE_SIZE (DImode);
3620
3621               return ((INTVAL (offs) & (ref_size - 1)) == 0
3622                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3623             }
3624         }
3625       return false;
3626
3627     default:
3628       return false;
3629     }
3630 }
3631
3632 bool
3633 aarch64_symbolic_address_p (rtx x)
3634 {
3635   rtx offset;
3636
3637   split_const (x, &x, &offset);
3638   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3639 }
3640
3641 /* Classify the base of symbolic expression X, given that X appears in
3642    context CONTEXT.  */
3643
3644 enum aarch64_symbol_type
3645 aarch64_classify_symbolic_expression (rtx x,
3646                                       enum aarch64_symbol_context context)
3647 {
3648   rtx offset;
3649
3650   split_const (x, &x, &offset);
3651   return aarch64_classify_symbol (x, offset, context);
3652 }
3653
3654
3655 /* Return TRUE if X is a legitimate address for accessing memory in
3656    mode MODE.  */
3657 static bool
3658 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3659 {
3660   struct aarch64_address_info addr;
3661
3662   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3663 }
3664
3665 /* Return TRUE if X is a legitimate address for accessing memory in
3666    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3667    pair operation.  */
3668 bool
3669 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3670                               RTX_CODE outer_code, bool strict_p)
3671 {
3672   struct aarch64_address_info addr;
3673
3674   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3675 }
3676
3677 /* Return TRUE if rtx X is immediate constant 0.0 */
3678 bool
3679 aarch64_float_const_zero_rtx_p (rtx x)
3680 {
3681   REAL_VALUE_TYPE r;
3682
3683   if (GET_MODE (x) == VOIDmode)
3684     return false;
3685
3686   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3687   if (REAL_VALUE_MINUS_ZERO (r))
3688     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3689   return REAL_VALUES_EQUAL (r, dconst0);
3690 }
3691
3692 /* Return the fixed registers used for condition codes.  */
3693
3694 static bool
3695 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3696 {
3697   *p1 = CC_REGNUM;
3698   *p2 = INVALID_REGNUM;
3699   return true;
3700 }
3701
3702 /* Emit call insn with PAT and do aarch64-specific handling.  */
3703
3704 void
3705 aarch64_emit_call_insn (rtx pat)
3706 {
3707   rtx insn = emit_call_insn (pat);
3708
3709   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3710   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3711   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3712 }
3713
3714 machine_mode
3715 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3716 {
3717   /* All floating point compares return CCFP if it is an equality
3718      comparison, and CCFPE otherwise.  */
3719   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3720     {
3721       switch (code)
3722         {
3723         case EQ:
3724         case NE:
3725         case UNORDERED:
3726         case ORDERED:
3727         case UNLT:
3728         case UNLE:
3729         case UNGT:
3730         case UNGE:
3731         case UNEQ:
3732         case LTGT:
3733           return CCFPmode;
3734
3735         case LT:
3736         case LE:
3737         case GT:
3738         case GE:
3739           return CCFPEmode;
3740
3741         default:
3742           gcc_unreachable ();
3743         }
3744     }
3745
3746   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3747       && y == const0_rtx
3748       && (code == EQ || code == NE || code == LT || code == GE)
3749       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3750           || GET_CODE (x) == NEG))
3751     return CC_NZmode;
3752
3753   /* A compare with a shifted operand.  Because of canonicalization,
3754      the comparison will have to be swapped when we emit the assembly
3755      code.  */
3756   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3757       && (REG_P (y) || GET_CODE (y) == SUBREG)
3758       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3759           || GET_CODE (x) == LSHIFTRT
3760           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3761     return CC_SWPmode;
3762
3763   /* Similarly for a negated operand, but we can only do this for
3764      equalities.  */
3765   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3766       && (REG_P (y) || GET_CODE (y) == SUBREG)
3767       && (code == EQ || code == NE)
3768       && GET_CODE (x) == NEG)
3769     return CC_Zmode;
3770
3771   /* A compare of a mode narrower than SI mode against zero can be done
3772      by extending the value in the comparison.  */
3773   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3774       && y == const0_rtx)
3775     /* Only use sign-extension if we really need it.  */
3776     return ((code == GT || code == GE || code == LE || code == LT)
3777             ? CC_SESWPmode : CC_ZESWPmode);
3778
3779   /* For everything else, return CCmode.  */
3780   return CCmode;
3781 }
3782
3783 static int
3784 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3785
3786 int
3787 aarch64_get_condition_code (rtx x)
3788 {
3789   machine_mode mode = GET_MODE (XEXP (x, 0));
3790   enum rtx_code comp_code = GET_CODE (x);
3791
3792   if (GET_MODE_CLASS (mode) != MODE_CC)
3793     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3794   return aarch64_get_condition_code_1 (mode, comp_code);
3795 }
3796
3797 static int
3798 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3799 {
3800   int ne = -1, eq = -1;
3801   switch (mode)
3802     {
3803     case CCFPmode:
3804     case CCFPEmode:
3805       switch (comp_code)
3806         {
3807         case GE: return AARCH64_GE;
3808         case GT: return AARCH64_GT;
3809         case LE: return AARCH64_LS;
3810         case LT: return AARCH64_MI;
3811         case NE: return AARCH64_NE;
3812         case EQ: return AARCH64_EQ;
3813         case ORDERED: return AARCH64_VC;
3814         case UNORDERED: return AARCH64_VS;
3815         case UNLT: return AARCH64_LT;
3816         case UNLE: return AARCH64_LE;
3817         case UNGT: return AARCH64_HI;
3818         case UNGE: return AARCH64_PL;
3819         default: return -1;
3820         }
3821       break;
3822
3823     case CC_DNEmode:
3824       ne = AARCH64_NE;
3825       eq = AARCH64_EQ;
3826       break;
3827
3828     case CC_DEQmode:
3829       ne = AARCH64_EQ;
3830       eq = AARCH64_NE;
3831       break;
3832
3833     case CC_DGEmode:
3834       ne = AARCH64_GE;
3835       eq = AARCH64_LT;
3836       break;
3837
3838     case CC_DLTmode:
3839       ne = AARCH64_LT;
3840       eq = AARCH64_GE;
3841       break;
3842
3843     case CC_DGTmode:
3844       ne = AARCH64_GT;
3845       eq = AARCH64_LE;
3846       break;
3847
3848     case CC_DLEmode:
3849       ne = AARCH64_LE;
3850       eq = AARCH64_GT;
3851       break;
3852
3853     case CC_DGEUmode:
3854       ne = AARCH64_CS;
3855       eq = AARCH64_CC;
3856       break;
3857
3858     case CC_DLTUmode:
3859       ne = AARCH64_CC;
3860       eq = AARCH64_CS;
3861       break;
3862
3863     case CC_DGTUmode:
3864       ne = AARCH64_HI;
3865       eq = AARCH64_LS;
3866       break;
3867
3868     case CC_DLEUmode:
3869       ne = AARCH64_LS;
3870       eq = AARCH64_HI;
3871       break;
3872
3873     case CCmode:
3874       switch (comp_code)
3875         {
3876         case NE: return AARCH64_NE;
3877         case EQ: return AARCH64_EQ;
3878         case GE: return AARCH64_GE;
3879         case GT: return AARCH64_GT;
3880         case LE: return AARCH64_LE;
3881         case LT: return AARCH64_LT;
3882         case GEU: return AARCH64_CS;
3883         case GTU: return AARCH64_HI;
3884         case LEU: return AARCH64_LS;
3885         case LTU: return AARCH64_CC;
3886         default: return -1;
3887         }
3888       break;
3889
3890     case CC_SWPmode:
3891     case CC_ZESWPmode:
3892     case CC_SESWPmode:
3893       switch (comp_code)
3894         {
3895         case NE: return AARCH64_NE;
3896         case EQ: return AARCH64_EQ;
3897         case GE: return AARCH64_LE;
3898         case GT: return AARCH64_LT;
3899         case LE: return AARCH64_GE;
3900         case LT: return AARCH64_GT;
3901         case GEU: return AARCH64_LS;
3902         case GTU: return AARCH64_CC;
3903         case LEU: return AARCH64_CS;
3904         case LTU: return AARCH64_HI;
3905         default: return -1;
3906         }
3907       break;
3908
3909     case CC_NZmode:
3910       switch (comp_code)
3911         {
3912         case NE: return AARCH64_NE;
3913         case EQ: return AARCH64_EQ;
3914         case GE: return AARCH64_PL;
3915         case LT: return AARCH64_MI;
3916         default: return -1;
3917         }
3918       break;
3919
3920     case CC_Zmode:
3921       switch (comp_code)
3922         {
3923         case NE: return AARCH64_NE;
3924         case EQ: return AARCH64_EQ;
3925         default: return -1;
3926         }
3927       break;
3928
3929     default:
3930       return -1;
3931       break;
3932     }
3933
3934   if (comp_code == NE)
3935     return ne;
3936
3937   if (comp_code == EQ)
3938     return eq;
3939
3940   return -1;
3941 }
3942
3943 bool
3944 aarch64_const_vec_all_same_in_range_p (rtx x,
3945                                   HOST_WIDE_INT minval,
3946                                   HOST_WIDE_INT maxval)
3947 {
3948   HOST_WIDE_INT firstval;
3949   int count, i;
3950
3951   if (GET_CODE (x) != CONST_VECTOR
3952       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3953     return false;
3954
3955   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3956   if (firstval < minval || firstval > maxval)
3957     return false;
3958
3959   count = CONST_VECTOR_NUNITS (x);
3960   for (i = 1; i < count; i++)
3961     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3962       return false;
3963
3964   return true;
3965 }
3966
3967 bool
3968 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3969 {
3970   return aarch64_const_vec_all_same_in_range_p (x, val, val);
3971 }
3972
3973 static unsigned
3974 bit_count (unsigned HOST_WIDE_INT value)
3975 {
3976   unsigned count = 0;
3977
3978   while (value)
3979     {
3980       count++;
3981       value &= value - 1;
3982     }
3983
3984   return count;
3985 }
3986
3987 /* N Z C V.  */
3988 #define AARCH64_CC_V 1
3989 #define AARCH64_CC_C (1 << 1)
3990 #define AARCH64_CC_Z (1 << 2)
3991 #define AARCH64_CC_N (1 << 3)
3992
3993 /* N Z C V flags for ccmp.  The first code is for AND op and the other
3994    is for IOR op.  Indexed by AARCH64_COND_CODE.  */
3995 static const int aarch64_nzcv_codes[][2] =
3996 {
3997   {AARCH64_CC_Z, 0}, /* EQ, Z == 1.  */
3998   {0, AARCH64_CC_Z}, /* NE, Z == 0.  */
3999   {AARCH64_CC_C, 0}, /* CS, C == 1.  */
4000   {0, AARCH64_CC_C}, /* CC, C == 0.  */
4001   {AARCH64_CC_N, 0}, /* MI, N == 1.  */
4002   {0, AARCH64_CC_N}, /* PL, N == 0.  */
4003   {AARCH64_CC_V, 0}, /* VS, V == 1.  */
4004   {0, AARCH64_CC_V}, /* VC, V == 0.  */
4005   {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0.  */
4006   {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0).  */
4007   {0, AARCH64_CC_V}, /* GE, N == V.  */
4008   {AARCH64_CC_V, 0}, /* LT, N != V.  */
4009   {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V.  */
4010   {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V).  */
4011   {0, 0}, /* AL, Any.  */
4012   {0, 0}, /* NV, Any.  */
4013 };
4014
4015 int
4016 aarch64_ccmp_mode_to_code (enum machine_mode mode)
4017 {
4018   switch (mode)
4019     {
4020     case CC_DNEmode:
4021       return NE;
4022
4023     case CC_DEQmode:
4024       return EQ;
4025
4026     case CC_DLEmode:
4027       return LE;
4028
4029     case CC_DGTmode:
4030       return GT;
4031
4032     case CC_DLTmode:
4033       return LT;
4034
4035     case CC_DGEmode:
4036       return GE;
4037
4038     case CC_DLEUmode:
4039       return LEU;
4040
4041     case CC_DGTUmode:
4042       return GTU;
4043
4044     case CC_DLTUmode:
4045       return LTU;
4046
4047     case CC_DGEUmode:
4048       return GEU;
4049
4050     default:
4051       gcc_unreachable ();
4052     }
4053 }
4054
4055
4056 void
4057 aarch64_print_operand (FILE *f, rtx x, char code)
4058 {
4059   switch (code)
4060     {
4061     /* An integer or symbol address without a preceding # sign.  */
4062     case 'c':
4063       switch (GET_CODE (x))
4064         {
4065         case CONST_INT:
4066           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4067           break;
4068
4069         case SYMBOL_REF:
4070           output_addr_const (f, x);
4071           break;
4072
4073         case CONST:
4074           if (GET_CODE (XEXP (x, 0)) == PLUS
4075               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4076             {
4077               output_addr_const (f, x);
4078               break;
4079             }
4080           /* Fall through.  */
4081
4082         default:
4083           output_operand_lossage ("Unsupported operand for code '%c'", code);
4084         }
4085       break;
4086
4087     case 'e':
4088       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4089       {
4090         int n;
4091
4092         if (!CONST_INT_P (x)
4093             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4094           {
4095             output_operand_lossage ("invalid operand for '%%%c'", code);
4096             return;
4097           }
4098
4099         switch (n)
4100           {
4101           case 3:
4102             fputc ('b', f);
4103             break;
4104           case 4:
4105             fputc ('h', f);
4106             break;
4107           case 5:
4108             fputc ('w', f);
4109             break;
4110           default:
4111             output_operand_lossage ("invalid operand for '%%%c'", code);
4112             return;
4113           }
4114       }
4115       break;
4116
4117     case 'p':
4118       {
4119         int n;
4120
4121         /* Print N such that 2^N == X.  */
4122         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4123           {
4124             output_operand_lossage ("invalid operand for '%%%c'", code);
4125             return;
4126           }
4127
4128         asm_fprintf (f, "%d", n);
4129       }
4130       break;
4131
4132     case 'P':
4133       /* Print the number of non-zero bits in X (a const_int).  */
4134       if (!CONST_INT_P (x))
4135         {
4136           output_operand_lossage ("invalid operand for '%%%c'", code);
4137           return;
4138         }
4139
4140       asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4141       break;
4142
4143     case 'H':
4144       /* Print the higher numbered register of a pair (TImode) of regs.  */
4145       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4146         {
4147           output_operand_lossage ("invalid operand for '%%%c'", code);
4148           return;
4149         }
4150
4151       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4152       break;
4153
4154     case 'm':
4155       {
4156         int cond_code;
4157         /* Print a condition (eq, ne, etc).  */
4158
4159         /* CONST_TRUE_RTX means always -- that's the default.  */
4160         if (x == const_true_rtx)
4161           return;
4162
4163         if (!COMPARISON_P (x))
4164           {
4165             output_operand_lossage ("invalid operand for '%%%c'", code);
4166             return;
4167           }
4168
4169         cond_code = aarch64_get_condition_code (x);
4170         gcc_assert (cond_code >= 0);
4171         fputs (aarch64_condition_codes[cond_code], f);
4172       }
4173       break;
4174
4175     case 'M':
4176       {
4177         int cond_code;
4178         /* Print the inverse of a condition (eq <-> ne, etc).  */
4179
4180         /* CONST_TRUE_RTX means never -- that's the default.  */
4181         if (x == const_true_rtx)
4182           {
4183             fputs ("nv", f);
4184             return;
4185           }
4186
4187         if (!COMPARISON_P (x))
4188           {
4189             output_operand_lossage ("invalid operand for '%%%c'", code);
4190             return;
4191           }
4192         cond_code = aarch64_get_condition_code (x);
4193         gcc_assert (cond_code >= 0);
4194         fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4195                                        (cond_code)], f);
4196       }
4197       break;
4198
4199     case 'b':
4200     case 'h':
4201     case 's':
4202     case 'd':
4203     case 'q':
4204       /* Print a scalar FP/SIMD register name.  */
4205       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4206         {
4207           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4208           return;
4209         }
4210       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4211       break;
4212
4213     case 'S':
4214     case 'T':
4215     case 'U':
4216     case 'V':
4217       /* Print the first FP/SIMD register name in a list.  */
4218       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4219         {
4220           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4221           return;
4222         }
4223       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4224       break;
4225
4226     case 'R':
4227       /* Print a scalar FP/SIMD register name + 1.  */
4228       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4229         {
4230           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4231           return;
4232         }
4233       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4234       break;
4235
4236     case 'X':
4237       /* Print bottom 16 bits of integer constant in hex.  */
4238       if (!CONST_INT_P (x))
4239         {
4240           output_operand_lossage ("invalid operand for '%%%c'", code);
4241           return;
4242         }
4243       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4244       break;
4245
4246     case 'w':
4247     case 'x':
4248       /* Print a general register name or the zero register (32-bit or
4249          64-bit).  */
4250       if (x == const0_rtx
4251           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4252         {
4253           asm_fprintf (f, "%czr", code);
4254           break;
4255         }
4256
4257       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4258         {
4259           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4260           break;
4261         }
4262
4263       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4264         {
4265           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4266           break;
4267         }
4268
4269       /* Fall through */
4270
4271     case 0:
4272       /* Print a normal operand, if it's a general register, then we
4273          assume DImode.  */
4274       if (x == NULL)
4275         {
4276           output_operand_lossage ("missing operand");
4277           return;
4278         }
4279
4280       switch (GET_CODE (x))
4281         {
4282         case REG:
4283           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4284           break;
4285
4286         case MEM:
4287           aarch64_memory_reference_mode = GET_MODE (x);
4288           output_address (XEXP (x, 0));
4289           break;
4290
4291         case LABEL_REF:
4292         case SYMBOL_REF:
4293           output_addr_const (asm_out_file, x);
4294           break;
4295
4296         case CONST_INT:
4297           asm_fprintf (f, "%wd", INTVAL (x));
4298           break;
4299
4300         case CONST_VECTOR:
4301           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4302             {
4303               gcc_assert (
4304                   aarch64_const_vec_all_same_in_range_p (x,
4305                                                          HOST_WIDE_INT_MIN,
4306                                                          HOST_WIDE_INT_MAX));
4307               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4308             }
4309           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4310             {
4311               fputc ('0', f);
4312             }
4313           else
4314             gcc_unreachable ();
4315           break;
4316
4317         case CONST_DOUBLE:
4318           /* CONST_DOUBLE can represent a double-width integer.
4319              In this case, the mode of x is VOIDmode.  */
4320           if (GET_MODE (x) == VOIDmode)
4321             ; /* Do Nothing.  */
4322           else if (aarch64_float_const_zero_rtx_p (x))
4323             {
4324               fputc ('0', f);
4325               break;
4326             }
4327           else if (aarch64_float_const_representable_p (x))
4328             {
4329 #define buf_size 20
4330               char float_buf[buf_size] = {'\0'};
4331               REAL_VALUE_TYPE r;
4332               REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4333               real_to_decimal_for_mode (float_buf, &r,
4334                                         buf_size, buf_size,
4335                                         1, GET_MODE (x));
4336               asm_fprintf (asm_out_file, "%s", float_buf);
4337               break;
4338 #undef buf_size
4339             }
4340           output_operand_lossage ("invalid constant");
4341           return;
4342         default:
4343           output_operand_lossage ("invalid operand");
4344           return;
4345         }
4346       break;
4347
4348     case 'A':
4349       if (GET_CODE (x) == HIGH)
4350         x = XEXP (x, 0);
4351
4352       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4353         {
4354         case SYMBOL_SMALL_GOT:
4355           asm_fprintf (asm_out_file, ":got:");
4356           break;
4357
4358         case SYMBOL_SMALL_TLSGD:
4359           asm_fprintf (asm_out_file, ":tlsgd:");
4360           break;
4361
4362         case SYMBOL_SMALL_TLSDESC:
4363           asm_fprintf (asm_out_file, ":tlsdesc:");
4364           break;
4365
4366         case SYMBOL_SMALL_GOTTPREL:
4367           asm_fprintf (asm_out_file, ":gottprel:");
4368           break;
4369
4370         case SYMBOL_SMALL_TPREL:
4371           asm_fprintf (asm_out_file, ":tprel:");
4372           break;
4373
4374         case SYMBOL_TINY_GOT:
4375           gcc_unreachable ();
4376           break;
4377
4378         default:
4379           break;
4380         }
4381       output_addr_const (asm_out_file, x);
4382       break;
4383
4384     case 'L':
4385       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4386         {
4387         case SYMBOL_SMALL_GOT:
4388           asm_fprintf (asm_out_file, ":lo12:");
4389           break;
4390
4391         case SYMBOL_SMALL_TLSGD:
4392           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4393           break;
4394
4395         case SYMBOL_SMALL_TLSDESC:
4396           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4397           break;
4398
4399         case SYMBOL_SMALL_GOTTPREL:
4400           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4401           break;
4402
4403         case SYMBOL_SMALL_TPREL:
4404           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4405           break;
4406
4407         case SYMBOL_TINY_GOT:
4408           asm_fprintf (asm_out_file, ":got:");
4409           break;
4410
4411         default:
4412           break;
4413         }
4414       output_addr_const (asm_out_file, x);
4415       break;
4416
4417     case 'G':
4418
4419       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4420         {
4421         case SYMBOL_SMALL_TPREL:
4422           asm_fprintf (asm_out_file, ":tprel_hi12:");
4423           break;
4424         default:
4425           break;
4426         }
4427       output_addr_const (asm_out_file, x);
4428       break;
4429
4430     case 'K':
4431       {
4432         int cond_code;
4433         /* Print nzcv.  */
4434
4435         if (!COMPARISON_P (x))
4436           {
4437             output_operand_lossage ("invalid operand for '%%%c'", code);
4438             return;
4439           }
4440
4441         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4442         gcc_assert (cond_code >= 0);
4443         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4444       }
4445       break;
4446
4447     case 'k':
4448       {
4449         int cond_code;
4450         /* Print nzcv.  */
4451
4452         if (!COMPARISON_P (x))
4453           {
4454             output_operand_lossage ("invalid operand for '%%%c'", code);
4455             return;
4456           }
4457
4458         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4459         gcc_assert (cond_code >= 0);
4460         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4461       }
4462       break;
4463
4464     default:
4465       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4466       return;
4467     }
4468 }
4469
4470 void
4471 aarch64_print_operand_address (FILE *f, rtx x)
4472 {
4473   struct aarch64_address_info addr;
4474
4475   if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4476                              MEM, true))
4477     switch (addr.type)
4478       {
4479       case ADDRESS_REG_IMM:
4480         if (addr.offset == const0_rtx)
4481           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4482         else
4483           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4484                        INTVAL (addr.offset));
4485         return;
4486
4487       case ADDRESS_REG_REG:
4488         if (addr.shift == 0)
4489           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4490                        reg_names [REGNO (addr.offset)]);
4491         else
4492           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4493                        reg_names [REGNO (addr.offset)], addr.shift);
4494         return;
4495
4496       case ADDRESS_REG_UXTW:
4497         if (addr.shift == 0)
4498           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4499                        REGNO (addr.offset) - R0_REGNUM);
4500         else
4501           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4502                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4503         return;
4504
4505       case ADDRESS_REG_SXTW:
4506         if (addr.shift == 0)
4507           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4508                        REGNO (addr.offset) - R0_REGNUM);
4509         else
4510           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4511                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4512         return;
4513
4514       case ADDRESS_REG_WB:
4515         switch (GET_CODE (x))
4516           {
4517           case PRE_INC:
4518             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4519                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4520             return;
4521           case POST_INC:
4522             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4523                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4524             return;
4525           case PRE_DEC:
4526             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4527                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4528             return;
4529           case POST_DEC:
4530             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4531                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4532             return;
4533           case PRE_MODIFY:
4534             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4535                          INTVAL (addr.offset));
4536             return;
4537           case POST_MODIFY:
4538             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4539                          INTVAL (addr.offset));
4540             return;
4541           default:
4542             break;
4543           }
4544         break;
4545
4546       case ADDRESS_LO_SUM:
4547         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4548         output_addr_const (f, addr.offset);
4549         asm_fprintf (f, "]");
4550         return;
4551
4552       case ADDRESS_SYMBOLIC:
4553         break;
4554       }
4555
4556   output_addr_const (f, x);
4557 }
4558
4559 bool
4560 aarch64_label_mentioned_p (rtx x)
4561 {
4562   const char *fmt;
4563   int i;
4564
4565   if (GET_CODE (x) == LABEL_REF)
4566     return true;
4567
4568   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4569      referencing instruction, but they are constant offsets, not
4570      symbols.  */
4571   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4572     return false;
4573
4574   fmt = GET_RTX_FORMAT (GET_CODE (x));
4575   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4576     {
4577       if (fmt[i] == 'E')
4578         {
4579           int j;
4580
4581           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4582             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4583               return 1;
4584         }
4585       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4586         return 1;
4587     }
4588
4589   return 0;
4590 }
4591
4592 /* Implement REGNO_REG_CLASS.  */
4593
4594 enum reg_class
4595 aarch64_regno_regclass (unsigned regno)
4596 {
4597   if (GP_REGNUM_P (regno))
4598     return GENERAL_REGS;
4599
4600   if (regno == SP_REGNUM)
4601     return STACK_REG;
4602
4603   if (regno == FRAME_POINTER_REGNUM
4604       || regno == ARG_POINTER_REGNUM)
4605     return POINTER_REGS;
4606
4607   if (FP_REGNUM_P (regno))
4608     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4609
4610   return NO_REGS;
4611 }
4612
4613 static rtx
4614 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
4615 {
4616   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4617      where mask is selected by alignment and size of the offset.
4618      We try to pick as large a range for the offset as possible to
4619      maximize the chance of a CSE.  However, for aligned addresses
4620      we limit the range to 4k so that structures with different sized
4621      elements are likely to use the same base.  */
4622
4623   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4624     {
4625       HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4626       HOST_WIDE_INT base_offset;
4627
4628       /* Does it look like we'll need a load/store-pair operation?  */
4629       if (GET_MODE_SIZE (mode) > 16
4630           || mode == TImode)
4631         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4632                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
4633       /* For offsets aren't a multiple of the access size, the limit is
4634          -256...255.  */
4635       else if (offset & (GET_MODE_SIZE (mode) - 1))
4636         base_offset = (offset + 0x100) & ~0x1ff;
4637       else
4638         base_offset = offset & ~0xfff;
4639
4640       if (base_offset == 0)
4641         return x;
4642
4643       offset -= base_offset;
4644       rtx base_reg = gen_reg_rtx (Pmode);
4645       rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4646                            NULL_RTX);
4647       emit_move_insn (base_reg, val);
4648       x = plus_constant (Pmode, base_reg, offset);
4649     }
4650
4651   return x;
4652 }
4653
4654 /* Try a machine-dependent way of reloading an illegitimate address
4655    operand.  If we find one, push the reload and return the new rtx.  */
4656
4657 rtx
4658 aarch64_legitimize_reload_address (rtx *x_p,
4659                                    machine_mode mode,
4660                                    int opnum, int type,
4661                                    int ind_levels ATTRIBUTE_UNUSED)
4662 {
4663   rtx x = *x_p;
4664
4665   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4666   if (aarch64_vect_struct_mode_p (mode)
4667       && GET_CODE (x) == PLUS
4668       && REG_P (XEXP (x, 0))
4669       && CONST_INT_P (XEXP (x, 1)))
4670     {
4671       rtx orig_rtx = x;
4672       x = copy_rtx (x);
4673       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4674                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4675                    opnum, (enum reload_type) type);
4676       return x;
4677     }
4678
4679   /* We must recognize output that we have already generated ourselves.  */
4680   if (GET_CODE (x) == PLUS
4681       && GET_CODE (XEXP (x, 0)) == PLUS
4682       && REG_P (XEXP (XEXP (x, 0), 0))
4683       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4684       && CONST_INT_P (XEXP (x, 1)))
4685     {
4686       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4687                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4688                    opnum, (enum reload_type) type);
4689       return x;
4690     }
4691
4692   /* We wish to handle large displacements off a base register by splitting
4693      the addend across an add and the mem insn.  This can cut the number of
4694      extra insns needed from 3 to 1.  It is only useful for load/store of a
4695      single register with 12 bit offset field.  */
4696   if (GET_CODE (x) == PLUS
4697       && REG_P (XEXP (x, 0))
4698       && CONST_INT_P (XEXP (x, 1))
4699       && HARD_REGISTER_P (XEXP (x, 0))
4700       && mode != TImode
4701       && mode != TFmode
4702       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4703     {
4704       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4705       HOST_WIDE_INT low = val & 0xfff;
4706       HOST_WIDE_INT high = val - low;
4707       HOST_WIDE_INT offs;
4708       rtx cst;
4709       machine_mode xmode = GET_MODE (x);
4710
4711       /* In ILP32, xmode can be either DImode or SImode.  */
4712       gcc_assert (xmode == DImode || xmode == SImode);
4713
4714       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4715          BLKmode alignment.  */
4716       if (GET_MODE_SIZE (mode) == 0)
4717         return NULL_RTX;
4718
4719       offs = low % GET_MODE_SIZE (mode);
4720
4721       /* Align misaligned offset by adjusting high part to compensate.  */
4722       if (offs != 0)
4723         {
4724           if (aarch64_uimm12_shift (high + offs))
4725             {
4726               /* Align down.  */
4727               low = low - offs;
4728               high = high + offs;
4729             }
4730           else
4731             {
4732               /* Align up.  */
4733               offs = GET_MODE_SIZE (mode) - offs;
4734               low = low + offs;
4735               high = high + (low & 0x1000) - offs;
4736               low &= 0xfff;
4737             }
4738         }
4739
4740       /* Check for overflow.  */
4741       if (high + low != val)
4742         return NULL_RTX;
4743
4744       cst = GEN_INT (high);
4745       if (!aarch64_uimm12_shift (high))
4746         cst = force_const_mem (xmode, cst);
4747
4748       /* Reload high part into base reg, leaving the low part
4749          in the mem instruction.
4750          Note that replacing this gen_rtx_PLUS with plus_constant is
4751          wrong in this case because we rely on the
4752          (plus (plus reg c1) c2) structure being preserved so that
4753          XEXP (*p, 0) in push_reload below uses the correct term.  */
4754       x = gen_rtx_PLUS (xmode,
4755                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4756                         GEN_INT (low));
4757
4758       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4759                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4760                    opnum, (enum reload_type) type);
4761       return x;
4762     }
4763
4764   return NULL_RTX;
4765 }
4766
4767
4768 static reg_class_t
4769 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4770                           reg_class_t rclass,
4771                           machine_mode mode,
4772                           secondary_reload_info *sri)
4773 {
4774   /* Without the TARGET_SIMD instructions we cannot move a Q register
4775      to a Q register directly.  We need a scratch.  */
4776   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4777       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4778       && reg_class_subset_p (rclass, FP_REGS))
4779     {
4780       if (mode == TFmode)
4781         sri->icode = CODE_FOR_aarch64_reload_movtf;
4782       else if (mode == TImode)
4783         sri->icode = CODE_FOR_aarch64_reload_movti;
4784       return NO_REGS;
4785     }
4786
4787   /* A TFmode or TImode memory access should be handled via an FP_REGS
4788      because AArch64 has richer addressing modes for LDR/STR instructions
4789      than LDP/STP instructions.  */
4790   if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4791       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4792     return FP_REGS;
4793
4794   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4795       return GENERAL_REGS;
4796
4797   return NO_REGS;
4798 }
4799
4800 static bool
4801 aarch64_can_eliminate (const int from, const int to)
4802 {
4803   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4804      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4805
4806   if (frame_pointer_needed)
4807     {
4808       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4809         return true;
4810       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4811         return false;
4812       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4813           && !cfun->calls_alloca)
4814         return true;
4815       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4816         return true;
4817
4818       return false;
4819     }
4820   else
4821     {
4822       /* If we decided that we didn't need a leaf frame pointer but then used
4823          LR in the function, then we'll want a frame pointer after all, so
4824          prevent this elimination to ensure a frame pointer is used.  */
4825       if (to == STACK_POINTER_REGNUM
4826           && flag_omit_leaf_frame_pointer
4827           && df_regs_ever_live_p (LR_REGNUM))
4828         return false;
4829     }
4830
4831   return true;
4832 }
4833
4834 HOST_WIDE_INT
4835 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4836 {
4837   aarch64_layout_frame ();
4838
4839   if (to == HARD_FRAME_POINTER_REGNUM)
4840     {
4841       if (from == ARG_POINTER_REGNUM)
4842         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4843
4844       if (from == FRAME_POINTER_REGNUM)
4845         return (cfun->machine->frame.hard_fp_offset
4846                 - cfun->machine->frame.saved_varargs_size);
4847     }
4848
4849   if (to == STACK_POINTER_REGNUM)
4850     {
4851       if (from == FRAME_POINTER_REGNUM)
4852           return (cfun->machine->frame.frame_size
4853                   - cfun->machine->frame.saved_varargs_size);
4854     }
4855
4856   return cfun->machine->frame.frame_size;
4857 }
4858
4859 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
4860    previous frame.  */
4861
4862 rtx
4863 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4864 {
4865   if (count != 0)
4866     return const0_rtx;
4867   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4868 }
4869
4870
4871 static void
4872 aarch64_asm_trampoline_template (FILE *f)
4873 {
4874   if (TARGET_ILP32)
4875     {
4876       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4877       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4878     }
4879   else
4880     {
4881       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4882       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4883     }
4884   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4885   assemble_aligned_integer (4, const0_rtx);
4886   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4887   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4888 }
4889
4890 static void
4891 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4892 {
4893   rtx fnaddr, mem, a_tramp;
4894   const int tramp_code_sz = 16;
4895
4896   /* Don't need to copy the trailing D-words, we fill those in below.  */
4897   emit_block_move (m_tramp, assemble_trampoline_template (),
4898                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4899   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4900   fnaddr = XEXP (DECL_RTL (fndecl), 0);
4901   if (GET_MODE (fnaddr) != ptr_mode)
4902     fnaddr = convert_memory_address (ptr_mode, fnaddr);
4903   emit_move_insn (mem, fnaddr);
4904
4905   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4906   emit_move_insn (mem, chain_value);
4907
4908   /* XXX We should really define a "clear_cache" pattern and use
4909      gen_clear_cache().  */
4910   a_tramp = XEXP (m_tramp, 0);
4911   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4912                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4913                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4914                      ptr_mode);
4915 }
4916
4917 static unsigned char
4918 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4919 {
4920   switch (regclass)
4921     {
4922     case CALLER_SAVE_REGS:
4923     case POINTER_REGS:
4924     case GENERAL_REGS:
4925     case ALL_REGS:
4926     case FP_REGS:
4927     case FP_LO_REGS:
4928       return
4929         aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4930                                        (GET_MODE_SIZE (mode) + 7) / 8;
4931     case STACK_REG:
4932       return 1;
4933
4934     case NO_REGS:
4935       return 0;
4936
4937     default:
4938       break;
4939     }
4940   gcc_unreachable ();
4941 }
4942
4943 static reg_class_t
4944 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4945 {
4946   if (regclass == POINTER_REGS)
4947     return GENERAL_REGS;
4948
4949   if (regclass == STACK_REG)
4950     {
4951       if (REG_P(x)
4952           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4953           return regclass;
4954
4955       return NO_REGS;
4956     }
4957
4958   /* If it's an integer immediate that MOVI can't handle, then
4959      FP_REGS is not an option, so we return NO_REGS instead.  */
4960   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4961       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4962     return NO_REGS;
4963
4964   /* Register eliminiation can result in a request for
4965      SP+constant->FP_REGS.  We cannot support such operations which
4966      use SP as source and an FP_REG as destination, so reject out
4967      right now.  */
4968   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4969     {
4970       rtx lhs = XEXP (x, 0);
4971
4972       /* Look through a possible SUBREG introduced by ILP32.  */
4973       if (GET_CODE (lhs) == SUBREG)
4974         lhs = SUBREG_REG (lhs);
4975
4976       gcc_assert (REG_P (lhs));
4977       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4978                                       POINTER_REGS));
4979       return NO_REGS;
4980     }
4981
4982   return regclass;
4983 }
4984
4985 void
4986 aarch64_asm_output_labelref (FILE* f, const char *name)
4987 {
4988   asm_fprintf (f, "%U%s", name);
4989 }
4990
4991 static void
4992 aarch64_elf_asm_constructor (rtx symbol, int priority)
4993 {
4994   if (priority == DEFAULT_INIT_PRIORITY)
4995     default_ctor_section_asm_out_constructor (symbol, priority);
4996   else
4997     {
4998       section *s;
4999       char buf[18];
5000       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5001       s = get_section (buf, SECTION_WRITE, NULL);
5002       switch_to_section (s);
5003       assemble_align (POINTER_SIZE);
5004       assemble_aligned_integer (POINTER_BYTES, symbol);
5005     }
5006 }
5007
5008 static void
5009 aarch64_elf_asm_destructor (rtx symbol, int priority)
5010 {
5011   if (priority == DEFAULT_INIT_PRIORITY)
5012     default_dtor_section_asm_out_destructor (symbol, priority);
5013   else
5014     {
5015       section *s;
5016       char buf[18];
5017       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5018       s = get_section (buf, SECTION_WRITE, NULL);
5019       switch_to_section (s);
5020       assemble_align (POINTER_SIZE);
5021       assemble_aligned_integer (POINTER_BYTES, symbol);
5022     }
5023 }
5024
5025 const char*
5026 aarch64_output_casesi (rtx *operands)
5027 {
5028   char buf[100];
5029   char label[100];
5030   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5031   int index;
5032   static const char *const patterns[4][2] =
5033   {
5034     {
5035       "ldrb\t%w3, [%0,%w1,uxtw]",
5036       "add\t%3, %4, %w3, sxtb #2"
5037     },
5038     {
5039       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5040       "add\t%3, %4, %w3, sxth #2"
5041     },
5042     {
5043       "ldr\t%w3, [%0,%w1,uxtw #2]",
5044       "add\t%3, %4, %w3, sxtw #2"
5045     },
5046     /* We assume that DImode is only generated when not optimizing and
5047        that we don't really need 64-bit address offsets.  That would
5048        imply an object file with 8GB of code in a single function!  */
5049     {
5050       "ldr\t%w3, [%0,%w1,uxtw #2]",
5051       "add\t%3, %4, %w3, sxtw #2"
5052     }
5053   };
5054
5055   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5056
5057   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5058
5059   gcc_assert (index >= 0 && index <= 3);
5060
5061   /* Need to implement table size reduction, by chaning the code below.  */
5062   output_asm_insn (patterns[index][0], operands);
5063   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5064   snprintf (buf, sizeof (buf),
5065             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5066   output_asm_insn (buf, operands);
5067   output_asm_insn (patterns[index][1], operands);
5068   output_asm_insn ("br\t%3", operands);
5069   assemble_label (asm_out_file, label);
5070   return "";
5071 }
5072
5073
5074 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5075    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5076    operator.  */
5077
5078 int
5079 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5080 {
5081   if (shift >= 0 && shift <= 3)
5082     {
5083       int size;
5084       for (size = 8; size <= 32; size *= 2)
5085         {
5086           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5087           if (mask == bits << shift)
5088             return size;
5089         }
5090     }
5091   return 0;
5092 }
5093
5094 static bool
5095 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5096                                    const_rtx x ATTRIBUTE_UNUSED)
5097 {
5098   /* We can't use blocks for constants when we're using a per-function
5099      constant pool.  */
5100   return false;
5101 }
5102
5103 static section *
5104 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5105                             rtx x ATTRIBUTE_UNUSED,
5106                             unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5107 {
5108   /* Force all constant pool entries into the current function section.  */
5109   return function_section (current_function_decl);
5110 }
5111
5112
5113 /* Costs.  */
5114
5115 /* Helper function for rtx cost calculation.  Strip a shift expression
5116    from X.  Returns the inner operand if successful, or the original
5117    expression on failure.  */
5118 static rtx
5119 aarch64_strip_shift (rtx x)
5120 {
5121   rtx op = x;
5122
5123   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5124      we can convert both to ROR during final output.  */
5125   if ((GET_CODE (op) == ASHIFT
5126        || GET_CODE (op) == ASHIFTRT
5127        || GET_CODE (op) == LSHIFTRT
5128        || GET_CODE (op) == ROTATERT
5129        || GET_CODE (op) == ROTATE)
5130       && CONST_INT_P (XEXP (op, 1)))
5131     return XEXP (op, 0);
5132
5133   if (GET_CODE (op) == MULT
5134       && CONST_INT_P (XEXP (op, 1))
5135       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5136     return XEXP (op, 0);
5137
5138   return x;
5139 }
5140
5141 /* Helper function for rtx cost calculation.  Strip an extend
5142    expression from X.  Returns the inner operand if successful, or the
5143    original expression on failure.  We deal with a number of possible
5144    canonicalization variations here.  */
5145 static rtx
5146 aarch64_strip_extend (rtx x)
5147 {
5148   rtx op = x;
5149
5150   /* Zero and sign extraction of a widened value.  */
5151   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5152       && XEXP (op, 2) == const0_rtx
5153       && GET_CODE (XEXP (op, 0)) == MULT
5154       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5155                                          XEXP (op, 1)))
5156     return XEXP (XEXP (op, 0), 0);
5157
5158   /* It can also be represented (for zero-extend) as an AND with an
5159      immediate.  */
5160   if (GET_CODE (op) == AND
5161       && GET_CODE (XEXP (op, 0)) == MULT
5162       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5163       && CONST_INT_P (XEXP (op, 1))
5164       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5165                            INTVAL (XEXP (op, 1))) != 0)
5166     return XEXP (XEXP (op, 0), 0);
5167
5168   /* Now handle extended register, as this may also have an optional
5169      left shift by 1..4.  */
5170   if (GET_CODE (op) == ASHIFT
5171       && CONST_INT_P (XEXP (op, 1))
5172       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5173     op = XEXP (op, 0);
5174
5175   if (GET_CODE (op) == ZERO_EXTEND
5176       || GET_CODE (op) == SIGN_EXTEND)
5177     op = XEXP (op, 0);
5178
5179   if (op != x)
5180     return op;
5181
5182   return x;
5183 }
5184
5185 /* Return true iff CODE is a shift supported in combination
5186    with arithmetic instructions.  */
5187
5188 static bool
5189 aarch64_shift_p (enum rtx_code code)
5190 {
5191   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5192 }
5193
5194 /* Helper function for rtx cost calculation.  Calculate the cost of
5195    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5196    Return the calculated cost of the expression, recursing manually in to
5197    operands where needed.  */
5198
5199 static int
5200 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5201 {
5202   rtx op0, op1;
5203   const struct cpu_cost_table *extra_cost
5204     = aarch64_tune_params->insn_extra_cost;
5205   int cost = 0;
5206   bool compound_p = (outer == PLUS || outer == MINUS);
5207   machine_mode mode = GET_MODE (x);
5208
5209   gcc_checking_assert (code == MULT);
5210
5211   op0 = XEXP (x, 0);
5212   op1 = XEXP (x, 1);
5213
5214   if (VECTOR_MODE_P (mode))
5215     mode = GET_MODE_INNER (mode);
5216
5217   /* Integer multiply/fma.  */
5218   if (GET_MODE_CLASS (mode) == MODE_INT)
5219     {
5220       /* The multiply will be canonicalized as a shift, cost it as such.  */
5221       if (aarch64_shift_p (GET_CODE (x))
5222           || (CONST_INT_P (op1)
5223               && exact_log2 (INTVAL (op1)) > 0))
5224         {
5225           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5226                            || GET_CODE (op0) == SIGN_EXTEND;
5227           if (speed)
5228             {
5229               if (compound_p)
5230                 {
5231                   if (REG_P (op1))
5232                     /* ARITH + shift-by-register.  */
5233                     cost += extra_cost->alu.arith_shift_reg;
5234                   else if (is_extend)
5235                     /* ARITH + extended register.  We don't have a cost field
5236                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
5237                     cost += extra_cost->alu.extend_arith;
5238                   else
5239                     /* ARITH + shift-by-immediate.  */
5240                     cost += extra_cost->alu.arith_shift;
5241                 }
5242               else
5243                 /* LSL (immediate).  */
5244                 cost += extra_cost->alu.shift;
5245
5246             }
5247           /* Strip extends as we will have costed them in the case above.  */
5248           if (is_extend)
5249             op0 = aarch64_strip_extend (op0);
5250
5251           cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5252
5253           return cost;
5254         }
5255
5256       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
5257          compound and let the below cases handle it.  After all, MNEG is a
5258          special-case alias of MSUB.  */
5259       if (GET_CODE (op0) == NEG)
5260         {
5261           op0 = XEXP (op0, 0);
5262           compound_p = true;
5263         }
5264
5265       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5266       if ((GET_CODE (op0) == ZERO_EXTEND
5267            && GET_CODE (op1) == ZERO_EXTEND)
5268           || (GET_CODE (op0) == SIGN_EXTEND
5269               && GET_CODE (op1) == SIGN_EXTEND))
5270         {
5271           cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5272                   + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5273
5274           if (speed)
5275             {
5276               if (compound_p)
5277                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
5278                 cost += extra_cost->mult[0].extend_add;
5279               else
5280                 /* MUL/SMULL/UMULL.  */
5281                 cost += extra_cost->mult[0].extend;
5282             }
5283
5284           return cost;
5285         }
5286
5287       /* This is either an integer multiply or a MADD.  In both cases
5288          we want to recurse and cost the operands.  */
5289       cost += rtx_cost (op0, MULT, 0, speed)
5290               + rtx_cost (op1, MULT, 1, speed);
5291
5292       if (speed)
5293         {
5294           if (compound_p)
5295             /* MADD/MSUB.  */
5296             cost += extra_cost->mult[mode == DImode].add;
5297           else
5298             /* MUL.  */
5299             cost += extra_cost->mult[mode == DImode].simple;
5300         }
5301
5302       return cost;
5303     }
5304   else
5305     {
5306       if (speed)
5307         {
5308           /* Floating-point FMA/FMUL can also support negations of the
5309              operands.  */
5310           if (GET_CODE (op0) == NEG)
5311             op0 = XEXP (op0, 0);
5312           if (GET_CODE (op1) == NEG)
5313             op1 = XEXP (op1, 0);
5314
5315           if (compound_p)
5316             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5317             cost += extra_cost->fp[mode == DFmode].fma;
5318           else
5319             /* FMUL/FNMUL.  */
5320             cost += extra_cost->fp[mode == DFmode].mult;
5321         }
5322
5323       cost += rtx_cost (op0, MULT, 0, speed)
5324               + rtx_cost (op1, MULT, 1, speed);
5325       return cost;
5326     }
5327 }
5328
5329 static int
5330 aarch64_address_cost (rtx x,
5331                       machine_mode mode,
5332                       addr_space_t as ATTRIBUTE_UNUSED,
5333                       bool speed)
5334 {
5335   enum rtx_code c = GET_CODE (x);
5336   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
5337   struct aarch64_address_info info;
5338   int cost = 0;
5339   info.shift = 0;
5340
5341   if (!aarch64_classify_address (&info, x, mode, c, false))
5342     {
5343       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5344         {
5345           /* This is a CONST or SYMBOL ref which will be split
5346              in a different way depending on the code model in use.
5347              Cost it through the generic infrastructure.  */
5348           int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5349           /* Divide through by the cost of one instruction to
5350              bring it to the same units as the address costs.  */
5351           cost_symbol_ref /= COSTS_N_INSNS (1);
5352           /* The cost is then the cost of preparing the address,
5353              followed by an immediate (possibly 0) offset.  */
5354           return cost_symbol_ref + addr_cost->imm_offset;
5355         }
5356       else
5357         {
5358           /* This is most likely a jump table from a case
5359              statement.  */
5360           return addr_cost->register_offset;
5361         }
5362     }
5363
5364   switch (info.type)
5365     {
5366       case ADDRESS_LO_SUM:
5367       case ADDRESS_SYMBOLIC:
5368       case ADDRESS_REG_IMM:
5369         cost += addr_cost->imm_offset;
5370         break;
5371
5372       case ADDRESS_REG_WB:
5373         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5374           cost += addr_cost->pre_modify;
5375         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5376           cost += addr_cost->post_modify;
5377         else
5378           gcc_unreachable ();
5379
5380         break;
5381
5382       case ADDRESS_REG_REG:
5383         cost += addr_cost->register_offset;
5384         break;
5385
5386       case ADDRESS_REG_UXTW:
5387       case ADDRESS_REG_SXTW:
5388         cost += addr_cost->register_extend;
5389         break;
5390
5391       default:
5392         gcc_unreachable ();
5393     }
5394
5395
5396   if (info.shift > 0)
5397     {
5398       /* For the sake of calculating the cost of the shifted register
5399          component, we can treat same sized modes in the same way.  */
5400       switch (GET_MODE_BITSIZE (mode))
5401         {
5402           case 16:
5403             cost += addr_cost->addr_scale_costs.hi;
5404             break;
5405
5406           case 32:
5407             cost += addr_cost->addr_scale_costs.si;
5408             break;
5409
5410           case 64:
5411             cost += addr_cost->addr_scale_costs.di;
5412             break;
5413
5414           /* We can't tell, or this is a 128-bit vector.  */
5415           default:
5416             cost += addr_cost->addr_scale_costs.ti;
5417             break;
5418         }
5419     }
5420
5421   return cost;
5422 }
5423
5424 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
5425    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
5426    to be taken.  */
5427
5428 int
5429 aarch64_branch_cost (bool speed_p, bool predictable_p)
5430 {
5431   /* When optimizing for speed, use the cost of unpredictable branches.  */
5432   const struct cpu_branch_cost *branch_costs =
5433     aarch64_tune_params->branch_costs;
5434
5435   if (!speed_p || predictable_p)
5436     return branch_costs->predictable;
5437   else
5438     return branch_costs->unpredictable;
5439 }
5440
5441 /* Return true if the RTX X in mode MODE is a zero or sign extract
5442    usable in an ADD or SUB (extended register) instruction.  */
5443 static bool
5444 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5445 {
5446   /* Catch add with a sign extract.
5447      This is add_<optab><mode>_multp2.  */
5448   if (GET_CODE (x) == SIGN_EXTRACT
5449       || GET_CODE (x) == ZERO_EXTRACT)
5450     {
5451       rtx op0 = XEXP (x, 0);
5452       rtx op1 = XEXP (x, 1);
5453       rtx op2 = XEXP (x, 2);
5454
5455       if (GET_CODE (op0) == MULT
5456           && CONST_INT_P (op1)
5457           && op2 == const0_rtx
5458           && CONST_INT_P (XEXP (op0, 1))
5459           && aarch64_is_extend_from_extract (mode,
5460                                              XEXP (op0, 1),
5461                                              op1))
5462         {
5463           return true;
5464         }
5465     }
5466
5467   return false;
5468 }
5469
5470 static bool
5471 aarch64_frint_unspec_p (unsigned int u)
5472 {
5473   switch (u)
5474     {
5475       case UNSPEC_FRINTZ:
5476       case UNSPEC_FRINTP:
5477       case UNSPEC_FRINTM:
5478       case UNSPEC_FRINTA:
5479       case UNSPEC_FRINTN:
5480       case UNSPEC_FRINTX:
5481       case UNSPEC_FRINTI:
5482         return true;
5483
5484       default:
5485         return false;
5486     }
5487 }
5488
5489 /* Return true iff X is an rtx that will match an extr instruction
5490    i.e. as described in the *extr<mode>5_insn family of patterns.
5491    OP0 and OP1 will be set to the operands of the shifts involved
5492    on success and will be NULL_RTX otherwise.  */
5493
5494 static bool
5495 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5496 {
5497   rtx op0, op1;
5498   machine_mode mode = GET_MODE (x);
5499
5500   *res_op0 = NULL_RTX;
5501   *res_op1 = NULL_RTX;
5502
5503   if (GET_CODE (x) != IOR)
5504     return false;
5505
5506   op0 = XEXP (x, 0);
5507   op1 = XEXP (x, 1);
5508
5509   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5510       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5511     {
5512      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
5513       if (GET_CODE (op1) == ASHIFT)
5514         std::swap (op0, op1);
5515
5516       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5517         return false;
5518
5519       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5520       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5521
5522       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5523           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5524         {
5525           *res_op0 = XEXP (op0, 0);
5526           *res_op1 = XEXP (op1, 0);
5527           return true;
5528         }
5529     }
5530
5531   return false;
5532 }
5533
5534 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5535    storing it in *COST.  Result is true if the total cost of the operation
5536    has now been calculated.  */
5537 static bool
5538 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5539 {
5540   rtx inner;
5541   rtx comparator;
5542   enum rtx_code cmpcode;
5543
5544   if (COMPARISON_P (op0))
5545     {
5546       inner = XEXP (op0, 0);
5547       comparator = XEXP (op0, 1);
5548       cmpcode = GET_CODE (op0);
5549     }
5550   else
5551     {
5552       inner = op0;
5553       comparator = const0_rtx;
5554       cmpcode = NE;
5555     }
5556
5557   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5558     {
5559       /* Conditional branch.  */
5560       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5561         return true;
5562       else
5563         {
5564           if (cmpcode == NE || cmpcode == EQ)
5565             {
5566               if (comparator == const0_rtx)
5567                 {
5568                   /* TBZ/TBNZ/CBZ/CBNZ.  */
5569                   if (GET_CODE (inner) == ZERO_EXTRACT)
5570                     /* TBZ/TBNZ.  */
5571                     *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5572                                        0, speed);
5573                 else
5574                   /* CBZ/CBNZ.  */
5575                   *cost += rtx_cost (inner, cmpcode, 0, speed);
5576
5577                 return true;
5578               }
5579             }
5580           else if (cmpcode == LT || cmpcode == GE)
5581             {
5582               /* TBZ/TBNZ.  */
5583               if (comparator == const0_rtx)
5584                 return true;
5585             }
5586         }
5587     }
5588   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5589     {
5590       /* It's a conditional operation based on the status flags,
5591          so it must be some flavor of CSEL.  */
5592
5593       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
5594       if (GET_CODE (op1) == NEG
5595           || GET_CODE (op1) == NOT
5596           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5597         op1 = XEXP (op1, 0);
5598
5599       *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5600       *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5601       return true;
5602     }
5603
5604   /* We don't know what this is, cost all operands.  */
5605   return false;
5606 }
5607
5608 /* Calculate the cost of calculating X, storing it in *COST.  Result
5609    is true if the total cost of the operation has now been calculated.  */
5610 static bool
5611 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5612                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5613 {
5614   rtx op0, op1, op2;
5615   const struct cpu_cost_table *extra_cost
5616     = aarch64_tune_params->insn_extra_cost;
5617   machine_mode mode = GET_MODE (x);
5618
5619   /* By default, assume that everything has equivalent cost to the
5620      cheapest instruction.  Any additional costs are applied as a delta
5621      above this default.  */
5622   *cost = COSTS_N_INSNS (1);
5623
5624   /* TODO: The cost infrastructure currently does not handle
5625      vector operations.  Assume that all vector operations
5626      are equally expensive.  */
5627   if (VECTOR_MODE_P (mode))
5628     {
5629       if (speed)
5630         *cost += extra_cost->vect.alu;
5631       return true;
5632     }
5633
5634   switch (code)
5635     {
5636     case SET:
5637       /* The cost depends entirely on the operands to SET.  */
5638       *cost = 0;
5639       op0 = SET_DEST (x);
5640       op1 = SET_SRC (x);
5641
5642       switch (GET_CODE (op0))
5643         {
5644         case MEM:
5645           if (speed)
5646             {
5647               rtx address = XEXP (op0, 0);
5648               if (GET_MODE_CLASS (mode) == MODE_INT)
5649                 *cost += extra_cost->ldst.store;
5650               else if (mode == SFmode)
5651                 *cost += extra_cost->ldst.storef;
5652               else if (mode == DFmode)
5653                 *cost += extra_cost->ldst.stored;
5654
5655               *cost +=
5656                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5657                                                      0, speed));
5658             }
5659
5660           *cost += rtx_cost (op1, SET, 1, speed);
5661           return true;
5662
5663         case SUBREG:
5664           if (! REG_P (SUBREG_REG (op0)))
5665             *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5666
5667           /* Fall through.  */
5668         case REG:
5669           /* const0_rtx is in general free, but we will use an
5670              instruction to set a register to 0.  */
5671           if (REG_P (op1) || op1 == const0_rtx)
5672             {
5673               /* The cost is 1 per register copied.  */
5674               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5675                               / UNITS_PER_WORD;
5676               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5677             }
5678           else
5679             /* Cost is just the cost of the RHS of the set.  */
5680             *cost += rtx_cost (op1, SET, 1, speed);
5681           return true;
5682
5683         case ZERO_EXTRACT:
5684         case SIGN_EXTRACT:
5685           /* Bit-field insertion.  Strip any redundant widening of
5686              the RHS to meet the width of the target.  */
5687           if (GET_CODE (op1) == SUBREG)
5688             op1 = SUBREG_REG (op1);
5689           if ((GET_CODE (op1) == ZERO_EXTEND
5690                || GET_CODE (op1) == SIGN_EXTEND)
5691               && CONST_INT_P (XEXP (op0, 1))
5692               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5693                   >= INTVAL (XEXP (op0, 1))))
5694             op1 = XEXP (op1, 0);
5695
5696           if (CONST_INT_P (op1))
5697             {
5698               /* MOV immediate is assumed to always be cheap.  */
5699               *cost = COSTS_N_INSNS (1);
5700             }
5701           else
5702             {
5703               /* BFM.  */
5704               if (speed)
5705                 *cost += extra_cost->alu.bfi;
5706               *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5707             }
5708
5709           return true;
5710
5711         default:
5712           /* We can't make sense of this, assume default cost.  */
5713           *cost = COSTS_N_INSNS (1);
5714           return false;
5715         }
5716       return false;
5717
5718     case CONST_INT:
5719       /* If an instruction can incorporate a constant within the
5720          instruction, the instruction's expression avoids calling
5721          rtx_cost() on the constant.  If rtx_cost() is called on a
5722          constant, then it is usually because the constant must be
5723          moved into a register by one or more instructions.
5724
5725          The exception is constant 0, which can be expressed
5726          as XZR/WZR and is therefore free.  The exception to this is
5727          if we have (set (reg) (const0_rtx)) in which case we must cost
5728          the move.  However, we can catch that when we cost the SET, so
5729          we don't need to consider that here.  */
5730       if (x == const0_rtx)
5731         *cost = 0;
5732       else
5733         {
5734           /* To an approximation, building any other constant is
5735              proportionally expensive to the number of instructions
5736              required to build that constant.  This is true whether we
5737              are compiling for SPEED or otherwise.  */
5738           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5739                                  (NULL_RTX, x, false, mode));
5740         }
5741       return true;
5742
5743     case CONST_DOUBLE:
5744       if (speed)
5745         {
5746           /* mov[df,sf]_aarch64.  */
5747           if (aarch64_float_const_representable_p (x))
5748             /* FMOV (scalar immediate).  */
5749             *cost += extra_cost->fp[mode == DFmode].fpconst;
5750           else if (!aarch64_float_const_zero_rtx_p (x))
5751             {
5752               /* This will be a load from memory.  */
5753               if (mode == DFmode)
5754                 *cost += extra_cost->ldst.loadd;
5755               else
5756                 *cost += extra_cost->ldst.loadf;
5757             }
5758           else
5759             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5760                or MOV v0.s[0], wzr - neither of which are modeled by the
5761                cost tables.  Just use the default cost.  */
5762             {
5763             }
5764         }
5765
5766       return true;
5767
5768     case MEM:
5769       if (speed)
5770         {
5771           /* For loads we want the base cost of a load, plus an
5772              approximation for the additional cost of the addressing
5773              mode.  */
5774           rtx address = XEXP (x, 0);
5775           if (GET_MODE_CLASS (mode) == MODE_INT)
5776             *cost += extra_cost->ldst.load;
5777           else if (mode == SFmode)
5778             *cost += extra_cost->ldst.loadf;
5779           else if (mode == DFmode)
5780             *cost += extra_cost->ldst.loadd;
5781
5782           *cost +=
5783                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5784                                                      0, speed));
5785         }
5786
5787       return true;
5788
5789     case NEG:
5790       op0 = XEXP (x, 0);
5791
5792       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5793        {
5794           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5795               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5796             {
5797               /* CSETM.  */
5798               *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5799               return true;
5800             }
5801
5802           /* Cost this as SUB wzr, X.  */
5803           op0 = CONST0_RTX (GET_MODE (x));
5804           op1 = XEXP (x, 0);
5805           goto cost_minus;
5806         }
5807
5808       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5809         {
5810           /* Support (neg(fma...)) as a single instruction only if
5811              sign of zeros is unimportant.  This matches the decision
5812              making in aarch64.md.  */
5813           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5814             {
5815               /* FNMADD.  */
5816               *cost = rtx_cost (op0, NEG, 0, speed);
5817               return true;
5818             }
5819           if (speed)
5820             /* FNEG.  */
5821             *cost += extra_cost->fp[mode == DFmode].neg;
5822           return false;
5823         }
5824
5825       return false;
5826
5827     case CLRSB:
5828     case CLZ:
5829       if (speed)
5830         *cost += extra_cost->alu.clz;
5831
5832       return false;
5833
5834     case COMPARE:
5835       op0 = XEXP (x, 0);
5836       op1 = XEXP (x, 1);
5837
5838       if (op1 == const0_rtx
5839           && GET_CODE (op0) == AND)
5840         {
5841           x = op0;
5842           goto cost_logic;
5843         }
5844
5845       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5846         {
5847           /* TODO: A write to the CC flags possibly costs extra, this
5848              needs encoding in the cost tables.  */
5849
5850           /* CC_ZESWPmode supports zero extend for free.  */
5851           if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5852             op0 = XEXP (op0, 0);
5853
5854           /* ANDS.  */
5855           if (GET_CODE (op0) == AND)
5856             {
5857               x = op0;
5858               goto cost_logic;
5859             }
5860
5861           if (GET_CODE (op0) == PLUS)
5862             {
5863               /* ADDS (and CMN alias).  */
5864               x = op0;
5865               goto cost_plus;
5866             }
5867
5868           if (GET_CODE (op0) == MINUS)
5869             {
5870               /* SUBS.  */
5871               x = op0;
5872               goto cost_minus;
5873             }
5874
5875           if (GET_CODE (op1) == NEG)
5876             {
5877               /* CMN.  */
5878               if (speed)
5879                 *cost += extra_cost->alu.arith;
5880
5881               *cost += rtx_cost (op0, COMPARE, 0, speed);
5882               *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5883               return true;
5884             }
5885
5886           /* CMP.
5887
5888              Compare can freely swap the order of operands, and
5889              canonicalization puts the more complex operation first.
5890              But the integer MINUS logic expects the shift/extend
5891              operation in op1.  */
5892           if (! (REG_P (op0)
5893                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5894           {
5895             op0 = XEXP (x, 1);
5896             op1 = XEXP (x, 0);
5897           }
5898           goto cost_minus;
5899         }
5900
5901       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5902         {
5903           /* FCMP.  */
5904           if (speed)
5905             *cost += extra_cost->fp[mode == DFmode].compare;
5906
5907           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5908             {
5909               *cost += rtx_cost (op0, COMPARE, 0, speed);
5910               /* FCMP supports constant 0.0 for no extra cost. */
5911               return true;
5912             }
5913           return false;
5914         }
5915
5916       return false;
5917
5918     case MINUS:
5919       {
5920         op0 = XEXP (x, 0);
5921         op1 = XEXP (x, 1);
5922
5923 cost_minus:
5924         *cost += rtx_cost (op0, MINUS, 0, speed);
5925
5926         /* Detect valid immediates.  */
5927         if ((GET_MODE_CLASS (mode) == MODE_INT
5928              || (GET_MODE_CLASS (mode) == MODE_CC
5929                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5930             && CONST_INT_P (op1)
5931             && aarch64_uimm12_shift (INTVAL (op1)))
5932           {
5933             if (speed)
5934               /* SUB(S) (immediate).  */
5935               *cost += extra_cost->alu.arith;
5936             return true;
5937           }
5938
5939         /* Look for SUB (extended register).  */
5940         if (aarch64_rtx_arith_op_extract_p (op1, mode))
5941           {
5942             if (speed)
5943               *cost += extra_cost->alu.extend_arith;
5944
5945             *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5946                                (enum rtx_code) GET_CODE (op1),
5947                                0, speed);
5948             return true;
5949           }
5950
5951         rtx new_op1 = aarch64_strip_extend (op1);
5952
5953         /* Cost this as an FMA-alike operation.  */
5954         if ((GET_CODE (new_op1) == MULT
5955              || aarch64_shift_p (GET_CODE (new_op1)))
5956             && code != COMPARE)
5957           {
5958             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5959                                             (enum rtx_code) code,
5960                                             speed);
5961             return true;
5962           }
5963
5964         *cost += rtx_cost (new_op1, MINUS, 1, speed);
5965
5966         if (speed)
5967           {
5968             if (GET_MODE_CLASS (mode) == MODE_INT)
5969               /* SUB(S).  */
5970               *cost += extra_cost->alu.arith;
5971             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5972               /* FSUB.  */
5973               *cost += extra_cost->fp[mode == DFmode].addsub;
5974           }
5975         return true;
5976       }
5977
5978     case PLUS:
5979       {
5980         rtx new_op0;
5981
5982         op0 = XEXP (x, 0);
5983         op1 = XEXP (x, 1);
5984
5985 cost_plus:
5986         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5987             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5988           {
5989             /* CSINC.  */
5990             *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5991             *cost += rtx_cost (op1, PLUS, 1, speed);
5992             return true;
5993           }
5994
5995         if (GET_MODE_CLASS (mode) == MODE_INT
5996             && CONST_INT_P (op1)
5997             && aarch64_uimm12_shift (INTVAL (op1)))
5998           {
5999             *cost += rtx_cost (op0, PLUS, 0, speed);
6000
6001             if (speed)
6002               /* ADD (immediate).  */
6003               *cost += extra_cost->alu.arith;
6004             return true;
6005           }
6006
6007         *cost += rtx_cost (op1, PLUS, 1, speed);
6008
6009         /* Look for ADD (extended register).  */
6010         if (aarch64_rtx_arith_op_extract_p (op0, mode))
6011           {
6012             if (speed)
6013               *cost += extra_cost->alu.extend_arith;
6014
6015             *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
6016                                (enum rtx_code) GET_CODE (op0),
6017                                0, speed);
6018             return true;
6019           }
6020
6021         /* Strip any extend, leave shifts behind as we will
6022            cost them through mult_cost.  */
6023         new_op0 = aarch64_strip_extend (op0);
6024
6025         if (GET_CODE (new_op0) == MULT
6026             || aarch64_shift_p (GET_CODE (new_op0)))
6027           {
6028             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6029                                             speed);
6030             return true;
6031           }
6032
6033         *cost += rtx_cost (new_op0, PLUS, 0, speed);
6034
6035         if (speed)
6036           {
6037             if (GET_MODE_CLASS (mode) == MODE_INT)
6038               /* ADD.  */
6039               *cost += extra_cost->alu.arith;
6040             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6041               /* FADD.  */
6042               *cost += extra_cost->fp[mode == DFmode].addsub;
6043           }
6044         return true;
6045       }
6046
6047     case BSWAP:
6048       *cost = COSTS_N_INSNS (1);
6049
6050       if (speed)
6051         *cost += extra_cost->alu.rev;
6052
6053       return false;
6054
6055     case IOR:
6056       if (aarch_rev16_p (x))
6057         {
6058           *cost = COSTS_N_INSNS (1);
6059
6060           if (speed)
6061             *cost += extra_cost->alu.rev;
6062
6063           return true;
6064         }
6065
6066       if (aarch64_extr_rtx_p (x, &op0, &op1))
6067         {
6068           *cost += rtx_cost (op0, IOR, 0, speed)
6069                    + rtx_cost (op1, IOR, 1, speed);
6070           if (speed)
6071             *cost += extra_cost->alu.shift;
6072
6073           return true;
6074         }
6075     /* Fall through.  */
6076     case XOR:
6077     case AND:
6078     cost_logic:
6079       op0 = XEXP (x, 0);
6080       op1 = XEXP (x, 1);
6081
6082       if (code == AND
6083           && GET_CODE (op0) == MULT
6084           && CONST_INT_P (XEXP (op0, 1))
6085           && CONST_INT_P (op1)
6086           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6087                                INTVAL (op1)) != 0)
6088         {
6089           /* This is a UBFM/SBFM.  */
6090           *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
6091           if (speed)
6092             *cost += extra_cost->alu.bfx;
6093           return true;
6094         }
6095
6096       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6097         {
6098           /* We possibly get the immediate for free, this is not
6099              modelled.  */
6100           if (CONST_INT_P (op1)
6101               && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
6102             {
6103               *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6104
6105               if (speed)
6106                 *cost += extra_cost->alu.logical;
6107
6108               return true;
6109             }
6110           else
6111             {
6112               rtx new_op0 = op0;
6113
6114               /* Handle ORN, EON, or BIC.  */
6115               if (GET_CODE (op0) == NOT)
6116                 op0 = XEXP (op0, 0);
6117
6118               new_op0 = aarch64_strip_shift (op0);
6119
6120               /* If we had a shift on op0 then this is a logical-shift-
6121                  by-register/immediate operation.  Otherwise, this is just
6122                  a logical operation.  */
6123               if (speed)
6124                 {
6125                   if (new_op0 != op0)
6126                     {
6127                       /* Shift by immediate.  */
6128                       if (CONST_INT_P (XEXP (op0, 1)))
6129                         *cost += extra_cost->alu.log_shift;
6130                       else
6131                         *cost += extra_cost->alu.log_shift_reg;
6132                     }
6133                   else
6134                     *cost += extra_cost->alu.logical;
6135                 }
6136
6137               /* In both cases we want to cost both operands.  */
6138               *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6139                        + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6140
6141               return true;
6142             }
6143         }
6144       return false;
6145
6146     case NOT:
6147       x = XEXP (x, 0);
6148       op0 = aarch64_strip_shift (x);
6149
6150       /* MVN-shifted-reg.  */
6151       if (op0 != x)
6152         {
6153           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6154
6155           if (speed)
6156             *cost += extra_cost->alu.log_shift;
6157
6158           return true;
6159         }
6160       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6161          Handle the second form here taking care that 'a' in the above can
6162          be a shift.  */
6163       else if (GET_CODE (op0) == XOR)
6164         {
6165           rtx newop0 = XEXP (op0, 0);
6166           rtx newop1 = XEXP (op0, 1);
6167           rtx op0_stripped = aarch64_strip_shift (newop0);
6168
6169           *cost += rtx_cost (newop1, (enum rtx_code) code, 1, speed)
6170                    + rtx_cost (op0_stripped, XOR, 0, speed);
6171
6172           if (speed)
6173             {
6174               if (op0_stripped != newop0)
6175                 *cost += extra_cost->alu.log_shift;
6176               else
6177                 *cost += extra_cost->alu.logical;
6178             }
6179
6180           return true;
6181         }
6182       /* MVN.  */
6183       if (speed)
6184         *cost += extra_cost->alu.logical;
6185
6186       return false;
6187
6188     case ZERO_EXTEND:
6189
6190       op0 = XEXP (x, 0);
6191       /* If a value is written in SI mode, then zero extended to DI
6192          mode, the operation will in general be free as a write to
6193          a 'w' register implicitly zeroes the upper bits of an 'x'
6194          register.  However, if this is
6195
6196            (set (reg) (zero_extend (reg)))
6197
6198          we must cost the explicit register move.  */
6199       if (mode == DImode
6200           && GET_MODE (op0) == SImode
6201           && outer == SET)
6202         {
6203           int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6204
6205           if (!op_cost && speed)
6206             /* MOV.  */
6207             *cost += extra_cost->alu.extend;
6208           else
6209             /* Free, the cost is that of the SI mode operation.  */
6210             *cost = op_cost;
6211
6212           return true;
6213         }
6214       else if (MEM_P (XEXP (x, 0)))
6215         {
6216           /* All loads can zero extend to any size for free.  */
6217           *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6218           return true;
6219         }
6220
6221       /* UXTB/UXTH.  */
6222       if (speed)
6223         *cost += extra_cost->alu.extend;
6224
6225       return false;
6226
6227     case SIGN_EXTEND:
6228       if (MEM_P (XEXP (x, 0)))
6229         {
6230           /* LDRSH.  */
6231           if (speed)
6232             {
6233               rtx address = XEXP (XEXP (x, 0), 0);
6234               *cost += extra_cost->ldst.load_sign_extend;
6235
6236               *cost +=
6237                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6238                                                      0, speed));
6239             }
6240           return true;
6241         }
6242
6243       if (speed)
6244         *cost += extra_cost->alu.extend;
6245       return false;
6246
6247     case ASHIFT:
6248       op0 = XEXP (x, 0);
6249       op1 = XEXP (x, 1);
6250
6251       if (CONST_INT_P (op1))
6252         {
6253           /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6254              aliases.  */
6255           if (speed)
6256             *cost += extra_cost->alu.shift;
6257
6258           /* We can incorporate zero/sign extend for free.  */
6259           if (GET_CODE (op0) == ZERO_EXTEND
6260               || GET_CODE (op0) == SIGN_EXTEND)
6261             op0 = XEXP (op0, 0);
6262
6263           *cost += rtx_cost (op0, ASHIFT, 0, speed);
6264           return true;
6265         }
6266       else
6267         {
6268           /* LSLV.  */
6269           if (speed)
6270             *cost += extra_cost->alu.shift_reg;
6271
6272           return false;  /* All arguments need to be in registers.  */
6273         }
6274
6275     case ROTATE:
6276     case ROTATERT:
6277     case LSHIFTRT:
6278     case ASHIFTRT:
6279       op0 = XEXP (x, 0);
6280       op1 = XEXP (x, 1);
6281
6282       if (CONST_INT_P (op1))
6283         {
6284           /* ASR (immediate) and friends.  */
6285           if (speed)
6286             *cost += extra_cost->alu.shift;
6287
6288           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6289           return true;
6290         }
6291       else
6292         {
6293
6294           /* ASR (register) and friends.  */
6295           if (speed)
6296             *cost += extra_cost->alu.shift_reg;
6297
6298           return false;  /* All arguments need to be in registers.  */
6299         }
6300
6301     case SYMBOL_REF:
6302
6303       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6304         {
6305           /* LDR.  */
6306           if (speed)
6307             *cost += extra_cost->ldst.load;
6308         }
6309       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6310                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6311         {
6312           /* ADRP, followed by ADD.  */
6313           *cost += COSTS_N_INSNS (1);
6314           if (speed)
6315             *cost += 2 * extra_cost->alu.arith;
6316         }
6317       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6318                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6319         {
6320           /* ADR.  */
6321           if (speed)
6322             *cost += extra_cost->alu.arith;
6323         }
6324
6325       if (flag_pic)
6326         {
6327           /* One extra load instruction, after accessing the GOT.  */
6328           *cost += COSTS_N_INSNS (1);
6329           if (speed)
6330             *cost += extra_cost->ldst.load;
6331         }
6332       return true;
6333
6334     case HIGH:
6335     case LO_SUM:
6336       /* ADRP/ADD (immediate).  */
6337       if (speed)
6338         *cost += extra_cost->alu.arith;
6339       return true;
6340
6341     case ZERO_EXTRACT:
6342     case SIGN_EXTRACT:
6343       /* UBFX/SBFX.  */
6344       if (speed)
6345         *cost += extra_cost->alu.bfx;
6346
6347       /* We can trust that the immediates used will be correct (there
6348          are no by-register forms), so we need only cost op0.  */
6349       *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6350       return true;
6351
6352     case MULT:
6353       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6354       /* aarch64_rtx_mult_cost always handles recursion to its
6355          operands.  */
6356       return true;
6357
6358     case MOD:
6359     case UMOD:
6360       if (speed)
6361         {
6362           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6363             *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6364                       + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6365           else if (GET_MODE (x) == DFmode)
6366             *cost += (extra_cost->fp[1].mult
6367                       + extra_cost->fp[1].div);
6368           else if (GET_MODE (x) == SFmode)
6369             *cost += (extra_cost->fp[0].mult
6370                       + extra_cost->fp[0].div);
6371         }
6372       return false;  /* All arguments need to be in registers.  */
6373
6374     case DIV:
6375     case UDIV:
6376     case SQRT:
6377       if (speed)
6378         {
6379           if (GET_MODE_CLASS (mode) == MODE_INT)
6380             /* There is no integer SQRT, so only DIV and UDIV can get
6381                here.  */
6382             *cost += extra_cost->mult[mode == DImode].idiv;
6383           else
6384             *cost += extra_cost->fp[mode == DFmode].div;
6385         }
6386       return false;  /* All arguments need to be in registers.  */
6387
6388     case IF_THEN_ELSE:
6389       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6390                                          XEXP (x, 2), cost, speed);
6391
6392     case EQ:
6393     case NE:
6394     case GT:
6395     case GTU:
6396     case LT:
6397     case LTU:
6398     case GE:
6399     case GEU:
6400     case LE:
6401     case LEU:
6402
6403       return false; /* All arguments must be in registers.  */
6404
6405     case FMA:
6406       op0 = XEXP (x, 0);
6407       op1 = XEXP (x, 1);
6408       op2 = XEXP (x, 2);
6409
6410       if (speed)
6411         *cost += extra_cost->fp[mode == DFmode].fma;
6412
6413       /* FMSUB, FNMADD, and FNMSUB are free.  */
6414       if (GET_CODE (op0) == NEG)
6415         op0 = XEXP (op0, 0);
6416
6417       if (GET_CODE (op2) == NEG)
6418         op2 = XEXP (op2, 0);
6419
6420       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6421          and the by-element operand as operand 0.  */
6422       if (GET_CODE (op1) == NEG)
6423         op1 = XEXP (op1, 0);
6424
6425       /* Catch vector-by-element operations.  The by-element operand can
6426          either be (vec_duplicate (vec_select (x))) or just
6427          (vec_select (x)), depending on whether we are multiplying by
6428          a vector or a scalar.
6429
6430          Canonicalization is not very good in these cases, FMA4 will put the
6431          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
6432       if (GET_CODE (op0) == VEC_DUPLICATE)
6433         op0 = XEXP (op0, 0);
6434       else if (GET_CODE (op1) == VEC_DUPLICATE)
6435         op1 = XEXP (op1, 0);
6436
6437       if (GET_CODE (op0) == VEC_SELECT)
6438         op0 = XEXP (op0, 0);
6439       else if (GET_CODE (op1) == VEC_SELECT)
6440         op1 = XEXP (op1, 0);
6441
6442       /* If the remaining parameters are not registers,
6443          get the cost to put them into registers.  */
6444       *cost += rtx_cost (op0, FMA, 0, speed);
6445       *cost += rtx_cost (op1, FMA, 1, speed);
6446       *cost += rtx_cost (op2, FMA, 2, speed);
6447       return true;
6448
6449     case FLOAT:
6450     case UNSIGNED_FLOAT:
6451       if (speed)
6452         *cost += extra_cost->fp[mode == DFmode].fromint;
6453       return false;
6454
6455     case FLOAT_EXTEND:
6456       if (speed)
6457         *cost += extra_cost->fp[mode == DFmode].widen;
6458       return false;
6459
6460     case FLOAT_TRUNCATE:
6461       if (speed)
6462         *cost += extra_cost->fp[mode == DFmode].narrow;
6463       return false;
6464
6465     case FIX:
6466     case UNSIGNED_FIX:
6467       x = XEXP (x, 0);
6468       /* Strip the rounding part.  They will all be implemented
6469          by the fcvt* family of instructions anyway.  */
6470       if (GET_CODE (x) == UNSPEC)
6471         {
6472           unsigned int uns_code = XINT (x, 1);
6473
6474           if (uns_code == UNSPEC_FRINTA
6475               || uns_code == UNSPEC_FRINTM
6476               || uns_code == UNSPEC_FRINTN
6477               || uns_code == UNSPEC_FRINTP
6478               || uns_code == UNSPEC_FRINTZ)
6479             x = XVECEXP (x, 0, 0);
6480         }
6481
6482       if (speed)
6483         *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6484
6485       *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6486       return true;
6487
6488     case ABS:
6489       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6490         {
6491           op0 = XEXP (x, 0);
6492
6493           /* FABD, which is analogous to FADD.  */
6494           if (GET_CODE (op0) == MINUS)
6495             {
6496               *cost += rtx_cost (XEXP (op0, 0), MINUS, 0, speed);
6497                         + rtx_cost (XEXP (op0, 1), MINUS, 1, speed);
6498               if (speed)
6499                 *cost += extra_cost->fp[mode == DFmode].addsub;
6500
6501               return true;
6502             }
6503           /* Simple FABS is analogous to FNEG.  */
6504           if (speed)
6505             *cost += extra_cost->fp[mode == DFmode].neg;
6506         }
6507       else
6508         {
6509           /* Integer ABS will either be split to
6510              two arithmetic instructions, or will be an ABS
6511              (scalar), which we don't model.  */
6512           *cost = COSTS_N_INSNS (2);
6513           if (speed)
6514             *cost += 2 * extra_cost->alu.arith;
6515         }
6516       return false;
6517
6518     case SMAX:
6519     case SMIN:
6520       if (speed)
6521         {
6522           /* FMAXNM/FMINNM/FMAX/FMIN.
6523              TODO: This may not be accurate for all implementations, but
6524              we do not model this in the cost tables.  */
6525           *cost += extra_cost->fp[mode == DFmode].addsub;
6526         }
6527       return false;
6528
6529     case UNSPEC:
6530       /* The floating point round to integer frint* instructions.  */
6531       if (aarch64_frint_unspec_p (XINT (x, 1)))
6532         {
6533           if (speed)
6534             *cost += extra_cost->fp[mode == DFmode].roundint;
6535
6536           return false;
6537         }
6538
6539       if (XINT (x, 1) == UNSPEC_RBIT)
6540         {
6541           if (speed)
6542             *cost += extra_cost->alu.rev;
6543
6544           return false;
6545         }
6546       break;
6547
6548     case TRUNCATE:
6549
6550       /* Decompose <su>muldi3_highpart.  */
6551       if (/* (truncate:DI  */
6552           mode == DImode
6553           /*   (lshiftrt:TI  */
6554           && GET_MODE (XEXP (x, 0)) == TImode
6555           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6556           /*      (mult:TI  */
6557           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6558           /*        (ANY_EXTEND:TI (reg:DI))
6559                     (ANY_EXTEND:TI (reg:DI)))  */
6560           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6561                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6562               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6563                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6564           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6565           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6566           /*     (const_int 64)  */
6567           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6568           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6569         {
6570           /* UMULH/SMULH.  */
6571           if (speed)
6572             *cost += extra_cost->mult[mode == DImode].extend;
6573           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6574                              MULT, 0, speed);
6575           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6576                              MULT, 1, speed);
6577           return true;
6578         }
6579
6580       /* Fall through.  */
6581     default:
6582       break;
6583     }
6584
6585   if (dump_file && (dump_flags & TDF_DETAILS))
6586     fprintf (dump_file,
6587       "\nFailed to cost RTX.  Assuming default cost.\n");
6588
6589   return true;
6590 }
6591
6592 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6593    calculated for X.  This cost is stored in *COST.  Returns true
6594    if the total cost of X was calculated.  */
6595 static bool
6596 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6597                    int param, int *cost, bool speed)
6598 {
6599   bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6600
6601   if (dump_file && (dump_flags & TDF_DETAILS))
6602     {
6603       print_rtl_single (dump_file, x);
6604       fprintf (dump_file, "\n%s cost: %d (%s)\n",
6605                speed ? "Hot" : "Cold",
6606                *cost, result ? "final" : "partial");
6607     }
6608
6609   return result;
6610 }
6611
6612 static int
6613 aarch64_register_move_cost (machine_mode mode,
6614                             reg_class_t from_i, reg_class_t to_i)
6615 {
6616   enum reg_class from = (enum reg_class) from_i;
6617   enum reg_class to = (enum reg_class) to_i;
6618   const struct cpu_regmove_cost *regmove_cost
6619     = aarch64_tune_params->regmove_cost;
6620
6621   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
6622   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6623     to = GENERAL_REGS;
6624
6625   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6626     from = GENERAL_REGS;
6627
6628   /* Moving between GPR and stack cost is the same as GP2GP.  */
6629   if ((from == GENERAL_REGS && to == STACK_REG)
6630       || (to == GENERAL_REGS && from == STACK_REG))
6631     return regmove_cost->GP2GP;
6632
6633   /* To/From the stack register, we move via the gprs.  */
6634   if (to == STACK_REG || from == STACK_REG)
6635     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6636             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6637
6638   if (GET_MODE_SIZE (mode) == 16)
6639     {
6640       /* 128-bit operations on general registers require 2 instructions.  */
6641       if (from == GENERAL_REGS && to == GENERAL_REGS)
6642         return regmove_cost->GP2GP * 2;
6643       else if (from == GENERAL_REGS)
6644         return regmove_cost->GP2FP * 2;
6645       else if (to == GENERAL_REGS)
6646         return regmove_cost->FP2GP * 2;
6647
6648       /* When AdvSIMD instructions are disabled it is not possible to move
6649          a 128-bit value directly between Q registers.  This is handled in
6650          secondary reload.  A general register is used as a scratch to move
6651          the upper DI value and the lower DI value is moved directly,
6652          hence the cost is the sum of three moves. */
6653       if (! TARGET_SIMD)
6654         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6655
6656       return regmove_cost->FP2FP;
6657     }
6658
6659   if (from == GENERAL_REGS && to == GENERAL_REGS)
6660     return regmove_cost->GP2GP;
6661   else if (from == GENERAL_REGS)
6662     return regmove_cost->GP2FP;
6663   else if (to == GENERAL_REGS)
6664     return regmove_cost->FP2GP;
6665
6666   return regmove_cost->FP2FP;
6667 }
6668
6669 static int
6670 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6671                           reg_class_t rclass ATTRIBUTE_UNUSED,
6672                           bool in ATTRIBUTE_UNUSED)
6673 {
6674   return aarch64_tune_params->memmov_cost;
6675 }
6676
6677 /* Return the number of instructions that can be issued per cycle.  */
6678 static int
6679 aarch64_sched_issue_rate (void)
6680 {
6681   return aarch64_tune_params->issue_rate;
6682 }
6683
6684 static int
6685 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6686 {
6687   int issue_rate = aarch64_sched_issue_rate ();
6688
6689   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6690 }
6691
6692 /* Vectorizer cost model target hooks.  */
6693
6694 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
6695 static int
6696 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6697                                     tree vectype,
6698                                     int misalign ATTRIBUTE_UNUSED)
6699 {
6700   unsigned elements;
6701
6702   switch (type_of_cost)
6703     {
6704       case scalar_stmt:
6705         return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6706
6707       case scalar_load:
6708         return aarch64_tune_params->vec_costs->scalar_load_cost;
6709
6710       case scalar_store:
6711         return aarch64_tune_params->vec_costs->scalar_store_cost;
6712
6713       case vector_stmt:
6714         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6715
6716       case vector_load:
6717         return aarch64_tune_params->vec_costs->vec_align_load_cost;
6718
6719       case vector_store:
6720         return aarch64_tune_params->vec_costs->vec_store_cost;
6721
6722       case vec_to_scalar:
6723         return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6724
6725       case scalar_to_vec:
6726         return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6727
6728       case unaligned_load:
6729         return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6730
6731       case unaligned_store:
6732         return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6733
6734       case cond_branch_taken:
6735         return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6736
6737       case cond_branch_not_taken:
6738         return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6739
6740       case vec_perm:
6741       case vec_promote_demote:
6742         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6743
6744       case vec_construct:
6745         elements = TYPE_VECTOR_SUBPARTS (vectype);
6746         return elements / 2 + 1;
6747
6748       default:
6749         gcc_unreachable ();
6750     }
6751 }
6752
6753 /* Implement targetm.vectorize.add_stmt_cost.  */
6754 static unsigned
6755 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6756                        struct _stmt_vec_info *stmt_info, int misalign,
6757                        enum vect_cost_model_location where)
6758 {
6759   unsigned *cost = (unsigned *) data;
6760   unsigned retval = 0;
6761
6762   if (flag_vect_cost_model)
6763     {
6764       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6765       int stmt_cost =
6766             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6767
6768       /* Statements in an inner loop relative to the loop being
6769          vectorized are weighted more heavily.  The value here is
6770          a function (linear for now) of the loop nest level.  */
6771       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6772         {
6773           loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6774           struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
6775           unsigned nest_level = loop_depth (loop);
6776
6777           count *= nest_level;
6778         }
6779
6780       retval = (unsigned) (count * stmt_cost);
6781       cost[where] += retval;
6782     }
6783
6784   return retval;
6785 }
6786
6787 static void initialize_aarch64_code_model (void);
6788
6789 /* Parse the architecture extension string.  */
6790
6791 static void
6792 aarch64_parse_extension (char *str)
6793 {
6794   /* The extension string is parsed left to right.  */
6795   const struct aarch64_option_extension *opt = NULL;
6796
6797   /* Flag to say whether we are adding or removing an extension.  */
6798   int adding_ext = -1;
6799
6800   while (str != NULL && *str != 0)
6801     {
6802       char *ext;
6803       size_t len;
6804
6805       str++;
6806       ext = strchr (str, '+');
6807
6808       if (ext != NULL)
6809         len = ext - str;
6810       else
6811         len = strlen (str);
6812
6813       if (len >= 2 && strncmp (str, "no", 2) == 0)
6814         {
6815           adding_ext = 0;
6816           len -= 2;
6817           str += 2;
6818         }
6819       else if (len > 0)
6820         adding_ext = 1;
6821
6822       if (len == 0)
6823         {
6824           error ("missing feature modifier after %qs", adding_ext ? "+"
6825                                                                   : "+no");
6826           return;
6827         }
6828
6829       /* Scan over the extensions table trying to find an exact match.  */
6830       for (opt = all_extensions; opt->name != NULL; opt++)
6831         {
6832           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6833             {
6834               /* Add or remove the extension.  */
6835               if (adding_ext)
6836                 aarch64_isa_flags |= opt->flags_on;
6837               else
6838                 aarch64_isa_flags &= ~(opt->flags_off);
6839               break;
6840             }
6841         }
6842
6843       if (opt->name == NULL)
6844         {
6845           /* Extension not found in list.  */
6846           error ("unknown feature modifier %qs", str);
6847           return;
6848         }
6849
6850       str = ext;
6851     };
6852
6853   return;
6854 }
6855
6856 /* Parse the ARCH string.  */
6857
6858 static void
6859 aarch64_parse_arch (void)
6860 {
6861   char *ext;
6862   const struct processor *arch;
6863   char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6864   size_t len;
6865
6866   strcpy (str, aarch64_arch_string);
6867
6868   ext = strchr (str, '+');
6869
6870   if (ext != NULL)
6871     len = ext - str;
6872   else
6873     len = strlen (str);
6874
6875   if (len == 0)
6876     {
6877       error ("missing arch name in -march=%qs", str);
6878       return;
6879     }
6880
6881   /* Loop through the list of supported ARCHs to find a match.  */
6882   for (arch = all_architectures; arch->name != NULL; arch++)
6883     {
6884       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6885         {
6886           selected_arch = arch;
6887           aarch64_isa_flags = selected_arch->flags;
6888
6889           if (!selected_cpu)
6890             selected_cpu = &all_cores[selected_arch->core];
6891
6892           if (ext != NULL)
6893             {
6894               /* ARCH string contains at least one extension.  */
6895               aarch64_parse_extension (ext);
6896             }
6897
6898           if (strcmp (selected_arch->arch, selected_cpu->arch))
6899             {
6900               warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6901                        selected_cpu->name, selected_arch->name);
6902             }
6903
6904           return;
6905         }
6906     }
6907
6908   /* ARCH name not found in list.  */
6909   error ("unknown value %qs for -march", str);
6910   return;
6911 }
6912
6913 /* Parse the CPU string.  */
6914
6915 static void
6916 aarch64_parse_cpu (void)
6917 {
6918   char *ext;
6919   const struct processor *cpu;
6920   char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6921   size_t len;
6922
6923   strcpy (str, aarch64_cpu_string);
6924
6925   ext = strchr (str, '+');
6926
6927   if (ext != NULL)
6928     len = ext - str;
6929   else
6930     len = strlen (str);
6931
6932   if (len == 0)
6933     {
6934       error ("missing cpu name in -mcpu=%qs", str);
6935       return;
6936     }
6937
6938   /* Loop through the list of supported CPUs to find a match.  */
6939   for (cpu = all_cores; cpu->name != NULL; cpu++)
6940     {
6941       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6942         {
6943           selected_cpu = cpu;
6944           aarch64_isa_flags = selected_cpu->flags;
6945
6946           if (ext != NULL)
6947             {
6948               /* CPU string contains at least one extension.  */
6949               aarch64_parse_extension (ext);
6950             }
6951
6952           return;
6953         }
6954     }
6955
6956   /* CPU name not found in list.  */
6957   error ("unknown value %qs for -mcpu", str);
6958   return;
6959 }
6960
6961 /* Parse the TUNE string.  */
6962
6963 static void
6964 aarch64_parse_tune (void)
6965 {
6966   const struct processor *cpu;
6967   char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6968   strcpy (str, aarch64_tune_string);
6969
6970   /* Loop through the list of supported CPUs to find a match.  */
6971   for (cpu = all_cores; cpu->name != NULL; cpu++)
6972     {
6973       if (strcmp (cpu->name, str) == 0)
6974         {
6975           selected_tune = cpu;
6976           return;
6977         }
6978     }
6979
6980   /* CPU name not found in list.  */
6981   error ("unknown value %qs for -mtune", str);
6982   return;
6983 }
6984
6985
6986 /* Implement TARGET_OPTION_OVERRIDE.  */
6987
6988 static void
6989 aarch64_override_options (void)
6990 {
6991   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6992      If either of -march or -mtune is given, they override their
6993      respective component of -mcpu.
6994
6995      So, first parse AARCH64_CPU_STRING, then the others, be careful
6996      with -march as, if -mcpu is not present on the command line, march
6997      must set a sensible default CPU.  */
6998   if (aarch64_cpu_string)
6999     {
7000       aarch64_parse_cpu ();
7001     }
7002
7003   if (aarch64_arch_string)
7004     {
7005       aarch64_parse_arch ();
7006     }
7007
7008   if (aarch64_tune_string)
7009     {
7010       aarch64_parse_tune ();
7011     }
7012
7013 #ifndef HAVE_AS_MABI_OPTION
7014   /* The compiler may have been configured with 2.23.* binutils, which does
7015      not have support for ILP32.  */
7016   if (TARGET_ILP32)
7017     error ("Assembler does not support -mabi=ilp32");
7018 #endif
7019
7020   initialize_aarch64_code_model ();
7021
7022   aarch64_build_bitmask_table ();
7023
7024   /* This target defaults to strict volatile bitfields.  */
7025   if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
7026     flag_strict_volatile_bitfields = 1;
7027
7028   /* If the user did not specify a processor, choose the default
7029      one for them.  This will be the CPU set during configuration using
7030      --with-cpu, otherwise it is "generic".  */
7031   if (!selected_cpu)
7032     {
7033       selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
7034       aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
7035     }
7036
7037   gcc_assert (selected_cpu);
7038
7039   if (!selected_tune)
7040     selected_tune = selected_cpu;
7041
7042   aarch64_tune_flags = selected_tune->flags;
7043   aarch64_tune = selected_tune->core;
7044   aarch64_tune_params = selected_tune->tune;
7045   aarch64_architecture_version = selected_cpu->architecture_version;
7046
7047   if (aarch64_fix_a53_err835769 == 2)
7048     {
7049 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
7050       aarch64_fix_a53_err835769 = 1;
7051 #else
7052       aarch64_fix_a53_err835769 = 0;
7053 #endif
7054     }
7055
7056   /* If not opzimizing for size, set the default
7057      alignment to what the target wants */
7058   if (!optimize_size)
7059     {
7060       if (align_loops <= 0)
7061         align_loops = aarch64_tune_params->loop_align;
7062       if (align_jumps <= 0)
7063         align_jumps = aarch64_tune_params->jump_align;
7064       if (align_functions <= 0)
7065         align_functions = aarch64_tune_params->function_align;
7066     }
7067
7068   if (AARCH64_TUNE_FMA_STEERING)
7069     aarch64_register_fma_steering ();
7070
7071   aarch64_override_options_after_change ();
7072 }
7073
7074 /* Implement targetm.override_options_after_change.  */
7075
7076 static void
7077 aarch64_override_options_after_change (void)
7078 {
7079   if (flag_omit_frame_pointer)
7080     flag_omit_leaf_frame_pointer = false;
7081   else if (flag_omit_leaf_frame_pointer)
7082     flag_omit_frame_pointer = true;
7083 }
7084
7085 static struct machine_function *
7086 aarch64_init_machine_status (void)
7087 {
7088   struct machine_function *machine;
7089   machine = ggc_cleared_alloc<machine_function> ();
7090   return machine;
7091 }
7092
7093 void
7094 aarch64_init_expanders (void)
7095 {
7096   init_machine_status = aarch64_init_machine_status;
7097 }
7098
7099 /* A checking mechanism for the implementation of the various code models.  */
7100 static void
7101 initialize_aarch64_code_model (void)
7102 {
7103    if (flag_pic)
7104      {
7105        switch (aarch64_cmodel_var)
7106          {
7107          case AARCH64_CMODEL_TINY:
7108            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
7109            break;
7110          case AARCH64_CMODEL_SMALL:
7111            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
7112            break;
7113          case AARCH64_CMODEL_LARGE:
7114            sorry ("code model %qs with -f%s", "large",
7115                   flag_pic > 1 ? "PIC" : "pic");
7116          default:
7117            gcc_unreachable ();
7118          }
7119      }
7120    else
7121      aarch64_cmodel = aarch64_cmodel_var;
7122 }
7123
7124 /* Return true if SYMBOL_REF X binds locally.  */
7125
7126 static bool
7127 aarch64_symbol_binds_local_p (const_rtx x)
7128 {
7129   return (SYMBOL_REF_DECL (x)
7130           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
7131           : SYMBOL_REF_LOCAL_P (x));
7132 }
7133
7134 /* Return true if SYMBOL_REF X is thread local */
7135 static bool
7136 aarch64_tls_symbol_p (rtx x)
7137 {
7138   if (! TARGET_HAVE_TLS)
7139     return false;
7140
7141   if (GET_CODE (x) != SYMBOL_REF)
7142     return false;
7143
7144   return SYMBOL_REF_TLS_MODEL (x) != 0;
7145 }
7146
7147 /* Classify a TLS symbol into one of the TLS kinds.  */
7148 enum aarch64_symbol_type
7149 aarch64_classify_tls_symbol (rtx x)
7150 {
7151   enum tls_model tls_kind = tls_symbolic_operand_type (x);
7152
7153   switch (tls_kind)
7154     {
7155     case TLS_MODEL_GLOBAL_DYNAMIC:
7156     case TLS_MODEL_LOCAL_DYNAMIC:
7157       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
7158
7159     case TLS_MODEL_INITIAL_EXEC:
7160       return SYMBOL_SMALL_GOTTPREL;
7161
7162     case TLS_MODEL_LOCAL_EXEC:
7163       return SYMBOL_SMALL_TPREL;
7164
7165     case TLS_MODEL_EMULATED:
7166     case TLS_MODEL_NONE:
7167       return SYMBOL_FORCE_TO_MEM;
7168
7169     default:
7170       gcc_unreachable ();
7171     }
7172 }
7173
7174 /* Return the method that should be used to access SYMBOL_REF or
7175    LABEL_REF X in context CONTEXT.  */
7176
7177 enum aarch64_symbol_type
7178 aarch64_classify_symbol (rtx x, rtx offset,
7179                          enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
7180 {
7181   if (GET_CODE (x) == LABEL_REF)
7182     {
7183       switch (aarch64_cmodel)
7184         {
7185         case AARCH64_CMODEL_LARGE:
7186           return SYMBOL_FORCE_TO_MEM;
7187
7188         case AARCH64_CMODEL_TINY_PIC:
7189         case AARCH64_CMODEL_TINY:
7190           return SYMBOL_TINY_ABSOLUTE;
7191
7192         case AARCH64_CMODEL_SMALL_PIC:
7193         case AARCH64_CMODEL_SMALL:
7194           return SYMBOL_SMALL_ABSOLUTE;
7195
7196         default:
7197           gcc_unreachable ();
7198         }
7199     }
7200
7201   if (GET_CODE (x) == SYMBOL_REF)
7202     {
7203       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7204           return SYMBOL_FORCE_TO_MEM;
7205
7206       if (aarch64_tls_symbol_p (x))
7207         return aarch64_classify_tls_symbol (x);
7208
7209       switch (aarch64_cmodel)
7210         {
7211         case AARCH64_CMODEL_TINY:
7212           /* When we retreive symbol + offset address, we have to make sure
7213              the offset does not cause overflow of the final address.  But
7214              we have no way of knowing the address of symbol at compile time
7215              so we can't accurately say if the distance between the PC and
7216              symbol + offset is outside the addressible range of +/-1M in the
7217              TINY code model.  So we rely on images not being greater than
7218              1M and cap the offset at 1M and anything beyond 1M will have to
7219              be loaded using an alternative mechanism.  */
7220           if (SYMBOL_REF_WEAK (x)
7221               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7222             return SYMBOL_FORCE_TO_MEM;
7223           return SYMBOL_TINY_ABSOLUTE;
7224
7225         case AARCH64_CMODEL_SMALL:
7226           /* Same reasoning as the tiny code model, but the offset cap here is
7227              4G.  */
7228           if (SYMBOL_REF_WEAK (x)
7229               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
7230                             HOST_WIDE_INT_C (4294967264)))
7231             return SYMBOL_FORCE_TO_MEM;
7232           return SYMBOL_SMALL_ABSOLUTE;
7233
7234         case AARCH64_CMODEL_TINY_PIC:
7235           if (!aarch64_symbol_binds_local_p (x))
7236             return SYMBOL_TINY_GOT;
7237           return SYMBOL_TINY_ABSOLUTE;
7238
7239         case AARCH64_CMODEL_SMALL_PIC:
7240           if (!aarch64_symbol_binds_local_p (x))
7241             return SYMBOL_SMALL_GOT;
7242           return SYMBOL_SMALL_ABSOLUTE;
7243
7244         default:
7245           gcc_unreachable ();
7246         }
7247     }
7248
7249   /* By default push everything into the constant pool.  */
7250   return SYMBOL_FORCE_TO_MEM;
7251 }
7252
7253 bool
7254 aarch64_constant_address_p (rtx x)
7255 {
7256   return (CONSTANT_P (x) && memory_address_p (DImode, x));
7257 }
7258
7259 bool
7260 aarch64_legitimate_pic_operand_p (rtx x)
7261 {
7262   if (GET_CODE (x) == SYMBOL_REF
7263       || (GET_CODE (x) == CONST
7264           && GET_CODE (XEXP (x, 0)) == PLUS
7265           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7266      return false;
7267
7268   return true;
7269 }
7270
7271 /* Return true if X holds either a quarter-precision or
7272      floating-point +0.0 constant.  */
7273 static bool
7274 aarch64_valid_floating_const (machine_mode mode, rtx x)
7275 {
7276   if (!CONST_DOUBLE_P (x))
7277     return false;
7278
7279   /* TODO: We could handle moving 0.0 to a TFmode register,
7280      but first we would like to refactor the movtf_aarch64
7281      to be more amicable to split moves properly and
7282      correctly gate on TARGET_SIMD.  For now - reject all
7283      constants which are not to SFmode or DFmode registers.  */
7284   if (!(mode == SFmode || mode == DFmode))
7285     return false;
7286
7287   if (aarch64_float_const_zero_rtx_p (x))
7288     return true;
7289   return aarch64_float_const_representable_p (x);
7290 }
7291
7292 static bool
7293 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7294 {
7295   /* Do not allow vector struct mode constants.  We could support
7296      0 and -1 easily, but they need support in aarch64-simd.md.  */
7297   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7298     return false;
7299
7300   /* This could probably go away because
7301      we now decompose CONST_INTs according to expand_mov_immediate.  */
7302   if ((GET_CODE (x) == CONST_VECTOR
7303        && aarch64_simd_valid_immediate (x, mode, false, NULL))
7304       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7305         return !targetm.cannot_force_const_mem (mode, x);
7306
7307   if (GET_CODE (x) == HIGH
7308       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7309     return true;
7310
7311   return aarch64_constant_address_p (x);
7312 }
7313
7314 rtx
7315 aarch64_load_tp (rtx target)
7316 {
7317   if (!target
7318       || GET_MODE (target) != Pmode
7319       || !register_operand (target, Pmode))
7320     target = gen_reg_rtx (Pmode);
7321
7322   /* Can return in any reg.  */
7323   emit_insn (gen_aarch64_load_tp_hard (target));
7324   return target;
7325 }
7326
7327 /* On AAPCS systems, this is the "struct __va_list".  */
7328 static GTY(()) tree va_list_type;
7329
7330 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7331    Return the type to use as __builtin_va_list.
7332
7333    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7334
7335    struct __va_list
7336    {
7337      void *__stack;
7338      void *__gr_top;
7339      void *__vr_top;
7340      int   __gr_offs;
7341      int   __vr_offs;
7342    };  */
7343
7344 static tree
7345 aarch64_build_builtin_va_list (void)
7346 {
7347   tree va_list_name;
7348   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7349
7350   /* Create the type.  */
7351   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7352   /* Give it the required name.  */
7353   va_list_name = build_decl (BUILTINS_LOCATION,
7354                              TYPE_DECL,
7355                              get_identifier ("__va_list"),
7356                              va_list_type);
7357   DECL_ARTIFICIAL (va_list_name) = 1;
7358   TYPE_NAME (va_list_type) = va_list_name;
7359   TYPE_STUB_DECL (va_list_type) = va_list_name;
7360
7361   /* Create the fields.  */
7362   f_stack = build_decl (BUILTINS_LOCATION,
7363                         FIELD_DECL, get_identifier ("__stack"),
7364                         ptr_type_node);
7365   f_grtop = build_decl (BUILTINS_LOCATION,
7366                         FIELD_DECL, get_identifier ("__gr_top"),
7367                         ptr_type_node);
7368   f_vrtop = build_decl (BUILTINS_LOCATION,
7369                         FIELD_DECL, get_identifier ("__vr_top"),
7370                         ptr_type_node);
7371   f_groff = build_decl (BUILTINS_LOCATION,
7372                         FIELD_DECL, get_identifier ("__gr_offs"),
7373                         integer_type_node);
7374   f_vroff = build_decl (BUILTINS_LOCATION,
7375                         FIELD_DECL, get_identifier ("__vr_offs"),
7376                         integer_type_node);
7377
7378   DECL_ARTIFICIAL (f_stack) = 1;
7379   DECL_ARTIFICIAL (f_grtop) = 1;
7380   DECL_ARTIFICIAL (f_vrtop) = 1;
7381   DECL_ARTIFICIAL (f_groff) = 1;
7382   DECL_ARTIFICIAL (f_vroff) = 1;
7383
7384   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7385   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7386   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7387   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7388   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7389
7390   TYPE_FIELDS (va_list_type) = f_stack;
7391   DECL_CHAIN (f_stack) = f_grtop;
7392   DECL_CHAIN (f_grtop) = f_vrtop;
7393   DECL_CHAIN (f_vrtop) = f_groff;
7394   DECL_CHAIN (f_groff) = f_vroff;
7395
7396   /* Compute its layout.  */
7397   layout_type (va_list_type);
7398
7399   return va_list_type;
7400 }
7401
7402 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
7403 static void
7404 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7405 {
7406   const CUMULATIVE_ARGS *cum;
7407   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7408   tree stack, grtop, vrtop, groff, vroff;
7409   tree t;
7410   int gr_save_area_size;
7411   int vr_save_area_size;
7412   int vr_offset;
7413
7414   cum = &crtl->args.info;
7415   gr_save_area_size
7416     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7417   vr_save_area_size
7418     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7419
7420   if (TARGET_GENERAL_REGS_ONLY)
7421     {
7422       if (cum->aapcs_nvrn > 0)
7423         sorry ("%qs and floating point or vector arguments",
7424                "-mgeneral-regs-only");
7425       vr_save_area_size = 0;
7426     }
7427
7428   f_stack = TYPE_FIELDS (va_list_type_node);
7429   f_grtop = DECL_CHAIN (f_stack);
7430   f_vrtop = DECL_CHAIN (f_grtop);
7431   f_groff = DECL_CHAIN (f_vrtop);
7432   f_vroff = DECL_CHAIN (f_groff);
7433
7434   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7435                   NULL_TREE);
7436   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7437                   NULL_TREE);
7438   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7439                   NULL_TREE);
7440   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7441                   NULL_TREE);
7442   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7443                   NULL_TREE);
7444
7445   /* Emit code to initialize STACK, which points to the next varargs stack
7446      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
7447      by named arguments.  STACK is 8-byte aligned.  */
7448   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7449   if (cum->aapcs_stack_size > 0)
7450     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7451   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7452   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7453
7454   /* Emit code to initialize GRTOP, the top of the GR save area.
7455      virtual_incoming_args_rtx should have been 16 byte aligned.  */
7456   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7457   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7458   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7459
7460   /* Emit code to initialize VRTOP, the top of the VR save area.
7461      This address is gr_save_area_bytes below GRTOP, rounded
7462      down to the next 16-byte boundary.  */
7463   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7464   vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7465                              STACK_BOUNDARY / BITS_PER_UNIT);
7466
7467   if (vr_offset)
7468     t = fold_build_pointer_plus_hwi (t, -vr_offset);
7469   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7470   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7471
7472   /* Emit code to initialize GROFF, the offset from GRTOP of the
7473      next GPR argument.  */
7474   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7475               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7476   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7477
7478   /* Likewise emit code to initialize VROFF, the offset from FTOP
7479      of the next VR argument.  */
7480   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7481               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7482   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7483 }
7484
7485 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
7486
7487 static tree
7488 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7489                               gimple_seq *post_p ATTRIBUTE_UNUSED)
7490 {
7491   tree addr;
7492   bool indirect_p;
7493   bool is_ha;           /* is HFA or HVA.  */
7494   bool dw_align;        /* double-word align.  */
7495   machine_mode ag_mode = VOIDmode;
7496   int nregs;
7497   machine_mode mode;
7498
7499   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7500   tree stack, f_top, f_off, off, arg, roundup, on_stack;
7501   HOST_WIDE_INT size, rsize, adjust, align;
7502   tree t, u, cond1, cond2;
7503
7504   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7505   if (indirect_p)
7506     type = build_pointer_type (type);
7507
7508   mode = TYPE_MODE (type);
7509
7510   f_stack = TYPE_FIELDS (va_list_type_node);
7511   f_grtop = DECL_CHAIN (f_stack);
7512   f_vrtop = DECL_CHAIN (f_grtop);
7513   f_groff = DECL_CHAIN (f_vrtop);
7514   f_vroff = DECL_CHAIN (f_groff);
7515
7516   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7517                   f_stack, NULL_TREE);
7518   size = int_size_in_bytes (type);
7519   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7520
7521   dw_align = false;
7522   adjust = 0;
7523   if (aarch64_vfp_is_call_or_return_candidate (mode,
7524                                                type,
7525                                                &ag_mode,
7526                                                &nregs,
7527                                                &is_ha))
7528     {
7529       /* TYPE passed in fp/simd registers.  */
7530       if (TARGET_GENERAL_REGS_ONLY)
7531         sorry ("%qs and floating point or vector arguments",
7532                "-mgeneral-regs-only");
7533
7534       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
7535                       unshare_expr (valist), f_vrtop, NULL_TREE);
7536       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
7537                       unshare_expr (valist), f_vroff, NULL_TREE);
7538
7539       rsize = nregs * UNITS_PER_VREG;
7540
7541       if (is_ha)
7542         {
7543           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
7544             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
7545         }
7546       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
7547                && size < UNITS_PER_VREG)
7548         {
7549           adjust = UNITS_PER_VREG - size;
7550         }
7551     }
7552   else
7553     {
7554       /* TYPE passed in general registers.  */
7555       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
7556                       unshare_expr (valist), f_grtop, NULL_TREE);
7557       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
7558                       unshare_expr (valist), f_groff, NULL_TREE);
7559       rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
7560       nregs = rsize / UNITS_PER_WORD;
7561
7562       if (align > 8)
7563         dw_align = true;
7564
7565       if (BLOCK_REG_PADDING (mode, type, 1) == downward
7566           && size < UNITS_PER_WORD)
7567         {
7568           adjust = UNITS_PER_WORD  - size;
7569         }
7570     }
7571
7572   /* Get a local temporary for the field value.  */
7573   off = get_initialized_tmp_var (f_off, pre_p, NULL);
7574
7575   /* Emit code to branch if off >= 0.  */
7576   t = build2 (GE_EXPR, boolean_type_node, off,
7577               build_int_cst (TREE_TYPE (off), 0));
7578   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
7579
7580   if (dw_align)
7581     {
7582       /* Emit: offs = (offs + 15) & -16.  */
7583       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7584                   build_int_cst (TREE_TYPE (off), 15));
7585       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
7586                   build_int_cst (TREE_TYPE (off), -16));
7587       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
7588     }
7589   else
7590     roundup = NULL;
7591
7592   /* Update ap.__[g|v]r_offs  */
7593   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7594               build_int_cst (TREE_TYPE (off), rsize));
7595   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
7596
7597   /* String up.  */
7598   if (roundup)
7599     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7600
7601   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
7602   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
7603               build_int_cst (TREE_TYPE (f_off), 0));
7604   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
7605
7606   /* String up: make sure the assignment happens before the use.  */
7607   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
7608   COND_EXPR_ELSE (cond1) = t;
7609
7610   /* Prepare the trees handling the argument that is passed on the stack;
7611      the top level node will store in ON_STACK.  */
7612   arg = get_initialized_tmp_var (stack, pre_p, NULL);
7613   if (align > 8)
7614     {
7615       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
7616       t = fold_convert (intDI_type_node, arg);
7617       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7618                   build_int_cst (TREE_TYPE (t), 15));
7619       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7620                   build_int_cst (TREE_TYPE (t), -16));
7621       t = fold_convert (TREE_TYPE (arg), t);
7622       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7623     }
7624   else
7625     roundup = NULL;
7626   /* Advance ap.__stack  */
7627   t = fold_convert (intDI_type_node, arg);
7628   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7629               build_int_cst (TREE_TYPE (t), size + 7));
7630   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7631               build_int_cst (TREE_TYPE (t), -8));
7632   t = fold_convert (TREE_TYPE (arg), t);
7633   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7634   /* String up roundup and advance.  */
7635   if (roundup)
7636     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7637   /* String up with arg */
7638   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7639   /* Big-endianness related address adjustment.  */
7640   if (BLOCK_REG_PADDING (mode, type, 1) == downward
7641       && size < UNITS_PER_WORD)
7642   {
7643     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7644                 size_int (UNITS_PER_WORD - size));
7645     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7646   }
7647
7648   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7649   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7650
7651   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
7652   t = off;
7653   if (adjust)
7654     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7655                 build_int_cst (TREE_TYPE (off), adjust));
7656
7657   t = fold_convert (sizetype, t);
7658   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7659
7660   if (is_ha)
7661     {
7662       /* type ha; // treat as "struct {ftype field[n];}"
7663          ... [computing offs]
7664          for (i = 0; i <nregs; ++i, offs += 16)
7665            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7666          return ha;  */
7667       int i;
7668       tree tmp_ha, field_t, field_ptr_t;
7669
7670       /* Declare a local variable.  */
7671       tmp_ha = create_tmp_var_raw (type, "ha");
7672       gimple_add_tmp_var (tmp_ha);
7673
7674       /* Establish the base type.  */
7675       switch (ag_mode)
7676         {
7677         case SFmode:
7678           field_t = float_type_node;
7679           field_ptr_t = float_ptr_type_node;
7680           break;
7681         case DFmode:
7682           field_t = double_type_node;
7683           field_ptr_t = double_ptr_type_node;
7684           break;
7685         case TFmode:
7686           field_t = long_double_type_node;
7687           field_ptr_t = long_double_ptr_type_node;
7688           break;
7689 /* The half precision and quad precision are not fully supported yet.  Enable
7690    the following code after the support is complete.  Need to find the correct
7691    type node for __fp16 *.  */
7692 #if 0
7693         case HFmode:
7694           field_t = float_type_node;
7695           field_ptr_t = float_ptr_type_node;
7696           break;
7697 #endif
7698         case V2SImode:
7699         case V4SImode:
7700             {
7701               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7702               field_t = build_vector_type_for_mode (innertype, ag_mode);
7703               field_ptr_t = build_pointer_type (field_t);
7704             }
7705           break;
7706         default:
7707           gcc_assert (0);
7708         }
7709
7710       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
7711       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7712       addr = t;
7713       t = fold_convert (field_ptr_t, addr);
7714       t = build2 (MODIFY_EXPR, field_t,
7715                   build1 (INDIRECT_REF, field_t, tmp_ha),
7716                   build1 (INDIRECT_REF, field_t, t));
7717
7718       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
7719       for (i = 1; i < nregs; ++i)
7720         {
7721           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7722           u = fold_convert (field_ptr_t, addr);
7723           u = build2 (MODIFY_EXPR, field_t,
7724                       build2 (MEM_REF, field_t, tmp_ha,
7725                               build_int_cst (field_ptr_t,
7726                                              (i *
7727                                               int_size_in_bytes (field_t)))),
7728                       build1 (INDIRECT_REF, field_t, u));
7729           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7730         }
7731
7732       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7733       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7734     }
7735
7736   COND_EXPR_ELSE (cond2) = t;
7737   addr = fold_convert (build_pointer_type (type), cond1);
7738   addr = build_va_arg_indirect_ref (addr);
7739
7740   if (indirect_p)
7741     addr = build_va_arg_indirect_ref (addr);
7742
7743   return addr;
7744 }
7745
7746 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
7747
7748 static void
7749 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7750                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7751                                 int no_rtl)
7752 {
7753   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7754   CUMULATIVE_ARGS local_cum;
7755   int gr_saved, vr_saved;
7756
7757   /* The caller has advanced CUM up to, but not beyond, the last named
7758      argument.  Advance a local copy of CUM past the last "real" named
7759      argument, to find out how many registers are left over.  */
7760   local_cum = *cum;
7761   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7762
7763   /* Found out how many registers we need to save.  */
7764   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7765   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7766
7767   if (TARGET_GENERAL_REGS_ONLY)
7768     {
7769       if (local_cum.aapcs_nvrn > 0)
7770         sorry ("%qs and floating point or vector arguments",
7771                "-mgeneral-regs-only");
7772       vr_saved = 0;
7773     }
7774
7775   if (!no_rtl)
7776     {
7777       if (gr_saved > 0)
7778         {
7779           rtx ptr, mem;
7780
7781           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
7782           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7783                                - gr_saved * UNITS_PER_WORD);
7784           mem = gen_frame_mem (BLKmode, ptr);
7785           set_mem_alias_set (mem, get_varargs_alias_set ());
7786
7787           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7788                                mem, gr_saved);
7789         }
7790       if (vr_saved > 0)
7791         {
7792           /* We can't use move_block_from_reg, because it will use
7793              the wrong mode, storing D regs only.  */
7794           machine_mode mode = TImode;
7795           int off, i;
7796
7797           /* Set OFF to the offset from virtual_incoming_args_rtx of
7798              the first vector register.  The VR save area lies below
7799              the GR one, and is aligned to 16 bytes.  */
7800           off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7801                                    STACK_BOUNDARY / BITS_PER_UNIT);
7802           off -= vr_saved * UNITS_PER_VREG;
7803
7804           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7805             {
7806               rtx ptr, mem;
7807
7808               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7809               mem = gen_frame_mem (mode, ptr);
7810               set_mem_alias_set (mem, get_varargs_alias_set ());
7811               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7812               off += UNITS_PER_VREG;
7813             }
7814         }
7815     }
7816
7817   /* We don't save the size into *PRETEND_SIZE because we want to avoid
7818      any complication of having crtl->args.pretend_args_size changed.  */
7819   cfun->machine->frame.saved_varargs_size
7820     = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7821                       STACK_BOUNDARY / BITS_PER_UNIT)
7822        + vr_saved * UNITS_PER_VREG);
7823 }
7824
7825 static void
7826 aarch64_conditional_register_usage (void)
7827 {
7828   int i;
7829   if (!TARGET_FLOAT)
7830     {
7831       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7832         {
7833           fixed_regs[i] = 1;
7834           call_used_regs[i] = 1;
7835         }
7836     }
7837 }
7838
7839 /* Walk down the type tree of TYPE counting consecutive base elements.
7840    If *MODEP is VOIDmode, then set it to the first valid floating point
7841    type.  If a non-floating point type is found, or if a floating point
7842    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7843    otherwise return the count in the sub-tree.  */
7844 static int
7845 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
7846 {
7847   machine_mode mode;
7848   HOST_WIDE_INT size;
7849
7850   switch (TREE_CODE (type))
7851     {
7852     case REAL_TYPE:
7853       mode = TYPE_MODE (type);
7854       if (mode != DFmode && mode != SFmode && mode != TFmode)
7855         return -1;
7856
7857       if (*modep == VOIDmode)
7858         *modep = mode;
7859
7860       if (*modep == mode)
7861         return 1;
7862
7863       break;
7864
7865     case COMPLEX_TYPE:
7866       mode = TYPE_MODE (TREE_TYPE (type));
7867       if (mode != DFmode && mode != SFmode && mode != TFmode)
7868         return -1;
7869
7870       if (*modep == VOIDmode)
7871         *modep = mode;
7872
7873       if (*modep == mode)
7874         return 2;
7875
7876       break;
7877
7878     case VECTOR_TYPE:
7879       /* Use V2SImode and V4SImode as representatives of all 64-bit
7880          and 128-bit vector types.  */
7881       size = int_size_in_bytes (type);
7882       switch (size)
7883         {
7884         case 8:
7885           mode = V2SImode;
7886           break;
7887         case 16:
7888           mode = V4SImode;
7889           break;
7890         default:
7891           return -1;
7892         }
7893
7894       if (*modep == VOIDmode)
7895         *modep = mode;
7896
7897       /* Vector modes are considered to be opaque: two vectors are
7898          equivalent for the purposes of being homogeneous aggregates
7899          if they are the same size.  */
7900       if (*modep == mode)
7901         return 1;
7902
7903       break;
7904
7905     case ARRAY_TYPE:
7906       {
7907         int count;
7908         tree index = TYPE_DOMAIN (type);
7909
7910         /* Can't handle incomplete types nor sizes that are not
7911            fixed.  */
7912         if (!COMPLETE_TYPE_P (type)
7913             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7914           return -1;
7915
7916         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7917         if (count == -1
7918             || !index
7919             || !TYPE_MAX_VALUE (index)
7920             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7921             || !TYPE_MIN_VALUE (index)
7922             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7923             || count < 0)
7924           return -1;
7925
7926         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7927                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7928
7929         /* There must be no padding.  */
7930         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7931           return -1;
7932
7933         return count;
7934       }
7935
7936     case RECORD_TYPE:
7937       {
7938         int count = 0;
7939         int sub_count;
7940         tree field;
7941
7942         /* Can't handle incomplete types nor sizes that are not
7943            fixed.  */
7944         if (!COMPLETE_TYPE_P (type)
7945             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7946           return -1;
7947
7948         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7949           {
7950             if (TREE_CODE (field) != FIELD_DECL)
7951               continue;
7952
7953             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7954             if (sub_count < 0)
7955               return -1;
7956             count += sub_count;
7957           }
7958
7959         /* There must be no padding.  */
7960         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7961           return -1;
7962
7963         return count;
7964       }
7965
7966     case UNION_TYPE:
7967     case QUAL_UNION_TYPE:
7968       {
7969         /* These aren't very interesting except in a degenerate case.  */
7970         int count = 0;
7971         int sub_count;
7972         tree field;
7973
7974         /* Can't handle incomplete types nor sizes that are not
7975            fixed.  */
7976         if (!COMPLETE_TYPE_P (type)
7977             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7978           return -1;
7979
7980         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7981           {
7982             if (TREE_CODE (field) != FIELD_DECL)
7983               continue;
7984
7985             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7986             if (sub_count < 0)
7987               return -1;
7988             count = count > sub_count ? count : sub_count;
7989           }
7990
7991         /* There must be no padding.  */
7992         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7993           return -1;
7994
7995         return count;
7996       }
7997
7998     default:
7999       break;
8000     }
8001
8002   return -1;
8003 }
8004
8005 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
8006    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
8007    array types.  The C99 floating-point complex types are also considered
8008    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
8009    types, which are GCC extensions and out of the scope of AAPCS64, are
8010    treated as composite types here as well.
8011
8012    Note that MODE itself is not sufficient in determining whether a type
8013    is such a composite type or not.  This is because
8014    stor-layout.c:compute_record_mode may have already changed the MODE
8015    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
8016    structure with only one field may have its MODE set to the mode of the
8017    field.  Also an integer mode whose size matches the size of the
8018    RECORD_TYPE type may be used to substitute the original mode
8019    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
8020    solely relied on.  */
8021
8022 static bool
8023 aarch64_composite_type_p (const_tree type,
8024                           machine_mode mode)
8025 {
8026   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
8027     return true;
8028
8029   if (mode == BLKmode
8030       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
8031       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
8032     return true;
8033
8034   return false;
8035 }
8036
8037 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
8038    type as described in AAPCS64 \S 4.1.2.
8039
8040    See the comment above aarch64_composite_type_p for the notes on MODE.  */
8041
8042 static bool
8043 aarch64_short_vector_p (const_tree type,
8044                         machine_mode mode)
8045 {
8046   HOST_WIDE_INT size = -1;
8047
8048   if (type && TREE_CODE (type) == VECTOR_TYPE)
8049     size = int_size_in_bytes (type);
8050   else if (!aarch64_composite_type_p (type, mode)
8051            && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
8052                || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
8053     size = GET_MODE_SIZE (mode);
8054
8055   return (size == 8 || size == 16) ? true : false;
8056 }
8057
8058 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
8059    shall be passed or returned in simd/fp register(s) (providing these
8060    parameter passing registers are available).
8061
8062    Upon successful return, *COUNT returns the number of needed registers,
8063    *BASE_MODE returns the mode of the individual register and when IS_HAF
8064    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
8065    floating-point aggregate or a homogeneous short-vector aggregate.  */
8066
8067 static bool
8068 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
8069                                          const_tree type,
8070                                          machine_mode *base_mode,
8071                                          int *count,
8072                                          bool *is_ha)
8073 {
8074   machine_mode new_mode = VOIDmode;
8075   bool composite_p = aarch64_composite_type_p (type, mode);
8076
8077   if (is_ha != NULL) *is_ha = false;
8078
8079   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
8080       || aarch64_short_vector_p (type, mode))
8081     {
8082       *count = 1;
8083       new_mode = mode;
8084     }
8085   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
8086     {
8087       if (is_ha != NULL) *is_ha = true;
8088       *count = 2;
8089       new_mode = GET_MODE_INNER (mode);
8090     }
8091   else if (type && composite_p)
8092     {
8093       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
8094
8095       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
8096         {
8097           if (is_ha != NULL) *is_ha = true;
8098           *count = ag_count;
8099         }
8100       else
8101         return false;
8102     }
8103   else
8104     return false;
8105
8106   *base_mode = new_mode;
8107   return true;
8108 }
8109
8110 /* Implement TARGET_STRUCT_VALUE_RTX.  */
8111
8112 static rtx
8113 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
8114                           int incoming ATTRIBUTE_UNUSED)
8115 {
8116   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
8117 }
8118
8119 /* Implements target hook vector_mode_supported_p.  */
8120 static bool
8121 aarch64_vector_mode_supported_p (machine_mode mode)
8122 {
8123   if (TARGET_SIMD
8124       && (mode == V4SImode  || mode == V8HImode
8125           || mode == V16QImode || mode == V2DImode
8126           || mode == V2SImode  || mode == V4HImode
8127           || mode == V8QImode || mode == V2SFmode
8128           || mode == V4SFmode || mode == V2DFmode
8129           || mode == V1DFmode))
8130     return true;
8131
8132   return false;
8133 }
8134
8135 /* Return appropriate SIMD container
8136    for MODE within a vector of WIDTH bits.  */
8137 static machine_mode
8138 aarch64_simd_container_mode (machine_mode mode, unsigned width)
8139 {
8140   gcc_assert (width == 64 || width == 128);
8141   if (TARGET_SIMD)
8142     {
8143       if (width == 128)
8144         switch (mode)
8145           {
8146           case DFmode:
8147             return V2DFmode;
8148           case SFmode:
8149             return V4SFmode;
8150           case SImode:
8151             return V4SImode;
8152           case HImode:
8153             return V8HImode;
8154           case QImode:
8155             return V16QImode;
8156           case DImode:
8157             return V2DImode;
8158           default:
8159             break;
8160           }
8161       else
8162         switch (mode)
8163           {
8164           case SFmode:
8165             return V2SFmode;
8166           case SImode:
8167             return V2SImode;
8168           case HImode:
8169             return V4HImode;
8170           case QImode:
8171             return V8QImode;
8172           default:
8173             break;
8174           }
8175     }
8176   return word_mode;
8177 }
8178
8179 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
8180 static machine_mode
8181 aarch64_preferred_simd_mode (machine_mode mode)
8182 {
8183   return aarch64_simd_container_mode (mode, 128);
8184 }
8185
8186 /* Return the bitmask of possible vector sizes for the vectorizer
8187    to iterate over.  */
8188 static unsigned int
8189 aarch64_autovectorize_vector_sizes (void)
8190 {
8191   return (16 | 8);
8192 }
8193
8194 /* Implement TARGET_MANGLE_TYPE.  */
8195
8196 static const char *
8197 aarch64_mangle_type (const_tree type)
8198 {
8199   /* The AArch64 ABI documents say that "__va_list" has to be
8200      managled as if it is in the "std" namespace.  */
8201   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8202     return "St9__va_list";
8203
8204   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
8205      builtin types.  */
8206   if (TYPE_NAME (type) != NULL)
8207     return aarch64_mangle_builtin_type (type);
8208
8209   /* Use the default mangling.  */
8210   return NULL;
8211 }
8212
8213
8214 /* Return true if the rtx_insn contains a MEM RTX somewhere
8215    in it.  */
8216
8217 static bool
8218 has_memory_op (rtx_insn *mem_insn)
8219 {
8220   subrtx_iterator::array_type array;
8221   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8222     if (MEM_P (*iter))
8223       return true;
8224
8225   return false;
8226 }
8227
8228 /* Find the first rtx_insn before insn that will generate an assembly
8229    instruction.  */
8230
8231 static rtx_insn *
8232 aarch64_prev_real_insn (rtx_insn *insn)
8233 {
8234   if (!insn)
8235     return NULL;
8236
8237   do
8238     {
8239       insn = prev_real_insn (insn);
8240     }
8241   while (insn && recog_memoized (insn) < 0);
8242
8243   return insn;
8244 }
8245
8246 static bool
8247 is_madd_op (enum attr_type t1)
8248 {
8249   unsigned int i;
8250   /* A number of these may be AArch32 only.  */
8251   enum attr_type mlatypes[] = {
8252     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8253     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8254     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8255   };
8256
8257   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8258     {
8259       if (t1 == mlatypes[i])
8260         return true;
8261     }
8262
8263   return false;
8264 }
8265
8266 /* Check if there is a register dependency between a load and the insn
8267    for which we hold recog_data.  */
8268
8269 static bool
8270 dep_between_memop_and_curr (rtx memop)
8271 {
8272   rtx load_reg;
8273   int opno;
8274
8275   gcc_assert (GET_CODE (memop) == SET);
8276
8277   if (!REG_P (SET_DEST (memop)))
8278     return false;
8279
8280   load_reg = SET_DEST (memop);
8281   for (opno = 1; opno < recog_data.n_operands; opno++)
8282     {
8283       rtx operand = recog_data.operand[opno];
8284       if (REG_P (operand)
8285           && reg_overlap_mentioned_p (load_reg, operand))
8286         return true;
8287
8288     }
8289   return false;
8290 }
8291
8292
8293 /* When working around the Cortex-A53 erratum 835769,
8294    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8295    instruction and has a preceding memory instruction such that a NOP
8296    should be inserted between them.  */
8297
8298 bool
8299 aarch64_madd_needs_nop (rtx_insn* insn)
8300 {
8301   enum attr_type attr_type;
8302   rtx_insn *prev;
8303   rtx body;
8304
8305   if (!aarch64_fix_a53_err835769)
8306     return false;
8307
8308   if (recog_memoized (insn) < 0)
8309     return false;
8310
8311   attr_type = get_attr_type (insn);
8312   if (!is_madd_op (attr_type))
8313     return false;
8314
8315   prev = aarch64_prev_real_insn (insn);
8316   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8317      Restore recog state to INSN to avoid state corruption.  */
8318   extract_constrain_insn_cached (insn);
8319
8320   if (!prev || !has_memory_op (prev))
8321     return false;
8322
8323   body = single_set (prev);
8324
8325   /* If the previous insn is a memory op and there is no dependency between
8326      it and the DImode madd, emit a NOP between them.  If body is NULL then we
8327      have a complex memory operation, probably a load/store pair.
8328      Be conservative for now and emit a NOP.  */
8329   if (GET_MODE (recog_data.operand[0]) == DImode
8330       && (!body || !dep_between_memop_and_curr (body)))
8331     return true;
8332
8333   return false;
8334
8335 }
8336
8337
8338 /* Implement FINAL_PRESCAN_INSN.  */
8339
8340 void
8341 aarch64_final_prescan_insn (rtx_insn *insn)
8342 {
8343   if (aarch64_madd_needs_nop (insn))
8344     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8345 }
8346
8347
8348 /* Return the equivalent letter for size.  */
8349 static char
8350 sizetochar (int size)
8351 {
8352   switch (size)
8353     {
8354     case 64: return 'd';
8355     case 32: return 's';
8356     case 16: return 'h';
8357     case 8 : return 'b';
8358     default: gcc_unreachable ();
8359     }
8360 }
8361
8362 /* Return true iff x is a uniform vector of floating-point
8363    constants, and the constant can be represented in
8364    quarter-precision form.  Note, as aarch64_float_const_representable
8365    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
8366 static bool
8367 aarch64_vect_float_const_representable_p (rtx x)
8368 {
8369   int i = 0;
8370   REAL_VALUE_TYPE r0, ri;
8371   rtx x0, xi;
8372
8373   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8374     return false;
8375
8376   x0 = CONST_VECTOR_ELT (x, 0);
8377   if (!CONST_DOUBLE_P (x0))
8378     return false;
8379
8380   REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8381
8382   for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8383     {
8384       xi = CONST_VECTOR_ELT (x, i);
8385       if (!CONST_DOUBLE_P (xi))
8386         return false;
8387
8388       REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8389       if (!REAL_VALUES_EQUAL (r0, ri))
8390         return false;
8391     }
8392
8393   return aarch64_float_const_representable_p (x0);
8394 }
8395
8396 /* Return true for valid and false for invalid.  */
8397 bool
8398 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8399                               struct simd_immediate_info *info)
8400 {
8401 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
8402   matches = 1;                                          \
8403   for (i = 0; i < idx; i += (STRIDE))                   \
8404     if (!(TEST))                                        \
8405       matches = 0;                                      \
8406   if (matches)                                          \
8407     {                                                   \
8408       immtype = (CLASS);                                \
8409       elsize = (ELSIZE);                                \
8410       eshift = (SHIFT);                                 \
8411       emvn = (NEG);                                     \
8412       break;                                            \
8413     }
8414
8415   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8416   unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8417   unsigned char bytes[16];
8418   int immtype = -1, matches;
8419   unsigned int invmask = inverse ? 0xff : 0;
8420   int eshift, emvn;
8421
8422   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8423     {
8424       if (! (aarch64_simd_imm_zero_p (op, mode)
8425              || aarch64_vect_float_const_representable_p (op)))
8426         return false;
8427
8428       if (info)
8429         {
8430           info->value = CONST_VECTOR_ELT (op, 0);
8431           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8432           info->mvn = false;
8433           info->shift = 0;
8434         }
8435
8436       return true;
8437     }
8438
8439   /* Splat vector constant out into a byte vector.  */
8440   for (i = 0; i < n_elts; i++)
8441     {
8442       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
8443          it must be laid out in the vector register in reverse order.  */
8444       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8445       unsigned HOST_WIDE_INT elpart;
8446       unsigned int part, parts;
8447
8448       if (CONST_INT_P (el))
8449         {
8450           elpart = INTVAL (el);
8451           parts = 1;
8452         }
8453       else if (GET_CODE (el) == CONST_DOUBLE)
8454         {
8455           elpart = CONST_DOUBLE_LOW (el);
8456           parts = 2;
8457         }
8458       else
8459         gcc_unreachable ();
8460
8461       for (part = 0; part < parts; part++)
8462         {
8463           unsigned int byte;
8464           for (byte = 0; byte < innersize; byte++)
8465             {
8466               bytes[idx++] = (elpart & 0xff) ^ invmask;
8467               elpart >>= BITS_PER_UNIT;
8468             }
8469           if (GET_CODE (el) == CONST_DOUBLE)
8470             elpart = CONST_DOUBLE_HIGH (el);
8471         }
8472     }
8473
8474   /* Sanity check.  */
8475   gcc_assert (idx == GET_MODE_SIZE (mode));
8476
8477   do
8478     {
8479       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8480              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8481
8482       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8483              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8484
8485       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8486              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8487
8488       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8489              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8490
8491       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8492
8493       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8494
8495       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8496              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8497
8498       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8499              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8500
8501       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8502              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8503
8504       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8505              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8506
8507       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8508
8509       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8510
8511       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8512              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8513
8514       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8515              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8516
8517       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8518              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8519
8520       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8521              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8522
8523       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8524
8525       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8526              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8527     }
8528   while (0);
8529
8530   if (immtype == -1)
8531     return false;
8532
8533   if (info)
8534     {
8535       info->element_width = elsize;
8536       info->mvn = emvn != 0;
8537       info->shift = eshift;
8538
8539       unsigned HOST_WIDE_INT imm = 0;
8540
8541       if (immtype >= 12 && immtype <= 15)
8542         info->msl = true;
8543
8544       /* Un-invert bytes of recognized vector, if necessary.  */
8545       if (invmask != 0)
8546         for (i = 0; i < idx; i++)
8547           bytes[i] ^= invmask;
8548
8549       if (immtype == 17)
8550         {
8551           /* FIXME: Broken on 32-bit H_W_I hosts.  */
8552           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8553
8554           for (i = 0; i < 8; i++)
8555             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8556               << (i * BITS_PER_UNIT);
8557
8558
8559           info->value = GEN_INT (imm);
8560         }
8561       else
8562         {
8563           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8564             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8565
8566           /* Construct 'abcdefgh' because the assembler cannot handle
8567              generic constants.  */
8568           if (info->mvn)
8569             imm = ~imm;
8570           imm = (imm >> info->shift) & 0xff;
8571           info->value = GEN_INT (imm);
8572         }
8573     }
8574
8575   return true;
8576 #undef CHECK
8577 }
8578
8579 /* Check of immediate shift constants are within range.  */
8580 bool
8581 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8582 {
8583   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8584   if (left)
8585     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8586   else
8587     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8588 }
8589
8590 /* Return true if X is a uniform vector where all elements
8591    are either the floating-point constant 0.0 or the
8592    integer constant 0.  */
8593 bool
8594 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8595 {
8596   return x == CONST0_RTX (mode);
8597 }
8598
8599 bool
8600 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8601 {
8602   HOST_WIDE_INT imm = INTVAL (x);
8603   int i;
8604
8605   for (i = 0; i < 8; i++)
8606     {
8607       unsigned int byte = imm & 0xff;
8608       if (byte != 0xff && byte != 0)
8609        return false;
8610       imm >>= 8;
8611     }
8612
8613   return true;
8614 }
8615
8616 bool
8617 aarch64_mov_operand_p (rtx x,
8618                        enum aarch64_symbol_context context,
8619                        machine_mode mode)
8620 {
8621   if (GET_CODE (x) == HIGH
8622       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8623     return true;
8624
8625   if (CONST_INT_P (x))
8626     return true;
8627
8628   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8629     return true;
8630
8631   return aarch64_classify_symbolic_expression (x, context)
8632     == SYMBOL_TINY_ABSOLUTE;
8633 }
8634
8635 /* Return a const_int vector of VAL.  */
8636 rtx
8637 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8638 {
8639   int nunits = GET_MODE_NUNITS (mode);
8640   rtvec v = rtvec_alloc (nunits);
8641   int i;
8642
8643   for (i=0; i < nunits; i++)
8644     RTVEC_ELT (v, i) = GEN_INT (val);
8645
8646   return gen_rtx_CONST_VECTOR (mode, v);
8647 }
8648
8649 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
8650
8651 bool
8652 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8653 {
8654   machine_mode vmode;
8655
8656   gcc_assert (!VECTOR_MODE_P (mode));
8657   vmode = aarch64_preferred_simd_mode (mode);
8658   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8659   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8660 }
8661
8662 /* Construct and return a PARALLEL RTX vector with elements numbering the
8663    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8664    the vector - from the perspective of the architecture.  This does not
8665    line up with GCC's perspective on lane numbers, so we end up with
8666    different masks depending on our target endian-ness.  The diagram
8667    below may help.  We must draw the distinction when building masks
8668    which select one half of the vector.  An instruction selecting
8669    architectural low-lanes for a big-endian target, must be described using
8670    a mask selecting GCC high-lanes.
8671
8672                  Big-Endian             Little-Endian
8673
8674 GCC             0   1   2   3           3   2   1   0
8675               | x | x | x | x |       | x | x | x | x |
8676 Architecture    3   2   1   0           3   2   1   0
8677
8678 Low Mask:         { 2, 3 }                { 0, 1 }
8679 High Mask:        { 0, 1 }                { 2, 3 }
8680 */
8681
8682 rtx
8683 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8684 {
8685   int nunits = GET_MODE_NUNITS (mode);
8686   rtvec v = rtvec_alloc (nunits / 2);
8687   int high_base = nunits / 2;
8688   int low_base = 0;
8689   int base;
8690   rtx t1;
8691   int i;
8692
8693   if (BYTES_BIG_ENDIAN)
8694     base = high ? low_base : high_base;
8695   else
8696     base = high ? high_base : low_base;
8697
8698   for (i = 0; i < nunits / 2; i++)
8699     RTVEC_ELT (v, i) = GEN_INT (base + i);
8700
8701   t1 = gen_rtx_PARALLEL (mode, v);
8702   return t1;
8703 }
8704
8705 /* Check OP for validity as a PARALLEL RTX vector with elements
8706    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8707    from the perspective of the architecture.  See the diagram above
8708    aarch64_simd_vect_par_cnst_half for more details.  */
8709
8710 bool
8711 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8712                                        bool high)
8713 {
8714   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8715   HOST_WIDE_INT count_op = XVECLEN (op, 0);
8716   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8717   int i = 0;
8718
8719   if (!VECTOR_MODE_P (mode))
8720     return false;
8721
8722   if (count_op != count_ideal)
8723     return false;
8724
8725   for (i = 0; i < count_ideal; i++)
8726     {
8727       rtx elt_op = XVECEXP (op, 0, i);
8728       rtx elt_ideal = XVECEXP (ideal, 0, i);
8729
8730       if (!CONST_INT_P (elt_op)
8731           || INTVAL (elt_ideal) != INTVAL (elt_op))
8732         return false;
8733     }
8734   return true;
8735 }
8736
8737 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
8738    HIGH (exclusive).  */
8739 void
8740 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
8741                           const_tree exp)
8742 {
8743   HOST_WIDE_INT lane;
8744   gcc_assert (CONST_INT_P (operand));
8745   lane = INTVAL (operand);
8746
8747   if (lane < low || lane >= high)
8748   {
8749     if (exp)
8750       error ("%Klane %ld out of range %ld - %ld", exp, lane, low, high - 1);
8751     else
8752       error ("lane %ld out of range %ld - %ld", lane, low, high - 1);
8753   }
8754 }
8755
8756 /* Return TRUE if OP is a valid vector addressing mode.  */
8757 bool
8758 aarch64_simd_mem_operand_p (rtx op)
8759 {
8760   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8761                         || REG_P (XEXP (op, 0)));
8762 }
8763
8764 /* Emit a register copy from operand to operand, taking care not to
8765    early-clobber source registers in the process.
8766
8767    COUNT is the number of components into which the copy needs to be
8768    decomposed.  */
8769 void
8770 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
8771                                 unsigned int count)
8772 {
8773   unsigned int i;
8774   int rdest = REGNO (operands[0]);
8775   int rsrc = REGNO (operands[1]);
8776
8777   if (!reg_overlap_mentioned_p (operands[0], operands[1])
8778       || rdest < rsrc)
8779     for (i = 0; i < count; i++)
8780       emit_move_insn (gen_rtx_REG (mode, rdest + i),
8781                       gen_rtx_REG (mode, rsrc + i));
8782   else
8783     for (i = 0; i < count; i++)
8784       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
8785                       gen_rtx_REG (mode, rsrc + count - i - 1));
8786 }
8787
8788 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8789    one of VSTRUCT modes: OI, CI or XI.  */
8790 int
8791 aarch64_simd_attr_length_move (rtx_insn *insn)
8792 {
8793   machine_mode mode;
8794
8795   extract_insn_cached (insn);
8796
8797   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8798     {
8799       mode = GET_MODE (recog_data.operand[0]);
8800       switch (mode)
8801         {
8802         case OImode:
8803           return 8;
8804         case CImode:
8805           return 12;
8806         case XImode:
8807           return 16;
8808         default:
8809           gcc_unreachable ();
8810         }
8811     }
8812   return 4;
8813 }
8814
8815 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
8816    one of VSTRUCT modes: OI, CI, EI, or XI.  */
8817 int
8818 aarch64_simd_attr_length_rglist (enum machine_mode mode)
8819 {
8820   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
8821 }
8822
8823 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
8824    alignment of a vector to 128 bits.  */
8825 static HOST_WIDE_INT
8826 aarch64_simd_vector_alignment (const_tree type)
8827 {
8828   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8829   return MIN (align, 128);
8830 }
8831
8832 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
8833 static bool
8834 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8835 {
8836   if (is_packed)
8837     return false;
8838
8839   /* We guarantee alignment for vectors up to 128-bits.  */
8840   if (tree_int_cst_compare (TYPE_SIZE (type),
8841                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8842     return false;
8843
8844   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
8845   return true;
8846 }
8847
8848 /* If VALS is a vector constant that can be loaded into a register
8849    using DUP, generate instructions to do so and return an RTX to
8850    assign to the register.  Otherwise return NULL_RTX.  */
8851 static rtx
8852 aarch64_simd_dup_constant (rtx vals)
8853 {
8854   machine_mode mode = GET_MODE (vals);
8855   machine_mode inner_mode = GET_MODE_INNER (mode);
8856   int n_elts = GET_MODE_NUNITS (mode);
8857   bool all_same = true;
8858   rtx x;
8859   int i;
8860
8861   if (GET_CODE (vals) != CONST_VECTOR)
8862     return NULL_RTX;
8863
8864   for (i = 1; i < n_elts; ++i)
8865     {
8866       x = CONST_VECTOR_ELT (vals, i);
8867       if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8868         all_same = false;
8869     }
8870
8871   if (!all_same)
8872     return NULL_RTX;
8873
8874   /* We can load this constant by using DUP and a constant in a
8875      single ARM register.  This will be cheaper than a vector
8876      load.  */
8877   x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8878   return gen_rtx_VEC_DUPLICATE (mode, x);
8879 }
8880
8881
8882 /* Generate code to load VALS, which is a PARALLEL containing only
8883    constants (for vec_init) or CONST_VECTOR, efficiently into a
8884    register.  Returns an RTX to copy into the register, or NULL_RTX
8885    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
8886 static rtx
8887 aarch64_simd_make_constant (rtx vals)
8888 {
8889   machine_mode mode = GET_MODE (vals);
8890   rtx const_dup;
8891   rtx const_vec = NULL_RTX;
8892   int n_elts = GET_MODE_NUNITS (mode);
8893   int n_const = 0;
8894   int i;
8895
8896   if (GET_CODE (vals) == CONST_VECTOR)
8897     const_vec = vals;
8898   else if (GET_CODE (vals) == PARALLEL)
8899     {
8900       /* A CONST_VECTOR must contain only CONST_INTs and
8901          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8902          Only store valid constants in a CONST_VECTOR.  */
8903       for (i = 0; i < n_elts; ++i)
8904         {
8905           rtx x = XVECEXP (vals, 0, i);
8906           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8907             n_const++;
8908         }
8909       if (n_const == n_elts)
8910         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8911     }
8912   else
8913     gcc_unreachable ();
8914
8915   if (const_vec != NULL_RTX
8916       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8917     /* Load using MOVI/MVNI.  */
8918     return const_vec;
8919   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8920     /* Loaded using DUP.  */
8921     return const_dup;
8922   else if (const_vec != NULL_RTX)
8923     /* Load from constant pool. We can not take advantage of single-cycle
8924        LD1 because we need a PC-relative addressing mode.  */
8925     return const_vec;
8926   else
8927     /* A PARALLEL containing something not valid inside CONST_VECTOR.
8928        We can not construct an initializer.  */
8929     return NULL_RTX;
8930 }
8931
8932 void
8933 aarch64_expand_vector_init (rtx target, rtx vals)
8934 {
8935   machine_mode mode = GET_MODE (target);
8936   machine_mode inner_mode = GET_MODE_INNER (mode);
8937   int n_elts = GET_MODE_NUNITS (mode);
8938   int n_var = 0;
8939   rtx any_const = NULL_RTX;
8940   bool all_same = true;
8941
8942   for (int i = 0; i < n_elts; ++i)
8943     {
8944       rtx x = XVECEXP (vals, 0, i);
8945       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8946         ++n_var;
8947       else
8948         any_const = x;
8949
8950       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8951         all_same = false;
8952     }
8953
8954   if (n_var == 0)
8955     {
8956       rtx constant = aarch64_simd_make_constant (vals);
8957       if (constant != NULL_RTX)
8958         {
8959           emit_move_insn (target, constant);
8960           return;
8961         }
8962     }
8963
8964   /* Splat a single non-constant element if we can.  */
8965   if (all_same)
8966     {
8967       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8968       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8969       return;
8970     }
8971
8972   /* Half the fields (or less) are non-constant.  Load constant then overwrite
8973      varying fields.  Hope that this is more efficient than using the stack.  */
8974   if (n_var <= n_elts/2)
8975     {
8976       rtx copy = copy_rtx (vals);
8977
8978       /* Load constant part of vector.  We really don't care what goes into the
8979          parts we will overwrite, but we're more likely to be able to load the
8980          constant efficiently if it has fewer, larger, repeating parts
8981          (see aarch64_simd_valid_immediate).  */
8982       for (int i = 0; i < n_elts; i++)
8983         {
8984           rtx x = XVECEXP (vals, 0, i);
8985           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8986             continue;
8987           rtx subst = any_const;
8988           for (int bit = n_elts / 2; bit > 0; bit /= 2)
8989             {
8990               /* Look in the copied vector, as more elements are const.  */
8991               rtx test = XVECEXP (copy, 0, i ^ bit);
8992               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
8993                 {
8994                   subst = test;
8995                   break;
8996                 }
8997             }
8998           XVECEXP (copy, 0, i) = subst;
8999         }
9000       aarch64_expand_vector_init (target, copy);
9001
9002       /* Insert variables.  */
9003       enum insn_code icode = optab_handler (vec_set_optab, mode);
9004       gcc_assert (icode != CODE_FOR_nothing);
9005
9006       for (int i = 0; i < n_elts; i++)
9007         {
9008           rtx x = XVECEXP (vals, 0, i);
9009           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9010             continue;
9011           x = copy_to_mode_reg (inner_mode, x);
9012           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
9013         }
9014       return;
9015     }
9016
9017   /* Construct the vector in memory one field at a time
9018      and load the whole vector.  */
9019   rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
9020   for (int i = 0; i < n_elts; i++)
9021     emit_move_insn (adjust_address_nv (mem, inner_mode,
9022                                     i * GET_MODE_SIZE (inner_mode)),
9023                     XVECEXP (vals, 0, i));
9024   emit_move_insn (target, mem);
9025
9026 }
9027
9028 static unsigned HOST_WIDE_INT
9029 aarch64_shift_truncation_mask (machine_mode mode)
9030 {
9031   return
9032     (aarch64_vector_mode_supported_p (mode)
9033      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
9034 }
9035
9036 #ifndef TLS_SECTION_ASM_FLAG
9037 #define TLS_SECTION_ASM_FLAG 'T'
9038 #endif
9039
9040 void
9041 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
9042                                tree decl ATTRIBUTE_UNUSED)
9043 {
9044   char flagchars[10], *f = flagchars;
9045
9046   /* If we have already declared this section, we can use an
9047      abbreviated form to switch back to it -- unless this section is
9048      part of a COMDAT groups, in which case GAS requires the full
9049      declaration every time.  */
9050   if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9051       && (flags & SECTION_DECLARED))
9052     {
9053       fprintf (asm_out_file, "\t.section\t%s\n", name);
9054       return;
9055     }
9056
9057   if (!(flags & SECTION_DEBUG))
9058     *f++ = 'a';
9059   if (flags & SECTION_WRITE)
9060     *f++ = 'w';
9061   if (flags & SECTION_CODE)
9062     *f++ = 'x';
9063   if (flags & SECTION_SMALL)
9064     *f++ = 's';
9065   if (flags & SECTION_MERGE)
9066     *f++ = 'M';
9067   if (flags & SECTION_STRINGS)
9068     *f++ = 'S';
9069   if (flags & SECTION_TLS)
9070     *f++ = TLS_SECTION_ASM_FLAG;
9071   if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9072     *f++ = 'G';
9073   *f = '\0';
9074
9075   fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
9076
9077   if (!(flags & SECTION_NOTYPE))
9078     {
9079       const char *type;
9080       const char *format;
9081
9082       if (flags & SECTION_BSS)
9083         type = "nobits";
9084       else
9085         type = "progbits";
9086
9087 #ifdef TYPE_OPERAND_FMT
9088       format = "," TYPE_OPERAND_FMT;
9089 #else
9090       format = ",@%s";
9091 #endif
9092
9093       fprintf (asm_out_file, format, type);
9094
9095       if (flags & SECTION_ENTSIZE)
9096         fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
9097       if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9098         {
9099           if (TREE_CODE (decl) == IDENTIFIER_NODE)
9100             fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
9101           else
9102             fprintf (asm_out_file, ",%s,comdat",
9103                      IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
9104         }
9105     }
9106
9107   putc ('\n', asm_out_file);
9108 }
9109
9110 /* Select a format to encode pointers in exception handling data.  */
9111 int
9112 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
9113 {
9114    int type;
9115    switch (aarch64_cmodel)
9116      {
9117      case AARCH64_CMODEL_TINY:
9118      case AARCH64_CMODEL_TINY_PIC:
9119      case AARCH64_CMODEL_SMALL:
9120      case AARCH64_CMODEL_SMALL_PIC:
9121        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
9122           for everything.  */
9123        type = DW_EH_PE_sdata4;
9124        break;
9125      default:
9126        /* No assumptions here.  8-byte relocs required.  */
9127        type = DW_EH_PE_sdata8;
9128        break;
9129      }
9130    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
9131 }
9132
9133 /* Emit load exclusive.  */
9134
9135 static void
9136 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
9137                              rtx mem, rtx model_rtx)
9138 {
9139   rtx (*gen) (rtx, rtx, rtx);
9140
9141   switch (mode)
9142     {
9143     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
9144     case HImode: gen = gen_aarch64_load_exclusivehi; break;
9145     case SImode: gen = gen_aarch64_load_exclusivesi; break;
9146     case DImode: gen = gen_aarch64_load_exclusivedi; break;
9147     default:
9148       gcc_unreachable ();
9149     }
9150
9151   emit_insn (gen (rval, mem, model_rtx));
9152 }
9153
9154 /* Emit store exclusive.  */
9155
9156 static void
9157 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
9158                               rtx rval, rtx mem, rtx model_rtx)
9159 {
9160   rtx (*gen) (rtx, rtx, rtx, rtx);
9161
9162   switch (mode)
9163     {
9164     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
9165     case HImode: gen = gen_aarch64_store_exclusivehi; break;
9166     case SImode: gen = gen_aarch64_store_exclusivesi; break;
9167     case DImode: gen = gen_aarch64_store_exclusivedi; break;
9168     default:
9169       gcc_unreachable ();
9170     }
9171
9172   emit_insn (gen (bval, rval, mem, model_rtx));
9173 }
9174
9175 /* Mark the previous jump instruction as unlikely.  */
9176
9177 static void
9178 aarch64_emit_unlikely_jump (rtx insn)
9179 {
9180   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
9181
9182   insn = emit_jump_insn (insn);
9183   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
9184 }
9185
9186 /* Expand a compare and swap pattern.  */
9187
9188 void
9189 aarch64_expand_compare_and_swap (rtx operands[])
9190 {
9191   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9192   machine_mode mode, cmp_mode;
9193   rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9194
9195   bval = operands[0];
9196   rval = operands[1];
9197   mem = operands[2];
9198   oldval = operands[3];
9199   newval = operands[4];
9200   is_weak = operands[5];
9201   mod_s = operands[6];
9202   mod_f = operands[7];
9203   mode = GET_MODE (mem);
9204   cmp_mode = mode;
9205
9206   /* Normally the succ memory model must be stronger than fail, but in the
9207      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9208      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
9209
9210   if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
9211       && INTVAL (mod_s) == MEMMODEL_RELEASE)
9212     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9213
9214   switch (mode)
9215     {
9216     case QImode:
9217     case HImode:
9218       /* For short modes, we're going to perform the comparison in SImode,
9219          so do the zero-extension now.  */
9220       cmp_mode = SImode;
9221       rval = gen_reg_rtx (SImode);
9222       oldval = convert_modes (SImode, mode, oldval, true);
9223       /* Fall through.  */
9224
9225     case SImode:
9226     case DImode:
9227       /* Force the value into a register if needed.  */
9228       if (!aarch64_plus_operand (oldval, mode))
9229         oldval = force_reg (cmp_mode, oldval);
9230       break;
9231
9232     default:
9233       gcc_unreachable ();
9234     }
9235
9236   switch (mode)
9237     {
9238     case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9239     case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9240     case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9241     case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9242     default:
9243       gcc_unreachable ();
9244     }
9245
9246   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9247
9248   if (mode == QImode || mode == HImode)
9249     emit_move_insn (operands[1], gen_lowpart (mode, rval));
9250
9251   x = gen_rtx_REG (CCmode, CC_REGNUM);
9252   x = gen_rtx_EQ (SImode, x, const0_rtx);
9253   emit_insn (gen_rtx_SET (VOIDmode, bval, x));
9254 }
9255
9256 /* Split a compare and swap pattern.  */
9257
9258 void
9259 aarch64_split_compare_and_swap (rtx operands[])
9260 {
9261   rtx rval, mem, oldval, newval, scratch;
9262   machine_mode mode;
9263   bool is_weak;
9264   rtx_code_label *label1, *label2;
9265   rtx x, cond;
9266
9267   rval = operands[0];
9268   mem = operands[1];
9269   oldval = operands[2];
9270   newval = operands[3];
9271   is_weak = (operands[4] != const0_rtx);
9272   scratch = operands[7];
9273   mode = GET_MODE (mem);
9274
9275   label1 = NULL;
9276   if (!is_weak)
9277     {
9278       label1 = gen_label_rtx ();
9279       emit_label (label1);
9280     }
9281   label2 = gen_label_rtx ();
9282
9283   aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
9284
9285   cond = aarch64_gen_compare_reg (NE, rval, oldval);
9286   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9287   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9288                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9289   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9290
9291   aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
9292
9293   if (!is_weak)
9294     {
9295       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9296       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9297                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9298       aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9299     }
9300   else
9301     {
9302       cond = gen_rtx_REG (CCmode, CC_REGNUM);
9303       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9304       emit_insn (gen_rtx_SET (VOIDmode, cond, x));
9305     }
9306
9307   emit_label (label2);
9308 }
9309
9310 /* Split an atomic operation.  */
9311
9312 void
9313 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9314                      rtx value, rtx model_rtx, rtx cond)
9315 {
9316   machine_mode mode = GET_MODE (mem);
9317   machine_mode wmode = (mode == DImode ? DImode : SImode);
9318   rtx_code_label *label;
9319   rtx x;
9320
9321   label = gen_label_rtx ();
9322   emit_label (label);
9323
9324   if (new_out)
9325     new_out = gen_lowpart (wmode, new_out);
9326   if (old_out)
9327     old_out = gen_lowpart (wmode, old_out);
9328   else
9329     old_out = new_out;
9330   value = simplify_gen_subreg (wmode, value, mode, 0);
9331
9332   aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9333
9334   switch (code)
9335     {
9336     case SET:
9337       new_out = value;
9338       break;
9339
9340     case NOT:
9341       x = gen_rtx_AND (wmode, old_out, value);
9342       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9343       x = gen_rtx_NOT (wmode, new_out);
9344       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9345       break;
9346
9347     case MINUS:
9348       if (CONST_INT_P (value))
9349         {
9350           value = GEN_INT (-INTVAL (value));
9351           code = PLUS;
9352         }
9353       /* Fall through.  */
9354
9355     default:
9356       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9357       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9358       break;
9359     }
9360
9361   aarch64_emit_store_exclusive (mode, cond, mem,
9362                                 gen_lowpart (mode, new_out), model_rtx);
9363
9364   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9365   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9366                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9367   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9368 }
9369
9370 static void
9371 aarch64_print_extension (void)
9372 {
9373   const struct aarch64_option_extension *opt = NULL;
9374
9375   for (opt = all_extensions; opt->name != NULL; opt++)
9376     if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9377       asm_fprintf (asm_out_file, "+%s", opt->name);
9378
9379   asm_fprintf (asm_out_file, "\n");
9380 }
9381
9382 static void
9383 aarch64_start_file (void)
9384 {
9385   if (selected_arch)
9386     {
9387       asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9388       aarch64_print_extension ();
9389     }
9390   else if (selected_cpu)
9391     {
9392       const char *truncated_name
9393             = aarch64_rewrite_selected_cpu (selected_cpu->name);
9394       asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9395       aarch64_print_extension ();
9396     }
9397   default_file_start();
9398 }
9399
9400 /* Target hook for c_mode_for_suffix.  */
9401 static machine_mode
9402 aarch64_c_mode_for_suffix (char suffix)
9403 {
9404   if (suffix == 'q')
9405     return TFmode;
9406
9407   return VOIDmode;
9408 }
9409
9410 /* We can only represent floating point constants which will fit in
9411    "quarter-precision" values.  These values are characterised by
9412    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
9413    by:
9414
9415    (-1)^s * (n/16) * 2^r
9416
9417    Where:
9418      's' is the sign bit.
9419      'n' is an integer in the range 16 <= n <= 31.
9420      'r' is an integer in the range -3 <= r <= 4.  */
9421
9422 /* Return true iff X can be represented by a quarter-precision
9423    floating point immediate operand X.  Note, we cannot represent 0.0.  */
9424 bool
9425 aarch64_float_const_representable_p (rtx x)
9426 {
9427   /* This represents our current view of how many bits
9428      make up the mantissa.  */
9429   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9430   int exponent;
9431   unsigned HOST_WIDE_INT mantissa, mask;
9432   REAL_VALUE_TYPE r, m;
9433   bool fail;
9434
9435   if (!CONST_DOUBLE_P (x))
9436     return false;
9437
9438   if (GET_MODE (x) == VOIDmode)
9439     return false;
9440
9441   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9442
9443   /* We cannot represent infinities, NaNs or +/-zero.  We won't
9444      know if we have +zero until we analyse the mantissa, but we
9445      can reject the other invalid values.  */
9446   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9447       || REAL_VALUE_MINUS_ZERO (r))
9448     return false;
9449
9450   /* Extract exponent.  */
9451   r = real_value_abs (&r);
9452   exponent = REAL_EXP (&r);
9453
9454   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9455      highest (sign) bit, with a fixed binary point at bit point_pos.
9456      m1 holds the low part of the mantissa, m2 the high part.
9457      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9458      bits for the mantissa, this can fail (low bits will be lost).  */
9459   real_ldexp (&m, &r, point_pos - exponent);
9460   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9461
9462   /* If the low part of the mantissa has bits set we cannot represent
9463      the value.  */
9464   if (w.elt (0) != 0)
9465     return false;
9466   /* We have rejected the lower HOST_WIDE_INT, so update our
9467      understanding of how many bits lie in the mantissa and
9468      look only at the high HOST_WIDE_INT.  */
9469   mantissa = w.elt (1);
9470   point_pos -= HOST_BITS_PER_WIDE_INT;
9471
9472   /* We can only represent values with a mantissa of the form 1.xxxx.  */
9473   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9474   if ((mantissa & mask) != 0)
9475     return false;
9476
9477   /* Having filtered unrepresentable values, we may now remove all
9478      but the highest 5 bits.  */
9479   mantissa >>= point_pos - 5;
9480
9481   /* We cannot represent the value 0.0, so reject it.  This is handled
9482      elsewhere.  */
9483   if (mantissa == 0)
9484     return false;
9485
9486   /* Then, as bit 4 is always set, we can mask it off, leaving
9487      the mantissa in the range [0, 15].  */
9488   mantissa &= ~(1 << 4);
9489   gcc_assert (mantissa <= 15);
9490
9491   /* GCC internally does not use IEEE754-like encoding (where normalized
9492      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
9493      Our mantissa values are shifted 4 places to the left relative to
9494      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9495      by 5 places to correct for GCC's representation.  */
9496   exponent = 5 - exponent;
9497
9498   return (exponent >= 0 && exponent <= 7);
9499 }
9500
9501 char*
9502 aarch64_output_simd_mov_immediate (rtx const_vector,
9503                                    machine_mode mode,
9504                                    unsigned width)
9505 {
9506   bool is_valid;
9507   static char templ[40];
9508   const char *mnemonic;
9509   const char *shift_op;
9510   unsigned int lane_count = 0;
9511   char element_char;
9512
9513   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
9514
9515   /* This will return true to show const_vector is legal for use as either
9516      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
9517      also update INFO to show how the immediate should be generated.  */
9518   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
9519   gcc_assert (is_valid);
9520
9521   element_char = sizetochar (info.element_width);
9522   lane_count = width / info.element_width;
9523
9524   mode = GET_MODE_INNER (mode);
9525   if (mode == SFmode || mode == DFmode)
9526     {
9527       gcc_assert (info.shift == 0 && ! info.mvn);
9528       if (aarch64_float_const_zero_rtx_p (info.value))
9529         info.value = GEN_INT (0);
9530       else
9531         {
9532 #define buf_size 20
9533           REAL_VALUE_TYPE r;
9534           REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
9535           char float_buf[buf_size] = {'\0'};
9536           real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
9537 #undef buf_size
9538
9539           if (lane_count == 1)
9540             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
9541           else
9542             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
9543                       lane_count, element_char, float_buf);
9544           return templ;
9545         }
9546     }
9547
9548   mnemonic = info.mvn ? "mvni" : "movi";
9549   shift_op = info.msl ? "msl" : "lsl";
9550
9551   if (lane_count == 1)
9552     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9553               mnemonic, UINTVAL (info.value));
9554   else if (info.shift)
9555     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9556               ", %s %d", mnemonic, lane_count, element_char,
9557               UINTVAL (info.value), shift_op, info.shift);
9558   else
9559     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9560               mnemonic, lane_count, element_char, UINTVAL (info.value));
9561   return templ;
9562 }
9563
9564 char*
9565 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9566                                           machine_mode mode)
9567 {
9568   machine_mode vmode;
9569
9570   gcc_assert (!VECTOR_MODE_P (mode));
9571   vmode = aarch64_simd_container_mode (mode, 64);
9572   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9573   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9574 }
9575
9576 /* Split operands into moves from op[1] + op[2] into op[0].  */
9577
9578 void
9579 aarch64_split_combinev16qi (rtx operands[3])
9580 {
9581   unsigned int dest = REGNO (operands[0]);
9582   unsigned int src1 = REGNO (operands[1]);
9583   unsigned int src2 = REGNO (operands[2]);
9584   machine_mode halfmode = GET_MODE (operands[1]);
9585   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9586   rtx destlo, desthi;
9587
9588   gcc_assert (halfmode == V16QImode);
9589
9590   if (src1 == dest && src2 == dest + halfregs)
9591     {
9592       /* No-op move.  Can't split to nothing; emit something.  */
9593       emit_note (NOTE_INSN_DELETED);
9594       return;
9595     }
9596
9597   /* Preserve register attributes for variable tracking.  */
9598   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9599   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9600                                GET_MODE_SIZE (halfmode));
9601
9602   /* Special case of reversed high/low parts.  */
9603   if (reg_overlap_mentioned_p (operands[2], destlo)
9604       && reg_overlap_mentioned_p (operands[1], desthi))
9605     {
9606       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9607       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9608       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9609     }
9610   else if (!reg_overlap_mentioned_p (operands[2], destlo))
9611     {
9612       /* Try to avoid unnecessary moves if part of the result
9613          is in the right place already.  */
9614       if (src1 != dest)
9615         emit_move_insn (destlo, operands[1]);
9616       if (src2 != dest + halfregs)
9617         emit_move_insn (desthi, operands[2]);
9618     }
9619   else
9620     {
9621       if (src2 != dest + halfregs)
9622         emit_move_insn (desthi, operands[2]);
9623       if (src1 != dest)
9624         emit_move_insn (destlo, operands[1]);
9625     }
9626 }
9627
9628 /* vec_perm support.  */
9629
9630 #define MAX_VECT_LEN 16
9631
9632 struct expand_vec_perm_d
9633 {
9634   rtx target, op0, op1;
9635   unsigned char perm[MAX_VECT_LEN];
9636   machine_mode vmode;
9637   unsigned char nelt;
9638   bool one_vector_p;
9639   bool testing_p;
9640 };
9641
9642 /* Generate a variable permutation.  */
9643
9644 static void
9645 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9646 {
9647   machine_mode vmode = GET_MODE (target);
9648   bool one_vector_p = rtx_equal_p (op0, op1);
9649
9650   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9651   gcc_checking_assert (GET_MODE (op0) == vmode);
9652   gcc_checking_assert (GET_MODE (op1) == vmode);
9653   gcc_checking_assert (GET_MODE (sel) == vmode);
9654   gcc_checking_assert (TARGET_SIMD);
9655
9656   if (one_vector_p)
9657     {
9658       if (vmode == V8QImode)
9659         {
9660           /* Expand the argument to a V16QI mode by duplicating it.  */
9661           rtx pair = gen_reg_rtx (V16QImode);
9662           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9663           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9664         }
9665       else
9666         {
9667           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9668         }
9669     }
9670   else
9671     {
9672       rtx pair;
9673
9674       if (vmode == V8QImode)
9675         {
9676           pair = gen_reg_rtx (V16QImode);
9677           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9678           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9679         }
9680       else
9681         {
9682           pair = gen_reg_rtx (OImode);
9683           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9684           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9685         }
9686     }
9687 }
9688
9689 void
9690 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9691 {
9692   machine_mode vmode = GET_MODE (target);
9693   unsigned int nelt = GET_MODE_NUNITS (vmode);
9694   bool one_vector_p = rtx_equal_p (op0, op1);
9695   rtx mask;
9696
9697   /* The TBL instruction does not use a modulo index, so we must take care
9698      of that ourselves.  */
9699   mask = aarch64_simd_gen_const_vector_dup (vmode,
9700       one_vector_p ? nelt - 1 : 2 * nelt - 1);
9701   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9702
9703   /* For big-endian, we also need to reverse the index within the vector
9704      (but not which vector).  */
9705   if (BYTES_BIG_ENDIAN)
9706     {
9707       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
9708       if (!one_vector_p)
9709         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9710       sel = expand_simple_binop (vmode, XOR, sel, mask,
9711                                  NULL, 0, OPTAB_LIB_WIDEN);
9712     }
9713   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9714 }
9715
9716 /* Recognize patterns suitable for the TRN instructions.  */
9717 static bool
9718 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9719 {
9720   unsigned int i, odd, mask, nelt = d->nelt;
9721   rtx out, in0, in1, x;
9722   rtx (*gen) (rtx, rtx, rtx);
9723   machine_mode vmode = d->vmode;
9724
9725   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9726     return false;
9727
9728   /* Note that these are little-endian tests.
9729      We correct for big-endian later.  */
9730   if (d->perm[0] == 0)
9731     odd = 0;
9732   else if (d->perm[0] == 1)
9733     odd = 1;
9734   else
9735     return false;
9736   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9737
9738   for (i = 0; i < nelt; i += 2)
9739     {
9740       if (d->perm[i] != i + odd)
9741         return false;
9742       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9743         return false;
9744     }
9745
9746   /* Success!  */
9747   if (d->testing_p)
9748     return true;
9749
9750   in0 = d->op0;
9751   in1 = d->op1;
9752   if (BYTES_BIG_ENDIAN)
9753     {
9754       x = in0, in0 = in1, in1 = x;
9755       odd = !odd;
9756     }
9757   out = d->target;
9758
9759   if (odd)
9760     {
9761       switch (vmode)
9762         {
9763         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9764         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9765         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9766         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9767         case V4SImode: gen = gen_aarch64_trn2v4si; break;
9768         case V2SImode: gen = gen_aarch64_trn2v2si; break;
9769         case V2DImode: gen = gen_aarch64_trn2v2di; break;
9770         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9771         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9772         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9773         default:
9774           return false;
9775         }
9776     }
9777   else
9778     {
9779       switch (vmode)
9780         {
9781         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9782         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9783         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9784         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9785         case V4SImode: gen = gen_aarch64_trn1v4si; break;
9786         case V2SImode: gen = gen_aarch64_trn1v2si; break;
9787         case V2DImode: gen = gen_aarch64_trn1v2di; break;
9788         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9789         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9790         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9791         default:
9792           return false;
9793         }
9794     }
9795
9796   emit_insn (gen (out, in0, in1));
9797   return true;
9798 }
9799
9800 /* Recognize patterns suitable for the UZP instructions.  */
9801 static bool
9802 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
9803 {
9804   unsigned int i, odd, mask, nelt = d->nelt;
9805   rtx out, in0, in1, x;
9806   rtx (*gen) (rtx, rtx, rtx);
9807   machine_mode vmode = d->vmode;
9808
9809   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9810     return false;
9811
9812   /* Note that these are little-endian tests.
9813      We correct for big-endian later.  */
9814   if (d->perm[0] == 0)
9815     odd = 0;
9816   else if (d->perm[0] == 1)
9817     odd = 1;
9818   else
9819     return false;
9820   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9821
9822   for (i = 0; i < nelt; i++)
9823     {
9824       unsigned elt = (i * 2 + odd) & mask;
9825       if (d->perm[i] != elt)
9826         return false;
9827     }
9828
9829   /* Success!  */
9830   if (d->testing_p)
9831     return true;
9832
9833   in0 = d->op0;
9834   in1 = d->op1;
9835   if (BYTES_BIG_ENDIAN)
9836     {
9837       x = in0, in0 = in1, in1 = x;
9838       odd = !odd;
9839     }
9840   out = d->target;
9841
9842   if (odd)
9843     {
9844       switch (vmode)
9845         {
9846         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9847         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9848         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9849         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9850         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9851         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9852         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9853         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9854         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9855         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9856         default:
9857           return false;
9858         }
9859     }
9860   else
9861     {
9862       switch (vmode)
9863         {
9864         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9865         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9866         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9867         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9868         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9869         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9870         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9871         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9872         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9873         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9874         default:
9875           return false;
9876         }
9877     }
9878
9879   emit_insn (gen (out, in0, in1));
9880   return true;
9881 }
9882
9883 /* Recognize patterns suitable for the ZIP instructions.  */
9884 static bool
9885 aarch64_evpc_zip (struct expand_vec_perm_d *d)
9886 {
9887   unsigned int i, high, mask, nelt = d->nelt;
9888   rtx out, in0, in1, x;
9889   rtx (*gen) (rtx, rtx, rtx);
9890   machine_mode vmode = d->vmode;
9891
9892   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9893     return false;
9894
9895   /* Note that these are little-endian tests.
9896      We correct for big-endian later.  */
9897   high = nelt / 2;
9898   if (d->perm[0] == high)
9899     /* Do Nothing.  */
9900     ;
9901   else if (d->perm[0] == 0)
9902     high = 0;
9903   else
9904     return false;
9905   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9906
9907   for (i = 0; i < nelt / 2; i++)
9908     {
9909       unsigned elt = (i + high) & mask;
9910       if (d->perm[i * 2] != elt)
9911         return false;
9912       elt = (elt + nelt) & mask;
9913       if (d->perm[i * 2 + 1] != elt)
9914         return false;
9915     }
9916
9917   /* Success!  */
9918   if (d->testing_p)
9919     return true;
9920
9921   in0 = d->op0;
9922   in1 = d->op1;
9923   if (BYTES_BIG_ENDIAN)
9924     {
9925       x = in0, in0 = in1, in1 = x;
9926       high = !high;
9927     }
9928   out = d->target;
9929
9930   if (high)
9931     {
9932       switch (vmode)
9933         {
9934         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9935         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9936         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9937         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9938         case V4SImode: gen = gen_aarch64_zip2v4si; break;
9939         case V2SImode: gen = gen_aarch64_zip2v2si; break;
9940         case V2DImode: gen = gen_aarch64_zip2v2di; break;
9941         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9942         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9943         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9944         default:
9945           return false;
9946         }
9947     }
9948   else
9949     {
9950       switch (vmode)
9951         {
9952         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9953         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9954         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9955         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9956         case V4SImode: gen = gen_aarch64_zip1v4si; break;
9957         case V2SImode: gen = gen_aarch64_zip1v2si; break;
9958         case V2DImode: gen = gen_aarch64_zip1v2di; break;
9959         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9960         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9961         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9962         default:
9963           return false;
9964         }
9965     }
9966
9967   emit_insn (gen (out, in0, in1));
9968   return true;
9969 }
9970
9971 /* Recognize patterns for the EXT insn.  */
9972
9973 static bool
9974 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9975 {
9976   unsigned int i, nelt = d->nelt;
9977   rtx (*gen) (rtx, rtx, rtx, rtx);
9978   rtx offset;
9979
9980   unsigned int location = d->perm[0]; /* Always < nelt.  */
9981
9982   /* Check if the extracted indices are increasing by one.  */
9983   for (i = 1; i < nelt; i++)
9984     {
9985       unsigned int required = location + i;
9986       if (d->one_vector_p)
9987         {
9988           /* We'll pass the same vector in twice, so allow indices to wrap.  */
9989           required &= (nelt - 1);
9990         }
9991       if (d->perm[i] != required)
9992         return false;
9993     }
9994
9995   switch (d->vmode)
9996     {
9997     case V16QImode: gen = gen_aarch64_extv16qi; break;
9998     case V8QImode: gen = gen_aarch64_extv8qi; break;
9999     case V4HImode: gen = gen_aarch64_extv4hi; break;
10000     case V8HImode: gen = gen_aarch64_extv8hi; break;
10001     case V2SImode: gen = gen_aarch64_extv2si; break;
10002     case V4SImode: gen = gen_aarch64_extv4si; break;
10003     case V2SFmode: gen = gen_aarch64_extv2sf; break;
10004     case V4SFmode: gen = gen_aarch64_extv4sf; break;
10005     case V2DImode: gen = gen_aarch64_extv2di; break;
10006     case V2DFmode: gen = gen_aarch64_extv2df; break;
10007     default:
10008       return false;
10009     }
10010
10011   /* Success! */
10012   if (d->testing_p)
10013     return true;
10014
10015   /* The case where (location == 0) is a no-op for both big- and little-endian,
10016      and is removed by the mid-end at optimization levels -O1 and higher.  */
10017
10018   if (BYTES_BIG_ENDIAN && (location != 0))
10019     {
10020       /* After setup, we want the high elements of the first vector (stored
10021          at the LSB end of the register), and the low elements of the second
10022          vector (stored at the MSB end of the register). So swap.  */
10023       std::swap (d->op0, d->op1);
10024       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
10025       location = nelt - location;
10026     }
10027
10028   offset = GEN_INT (location);
10029   emit_insn (gen (d->target, d->op0, d->op1, offset));
10030   return true;
10031 }
10032
10033 /* Recognize patterns for the REV insns.  */
10034
10035 static bool
10036 aarch64_evpc_rev (struct expand_vec_perm_d *d)
10037 {
10038   unsigned int i, j, diff, nelt = d->nelt;
10039   rtx (*gen) (rtx, rtx);
10040
10041   if (!d->one_vector_p)
10042     return false;
10043
10044   diff = d->perm[0];
10045   switch (diff)
10046     {
10047     case 7:
10048       switch (d->vmode)
10049         {
10050         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
10051         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
10052         default:
10053           return false;
10054         }
10055       break;
10056     case 3:
10057       switch (d->vmode)
10058         {
10059         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
10060         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
10061         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
10062         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
10063         default:
10064           return false;
10065         }
10066       break;
10067     case 1:
10068       switch (d->vmode)
10069         {
10070         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
10071         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
10072         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
10073         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
10074         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
10075         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
10076         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
10077         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
10078         default:
10079           return false;
10080         }
10081       break;
10082     default:
10083       return false;
10084     }
10085
10086   for (i = 0; i < nelt ; i += diff + 1)
10087     for (j = 0; j <= diff; j += 1)
10088       {
10089         /* This is guaranteed to be true as the value of diff
10090            is 7, 3, 1 and we should have enough elements in the
10091            queue to generate this.  Getting a vector mask with a
10092            value of diff other than these values implies that
10093            something is wrong by the time we get here.  */
10094         gcc_assert (i + j < nelt);
10095         if (d->perm[i + j] != i + diff - j)
10096           return false;
10097       }
10098
10099   /* Success! */
10100   if (d->testing_p)
10101     return true;
10102
10103   emit_insn (gen (d->target, d->op0));
10104   return true;
10105 }
10106
10107 static bool
10108 aarch64_evpc_dup (struct expand_vec_perm_d *d)
10109 {
10110   rtx (*gen) (rtx, rtx, rtx);
10111   rtx out = d->target;
10112   rtx in0;
10113   machine_mode vmode = d->vmode;
10114   unsigned int i, elt, nelt = d->nelt;
10115   rtx lane;
10116
10117   elt = d->perm[0];
10118   for (i = 1; i < nelt; i++)
10119     {
10120       if (elt != d->perm[i])
10121         return false;
10122     }
10123
10124   /* The generic preparation in aarch64_expand_vec_perm_const_1
10125      swaps the operand order and the permute indices if it finds
10126      d->perm[0] to be in the second operand.  Thus, we can always
10127      use d->op0 and need not do any extra arithmetic to get the
10128      correct lane number.  */
10129   in0 = d->op0;
10130   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
10131
10132   switch (vmode)
10133     {
10134     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
10135     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
10136     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
10137     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
10138     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
10139     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
10140     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
10141     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
10142     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
10143     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
10144     default:
10145       return false;
10146     }
10147
10148   emit_insn (gen (out, in0, lane));
10149   return true;
10150 }
10151
10152 static bool
10153 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
10154 {
10155   rtx rperm[MAX_VECT_LEN], sel;
10156   machine_mode vmode = d->vmode;
10157   unsigned int i, nelt = d->nelt;
10158
10159   if (d->testing_p)
10160     return true;
10161
10162   /* Generic code will try constant permutation twice.  Once with the
10163      original mode and again with the elements lowered to QImode.
10164      So wait and don't do the selector expansion ourselves.  */
10165   if (vmode != V8QImode && vmode != V16QImode)
10166     return false;
10167
10168   for (i = 0; i < nelt; ++i)
10169     {
10170       int nunits = GET_MODE_NUNITS (vmode);
10171
10172       /* If big-endian and two vectors we end up with a weird mixed-endian
10173          mode on NEON.  Reverse the index within each word but not the word
10174          itself.  */
10175       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
10176                                            : d->perm[i]);
10177     }
10178   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
10179   sel = force_reg (vmode, sel);
10180
10181   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
10182   return true;
10183 }
10184
10185 static bool
10186 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10187 {
10188   /* The pattern matching functions above are written to look for a small
10189      number to begin the sequence (0, 1, N/2).  If we begin with an index
10190      from the second operand, we can swap the operands.  */
10191   if (d->perm[0] >= d->nelt)
10192     {
10193       unsigned i, nelt = d->nelt;
10194
10195       gcc_assert (nelt == (nelt & -nelt));
10196       for (i = 0; i < nelt; ++i)
10197         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
10198
10199       std::swap (d->op0, d->op1);
10200     }
10201
10202   if (TARGET_SIMD)
10203     {
10204       if (aarch64_evpc_rev (d))
10205         return true;
10206       else if (aarch64_evpc_ext (d))
10207         return true;
10208       else if (aarch64_evpc_dup (d))
10209         return true;
10210       else if (aarch64_evpc_zip (d))
10211         return true;
10212       else if (aarch64_evpc_uzp (d))
10213         return true;
10214       else if (aarch64_evpc_trn (d))
10215         return true;
10216       return aarch64_evpc_tbl (d);
10217     }
10218   return false;
10219 }
10220
10221 /* Expand a vec_perm_const pattern.  */
10222
10223 bool
10224 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10225 {
10226   struct expand_vec_perm_d d;
10227   int i, nelt, which;
10228
10229   d.target = target;
10230   d.op0 = op0;
10231   d.op1 = op1;
10232
10233   d.vmode = GET_MODE (target);
10234   gcc_assert (VECTOR_MODE_P (d.vmode));
10235   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10236   d.testing_p = false;
10237
10238   for (i = which = 0; i < nelt; ++i)
10239     {
10240       rtx e = XVECEXP (sel, 0, i);
10241       int ei = INTVAL (e) & (2 * nelt - 1);
10242       which |= (ei < nelt ? 1 : 2);
10243       d.perm[i] = ei;
10244     }
10245
10246   switch (which)
10247     {
10248     default:
10249       gcc_unreachable ();
10250
10251     case 3:
10252       d.one_vector_p = false;
10253       if (!rtx_equal_p (op0, op1))
10254         break;
10255
10256       /* The elements of PERM do not suggest that only the first operand
10257          is used, but both operands are identical.  Allow easier matching
10258          of the permutation by folding the permutation into the single
10259          input vector.  */
10260       /* Fall Through.  */
10261     case 2:
10262       for (i = 0; i < nelt; ++i)
10263         d.perm[i] &= nelt - 1;
10264       d.op0 = op1;
10265       d.one_vector_p = true;
10266       break;
10267
10268     case 1:
10269       d.op1 = op0;
10270       d.one_vector_p = true;
10271       break;
10272     }
10273
10274   return aarch64_expand_vec_perm_const_1 (&d);
10275 }
10276
10277 static bool
10278 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10279                                      const unsigned char *sel)
10280 {
10281   struct expand_vec_perm_d d;
10282   unsigned int i, nelt, which;
10283   bool ret;
10284
10285   d.vmode = vmode;
10286   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10287   d.testing_p = true;
10288   memcpy (d.perm, sel, nelt);
10289
10290   /* Calculate whether all elements are in one vector.  */
10291   for (i = which = 0; i < nelt; ++i)
10292     {
10293       unsigned char e = d.perm[i];
10294       gcc_assert (e < 2 * nelt);
10295       which |= (e < nelt ? 1 : 2);
10296     }
10297
10298   /* If all elements are from the second vector, reindex as if from the
10299      first vector.  */
10300   if (which == 2)
10301     for (i = 0; i < nelt; ++i)
10302       d.perm[i] -= nelt;
10303
10304   /* Check whether the mask can be applied to a single vector.  */
10305   d.one_vector_p = (which != 3);
10306
10307   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10308   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10309   if (!d.one_vector_p)
10310     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10311
10312   start_sequence ();
10313   ret = aarch64_expand_vec_perm_const_1 (&d);
10314   end_sequence ();
10315
10316   return ret;
10317 }
10318
10319 rtx
10320 aarch64_reverse_mask (enum machine_mode mode)
10321 {
10322   /* We have to reverse each vector because we dont have
10323      a permuted load that can reverse-load according to ABI rules.  */
10324   rtx mask;
10325   rtvec v = rtvec_alloc (16);
10326   int i, j;
10327   int nunits = GET_MODE_NUNITS (mode);
10328   int usize = GET_MODE_UNIT_SIZE (mode);
10329
10330   gcc_assert (BYTES_BIG_ENDIAN);
10331   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
10332
10333   for (i = 0; i < nunits; i++)
10334     for (j = 0; j < usize; j++)
10335       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
10336   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
10337   return force_reg (V16QImode, mask);
10338 }
10339
10340 /* Implement MODES_TIEABLE_P.  */
10341
10342 bool
10343 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10344 {
10345   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10346     return true;
10347
10348   /* We specifically want to allow elements of "structure" modes to
10349      be tieable to the structure.  This more general condition allows
10350      other rarer situations too.  */
10351   if (TARGET_SIMD
10352       && aarch64_vector_mode_p (mode1)
10353       && aarch64_vector_mode_p (mode2))
10354     return true;
10355
10356   return false;
10357 }
10358
10359 /* Return a new RTX holding the result of moving POINTER forward by
10360    AMOUNT bytes.  */
10361
10362 static rtx
10363 aarch64_move_pointer (rtx pointer, int amount)
10364 {
10365   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10366
10367   return adjust_automodify_address (pointer, GET_MODE (pointer),
10368                                     next, amount);
10369 }
10370
10371 /* Return a new RTX holding the result of moving POINTER forward by the
10372    size of the mode it points to.  */
10373
10374 static rtx
10375 aarch64_progress_pointer (rtx pointer)
10376 {
10377   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10378
10379   return aarch64_move_pointer (pointer, amount);
10380 }
10381
10382 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10383    MODE bytes.  */
10384
10385 static void
10386 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10387                                               machine_mode mode)
10388 {
10389   rtx reg = gen_reg_rtx (mode);
10390
10391   /* "Cast" the pointers to the correct mode.  */
10392   *src = adjust_address (*src, mode, 0);
10393   *dst = adjust_address (*dst, mode, 0);
10394   /* Emit the memcpy.  */
10395   emit_move_insn (reg, *src);
10396   emit_move_insn (*dst, reg);
10397   /* Move the pointers forward.  */
10398   *src = aarch64_progress_pointer (*src);
10399   *dst = aarch64_progress_pointer (*dst);
10400 }
10401
10402 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
10403    we succeed, otherwise return false.  */
10404
10405 bool
10406 aarch64_expand_movmem (rtx *operands)
10407 {
10408   unsigned int n;
10409   rtx dst = operands[0];
10410   rtx src = operands[1];
10411   rtx base;
10412   bool speed_p = !optimize_function_for_size_p (cfun);
10413
10414   /* When optimizing for size, give a better estimate of the length of a
10415      memcpy call, but use the default otherwise.  */
10416   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10417
10418   /* We can't do anything smart if the amount to copy is not constant.  */
10419   if (!CONST_INT_P (operands[2]))
10420     return false;
10421
10422   n = UINTVAL (operands[2]);
10423
10424   /* Try to keep the number of instructions low.  For cases below 16 bytes we
10425      need to make at most two moves.  For cases above 16 bytes it will be one
10426      move for each 16 byte chunk, then at most two additional moves.  */
10427   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10428     return false;
10429
10430   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10431   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10432
10433   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10434   src = adjust_automodify_address (src, VOIDmode, base, 0);
10435
10436   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10437      1-byte chunk.  */
10438   if (n < 4)
10439     {
10440       if (n >= 2)
10441         {
10442           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10443           n -= 2;
10444         }
10445
10446       if (n == 1)
10447         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10448
10449       return true;
10450     }
10451
10452   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
10453      4-byte chunk, partially overlapping with the previously copied chunk.  */
10454   if (n < 8)
10455     {
10456       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10457       n -= 4;
10458       if (n > 0)
10459         {
10460           int move = n - 4;
10461
10462           src = aarch64_move_pointer (src, move);
10463           dst = aarch64_move_pointer (dst, move);
10464           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10465         }
10466       return true;
10467     }
10468
10469   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
10470      them, then (if applicable) an 8-byte chunk.  */
10471   while (n >= 8)
10472     {
10473       if (n / 16)
10474         {
10475           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10476           n -= 16;
10477         }
10478       else
10479         {
10480           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10481           n -= 8;
10482         }
10483     }
10484
10485   /* Finish the final bytes of the copy.  We can always do this in one
10486      instruction.  We either copy the exact amount we need, or partially
10487      overlap with the previous chunk we copied and copy 8-bytes.  */
10488   if (n == 0)
10489     return true;
10490   else if (n == 1)
10491     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10492   else if (n == 2)
10493     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10494   else if (n == 4)
10495     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10496   else
10497     {
10498       if (n == 3)
10499         {
10500           src = aarch64_move_pointer (src, -1);
10501           dst = aarch64_move_pointer (dst, -1);
10502           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10503         }
10504       else
10505         {
10506           int move = n - 8;
10507
10508           src = aarch64_move_pointer (src, move);
10509           dst = aarch64_move_pointer (dst, move);
10510           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10511         }
10512     }
10513
10514   return true;
10515 }
10516
10517 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
10518
10519 static unsigned HOST_WIDE_INT
10520 aarch64_asan_shadow_offset (void)
10521 {
10522   return (HOST_WIDE_INT_1 << 36);
10523 }
10524
10525 static bool
10526 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
10527                                         unsigned int align,
10528                                         enum by_pieces_operation op,
10529                                         bool speed_p)
10530 {
10531   /* STORE_BY_PIECES can be used when copying a constant string, but
10532      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10533      For now we always fail this and let the move_by_pieces code copy
10534      the string from read-only memory.  */
10535   if (op == STORE_BY_PIECES)
10536     return false;
10537
10538   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10539 }
10540
10541 static enum machine_mode
10542 aarch64_code_to_ccmode (enum rtx_code code)
10543 {
10544   switch (code)
10545     {
10546     case NE:
10547       return CC_DNEmode;
10548
10549     case EQ:
10550       return CC_DEQmode;
10551
10552     case LE:
10553       return CC_DLEmode;
10554
10555     case LT:
10556       return CC_DLTmode;
10557
10558     case GE:
10559       return CC_DGEmode;
10560
10561     case GT:
10562       return CC_DGTmode;
10563
10564     case LEU:
10565       return CC_DLEUmode;
10566
10567     case LTU:
10568       return CC_DLTUmode;
10569
10570     case GEU:
10571       return CC_DGEUmode;
10572
10573     case GTU:
10574       return CC_DGTUmode;
10575
10576     default:
10577       return CCmode;
10578     }
10579 }
10580
10581 static rtx
10582 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
10583                         int code, tree treeop0, tree treeop1)
10584 {
10585   enum machine_mode op_mode, cmp_mode, cc_mode;
10586   rtx op0, op1, cmp, target;
10587   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10588   enum insn_code icode;
10589   struct expand_operand ops[4];
10590
10591   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
10592   if (cc_mode == CCmode)
10593     return NULL_RTX;
10594
10595   start_sequence ();
10596   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10597
10598   op_mode = GET_MODE (op0);
10599   if (op_mode == VOIDmode)
10600     op_mode = GET_MODE (op1);
10601
10602   switch (op_mode)
10603     {
10604     case QImode:
10605     case HImode:
10606     case SImode:
10607       cmp_mode = SImode;
10608       icode = CODE_FOR_cmpsi;
10609       break;
10610
10611     case DImode:
10612       cmp_mode = DImode;
10613       icode = CODE_FOR_cmpdi;
10614       break;
10615
10616     default:
10617       end_sequence ();
10618       return NULL_RTX;
10619     }
10620
10621   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10622   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10623   if (!op0 || !op1)
10624     {
10625       end_sequence ();
10626       return NULL_RTX;
10627     }
10628   *prep_seq = get_insns ();
10629   end_sequence ();
10630
10631   cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
10632   target = gen_rtx_REG (CCmode, CC_REGNUM);
10633
10634   create_output_operand (&ops[0], target, CCmode);
10635   create_fixed_operand (&ops[1], cmp);
10636   create_fixed_operand (&ops[2], op0);
10637   create_fixed_operand (&ops[3], op1);
10638
10639   start_sequence ();
10640   if (!maybe_expand_insn (icode, 4, ops))
10641     {
10642       end_sequence ();
10643       return NULL_RTX;
10644     }
10645   *gen_seq = get_insns ();
10646   end_sequence ();
10647
10648   return gen_rtx_REG (cc_mode, CC_REGNUM);
10649 }
10650
10651 static rtx
10652 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
10653                        tree treeop0, tree treeop1, int bit_code)
10654 {
10655   rtx op0, op1, cmp0, cmp1, target;
10656   enum machine_mode op_mode, cmp_mode, cc_mode;
10657   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10658   enum insn_code icode = CODE_FOR_ccmp_andsi;
10659   struct expand_operand ops[6];
10660
10661   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
10662   if (cc_mode == CCmode)
10663     return NULL_RTX;
10664
10665   push_to_sequence ((rtx_insn*) *prep_seq);
10666   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10667
10668   op_mode = GET_MODE (op0);
10669   if (op_mode == VOIDmode)
10670     op_mode = GET_MODE (op1);
10671
10672   switch (op_mode)
10673     {
10674     case QImode:
10675     case HImode:
10676     case SImode:
10677       cmp_mode = SImode;
10678       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
10679                                                 : CODE_FOR_ccmp_iorsi;
10680       break;
10681
10682     case DImode:
10683       cmp_mode = DImode;
10684       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
10685                                                 : CODE_FOR_ccmp_iordi;
10686       break;
10687
10688     default:
10689       end_sequence ();
10690       return NULL_RTX;
10691     }
10692
10693   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10694   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10695   if (!op0 || !op1)
10696     {
10697       end_sequence ();
10698       return NULL_RTX;
10699     }
10700   *prep_seq = get_insns ();
10701   end_sequence ();
10702
10703   target = gen_rtx_REG (cc_mode, CC_REGNUM);
10704   cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
10705   cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
10706
10707   create_fixed_operand (&ops[0], prev);
10708   create_fixed_operand (&ops[1], target);
10709   create_fixed_operand (&ops[2], op0);
10710   create_fixed_operand (&ops[3], op1);
10711   create_fixed_operand (&ops[4], cmp0);
10712   create_fixed_operand (&ops[5], cmp1);
10713
10714   push_to_sequence ((rtx_insn*) *gen_seq);
10715   if (!maybe_expand_insn (icode, 6, ops))
10716     {
10717       end_sequence ();
10718       return NULL_RTX;
10719     }
10720
10721   *gen_seq = get_insns ();
10722   end_sequence ();
10723
10724   return target;
10725 }
10726
10727 #undef TARGET_GEN_CCMP_FIRST
10728 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
10729
10730 #undef TARGET_GEN_CCMP_NEXT
10731 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
10732
10733 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
10734    instruction fusion of some sort.  */
10735
10736 static bool
10737 aarch64_macro_fusion_p (void)
10738 {
10739   return aarch64_tune_params->fuseable_ops != AARCH64_FUSE_NOTHING;
10740 }
10741
10742
10743 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
10744    should be kept together during scheduling.  */
10745
10746 static bool
10747 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
10748 {
10749   rtx set_dest;
10750   rtx prev_set = single_set (prev);
10751   rtx curr_set = single_set (curr);
10752   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
10753   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
10754
10755   if (!aarch64_macro_fusion_p ())
10756     return false;
10757
10758   if (simple_sets_p
10759       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOV_MOVK))
10760     {
10761       /* We are trying to match:
10762          prev (mov)  == (set (reg r0) (const_int imm16))
10763          curr (movk) == (set (zero_extract (reg r0)
10764                                            (const_int 16)
10765                                            (const_int 16))
10766                              (const_int imm16_1))  */
10767
10768       set_dest = SET_DEST (curr_set);
10769
10770       if (GET_CODE (set_dest) == ZERO_EXTRACT
10771           && CONST_INT_P (SET_SRC (curr_set))
10772           && CONST_INT_P (SET_SRC (prev_set))
10773           && CONST_INT_P (XEXP (set_dest, 2))
10774           && INTVAL (XEXP (set_dest, 2)) == 16
10775           && REG_P (XEXP (set_dest, 0))
10776           && REG_P (SET_DEST (prev_set))
10777           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
10778         {
10779           return true;
10780         }
10781     }
10782
10783   if (simple_sets_p
10784       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_ADD))
10785     {
10786
10787       /*  We're trying to match:
10788           prev (adrp) == (set (reg r1)
10789                               (high (symbol_ref ("SYM"))))
10790           curr (add) == (set (reg r0)
10791                              (lo_sum (reg r1)
10792                                      (symbol_ref ("SYM"))))
10793           Note that r0 need not necessarily be the same as r1, especially
10794           during pre-regalloc scheduling.  */
10795
10796       if (satisfies_constraint_Ush (SET_SRC (prev_set))
10797           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10798         {
10799           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
10800               && REG_P (XEXP (SET_SRC (curr_set), 0))
10801               && REGNO (XEXP (SET_SRC (curr_set), 0))
10802                  == REGNO (SET_DEST (prev_set))
10803               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
10804                               XEXP (SET_SRC (curr_set), 1)))
10805             return true;
10806         }
10807     }
10808
10809   if (simple_sets_p
10810       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOVK_MOVK))
10811     {
10812
10813       /* We're trying to match:
10814          prev (movk) == (set (zero_extract (reg r0)
10815                                            (const_int 16)
10816                                            (const_int 32))
10817                              (const_int imm16_1))
10818          curr (movk) == (set (zero_extract (reg r0)
10819                                            (const_int 16)
10820                                            (const_int 48))
10821                              (const_int imm16_2))  */
10822
10823       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
10824           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
10825           && REG_P (XEXP (SET_DEST (prev_set), 0))
10826           && REG_P (XEXP (SET_DEST (curr_set), 0))
10827           && REGNO (XEXP (SET_DEST (prev_set), 0))
10828              == REGNO (XEXP (SET_DEST (curr_set), 0))
10829           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
10830           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
10831           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
10832           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
10833           && CONST_INT_P (SET_SRC (prev_set))
10834           && CONST_INT_P (SET_SRC (curr_set)))
10835         return true;
10836
10837     }
10838   if (simple_sets_p
10839       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_LDR))
10840     {
10841       /* We're trying to match:
10842           prev (adrp) == (set (reg r0)
10843                               (high (symbol_ref ("SYM"))))
10844           curr (ldr) == (set (reg r1)
10845                              (mem (lo_sum (reg r0)
10846                                              (symbol_ref ("SYM")))))
10847                  or
10848           curr (ldr) == (set (reg r1)
10849                              (zero_extend (mem
10850                                            (lo_sum (reg r0)
10851                                                    (symbol_ref ("SYM"))))))  */
10852       if (satisfies_constraint_Ush (SET_SRC (prev_set))
10853           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10854         {
10855           rtx curr_src = SET_SRC (curr_set);
10856
10857           if (GET_CODE (curr_src) == ZERO_EXTEND)
10858             curr_src = XEXP (curr_src, 0);
10859
10860           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
10861               && REG_P (XEXP (XEXP (curr_src, 0), 0))
10862               && REGNO (XEXP (XEXP (curr_src, 0), 0))
10863                  == REGNO (SET_DEST (prev_set))
10864               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
10865                               XEXP (SET_SRC (prev_set), 0)))
10866               return true;
10867         }
10868     }
10869
10870   if ((aarch64_tune_params->fuseable_ops & AARCH64_FUSE_CMP_BRANCH)
10871       && any_condjump_p (curr))
10872     {
10873       enum attr_type prev_type = get_attr_type (prev);
10874
10875       /* FIXME: this misses some which is considered simple arthematic
10876          instructions for ThunderX.  Simple shifts are missed here.  */
10877       if (prev_type == TYPE_ALUS_SREG
10878           || prev_type == TYPE_ALUS_IMM
10879           || prev_type == TYPE_LOGICS_REG
10880           || prev_type == TYPE_LOGICS_IMM)
10881         return true;
10882     }
10883
10884   return false;
10885 }
10886
10887 /* If MEM is in the form of [base+offset], extract the two parts
10888    of address and set to BASE and OFFSET, otherwise return false
10889    after clearing BASE and OFFSET.  */
10890
10891 bool
10892 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
10893 {
10894   rtx addr;
10895
10896   gcc_assert (MEM_P (mem));
10897
10898   addr = XEXP (mem, 0);
10899
10900   if (REG_P (addr))
10901     {
10902       *base = addr;
10903       *offset = const0_rtx;
10904       return true;
10905     }
10906
10907   if (GET_CODE (addr) == PLUS
10908       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
10909     {
10910       *base = XEXP (addr, 0);
10911       *offset = XEXP (addr, 1);
10912       return true;
10913     }
10914
10915   *base = NULL_RTX;
10916   *offset = NULL_RTX;
10917
10918   return false;
10919 }
10920
10921 /* Types for scheduling fusion.  */
10922 enum sched_fusion_type
10923 {
10924   SCHED_FUSION_NONE = 0,
10925   SCHED_FUSION_LD_SIGN_EXTEND,
10926   SCHED_FUSION_LD_ZERO_EXTEND,
10927   SCHED_FUSION_LD,
10928   SCHED_FUSION_ST,
10929   SCHED_FUSION_NUM
10930 };
10931
10932 /* If INSN is a load or store of address in the form of [base+offset],
10933    extract the two parts and set to BASE and OFFSET.  Return scheduling
10934    fusion type this INSN is.  */
10935
10936 static enum sched_fusion_type
10937 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
10938 {
10939   rtx x, dest, src;
10940   enum sched_fusion_type fusion = SCHED_FUSION_LD;
10941
10942   gcc_assert (INSN_P (insn));
10943   x = PATTERN (insn);
10944   if (GET_CODE (x) != SET)
10945     return SCHED_FUSION_NONE;
10946
10947   src = SET_SRC (x);
10948   dest = SET_DEST (x);
10949
10950   if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
10951       && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
10952     return SCHED_FUSION_NONE;
10953
10954   if (GET_CODE (src) == SIGN_EXTEND)
10955     {
10956       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
10957       src = XEXP (src, 0);
10958       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10959         return SCHED_FUSION_NONE;
10960     }
10961   else if (GET_CODE (src) == ZERO_EXTEND)
10962     {
10963       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
10964       src = XEXP (src, 0);
10965       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10966         return SCHED_FUSION_NONE;
10967     }
10968
10969   if (GET_CODE (src) == MEM && REG_P (dest))
10970     extract_base_offset_in_addr (src, base, offset);
10971   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
10972     {
10973       fusion = SCHED_FUSION_ST;
10974       extract_base_offset_in_addr (dest, base, offset);
10975     }
10976   else
10977     return SCHED_FUSION_NONE;
10978
10979   if (*base == NULL_RTX || *offset == NULL_RTX)
10980     fusion = SCHED_FUSION_NONE;
10981
10982   return fusion;
10983 }
10984
10985 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
10986
10987    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
10988    and PRI are only calculated for these instructions.  For other instruction,
10989    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
10990    type instruction fusion can be added by returning different priorities.
10991
10992    It's important that irrelevant instructions get the largest FUSION_PRI.  */
10993
10994 static void
10995 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
10996                                int *fusion_pri, int *pri)
10997 {
10998   int tmp, off_val;
10999   rtx base, offset;
11000   enum sched_fusion_type fusion;
11001
11002   gcc_assert (INSN_P (insn));
11003
11004   tmp = max_pri - 1;
11005   fusion = fusion_load_store (insn, &base, &offset);
11006   if (fusion == SCHED_FUSION_NONE)
11007     {
11008       *pri = tmp;
11009       *fusion_pri = tmp;
11010       return;
11011     }
11012
11013   /* Set FUSION_PRI according to fusion type and base register.  */
11014   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
11015
11016   /* Calculate PRI.  */
11017   tmp /= 2;
11018
11019   /* INSN with smaller offset goes first.  */
11020   off_val = (int)(INTVAL (offset));
11021   if (off_val >= 0)
11022     tmp -= (off_val & 0xfffff);
11023   else
11024     tmp += ((- off_val) & 0xfffff);
11025
11026   *pri = tmp;
11027   return;
11028 }
11029
11030 /* Given OPERANDS of consecutive load/store, check if we can merge
11031    them into ldp/stp.  LOAD is true if they are load instructions.
11032    MODE is the mode of memory operands.  */
11033
11034 bool
11035 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
11036                                 enum machine_mode mode)
11037 {
11038   HOST_WIDE_INT offval_1, offval_2, msize;
11039   enum reg_class rclass_1, rclass_2;
11040   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
11041
11042   if (load)
11043     {
11044       mem_1 = operands[1];
11045       mem_2 = operands[3];
11046       reg_1 = operands[0];
11047       reg_2 = operands[2];
11048       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
11049       if (REGNO (reg_1) == REGNO (reg_2))
11050         return false;
11051     }
11052   else
11053     {
11054       mem_1 = operands[0];
11055       mem_2 = operands[2];
11056       reg_1 = operands[1];
11057       reg_2 = operands[3];
11058     }
11059
11060   /* The mems cannot be volatile.  */
11061   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
11062     return false;
11063
11064   /* Check if the addresses are in the form of [base+offset].  */
11065   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11066   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11067     return false;
11068   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11069   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11070     return false;
11071
11072   /* Check if the bases are same.  */
11073   if (!rtx_equal_p (base_1, base_2))
11074     return false;
11075
11076   offval_1 = INTVAL (offset_1);
11077   offval_2 = INTVAL (offset_2);
11078   msize = GET_MODE_SIZE (mode);
11079   /* Check if the offsets are consecutive.  */
11080   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
11081     return false;
11082
11083   /* Check if the addresses are clobbered by load.  */
11084   if (load)
11085     {
11086       if (reg_mentioned_p (reg_1, mem_1))
11087         return false;
11088
11089       /* In increasing order, the last load can clobber the address.  */
11090       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
11091       return false;
11092     }
11093
11094   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11095     rclass_1 = FP_REGS;
11096   else
11097     rclass_1 = GENERAL_REGS;
11098
11099   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11100     rclass_2 = FP_REGS;
11101   else
11102     rclass_2 = GENERAL_REGS;
11103
11104   /* Check if the registers are of same class.  */
11105   if (rclass_1 != rclass_2)
11106     return false;
11107
11108   return true;
11109 }
11110
11111 /* Given OPERANDS of consecutive load/store, check if we can merge
11112    them into ldp/stp by adjusting the offset.  LOAD is true if they
11113    are load instructions.  MODE is the mode of memory operands.
11114
11115    Given below consecutive stores:
11116
11117      str  w1, [xb, 0x100]
11118      str  w1, [xb, 0x104]
11119      str  w1, [xb, 0x108]
11120      str  w1, [xb, 0x10c]
11121
11122    Though the offsets are out of the range supported by stp, we can
11123    still pair them after adjusting the offset, like:
11124
11125      add  scratch, xb, 0x100
11126      stp  w1, w1, [scratch]
11127      stp  w1, w1, [scratch, 0x8]
11128
11129    The peephole patterns detecting this opportunity should guarantee
11130    the scratch register is avaliable.  */
11131
11132 bool
11133 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
11134                                        enum machine_mode mode)
11135 {
11136   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
11137   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
11138   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
11139   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
11140
11141   if (load)
11142     {
11143       reg_1 = operands[0];
11144       mem_1 = operands[1];
11145       reg_2 = operands[2];
11146       mem_2 = operands[3];
11147       reg_3 = operands[4];
11148       mem_3 = operands[5];
11149       reg_4 = operands[6];
11150       mem_4 = operands[7];
11151       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
11152                   && REG_P (reg_3) && REG_P (reg_4));
11153       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
11154         return false;
11155     }
11156   else
11157     {
11158       mem_1 = operands[0];
11159       reg_1 = operands[1];
11160       mem_2 = operands[2];
11161       reg_2 = operands[3];
11162       mem_3 = operands[4];
11163       reg_3 = operands[5];
11164       mem_4 = operands[6];
11165       reg_4 = operands[7];
11166     }
11167   /* Skip if memory operand is by itslef valid for ldp/stp.  */
11168   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
11169     return false;
11170
11171   /* The mems cannot be volatile.  */
11172   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
11173       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
11174     return false;
11175
11176   /* Check if the addresses are in the form of [base+offset].  */
11177   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11178   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11179     return false;
11180   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11181   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11182     return false;
11183   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11184   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11185     return false;
11186   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11187   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11188     return false;
11189
11190   /* Check if the bases are same.  */
11191   if (!rtx_equal_p (base_1, base_2)
11192       || !rtx_equal_p (base_2, base_3)
11193       || !rtx_equal_p (base_3, base_4))
11194     return false;
11195
11196   offval_1 = INTVAL (offset_1);
11197   offval_2 = INTVAL (offset_2);
11198   offval_3 = INTVAL (offset_3);
11199   offval_4 = INTVAL (offset_4);
11200   msize = GET_MODE_SIZE (mode);
11201   /* Check if the offsets are consecutive.  */
11202   if ((offval_1 != (offval_2 + msize)
11203        || offval_1 != (offval_3 + msize * 2)
11204        || offval_1 != (offval_4 + msize * 3))
11205       && (offval_4 != (offval_3 + msize)
11206           || offval_4 != (offval_2 + msize * 2)
11207           || offval_4 != (offval_1 + msize * 3)))
11208     return false;
11209
11210   /* Check if the addresses are clobbered by load.  */
11211   if (load)
11212     {
11213       if (reg_mentioned_p (reg_1, mem_1)
11214           || reg_mentioned_p (reg_2, mem_2)
11215           || reg_mentioned_p (reg_3, mem_3))
11216         return false;
11217
11218       /* In increasing order, the last load can clobber the address.  */
11219       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11220         return false;
11221     }
11222
11223   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11224     rclass_1 = FP_REGS;
11225   else
11226     rclass_1 = GENERAL_REGS;
11227
11228   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11229     rclass_2 = FP_REGS;
11230   else
11231     rclass_2 = GENERAL_REGS;
11232
11233   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11234     rclass_3 = FP_REGS;
11235   else
11236     rclass_3 = GENERAL_REGS;
11237
11238   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11239     rclass_4 = FP_REGS;
11240   else
11241     rclass_4 = GENERAL_REGS;
11242
11243   /* Check if the registers are of same class.  */
11244   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11245     return false;
11246
11247   return true;
11248 }
11249
11250 /* Given OPERANDS of consecutive load/store, this function pairs them
11251    into ldp/stp after adjusting the offset.  It depends on the fact
11252    that addresses of load/store instructions are in increasing order.
11253    MODE is the mode of memory operands.  CODE is the rtl operator
11254    which should be applied to all memory operands, it's SIGN_EXTEND,
11255    ZERO_EXTEND or UNKNOWN.  */
11256
11257 bool
11258 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11259                              enum machine_mode mode, RTX_CODE code)
11260 {
11261   rtx base, offset, t1, t2;
11262   rtx mem_1, mem_2, mem_3, mem_4;
11263   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11264
11265   if (load)
11266     {
11267       mem_1 = operands[1];
11268       mem_2 = operands[3];
11269       mem_3 = operands[5];
11270       mem_4 = operands[7];
11271     }
11272   else
11273     {
11274       mem_1 = operands[0];
11275       mem_2 = operands[2];
11276       mem_3 = operands[4];
11277       mem_4 = operands[6];
11278       gcc_assert (code == UNKNOWN);
11279     }
11280
11281   extract_base_offset_in_addr (mem_1, &base, &offset);
11282   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11283
11284   /* Adjust offset thus it can fit in ldp/stp instruction.  */
11285   msize = GET_MODE_SIZE (mode);
11286   stp_off_limit = msize * 0x40;
11287   off_val = INTVAL (offset);
11288   abs_off = (off_val < 0) ? -off_val : off_val;
11289   new_off = abs_off % stp_off_limit;
11290   adj_off = abs_off - new_off;
11291
11292   /* Further adjust to make sure all offsets are OK.  */
11293   if ((new_off + msize * 2) >= stp_off_limit)
11294     {
11295       adj_off += stp_off_limit;
11296       new_off -= stp_off_limit;
11297     }
11298
11299   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
11300   if (adj_off >= 0x1000)
11301     return false;
11302
11303   if (off_val < 0)
11304     {
11305       adj_off = -adj_off;
11306       new_off = -new_off;
11307     }
11308
11309   /* Create new memory references.  */
11310   mem_1 = change_address (mem_1, VOIDmode,
11311                           plus_constant (DImode, operands[8], new_off));
11312
11313   /* Check if the adjusted address is OK for ldp/stp.  */
11314   if (!aarch64_mem_pair_operand (mem_1, mode))
11315     return false;
11316
11317   msize = GET_MODE_SIZE (mode);
11318   mem_2 = change_address (mem_2, VOIDmode,
11319                           plus_constant (DImode,
11320                                          operands[8],
11321                                          new_off + msize));
11322   mem_3 = change_address (mem_3, VOIDmode,
11323                           plus_constant (DImode,
11324                                          operands[8],
11325                                          new_off + msize * 2));
11326   mem_4 = change_address (mem_4, VOIDmode,
11327                           plus_constant (DImode,
11328                                          operands[8],
11329                                          new_off + msize * 3));
11330
11331   if (code == ZERO_EXTEND)
11332     {
11333       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11334       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11335       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11336       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11337     }
11338   else if (code == SIGN_EXTEND)
11339     {
11340       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11341       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11342       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11343       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11344     }
11345
11346   if (load)
11347     {
11348       operands[1] = mem_1;
11349       operands[3] = mem_2;
11350       operands[5] = mem_3;
11351       operands[7] = mem_4;
11352     }
11353   else
11354     {
11355       operands[0] = mem_1;
11356       operands[2] = mem_2;
11357       operands[4] = mem_3;
11358       operands[6] = mem_4;
11359     }
11360
11361   /* Emit adjusting instruction.  */
11362   emit_insn (gen_rtx_SET (VOIDmode, operands[8],
11363                           plus_constant (DImode, base, adj_off)));
11364   /* Emit ldp/stp instructions.  */
11365   t1 = gen_rtx_SET (VOIDmode, operands[0], operands[1]);
11366   t2 = gen_rtx_SET (VOIDmode, operands[2], operands[3]);
11367   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11368   t1 = gen_rtx_SET (VOIDmode, operands[4], operands[5]);
11369   t2 = gen_rtx_SET (VOIDmode, operands[6], operands[7]);
11370   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11371   return true;
11372 }
11373
11374 #undef TARGET_ADDRESS_COST
11375 #define TARGET_ADDRESS_COST aarch64_address_cost
11376
11377 /* This hook will determines whether unnamed bitfields affect the alignment
11378    of the containing structure.  The hook returns true if the structure
11379    should inherit the alignment requirements of an unnamed bitfield's
11380    type.  */
11381 #undef TARGET_ALIGN_ANON_BITFIELD
11382 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11383
11384 #undef TARGET_ASM_ALIGNED_DI_OP
11385 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11386
11387 #undef TARGET_ASM_ALIGNED_HI_OP
11388 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11389
11390 #undef TARGET_ASM_ALIGNED_SI_OP
11391 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11392
11393 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11394 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11395   hook_bool_const_tree_hwi_hwi_const_tree_true
11396
11397 #undef TARGET_ASM_FILE_START
11398 #define TARGET_ASM_FILE_START aarch64_start_file
11399
11400 #undef TARGET_ASM_OUTPUT_MI_THUNK
11401 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11402
11403 #undef TARGET_ASM_SELECT_RTX_SECTION
11404 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11405
11406 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11407 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11408
11409 #undef TARGET_BUILD_BUILTIN_VA_LIST
11410 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11411
11412 #undef TARGET_CALLEE_COPIES
11413 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11414
11415 #undef TARGET_CAN_ELIMINATE
11416 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11417
11418 #undef TARGET_CANNOT_FORCE_CONST_MEM
11419 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11420
11421 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11422 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11423
11424 /* Only the least significant bit is used for initialization guard
11425    variables.  */
11426 #undef TARGET_CXX_GUARD_MASK_BIT
11427 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11428
11429 #undef TARGET_C_MODE_FOR_SUFFIX
11430 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11431
11432 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11433 #undef  TARGET_DEFAULT_TARGET_FLAGS
11434 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11435 #endif
11436
11437 #undef TARGET_CLASS_MAX_NREGS
11438 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11439
11440 #undef TARGET_BUILTIN_DECL
11441 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11442
11443 #undef  TARGET_EXPAND_BUILTIN
11444 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11445
11446 #undef TARGET_EXPAND_BUILTIN_VA_START
11447 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11448
11449 #undef TARGET_FOLD_BUILTIN
11450 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11451
11452 #undef TARGET_FUNCTION_ARG
11453 #define TARGET_FUNCTION_ARG aarch64_function_arg
11454
11455 #undef TARGET_FUNCTION_ARG_ADVANCE
11456 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11457
11458 #undef TARGET_FUNCTION_ARG_BOUNDARY
11459 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11460
11461 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11462 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11463
11464 #undef TARGET_FUNCTION_VALUE
11465 #define TARGET_FUNCTION_VALUE aarch64_function_value
11466
11467 #undef TARGET_FUNCTION_VALUE_REGNO_P
11468 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11469
11470 #undef TARGET_FRAME_POINTER_REQUIRED
11471 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11472
11473 #undef TARGET_GIMPLE_FOLD_BUILTIN
11474 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11475
11476 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11477 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11478
11479 #undef  TARGET_INIT_BUILTINS
11480 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
11481
11482 #undef TARGET_LEGITIMATE_ADDRESS_P
11483 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11484
11485 #undef TARGET_LEGITIMATE_CONSTANT_P
11486 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11487
11488 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11489 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11490
11491 #undef TARGET_LRA_P
11492 #define TARGET_LRA_P hook_bool_void_true
11493
11494 #undef TARGET_MANGLE_TYPE
11495 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11496
11497 #undef TARGET_MEMORY_MOVE_COST
11498 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11499
11500 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11501 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11502
11503 #undef TARGET_MUST_PASS_IN_STACK
11504 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11505
11506 /* This target hook should return true if accesses to volatile bitfields
11507    should use the narrowest mode possible.  It should return false if these
11508    accesses should use the bitfield container type.  */
11509 #undef TARGET_NARROW_VOLATILE_BITFIELD
11510 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11511
11512 #undef  TARGET_OPTION_OVERRIDE
11513 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11514
11515 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11516 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11517   aarch64_override_options_after_change
11518
11519 #undef TARGET_PASS_BY_REFERENCE
11520 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11521
11522 #undef TARGET_PREFERRED_RELOAD_CLASS
11523 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11524
11525 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11526 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11527
11528 #undef TARGET_SECONDARY_RELOAD
11529 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11530
11531 #undef TARGET_SHIFT_TRUNCATION_MASK
11532 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11533
11534 #undef TARGET_SETUP_INCOMING_VARARGS
11535 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11536
11537 #undef TARGET_STRUCT_VALUE_RTX
11538 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
11539
11540 #undef TARGET_REGISTER_MOVE_COST
11541 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11542
11543 #undef TARGET_RETURN_IN_MEMORY
11544 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11545
11546 #undef TARGET_RETURN_IN_MSB
11547 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11548
11549 #undef TARGET_RTX_COSTS
11550 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11551
11552 #undef TARGET_SCHED_ISSUE_RATE
11553 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11554
11555 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
11556 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
11557   aarch64_sched_first_cycle_multipass_dfa_lookahead
11558
11559 #undef TARGET_TRAMPOLINE_INIT
11560 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11561
11562 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11563 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11564
11565 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11566 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11567
11568 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11569 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11570
11571 #undef TARGET_VECTORIZE_ADD_STMT_COST
11572 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11573
11574 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11575 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11576   aarch64_builtin_vectorization_cost
11577
11578 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11579 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11580
11581 #undef TARGET_VECTORIZE_BUILTINS
11582 #define TARGET_VECTORIZE_BUILTINS
11583
11584 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11585 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11586   aarch64_builtin_vectorized_function
11587
11588 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11589 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11590   aarch64_autovectorize_vector_sizes
11591
11592 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11593 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11594   aarch64_atomic_assign_expand_fenv
11595
11596 /* Section anchor support.  */
11597
11598 #undef TARGET_MIN_ANCHOR_OFFSET
11599 #define TARGET_MIN_ANCHOR_OFFSET -256
11600
11601 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11602    byte offset; we can do much more for larger data types, but have no way
11603    to determine the size of the access.  We assume accesses are aligned.  */
11604 #undef TARGET_MAX_ANCHOR_OFFSET
11605 #define TARGET_MAX_ANCHOR_OFFSET 4095
11606
11607 #undef TARGET_VECTOR_ALIGNMENT
11608 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11609
11610 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11611 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11612   aarch64_simd_vector_alignment_reachable
11613
11614 /* vec_perm support.  */
11615
11616 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11617 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11618   aarch64_vectorize_vec_perm_const_ok
11619
11620
11621 #undef TARGET_FIXED_CONDITION_CODE_REGS
11622 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11623
11624 #undef TARGET_FLAGS_REGNUM
11625 #define TARGET_FLAGS_REGNUM CC_REGNUM
11626
11627 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11628 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11629
11630 #undef TARGET_ASAN_SHADOW_OFFSET
11631 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11632
11633 #undef TARGET_LEGITIMIZE_ADDRESS
11634 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11635
11636 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11637 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11638   aarch64_use_by_pieces_infrastructure_p
11639
11640 #undef TARGET_CAN_USE_DOLOOP_P
11641 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11642
11643 #undef TARGET_SCHED_MACRO_FUSION_P
11644 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11645
11646 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11647 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11648
11649 #undef TARGET_SCHED_FUSION_PRIORITY
11650 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11651
11652 struct gcc_target targetm = TARGET_INITIALIZER;
11653
11654 #include "gt-aarch64.h"