gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2015 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "insn-codes.h"
  26 #include "rtl.h"
  27 #include "insn-attr.h"
  28 #include "hash-set.h"
  29 #include "machmode.h"
  30 #include "vec.h"
  31 #include "double-int.h"
  32 #include "input.h"
  33 #include "alias.h"
  34 #include "symtab.h"
  35 #include "wide-int.h"
  36 #include "inchash.h"
  37 #include "tree.h"
  38 #include "fold-const.h"
  39 #include "stringpool.h"
  40 #include "stor-layout.h"
  41 #include "calls.h"
  42 #include "varasm.h"
  43 #include "regs.h"
  44 #include "dominance.h"
  45 #include "cfg.h"
  46 #include "cfgrtl.h"
  47 #include "cfganal.h"
  48 #include "lcm.h"
  49 #include "cfgbuild.h"
  50 #include "cfgcleanup.h"
  51 #include "predict.h"
  52 #include "basic-block.h"
  53 #include "df.h"
  54 #include "hard-reg-set.h"
  55 #include "output.h"
  56 #include "hashtab.h"
  57 #include "function.h"
  58 #include "flags.h"
  59 #include "statistics.h"
  60 #include "real.h"
  61 #include "fixed-value.h"
  62 #include "insn-config.h"
  63 #include "expmed.h"
  64 #include "dojump.h"
  65 #include "explow.h"
  66 #include "emit-rtl.h"
  67 #include "stmt.h"
  68 #include "expr.h"
  69 #include "reload.h"
  70 #include "toplev.h"
  71 #include "target.h"
  72 #include "target-def.h"
  73 #include "targhooks.h"
  74 #include "ggc.h"
  75 #include "tm_p.h"
  76 #include "recog.h"
  77 #include "langhooks.h"
  78 #include "diagnostic-core.h"
  79 #include "hash-table.h"
  80 #include "tree-ssa-alias.h"
  81 #include "internal-fn.h"
  82 #include "gimple-fold.h"
  83 #include "tree-eh.h"
  84 #include "gimple-expr.h"
  85 #include "is-a.h"
  86 #include "gimple.h"
  87 #include "gimplify.h"
  88 #include "optabs.h"
  89 #include "dwarf2.h"
  90 #include "cfgloop.h"
  91 #include "tree-vectorizer.h"
  92 #include "aarch64-cost-tables.h"
  93 #include "dumpfile.h"
  94 #include "builtins.h"
  95 #include "rtl-iter.h"
  96 #include "tm-constrs.h"
  97 #include "sched-int.h"
  98 #include "cortex-a57-fma-steering.h"
  99
 100 /* Defined for convenience.  */
 101 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
 102
 103 /* Classifies an address.
 104
 105    ADDRESS_REG_IMM
 106        A simple base register plus immediate offset.
 107
 108    ADDRESS_REG_WB
 109        A base register indexed by immediate offset with writeback.
 110
 111    ADDRESS_REG_REG
 112        A base register indexed by (optionally scaled) register.
 113
 114    ADDRESS_REG_UXTW
 115        A base register indexed by (optionally scaled) zero-extended register.
 116
 117    ADDRESS_REG_SXTW
 118        A base register indexed by (optionally scaled) sign-extended register.
 119
 120    ADDRESS_LO_SUM
 121        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 122
 123    ADDRESS_SYMBOLIC:
 124        A constant symbolic address, in pc-relative literal pool.  */
 125
 126 enum aarch64_address_type {
 127   ADDRESS_REG_IMM,
 128   ADDRESS_REG_WB,
 129   ADDRESS_REG_REG,
 130   ADDRESS_REG_UXTW,
 131   ADDRESS_REG_SXTW,
 132   ADDRESS_LO_SUM,
 133   ADDRESS_SYMBOLIC
 134 };
 135
 136 struct aarch64_address_info {
 137   enum aarch64_address_type type;
 138   rtx base;
 139   rtx offset;
 140   int shift;
 141   enum aarch64_symbol_type symbol_type;
 142 };
 143
 144 struct simd_immediate_info
 145 {
 146   rtx value;
 147   int shift;
 148   int element_width;
 149   bool mvn;
 150   bool msl;
 151 };
 152
 153 /* The current code model.  */
 154 enum aarch64_code_model aarch64_cmodel;
 155
 156 #ifdef HAVE_AS_TLS
 157 #undef TARGET_HAVE_TLS
 158 #define TARGET_HAVE_TLS 1
 159 #endif
 160
 161 static bool aarch64_composite_type_p (const_tree, machine_mode);
 162 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 163                                                      const_tree,
 164                                                      machine_mode *, int *,
 165                                                      bool *);
 166 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 167 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 168 static void aarch64_override_options_after_change (void);
 169 static bool aarch64_vector_mode_supported_p (machine_mode);
 170 static unsigned bit_count (unsigned HOST_WIDE_INT);
 171 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 172                                                  const unsigned char *sel);
 173 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 174
 175 /* Major revision number of the ARM Architecture implemented by the target.  */
 176 unsigned aarch64_architecture_version;
 177
 178 /* The processor for which instructions should be scheduled.  */
 179 enum aarch64_processor aarch64_tune = cortexa53;
 180
 181 /* The current tuning set.  */
 182 const struct tune_params *aarch64_tune_params;
 183
 184 /* Mask to specify which instructions we are allowed to generate.  */
 185 unsigned long aarch64_isa_flags = 0;
 186
 187 /* Mask to specify which instruction scheduling options should be used.  */
 188 unsigned long aarch64_tune_flags = 0;
 189
 190 /* Tuning parameters.  */
 191
 192 static const struct cpu_addrcost_table generic_addrcost_table =
 193 {
 194     {
 195       0, /* hi  */
 196       0, /* si  */
 197       0, /* di  */
 198       0, /* ti  */
 199     },
 200   0, /* pre_modify  */
 201   0, /* post_modify  */
 202   0, /* register_offset  */
 203   0, /* register_extend  */
 204   0 /* imm_offset  */
 205 };
 206
 207 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 208 {
 209     {
 210       1, /* hi  */
 211       0, /* si  */
 212       0, /* di  */
 213       1, /* ti  */
 214     },
 215   0, /* pre_modify  */
 216   0, /* post_modify  */
 217   0, /* register_offset  */
 218   0, /* register_extend  */
 219   0, /* imm_offset  */
 220 };
 221
 222 static const struct cpu_addrcost_table xgene1_addrcost_table =
 223 {
 224     {
 225       1, /* hi  */
 226       0, /* si  */
 227       0, /* di  */
 228       1, /* ti  */
 229     },
 230   1, /* pre_modify  */
 231   0, /* post_modify  */
 232   0, /* register_offset  */
 233   1, /* register_extend  */
 234   0, /* imm_offset  */
 235 };
 236
 237 static const struct cpu_regmove_cost generic_regmove_cost =
 238 {
 239   1, /* GP2GP  */
 240   /* Avoid the use of slow int<->fp moves for spilling by setting
 241      their cost higher than memmov_cost.  */
 242   5, /* GP2FP  */
 243   5, /* FP2GP  */
 244   2 /* FP2FP  */
 245 };
 246
 247 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 248 {
 249   1, /* GP2GP  */
 250   /* Avoid the use of slow int<->fp moves for spilling by setting
 251      their cost higher than memmov_cost.  */
 252   5, /* GP2FP  */
 253   5, /* FP2GP  */
 254   2 /* FP2FP  */
 255 };
 256
 257 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 258 {
 259   1, /* GP2GP  */
 260   /* Avoid the use of slow int<->fp moves for spilling by setting
 261      their cost higher than memmov_cost.  */
 262   5, /* GP2FP  */
 263   5, /* FP2GP  */
 264   2 /* FP2FP  */
 265 };
 266
 267 static const struct cpu_regmove_cost thunderx_regmove_cost =
 268 {
 269   2, /* GP2GP  */
 270   2, /* GP2FP  */
 271   6, /* FP2GP  */
 272   4 /* FP2FP  */
 273 };
 274
 275 static const struct cpu_regmove_cost xgene1_regmove_cost =
 276 {
 277   1, /* GP2GP  */
 278   /* Avoid the use of slow int<->fp moves for spilling by setting
 279      their cost higher than memmov_cost.  */
 280   8, /* GP2FP  */
 281   8, /* FP2GP  */
 282   2 /* FP2FP  */
 283 };
 284
 285 /* Generic costs for vector insn classes.  */
 286 static const struct cpu_vector_cost generic_vector_cost =
 287 {
 288   1, /* scalar_stmt_cost  */
 289   1, /* scalar_load_cost  */
 290   1, /* scalar_store_cost  */
 291   1, /* vec_stmt_cost  */
 292   1, /* vec_to_scalar_cost  */
 293   1, /* scalar_to_vec_cost  */
 294   1, /* vec_align_load_cost  */
 295   1, /* vec_unalign_load_cost  */
 296   1, /* vec_unalign_store_cost  */
 297   1, /* vec_store_cost  */
 298   3, /* cond_taken_branch_cost  */
 299   1 /* cond_not_taken_branch_cost  */
 300 };
 301
 302 /* Generic costs for vector insn classes.  */
 303 static const struct cpu_vector_cost cortexa57_vector_cost =
 304 {
 305   1, /* scalar_stmt_cost  */
 306   4, /* scalar_load_cost  */
 307   1, /* scalar_store_cost  */
 308   3, /* vec_stmt_cost  */
 309   8, /* vec_to_scalar_cost  */
 310   8, /* scalar_to_vec_cost  */
 311   5, /* vec_align_load_cost  */
 312   5, /* vec_unalign_load_cost  */
 313   1, /* vec_unalign_store_cost  */
 314   1, /* vec_store_cost  */
 315   1, /* cond_taken_branch_cost  */
 316   1 /* cond_not_taken_branch_cost  */
 317 };
 318
 319 /* Generic costs for vector insn classes.  */
 320 static const struct cpu_vector_cost xgene1_vector_cost =
 321 {
 322   1, /* scalar_stmt_cost  */
 323   5, /* scalar_load_cost  */
 324   1, /* scalar_store_cost  */
 325   2, /* vec_stmt_cost  */
 326   4, /* vec_to_scalar_cost  */
 327   4, /* scalar_to_vec_cost  */
 328   10, /* vec_align_load_cost  */
 329   10, /* vec_unalign_load_cost  */
 330   2, /* vec_unalign_store_cost  */
 331   2, /* vec_store_cost  */
 332   2, /* cond_taken_branch_cost  */
 333   1 /* cond_not_taken_branch_cost  */
 334 };
 335
 336 #define AARCH64_FUSE_NOTHING    (0)
 337 #define AARCH64_FUSE_MOV_MOVK   (1 << 0)
 338 #define AARCH64_FUSE_ADRP_ADD   (1 << 1)
 339 #define AARCH64_FUSE_MOVK_MOVK  (1 << 2)
 340 #define AARCH64_FUSE_ADRP_LDR   (1 << 3)
 341 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
 342
 343 static const struct tune_params generic_tunings =
 344 {
 345   &cortexa57_extra_costs,
 346   &generic_addrcost_table,
 347   &generic_regmove_cost,
 348   &generic_vector_cost,
 349   4, /* memmov_cost  */
 350   2, /* issue_rate  */
 351   AARCH64_FUSE_NOTHING, /* fuseable_ops  */
 352   8,    /* function_align.  */
 353   8,    /* jump_align.  */
 354   4,    /* loop_align.  */
 355   2,    /* int_reassoc_width.  */
 356   4,    /* fp_reassoc_width.  */
 357   1     /* vec_reassoc_width.  */
 358 };
 359
 360 static const struct tune_params cortexa53_tunings =
 361 {
 362   &cortexa53_extra_costs,
 363   &generic_addrcost_table,
 364   &cortexa53_regmove_cost,
 365   &generic_vector_cost,
 366   4, /* memmov_cost  */
 367   2, /* issue_rate  */
 368   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 369    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fuseable_ops  */
 370   8,    /* function_align.  */
 371   8,    /* jump_align.  */
 372   4,    /* loop_align.  */
 373   2,    /* int_reassoc_width.  */
 374   4,    /* fp_reassoc_width.  */
 375   1     /* vec_reassoc_width.  */
 376 };
 377
 378 static const struct tune_params cortexa57_tunings =
 379 {
 380   &cortexa57_extra_costs,
 381   &cortexa57_addrcost_table,
 382   &cortexa57_regmove_cost,
 383   &cortexa57_vector_cost,
 384   4, /* memmov_cost  */
 385   3, /* issue_rate  */
 386   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 387    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 388   16,   /* function_align.  */
 389   8,    /* jump_align.  */
 390   4,    /* loop_align.  */
 391   2,    /* int_reassoc_width.  */
 392   4,    /* fp_reassoc_width.  */
 393   1     /* vec_reassoc_width.  */
 394 };
 395
 396 static const struct tune_params thunderx_tunings =
 397 {
 398   &thunderx_extra_costs,
 399   &generic_addrcost_table,
 400   &thunderx_regmove_cost,
 401   &generic_vector_cost,
 402   6, /* memmov_cost  */
 403   2, /* issue_rate  */
 404   AARCH64_FUSE_CMP_BRANCH, /* fuseable_ops  */
 405   8,    /* function_align.  */
 406   8,    /* jump_align.  */
 407   8,    /* loop_align.  */
 408   2,    /* int_reassoc_width.  */
 409   4,    /* fp_reassoc_width.  */
 410   1     /* vec_reassoc_width.  */
 411 };
 412
 413 static const struct tune_params xgene1_tunings =
 414 {
 415   &xgene1_extra_costs,
 416   &xgene1_addrcost_table,
 417   &xgene1_regmove_cost,
 418   &xgene1_vector_cost,
 419   6, /* memmov_cost  */
 420   4, /* issue_rate  */
 421   AARCH64_FUSE_NOTHING, /* fuseable_ops  */
 422   16,   /* function_align.  */
 423   8,    /* jump_align.  */
 424   16,   /* loop_align.  */
 425   2,    /* int_reassoc_width.  */
 426   4,    /* fp_reassoc_width.  */
 427   1     /* vec_reassoc_width.  */
 428 };
 429
 430 /* A processor implementing AArch64.  */
 431 struct processor
 432 {
 433   const char *const name;
 434   enum aarch64_processor core;
 435   const char *arch;
 436   unsigned architecture_version;
 437   const unsigned long flags;
 438   const struct tune_params *const tune;
 439 };
 440
 441 /* Processor cores implementing AArch64.  */
 442 static const struct processor all_cores[] =
 443 {
 444 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
 445   {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
 446 #include "aarch64-cores.def"
 447 #undef AARCH64_CORE
 448   {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
 449   {NULL, aarch64_none, NULL, 0, 0, NULL}
 450 };
 451
 452 /* Architectures implementing AArch64.  */
 453 static const struct processor all_architectures[] =
 454 {
 455 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
 456   {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
 457 #include "aarch64-arches.def"
 458 #undef AARCH64_ARCH
 459   {NULL, aarch64_none, NULL, 0, 0, NULL}
 460 };
 461
 462 /* Target specification.  These are populated as commandline arguments
 463    are processed, or NULL if not specified.  */
 464 static const struct processor *selected_arch;
 465 static const struct processor *selected_cpu;
 466 static const struct processor *selected_tune;
 467
 468 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 469
 470 /* An ISA extension in the co-processor and main instruction set space.  */
 471 struct aarch64_option_extension
 472 {
 473   const char *const name;
 474   const unsigned long flags_on;
 475   const unsigned long flags_off;
 476 };
 477
 478 /* ISA extensions in AArch64.  */
 479 static const struct aarch64_option_extension all_extensions[] =
 480 {
 481 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
 482   {NAME, FLAGS_ON, FLAGS_OFF},
 483 #include "aarch64-option-extensions.def"
 484 #undef AARCH64_OPT_EXTENSION
 485   {NULL, 0, 0}
 486 };
 487
 488 /* Used to track the size of an address when generating a pre/post
 489    increment address.  */
 490 static machine_mode aarch64_memory_reference_mode;
 491
 492 /* A table of valid AArch64 "bitmask immediate" values for
 493    logical instructions.  */
 494
 495 #define AARCH64_NUM_BITMASKS  5334
 496 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
 497
 498 typedef enum aarch64_cond_code
 499 {
 500   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 501   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 502   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 503 }
 504 aarch64_cc;
 505
 506 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 507
 508 /* The condition codes of the processor, and the inverse function.  */
 509 static const char * const aarch64_condition_codes[] =
 510 {
 511   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 512   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 513 };
 514
 515 static unsigned int
 516 aarch64_min_divisions_for_recip_mul (enum machine_mode mode ATTRIBUTE_UNUSED)
 517 {
 518   return 2;
 519 }
 520
 521 static int
 522 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 523                              enum machine_mode mode)
 524 {
 525   if (VECTOR_MODE_P (mode))
 526     return aarch64_tune_params->vec_reassoc_width;
 527   if (INTEGRAL_MODE_P (mode))
 528     return aarch64_tune_params->int_reassoc_width;
 529   if (FLOAT_MODE_P (mode))
 530     return aarch64_tune_params->fp_reassoc_width;
 531   return 1;
 532 }
 533
 534 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 535 unsigned
 536 aarch64_dbx_register_number (unsigned regno)
 537 {
 538    if (GP_REGNUM_P (regno))
 539      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 540    else if (regno == SP_REGNUM)
 541      return AARCH64_DWARF_SP;
 542    else if (FP_REGNUM_P (regno))
 543      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 544
 545    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 546       equivalent DWARF register.  */
 547    return DWARF_FRAME_REGISTERS;
 548 }
 549
 550 /* Return TRUE if MODE is any of the large INT modes.  */
 551 static bool
 552 aarch64_vect_struct_mode_p (machine_mode mode)
 553 {
 554   return mode == OImode || mode == CImode || mode == XImode;
 555 }
 556
 557 /* Return TRUE if MODE is any of the vector modes.  */
 558 static bool
 559 aarch64_vector_mode_p (machine_mode mode)
 560 {
 561   return aarch64_vector_mode_supported_p (mode)
 562          || aarch64_vect_struct_mode_p (mode);
 563 }
 564
 565 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 566 static bool
 567 aarch64_array_mode_supported_p (machine_mode mode,
 568                                 unsigned HOST_WIDE_INT nelems)
 569 {
 570   if (TARGET_SIMD
 571       && AARCH64_VALID_SIMD_QREG_MODE (mode)
 572       && (nelems >= 2 && nelems <= 4))
 573     return true;
 574
 575   return false;
 576 }
 577
 578 /* Implement HARD_REGNO_NREGS.  */
 579
 580 int
 581 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
 582 {
 583   switch (aarch64_regno_regclass (regno))
 584     {
 585     case FP_REGS:
 586     case FP_LO_REGS:
 587       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 588     default:
 589       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 590     }
 591   gcc_unreachable ();
 592 }
 593
 594 /* Implement HARD_REGNO_MODE_OK.  */
 595
 596 int
 597 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
 598 {
 599   if (GET_MODE_CLASS (mode) == MODE_CC)
 600     return regno == CC_REGNUM;
 601
 602   if (regno == SP_REGNUM)
 603     /* The purpose of comparing with ptr_mode is to support the
 604        global register variable associated with the stack pointer
 605        register via the syntax of asm ("wsp") in ILP32.  */
 606     return mode == Pmode || mode == ptr_mode;
 607
 608   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 609     return mode == Pmode;
 610
 611   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 612     return 1;
 613
 614   if (FP_REGNUM_P (regno))
 615     {
 616       if (aarch64_vect_struct_mode_p (mode))
 617         return
 618           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 619       else
 620         return 1;
 621     }
 622
 623   return 0;
 624 }
 625
 626 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 627 machine_mode
 628 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 629                                      machine_mode mode)
 630 {
 631   /* Handle modes that fit within single registers.  */
 632   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 633     {
 634       if (GET_MODE_SIZE (mode) >= 4)
 635         return mode;
 636       else
 637         return SImode;
 638     }
 639   /* Fall back to generic for multi-reg and very large modes.  */
 640   else
 641     return choose_hard_reg_mode (regno, nregs, false);
 642 }
 643
 644 /* Return true if calls to DECL should be treated as
 645    long-calls (ie called via a register).  */
 646 static bool
 647 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 648 {
 649   return false;
 650 }
 651
 652 /* Return true if calls to symbol-ref SYM should be treated as
 653    long-calls (ie called via a register).  */
 654 bool
 655 aarch64_is_long_call_p (rtx sym)
 656 {
 657   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 658 }
 659
 660 /* Return true if the offsets to a zero/sign-extract operation
 661    represent an expression that matches an extend operation.  The
 662    operands represent the paramters from
 663
 664    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 665 bool
 666 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
 667                                 rtx extract_imm)
 668 {
 669   HOST_WIDE_INT mult_val, extract_val;
 670
 671   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 672     return false;
 673
 674   mult_val = INTVAL (mult_imm);
 675   extract_val = INTVAL (extract_imm);
 676
 677   if (extract_val > 8
 678       && extract_val < GET_MODE_BITSIZE (mode)
 679       && exact_log2 (extract_val & ~7) > 0
 680       && (extract_val & 7) <= 4
 681       && mult_val == (1 << (extract_val & 7)))
 682     return true;
 683
 684   return false;
 685 }
 686
 687 /* Emit an insn that's a simple single-set.  Both the operands must be
 688    known to be valid.  */
 689 inline static rtx
 690 emit_set_insn (rtx x, rtx y)
 691 {
 692   return emit_insn (gen_rtx_SET (VOIDmode, x, y));
 693 }
 694
 695 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 696    return the rtx for register 0 in the proper mode.  */
 697 rtx
 698 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 699 {
 700   machine_mode mode = SELECT_CC_MODE (code, x, y);
 701   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 702
 703   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 704   return cc_reg;
 705 }
 706
 707 /* Build the SYMBOL_REF for __tls_get_addr.  */
 708
 709 static GTY(()) rtx tls_get_addr_libfunc;
 710
 711 rtx
 712 aarch64_tls_get_addr (void)
 713 {
 714   if (!tls_get_addr_libfunc)
 715     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 716   return tls_get_addr_libfunc;
 717 }
 718
 719 /* Return the TLS model to use for ADDR.  */
 720
 721 static enum tls_model
 722 tls_symbolic_operand_type (rtx addr)
 723 {
 724   enum tls_model tls_kind = TLS_MODEL_NONE;
 725   rtx sym, addend;
 726
 727   if (GET_CODE (addr) == CONST)
 728     {
 729       split_const (addr, &sym, &addend);
 730       if (GET_CODE (sym) == SYMBOL_REF)
 731         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 732     }
 733   else if (GET_CODE (addr) == SYMBOL_REF)
 734     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 735
 736   return tls_kind;
 737 }
 738
 739 /* We'll allow lo_sum's in addresses in our legitimate addresses
 740    so that combine would take care of combining addresses where
 741    necessary, but for generation purposes, we'll generate the address
 742    as :
 743    RTL                               Absolute
 744    tmp = hi (symbol_ref);            adrp  x1, foo
 745    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 746                                      nop
 747
 748    PIC                               TLS
 749    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 750    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 751                                      bl   __tls_get_addr
 752                                      nop
 753
 754    Load TLS symbol, depending on TLS mechanism and TLS access model.
 755
 756    Global Dynamic - Traditional TLS:
 757    adrp tmp, :tlsgd:imm
 758    add  dest, tmp, #:tlsgd_lo12:imm
 759    bl   __tls_get_addr
 760
 761    Global Dynamic - TLS Descriptors:
 762    adrp dest, :tlsdesc:imm
 763    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 764    add  dest, dest, #:tlsdesc_lo12:imm
 765    blr  tmp
 766    mrs  tp, tpidr_el0
 767    add  dest, dest, tp
 768
 769    Initial Exec:
 770    mrs  tp, tpidr_el0
 771    adrp tmp, :gottprel:imm
 772    ldr  dest, [tmp, #:gottprel_lo12:imm]
 773    add  dest, dest, tp
 774
 775    Local Exec:
 776    mrs  tp, tpidr_el0
 777    add  t0, tp, #:tprel_hi12:imm, lsl #12
 778    add  t0, t0, #:tprel_lo12_nc:imm
 779 */
 780
 781 static void
 782 aarch64_load_symref_appropriately (rtx dest, rtx imm,
 783                                    enum aarch64_symbol_type type)
 784 {
 785   switch (type)
 786     {
 787     case SYMBOL_SMALL_ABSOLUTE:
 788       {
 789         /* In ILP32, the mode of dest can be either SImode or DImode.  */
 790         rtx tmp_reg = dest;
 791         machine_mode mode = GET_MODE (dest);
 792
 793         gcc_assert (mode == Pmode || mode == ptr_mode);
 794
 795         if (can_create_pseudo_p ())
 796           tmp_reg = gen_reg_rtx (mode);
 797
 798         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 799         emit_insn (gen_add_losym (dest, tmp_reg, imm));
 800         return;
 801       }
 802
 803     case SYMBOL_TINY_ABSOLUTE:
 804       emit_insn (gen_rtx_SET (Pmode, dest, imm));
 805       return;
 806
 807     case SYMBOL_SMALL_GOT:
 808       {
 809         /* In ILP32, the mode of dest can be either SImode or DImode,
 810            while the got entry is always of SImode size.  The mode of
 811            dest depends on how dest is used: if dest is assigned to a
 812            pointer (e.g. in the memory), it has SImode; it may have
 813            DImode if dest is dereferenced to access the memeory.
 814            This is why we have to handle three different ldr_got_small
 815            patterns here (two patterns for ILP32).  */
 816         rtx tmp_reg = dest;
 817         machine_mode mode = GET_MODE (dest);
 818
 819         if (can_create_pseudo_p ())
 820           tmp_reg = gen_reg_rtx (mode);
 821
 822         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 823         if (mode == ptr_mode)
 824           {
 825             if (mode == DImode)
 826               emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
 827             else
 828               emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
 829           }
 830         else
 831           {
 832             gcc_assert (mode == Pmode);
 833             emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
 834           }
 835
 836         return;
 837       }
 838
 839     case SYMBOL_SMALL_TLSGD:
 840       {
 841         rtx_insn *insns;
 842         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
 843
 844         start_sequence ();
 845         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
 846         insns = get_insns ();
 847         end_sequence ();
 848
 849         RTL_CONST_CALL_P (insns) = 1;
 850         emit_libcall_block (insns, dest, result, imm);
 851         return;
 852       }
 853
 854     case SYMBOL_SMALL_TLSDESC:
 855       {
 856         machine_mode mode = GET_MODE (dest);
 857         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
 858         rtx tp;
 859
 860         gcc_assert (mode == Pmode || mode == ptr_mode);
 861
 862         /* In ILP32, the got entry is always of SImode size.  Unlike
 863            small GOT, the dest is fixed at reg 0.  */
 864         if (TARGET_ILP32)
 865           emit_insn (gen_tlsdesc_small_si (imm));
 866         else
 867           emit_insn (gen_tlsdesc_small_di (imm));
 868         tp = aarch64_load_tp (NULL);
 869
 870         if (mode != Pmode)
 871           tp = gen_lowpart (mode, tp);
 872
 873         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
 874         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 875         return;
 876       }
 877
 878     case SYMBOL_SMALL_GOTTPREL:
 879       {
 880         /* In ILP32, the mode of dest can be either SImode or DImode,
 881            while the got entry is always of SImode size.  The mode of
 882            dest depends on how dest is used: if dest is assigned to a
 883            pointer (e.g. in the memory), it has SImode; it may have
 884            DImode if dest is dereferenced to access the memeory.
 885            This is why we have to handle three different tlsie_small
 886            patterns here (two patterns for ILP32).  */
 887         machine_mode mode = GET_MODE (dest);
 888         rtx tmp_reg = gen_reg_rtx (mode);
 889         rtx tp = aarch64_load_tp (NULL);
 890
 891         if (mode == ptr_mode)
 892           {
 893             if (mode == DImode)
 894               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
 895             else
 896               {
 897                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
 898                 tp = gen_lowpart (mode, tp);
 899               }
 900           }
 901         else
 902           {
 903             gcc_assert (mode == Pmode);
 904             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
 905           }
 906
 907         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
 908         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 909         return;
 910       }
 911
 912     case SYMBOL_SMALL_TPREL:
 913       {
 914         rtx tp = aarch64_load_tp (NULL);
 915
 916         if (GET_MODE (dest) != Pmode)
 917           tp = gen_lowpart (GET_MODE (dest), tp);
 918
 919         emit_insn (gen_tlsle_small (dest, tp, imm));
 920         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 921         return;
 922       }
 923
 924     case SYMBOL_TINY_GOT:
 925       emit_insn (gen_ldr_got_tiny (dest, imm));
 926       return;
 927
 928     default:
 929       gcc_unreachable ();
 930     }
 931 }
 932
 933 /* Emit a move from SRC to DEST.  Assume that the move expanders can
 934    handle all moves if !can_create_pseudo_p ().  The distinction is
 935    important because, unlike emit_move_insn, the move expanders know
 936    how to force Pmode objects into the constant pool even when the
 937    constant pool address is not itself legitimate.  */
 938 static rtx
 939 aarch64_emit_move (rtx dest, rtx src)
 940 {
 941   return (can_create_pseudo_p ()
 942           ? emit_move_insn (dest, src)
 943           : emit_move_insn_1 (dest, src));
 944 }
 945
 946 /* Split a 128-bit move operation into two 64-bit move operations,
 947    taking care to handle partial overlap of register to register
 948    copies.  Special cases are needed when moving between GP regs and
 949    FP regs.  SRC can be a register, constant or memory; DST a register
 950    or memory.  If either operand is memory it must not have any side
 951    effects.  */
 952 void
 953 aarch64_split_128bit_move (rtx dst, rtx src)
 954 {
 955   rtx dst_lo, dst_hi;
 956   rtx src_lo, src_hi;
 957
 958   machine_mode mode = GET_MODE (dst);
 959
 960   gcc_assert (mode == TImode || mode == TFmode);
 961   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
 962   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
 963
 964   if (REG_P (dst) && REG_P (src))
 965     {
 966       int src_regno = REGNO (src);
 967       int dst_regno = REGNO (dst);
 968
 969       /* Handle FP <-> GP regs.  */
 970       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
 971         {
 972           src_lo = gen_lowpart (word_mode, src);
 973           src_hi = gen_highpart (word_mode, src);
 974
 975           if (mode == TImode)
 976             {
 977               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
 978               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
 979             }
 980           else
 981             {
 982               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
 983               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
 984             }
 985           return;
 986         }
 987       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
 988         {
 989           dst_lo = gen_lowpart (word_mode, dst);
 990           dst_hi = gen_highpart (word_mode, dst);
 991
 992           if (mode == TImode)
 993             {
 994               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
 995               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
 996             }
 997           else
 998             {
 999               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1000               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1001             }
1002           return;
1003         }
1004     }
1005
1006   dst_lo = gen_lowpart (word_mode, dst);
1007   dst_hi = gen_highpart (word_mode, dst);
1008   src_lo = gen_lowpart (word_mode, src);
1009   src_hi = gen_highpart_mode (word_mode, mode, src);
1010
1011   /* At most one pairing may overlap.  */
1012   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1013     {
1014       aarch64_emit_move (dst_hi, src_hi);
1015       aarch64_emit_move (dst_lo, src_lo);
1016     }
1017   else
1018     {
1019       aarch64_emit_move (dst_lo, src_lo);
1020       aarch64_emit_move (dst_hi, src_hi);
1021     }
1022 }
1023
1024 bool
1025 aarch64_split_128bit_move_p (rtx dst, rtx src)
1026 {
1027   return (! REG_P (src)
1028           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1029 }
1030
1031 /* Split a complex SIMD combine.  */
1032
1033 void
1034 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1035 {
1036   machine_mode src_mode = GET_MODE (src1);
1037   machine_mode dst_mode = GET_MODE (dst);
1038
1039   gcc_assert (VECTOR_MODE_P (dst_mode));
1040
1041   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1042     {
1043       rtx (*gen) (rtx, rtx, rtx);
1044
1045       switch (src_mode)
1046         {
1047         case V8QImode:
1048           gen = gen_aarch64_simd_combinev8qi;
1049           break;
1050         case V4HImode:
1051           gen = gen_aarch64_simd_combinev4hi;
1052           break;
1053         case V2SImode:
1054           gen = gen_aarch64_simd_combinev2si;
1055           break;
1056         case V2SFmode:
1057           gen = gen_aarch64_simd_combinev2sf;
1058           break;
1059         case DImode:
1060           gen = gen_aarch64_simd_combinedi;
1061           break;
1062         case DFmode:
1063           gen = gen_aarch64_simd_combinedf;
1064           break;
1065         default:
1066           gcc_unreachable ();
1067         }
1068
1069       emit_insn (gen (dst, src1, src2));
1070       return;
1071     }
1072 }
1073
1074 /* Split a complex SIMD move.  */
1075
1076 void
1077 aarch64_split_simd_move (rtx dst, rtx src)
1078 {
1079   machine_mode src_mode = GET_MODE (src);
1080   machine_mode dst_mode = GET_MODE (dst);
1081
1082   gcc_assert (VECTOR_MODE_P (dst_mode));
1083
1084   if (REG_P (dst) && REG_P (src))
1085     {
1086       rtx (*gen) (rtx, rtx);
1087
1088       gcc_assert (VECTOR_MODE_P (src_mode));
1089
1090       switch (src_mode)
1091         {
1092         case V16QImode:
1093           gen = gen_aarch64_split_simd_movv16qi;
1094           break;
1095         case V8HImode:
1096           gen = gen_aarch64_split_simd_movv8hi;
1097           break;
1098         case V4SImode:
1099           gen = gen_aarch64_split_simd_movv4si;
1100           break;
1101         case V2DImode:
1102           gen = gen_aarch64_split_simd_movv2di;
1103           break;
1104         case V4SFmode:
1105           gen = gen_aarch64_split_simd_movv4sf;
1106           break;
1107         case V2DFmode:
1108           gen = gen_aarch64_split_simd_movv2df;
1109           break;
1110         default:
1111           gcc_unreachable ();
1112         }
1113
1114       emit_insn (gen (dst, src));
1115       return;
1116     }
1117 }
1118
1119 static rtx
1120 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1121 {
1122   if (can_create_pseudo_p ())
1123     return force_reg (mode, value);
1124   else
1125     {
1126       x = aarch64_emit_move (x, value);
1127       return x;
1128     }
1129 }
1130
1131
1132 static rtx
1133 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1134 {
1135   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1136     {
1137       rtx high;
1138       /* Load the full offset into a register.  This
1139          might be improvable in the future.  */
1140       high = GEN_INT (offset);
1141       offset = 0;
1142       high = aarch64_force_temporary (mode, temp, high);
1143       reg = aarch64_force_temporary (mode, temp,
1144                                      gen_rtx_PLUS (mode, high, reg));
1145     }
1146   return plus_constant (mode, reg, offset);
1147 }
1148
1149 static int
1150 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1151                                 machine_mode mode)
1152 {
1153   unsigned HOST_WIDE_INT mask;
1154   int i;
1155   bool first;
1156   unsigned HOST_WIDE_INT val;
1157   bool subtargets;
1158   rtx subtarget;
1159   int one_match, zero_match, first_not_ffff_match;
1160   int num_insns = 0;
1161
1162   if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1163     {
1164       if (generate)
1165         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1166       num_insns++;
1167       return num_insns;
1168     }
1169
1170   if (mode == SImode)
1171     {
1172       /* We know we can't do this in 1 insn, and we must be able to do it
1173          in two; so don't mess around looking for sequences that don't buy
1174          us anything.  */
1175       if (generate)
1176         {
1177           emit_insn (gen_rtx_SET (VOIDmode, dest,
1178                                   GEN_INT (INTVAL (imm) & 0xffff)));
1179           emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1180                                      GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1181         }
1182       num_insns += 2;
1183       return num_insns;
1184     }
1185
1186   /* Remaining cases are all for DImode.  */
1187
1188   val = INTVAL (imm);
1189   subtargets = optimize && can_create_pseudo_p ();
1190
1191   one_match = 0;
1192   zero_match = 0;
1193   mask = 0xffff;
1194   first_not_ffff_match = -1;
1195
1196   for (i = 0; i < 64; i += 16, mask <<= 16)
1197     {
1198       if ((val & mask) == mask)
1199         one_match++;
1200       else
1201         {
1202           if (first_not_ffff_match < 0)
1203             first_not_ffff_match = i;
1204           if ((val & mask) == 0)
1205             zero_match++;
1206         }
1207     }
1208
1209   if (one_match == 2)
1210     {
1211       /* Set one of the quarters and then insert back into result.  */
1212       mask = 0xffffll << first_not_ffff_match;
1213       if (generate)
1214         {
1215           emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1216           emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1217                                      GEN_INT ((val >> first_not_ffff_match)
1218                                               & 0xffff)));
1219         }
1220       num_insns += 2;
1221       return num_insns;
1222     }
1223
1224   if (zero_match == 2)
1225     goto simple_sequence;
1226
1227   mask = 0x0ffff0000UL;
1228   for (i = 16; i < 64; i += 16, mask <<= 16)
1229     {
1230       HOST_WIDE_INT comp = mask & ~(mask - 1);
1231
1232       if (aarch64_uimm12_shift (val - (val & mask)))
1233         {
1234           if (generate)
1235             {
1236               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1237               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1238                                       GEN_INT (val & mask)));
1239               emit_insn (gen_adddi3 (dest, subtarget,
1240                                      GEN_INT (val - (val & mask))));
1241             }
1242           num_insns += 2;
1243           return num_insns;
1244         }
1245       else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1246         {
1247           if (generate)
1248             {
1249               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1250               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1251                                       GEN_INT ((val + comp) & mask)));
1252               emit_insn (gen_adddi3 (dest, subtarget,
1253                                      GEN_INT (val - ((val + comp) & mask))));
1254             }
1255           num_insns += 2;
1256           return num_insns;
1257         }
1258       else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1259         {
1260           if (generate)
1261             {
1262               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1263               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1264                                       GEN_INT ((val - comp) | ~mask)));
1265               emit_insn (gen_adddi3 (dest, subtarget,
1266                                      GEN_INT (val - ((val - comp) | ~mask))));
1267             }
1268           num_insns += 2;
1269           return num_insns;
1270         }
1271       else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1272         {
1273           if (generate)
1274             {
1275               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1276               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1277                                       GEN_INT (val | ~mask)));
1278               emit_insn (gen_adddi3 (dest, subtarget,
1279                                      GEN_INT (val - (val | ~mask))));
1280             }
1281           num_insns += 2;
1282           return num_insns;
1283         }
1284     }
1285
1286   /* See if we can do it by arithmetically combining two
1287      immediates.  */
1288   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1289     {
1290       int j;
1291       mask = 0xffff;
1292
1293       if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1294           || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1295         {
1296           if (generate)
1297             {
1298               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1299               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1300                                       GEN_INT (aarch64_bitmasks[i])));
1301               emit_insn (gen_adddi3 (dest, subtarget,
1302                                      GEN_INT (val - aarch64_bitmasks[i])));
1303             }
1304           num_insns += 2;
1305           return num_insns;
1306         }
1307
1308       for (j = 0; j < 64; j += 16, mask <<= 16)
1309         {
1310           if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1311             {
1312               if (generate)
1313                 {
1314                   emit_insn (gen_rtx_SET (VOIDmode, dest,
1315                                           GEN_INT (aarch64_bitmasks[i])));
1316                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1317                                              GEN_INT ((val >> j) & 0xffff)));
1318                 }
1319               num_insns += 2;
1320               return num_insns;
1321             }
1322         }
1323     }
1324
1325   /* See if we can do it by logically combining two immediates.  */
1326   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1327     {
1328       if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1329         {
1330           int j;
1331
1332           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1333             if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1334               {
1335                 if (generate)
1336                   {
1337                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1338                     emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1339                                             GEN_INT (aarch64_bitmasks[i])));
1340                     emit_insn (gen_iordi3 (dest, subtarget,
1341                                            GEN_INT (aarch64_bitmasks[j])));
1342                   }
1343                 num_insns += 2;
1344                 return num_insns;
1345               }
1346         }
1347       else if ((val & aarch64_bitmasks[i]) == val)
1348         {
1349           int j;
1350
1351           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1352             if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1353               {
1354                 if (generate)
1355                   {
1356                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1357                     emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1358                                             GEN_INT (aarch64_bitmasks[j])));
1359                     emit_insn (gen_anddi3 (dest, subtarget,
1360                                            GEN_INT (aarch64_bitmasks[i])));
1361                   }
1362                 num_insns += 2;
1363                 return num_insns;
1364               }
1365         }
1366     }
1367
1368   if (one_match > zero_match)
1369     {
1370       /* Set either first three quarters or all but the third.   */
1371       mask = 0xffffll << (16 - first_not_ffff_match);
1372       if (generate)
1373         emit_insn (gen_rtx_SET (VOIDmode, dest,
1374                                 GEN_INT (val | mask | 0xffffffff00000000ull)));
1375       num_insns ++;
1376
1377       /* Now insert other two quarters.  */
1378       for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1379            i < 64; i += 16, mask <<= 16)
1380         {
1381           if ((val & mask) != mask)
1382             {
1383               if (generate)
1384                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1385                                            GEN_INT ((val >> i) & 0xffff)));
1386               num_insns ++;
1387             }
1388         }
1389       return num_insns;
1390     }
1391
1392  simple_sequence:
1393   first = true;
1394   mask = 0xffff;
1395   for (i = 0; i < 64; i += 16, mask <<= 16)
1396     {
1397       if ((val & mask) != 0)
1398         {
1399           if (first)
1400             {
1401               if (generate)
1402                 emit_insn (gen_rtx_SET (VOIDmode, dest,
1403                                         GEN_INT (val & mask)));
1404               num_insns ++;
1405               first = false;
1406             }
1407           else
1408             {
1409               if (generate)
1410                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1411                                            GEN_INT ((val >> i) & 0xffff)));
1412               num_insns ++;
1413             }
1414         }
1415     }
1416
1417   return num_insns;
1418 }
1419
1420
1421 void
1422 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1423 {
1424   machine_mode mode = GET_MODE (dest);
1425
1426   gcc_assert (mode == SImode || mode == DImode);
1427
1428   /* Check on what type of symbol it is.  */
1429   if (GET_CODE (imm) == SYMBOL_REF
1430       || GET_CODE (imm) == LABEL_REF
1431       || GET_CODE (imm) == CONST)
1432     {
1433       rtx mem, base, offset;
1434       enum aarch64_symbol_type sty;
1435
1436       /* If we have (const (plus symbol offset)), separate out the offset
1437          before we start classifying the symbol.  */
1438       split_const (imm, &base, &offset);
1439
1440       sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1441       switch (sty)
1442         {
1443         case SYMBOL_FORCE_TO_MEM:
1444           if (offset != const0_rtx
1445               && targetm.cannot_force_const_mem (mode, imm))
1446             {
1447               gcc_assert (can_create_pseudo_p ());
1448               base = aarch64_force_temporary (mode, dest, base);
1449               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1450               aarch64_emit_move (dest, base);
1451               return;
1452             }
1453           mem = force_const_mem (ptr_mode, imm);
1454           gcc_assert (mem);
1455           if (mode != ptr_mode)
1456             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1457           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1458           return;
1459
1460         case SYMBOL_SMALL_TLSGD:
1461         case SYMBOL_SMALL_TLSDESC:
1462         case SYMBOL_SMALL_GOTTPREL:
1463         case SYMBOL_SMALL_GOT:
1464         case SYMBOL_TINY_GOT:
1465           if (offset != const0_rtx)
1466             {
1467               gcc_assert(can_create_pseudo_p ());
1468               base = aarch64_force_temporary (mode, dest, base);
1469               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1470               aarch64_emit_move (dest, base);
1471               return;
1472             }
1473           /* FALLTHRU */
1474
1475         case SYMBOL_SMALL_TPREL:
1476         case SYMBOL_SMALL_ABSOLUTE:
1477         case SYMBOL_TINY_ABSOLUTE:
1478           aarch64_load_symref_appropriately (dest, imm, sty);
1479           return;
1480
1481         default:
1482           gcc_unreachable ();
1483         }
1484     }
1485
1486   if (!CONST_INT_P (imm))
1487     {
1488       if (GET_CODE (imm) == HIGH)
1489         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1490       else
1491         {
1492           rtx mem = force_const_mem (mode, imm);
1493           gcc_assert (mem);
1494           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1495         }
1496
1497       return;
1498     }
1499
1500   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1501 }
1502
1503 static bool
1504 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1505                                  tree exp ATTRIBUTE_UNUSED)
1506 {
1507   /* Currently, always true.  */
1508   return true;
1509 }
1510
1511 /* Implement TARGET_PASS_BY_REFERENCE.  */
1512
1513 static bool
1514 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1515                            machine_mode mode,
1516                            const_tree type,
1517                            bool named ATTRIBUTE_UNUSED)
1518 {
1519   HOST_WIDE_INT size;
1520   machine_mode dummymode;
1521   int nregs;
1522
1523   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1524   size = (mode == BLKmode && type)
1525     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1526
1527   /* Aggregates are passed by reference based on their size.  */
1528   if (type && AGGREGATE_TYPE_P (type))
1529     {
1530       size = int_size_in_bytes (type);
1531     }
1532
1533   /* Variable sized arguments are always returned by reference.  */
1534   if (size < 0)
1535     return true;
1536
1537   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1538   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1539                                                &dummymode, &nregs,
1540                                                NULL))
1541     return false;
1542
1543   /* Arguments which are variable sized or larger than 2 registers are
1544      passed by reference unless they are a homogenous floating point
1545      aggregate.  */
1546   return size > 2 * UNITS_PER_WORD;
1547 }
1548
1549 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1550 static bool
1551 aarch64_return_in_msb (const_tree valtype)
1552 {
1553   machine_mode dummy_mode;
1554   int dummy_int;
1555
1556   /* Never happens in little-endian mode.  */
1557   if (!BYTES_BIG_ENDIAN)
1558     return false;
1559
1560   /* Only composite types smaller than or equal to 16 bytes can
1561      be potentially returned in registers.  */
1562   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1563       || int_size_in_bytes (valtype) <= 0
1564       || int_size_in_bytes (valtype) > 16)
1565     return false;
1566
1567   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1568      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1569      is always passed/returned in the least significant bits of fp/simd
1570      register(s).  */
1571   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1572                                                &dummy_mode, &dummy_int, NULL))
1573     return false;
1574
1575   return true;
1576 }
1577
1578 /* Implement TARGET_FUNCTION_VALUE.
1579    Define how to find the value returned by a function.  */
1580
1581 static rtx
1582 aarch64_function_value (const_tree type, const_tree func,
1583                         bool outgoing ATTRIBUTE_UNUSED)
1584 {
1585   machine_mode mode;
1586   int unsignedp;
1587   int count;
1588   machine_mode ag_mode;
1589
1590   mode = TYPE_MODE (type);
1591   if (INTEGRAL_TYPE_P (type))
1592     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1593
1594   if (aarch64_return_in_msb (type))
1595     {
1596       HOST_WIDE_INT size = int_size_in_bytes (type);
1597
1598       if (size % UNITS_PER_WORD != 0)
1599         {
1600           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1601           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1602         }
1603     }
1604
1605   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1606                                                &ag_mode, &count, NULL))
1607     {
1608       if (!aarch64_composite_type_p (type, mode))
1609         {
1610           gcc_assert (count == 1 && mode == ag_mode);
1611           return gen_rtx_REG (mode, V0_REGNUM);
1612         }
1613       else
1614         {
1615           int i;
1616           rtx par;
1617
1618           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1619           for (i = 0; i < count; i++)
1620             {
1621               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1622               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1623                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1624               XVECEXP (par, 0, i) = tmp;
1625             }
1626           return par;
1627         }
1628     }
1629   else
1630     return gen_rtx_REG (mode, R0_REGNUM);
1631 }
1632
1633 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1634    Return true if REGNO is the number of a hard register in which the values
1635    of called function may come back.  */
1636
1637 static bool
1638 aarch64_function_value_regno_p (const unsigned int regno)
1639 {
1640   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1641      of 16-byte return values are: 128-bit integers and 16-byte small
1642      structures (excluding homogeneous floating-point aggregates).  */
1643   if (regno == R0_REGNUM || regno == R1_REGNUM)
1644     return true;
1645
1646   /* Up to four fp/simd registers can return a function value, e.g. a
1647      homogeneous floating-point aggregate having four members.  */
1648   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1649     return !TARGET_GENERAL_REGS_ONLY;
1650
1651   return false;
1652 }
1653
1654 /* Implement TARGET_RETURN_IN_MEMORY.
1655
1656    If the type T of the result of a function is such that
1657      void func (T arg)
1658    would require that arg be passed as a value in a register (or set of
1659    registers) according to the parameter passing rules, then the result
1660    is returned in the same registers as would be used for such an
1661    argument.  */
1662
1663 static bool
1664 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1665 {
1666   HOST_WIDE_INT size;
1667   machine_mode ag_mode;
1668   int count;
1669
1670   if (!AGGREGATE_TYPE_P (type)
1671       && TREE_CODE (type) != COMPLEX_TYPE
1672       && TREE_CODE (type) != VECTOR_TYPE)
1673     /* Simple scalar types always returned in registers.  */
1674     return false;
1675
1676   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1677                                                type,
1678                                                &ag_mode,
1679                                                &count,
1680                                                NULL))
1681     return false;
1682
1683   /* Types larger than 2 registers returned in memory.  */
1684   size = int_size_in_bytes (type);
1685   return (size < 0 || size > 2 * UNITS_PER_WORD);
1686 }
1687
1688 static bool
1689 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1690                                const_tree type, int *nregs)
1691 {
1692   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1693   return aarch64_vfp_is_call_or_return_candidate (mode,
1694                                                   type,
1695                                                   &pcum->aapcs_vfp_rmode,
1696                                                   nregs,
1697                                                   NULL);
1698 }
1699
1700 /* Given MODE and TYPE of a function argument, return the alignment in
1701    bits.  The idea is to suppress any stronger alignment requested by
1702    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1703    This is a helper function for local use only.  */
1704
1705 static unsigned int
1706 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1707 {
1708   unsigned int alignment;
1709
1710   if (type)
1711     {
1712       if (!integer_zerop (TYPE_SIZE (type)))
1713         {
1714           if (TYPE_MODE (type) == mode)
1715             alignment = TYPE_ALIGN (type);
1716           else
1717             alignment = GET_MODE_ALIGNMENT (mode);
1718         }
1719       else
1720         alignment = 0;
1721     }
1722   else
1723     alignment = GET_MODE_ALIGNMENT (mode);
1724
1725   return alignment;
1726 }
1727
1728 /* Layout a function argument according to the AAPCS64 rules.  The rule
1729    numbers refer to the rule numbers in the AAPCS64.  */
1730
1731 static void
1732 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1733                     const_tree type,
1734                     bool named ATTRIBUTE_UNUSED)
1735 {
1736   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1737   int ncrn, nvrn, nregs;
1738   bool allocate_ncrn, allocate_nvrn;
1739   HOST_WIDE_INT size;
1740
1741   /* We need to do this once per argument.  */
1742   if (pcum->aapcs_arg_processed)
1743     return;
1744
1745   pcum->aapcs_arg_processed = true;
1746
1747   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1748   size
1749     = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1750                         UNITS_PER_WORD);
1751
1752   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1753   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1754                                                  mode,
1755                                                  type,
1756                                                  &nregs);
1757
1758   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1759      The following code thus handles passing by SIMD/FP registers first.  */
1760
1761   nvrn = pcum->aapcs_nvrn;
1762
1763   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1764      and homogenous short-vector aggregates (HVA).  */
1765   if (allocate_nvrn)
1766     {
1767       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1768         {
1769           pcum->aapcs_nextnvrn = nvrn + nregs;
1770           if (!aarch64_composite_type_p (type, mode))
1771             {
1772               gcc_assert (nregs == 1);
1773               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1774             }
1775           else
1776             {
1777               rtx par;
1778               int i;
1779               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1780               for (i = 0; i < nregs; i++)
1781                 {
1782                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1783                                          V0_REGNUM + nvrn + i);
1784                   tmp = gen_rtx_EXPR_LIST
1785                     (VOIDmode, tmp,
1786                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1787                   XVECEXP (par, 0, i) = tmp;
1788                 }
1789               pcum->aapcs_reg = par;
1790             }
1791           return;
1792         }
1793       else
1794         {
1795           /* C.3 NSRN is set to 8.  */
1796           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1797           goto on_stack;
1798         }
1799     }
1800
1801   ncrn = pcum->aapcs_ncrn;
1802   nregs = size / UNITS_PER_WORD;
1803
1804   /* C6 - C9.  though the sign and zero extension semantics are
1805      handled elsewhere.  This is the case where the argument fits
1806      entirely general registers.  */
1807   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1808     {
1809       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1810
1811       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1812
1813       /* C.8 if the argument has an alignment of 16 then the NGRN is
1814          rounded up to the next even number.  */
1815       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1816         {
1817           ++ncrn;
1818           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1819         }
1820       /* NREGS can be 0 when e.g. an empty structure is to be passed.
1821          A reg is still generated for it, but the caller should be smart
1822          enough not to use it.  */
1823       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1824         {
1825           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1826         }
1827       else
1828         {
1829           rtx par;
1830           int i;
1831
1832           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1833           for (i = 0; i < nregs; i++)
1834             {
1835               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1836               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1837                                        GEN_INT (i * UNITS_PER_WORD));
1838               XVECEXP (par, 0, i) = tmp;
1839             }
1840           pcum->aapcs_reg = par;
1841         }
1842
1843       pcum->aapcs_nextncrn = ncrn + nregs;
1844       return;
1845     }
1846
1847   /* C.11  */
1848   pcum->aapcs_nextncrn = NUM_ARG_REGS;
1849
1850   /* The argument is passed on stack; record the needed number of words for
1851      this argument and align the total size if necessary.  */
1852 on_stack:
1853   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1854   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1855     pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1856                                                16 / UNITS_PER_WORD);
1857   return;
1858 }
1859
1860 /* Implement TARGET_FUNCTION_ARG.  */
1861
1862 static rtx
1863 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1864                       const_tree type, bool named)
1865 {
1866   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1867   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1868
1869   if (mode == VOIDmode)
1870     return NULL_RTX;
1871
1872   aarch64_layout_arg (pcum_v, mode, type, named);
1873   return pcum->aapcs_reg;
1874 }
1875
1876 void
1877 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1878                            const_tree fntype ATTRIBUTE_UNUSED,
1879                            rtx libname ATTRIBUTE_UNUSED,
1880                            const_tree fndecl ATTRIBUTE_UNUSED,
1881                            unsigned n_named ATTRIBUTE_UNUSED)
1882 {
1883   pcum->aapcs_ncrn = 0;
1884   pcum->aapcs_nvrn = 0;
1885   pcum->aapcs_nextncrn = 0;
1886   pcum->aapcs_nextnvrn = 0;
1887   pcum->pcs_variant = ARM_PCS_AAPCS64;
1888   pcum->aapcs_reg = NULL_RTX;
1889   pcum->aapcs_arg_processed = false;
1890   pcum->aapcs_stack_words = 0;
1891   pcum->aapcs_stack_size = 0;
1892
1893   return;
1894 }
1895
1896 static void
1897 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1898                               machine_mode mode,
1899                               const_tree type,
1900                               bool named)
1901 {
1902   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1903   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1904     {
1905       aarch64_layout_arg (pcum_v, mode, type, named);
1906       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1907                   != (pcum->aapcs_stack_words != 0));
1908       pcum->aapcs_arg_processed = false;
1909       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1910       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1911       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1912       pcum->aapcs_stack_words = 0;
1913       pcum->aapcs_reg = NULL_RTX;
1914     }
1915 }
1916
1917 bool
1918 aarch64_function_arg_regno_p (unsigned regno)
1919 {
1920   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1921           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1922 }
1923
1924 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
1925    PARM_BOUNDARY bits of alignment, but will be given anything up
1926    to STACK_BOUNDARY bits if the type requires it.  This makes sure
1927    that both before and after the layout of each argument, the Next
1928    Stacked Argument Address (NSAA) will have a minimum alignment of
1929    8 bytes.  */
1930
1931 static unsigned int
1932 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1933 {
1934   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1935
1936   if (alignment < PARM_BOUNDARY)
1937     alignment = PARM_BOUNDARY;
1938   if (alignment > STACK_BOUNDARY)
1939     alignment = STACK_BOUNDARY;
1940   return alignment;
1941 }
1942
1943 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1944
1945    Return true if an argument passed on the stack should be padded upwards,
1946    i.e. if the least-significant byte of the stack slot has useful data.
1947
1948    Small aggregate types are placed in the lowest memory address.
1949
1950    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
1951
1952 bool
1953 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1954 {
1955   /* On little-endian targets, the least significant byte of every stack
1956      argument is passed at the lowest byte address of the stack slot.  */
1957   if (!BYTES_BIG_ENDIAN)
1958     return true;
1959
1960   /* Otherwise, integral, floating-point and pointer types are padded downward:
1961      the least significant byte of a stack argument is passed at the highest
1962      byte address of the stack slot.  */
1963   if (type
1964       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1965          || POINTER_TYPE_P (type))
1966       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1967     return false;
1968
1969   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
1970   return true;
1971 }
1972
1973 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1974
1975    It specifies padding for the last (may also be the only)
1976    element of a block move between registers and memory.  If
1977    assuming the block is in the memory, padding upward means that
1978    the last element is padded after its highest significant byte,
1979    while in downward padding, the last element is padded at the
1980    its least significant byte side.
1981
1982    Small aggregates and small complex types are always padded
1983    upwards.
1984
1985    We don't need to worry about homogeneous floating-point or
1986    short-vector aggregates; their move is not affected by the
1987    padding direction determined here.  Regardless of endianness,
1988    each element of such an aggregate is put in the least
1989    significant bits of a fp/simd register.
1990
1991    Return !BYTES_BIG_ENDIAN if the least significant byte of the
1992    register has useful data, and return the opposite if the most
1993    significant byte does.  */
1994
1995 bool
1996 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
1997                      bool first ATTRIBUTE_UNUSED)
1998 {
1999
2000   /* Small composite types are always padded upward.  */
2001   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2002     {
2003       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2004                             : GET_MODE_SIZE (mode));
2005       if (size < 2 * UNITS_PER_WORD)
2006         return true;
2007     }
2008
2009   /* Otherwise, use the default padding.  */
2010   return !BYTES_BIG_ENDIAN;
2011 }
2012
2013 static machine_mode
2014 aarch64_libgcc_cmp_return_mode (void)
2015 {
2016   return SImode;
2017 }
2018
2019 static bool
2020 aarch64_frame_pointer_required (void)
2021 {
2022   /* In aarch64_override_options_after_change
2023      flag_omit_leaf_frame_pointer turns off the frame pointer by
2024      default.  Turn it back on now if we've not got a leaf
2025      function.  */
2026   if (flag_omit_leaf_frame_pointer
2027       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2028     return true;
2029
2030   return false;
2031 }
2032
2033 /* Mark the registers that need to be saved by the callee and calculate
2034    the size of the callee-saved registers area and frame record (both FP
2035    and LR may be omitted).  */
2036 static void
2037 aarch64_layout_frame (void)
2038 {
2039   HOST_WIDE_INT offset = 0;
2040   int regno;
2041
2042   if (reload_completed && cfun->machine->frame.laid_out)
2043     return;
2044
2045 #define SLOT_NOT_REQUIRED (-2)
2046 #define SLOT_REQUIRED     (-1)
2047
2048   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2049   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2050
2051   /* First mark all the registers that really need to be saved...  */
2052   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2053     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2054
2055   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2056     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2057
2058   /* ... that includes the eh data registers (if needed)...  */
2059   if (crtl->calls_eh_return)
2060     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2061       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2062         = SLOT_REQUIRED;
2063
2064   /* ... and any callee saved register that dataflow says is live.  */
2065   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2066     if (df_regs_ever_live_p (regno)
2067         && (regno == R30_REGNUM
2068             || !call_used_regs[regno]))
2069       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2070
2071   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2072     if (df_regs_ever_live_p (regno)
2073         && !call_used_regs[regno])
2074       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2075
2076   if (frame_pointer_needed)
2077     {
2078       /* FP and LR are placed in the linkage record.  */
2079       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2080       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2081       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2082       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2083       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2084       offset += 2 * UNITS_PER_WORD;
2085     }
2086
2087   /* Now assign stack slots for them.  */
2088   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2089     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2090       {
2091         cfun->machine->frame.reg_offset[regno] = offset;
2092         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2093           cfun->machine->frame.wb_candidate1 = regno;
2094         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2095           cfun->machine->frame.wb_candidate2 = regno;
2096         offset += UNITS_PER_WORD;
2097       }
2098
2099   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2100     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2101       {
2102         cfun->machine->frame.reg_offset[regno] = offset;
2103         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2104           cfun->machine->frame.wb_candidate1 = regno;
2105         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2106                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2107           cfun->machine->frame.wb_candidate2 = regno;
2108         offset += UNITS_PER_WORD;
2109       }
2110
2111   cfun->machine->frame.padding0 =
2112     (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2113   offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2114
2115   cfun->machine->frame.saved_regs_size = offset;
2116
2117   cfun->machine->frame.hard_fp_offset
2118     = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2119                         + get_frame_size ()
2120                         + cfun->machine->frame.saved_regs_size,
2121                         STACK_BOUNDARY / BITS_PER_UNIT);
2122
2123   cfun->machine->frame.frame_size
2124     = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2125                         + crtl->outgoing_args_size,
2126                         STACK_BOUNDARY / BITS_PER_UNIT);
2127
2128   cfun->machine->frame.laid_out = true;
2129 }
2130
2131 static bool
2132 aarch64_register_saved_on_entry (int regno)
2133 {
2134   return cfun->machine->frame.reg_offset[regno] >= 0;
2135 }
2136
2137 static unsigned
2138 aarch64_next_callee_save (unsigned regno, unsigned limit)
2139 {
2140   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2141     regno ++;
2142   return regno;
2143 }
2144
2145 static void
2146 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2147                            HOST_WIDE_INT adjustment)
2148  {
2149   rtx base_rtx = stack_pointer_rtx;
2150   rtx insn, reg, mem;
2151
2152   reg = gen_rtx_REG (mode, regno);
2153   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2154                             plus_constant (Pmode, base_rtx, -adjustment));
2155   mem = gen_rtx_MEM (mode, mem);
2156
2157   insn = emit_move_insn (mem, reg);
2158   RTX_FRAME_RELATED_P (insn) = 1;
2159 }
2160
2161 static rtx
2162 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2163                           HOST_WIDE_INT adjustment)
2164 {
2165   switch (mode)
2166     {
2167     case DImode:
2168       return gen_storewb_pairdi_di (base, base, reg, reg2,
2169                                     GEN_INT (-adjustment),
2170                                     GEN_INT (UNITS_PER_WORD - adjustment));
2171     case DFmode:
2172       return gen_storewb_pairdf_di (base, base, reg, reg2,
2173                                     GEN_INT (-adjustment),
2174                                     GEN_INT (UNITS_PER_WORD - adjustment));
2175     default:
2176       gcc_unreachable ();
2177     }
2178 }
2179
2180 static void
2181 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2182                          unsigned regno2, HOST_WIDE_INT adjustment)
2183 {
2184   rtx_insn *insn;
2185   rtx reg1 = gen_rtx_REG (mode, regno1);
2186   rtx reg2 = gen_rtx_REG (mode, regno2);
2187
2188   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2189                                               reg2, adjustment));
2190   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2191   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2192   RTX_FRAME_RELATED_P (insn) = 1;
2193 }
2194
2195 static rtx
2196 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2197                          HOST_WIDE_INT adjustment)
2198 {
2199   switch (mode)
2200     {
2201     case DImode:
2202       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2203                                    GEN_INT (UNITS_PER_WORD));
2204     case DFmode:
2205       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2206                                    GEN_INT (UNITS_PER_WORD));
2207     default:
2208       gcc_unreachable ();
2209     }
2210 }
2211
2212 static rtx
2213 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2214                         rtx reg2)
2215 {
2216   switch (mode)
2217     {
2218     case DImode:
2219       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2220
2221     case DFmode:
2222       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2223
2224     default:
2225       gcc_unreachable ();
2226     }
2227 }
2228
2229 static rtx
2230 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2231                        rtx mem2)
2232 {
2233   switch (mode)
2234     {
2235     case DImode:
2236       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2237
2238     case DFmode:
2239       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2240
2241     default:
2242       gcc_unreachable ();
2243     }
2244 }
2245
2246
2247 static void
2248 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2249                            unsigned start, unsigned limit, bool skip_wb)
2250 {
2251   rtx_insn *insn;
2252   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2253                                                  ? gen_frame_mem : gen_rtx_MEM);
2254   unsigned regno;
2255   unsigned regno2;
2256
2257   for (regno = aarch64_next_callee_save (start, limit);
2258        regno <= limit;
2259        regno = aarch64_next_callee_save (regno + 1, limit))
2260     {
2261       rtx reg, mem;
2262       HOST_WIDE_INT offset;
2263
2264       if (skip_wb
2265           && (regno == cfun->machine->frame.wb_candidate1
2266               || regno == cfun->machine->frame.wb_candidate2))
2267         continue;
2268
2269       reg = gen_rtx_REG (mode, regno);
2270       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2271       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2272                                               offset));
2273
2274       regno2 = aarch64_next_callee_save (regno + 1, limit);
2275
2276       if (regno2 <= limit
2277           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2278               == cfun->machine->frame.reg_offset[regno2]))
2279
2280         {
2281           rtx reg2 = gen_rtx_REG (mode, regno2);
2282           rtx mem2;
2283
2284           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2285           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2286                                                    offset));
2287           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2288                                                     reg2));
2289
2290           /* The first part of a frame-related parallel insn is
2291              always assumed to be relevant to the frame
2292              calculations; subsequent parts, are only
2293              frame-related if explicitly marked.  */
2294           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2295           regno = regno2;
2296         }
2297       else
2298         insn = emit_move_insn (mem, reg);
2299
2300       RTX_FRAME_RELATED_P (insn) = 1;
2301     }
2302 }
2303
2304 static void
2305 aarch64_restore_callee_saves (machine_mode mode,
2306                               HOST_WIDE_INT start_offset, unsigned start,
2307                               unsigned limit, bool skip_wb, rtx *cfi_ops)
2308 {
2309   rtx base_rtx = stack_pointer_rtx;
2310   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2311                                                  ? gen_frame_mem : gen_rtx_MEM);
2312   unsigned regno;
2313   unsigned regno2;
2314   HOST_WIDE_INT offset;
2315
2316   for (regno = aarch64_next_callee_save (start, limit);
2317        regno <= limit;
2318        regno = aarch64_next_callee_save (regno + 1, limit))
2319     {
2320       rtx reg, mem;
2321
2322       if (skip_wb
2323           && (regno == cfun->machine->frame.wb_candidate1
2324               || regno == cfun->machine->frame.wb_candidate2))
2325         continue;
2326
2327       reg = gen_rtx_REG (mode, regno);
2328       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2329       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2330
2331       regno2 = aarch64_next_callee_save (regno + 1, limit);
2332
2333       if (regno2 <= limit
2334           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2335               == cfun->machine->frame.reg_offset[regno2]))
2336         {
2337           rtx reg2 = gen_rtx_REG (mode, regno2);
2338           rtx mem2;
2339
2340           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2341           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2342           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2343
2344           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2345           regno = regno2;
2346         }
2347       else
2348         emit_move_insn (reg, mem);
2349       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2350     }
2351 }
2352
2353 /* AArch64 stack frames generated by this compiler look like:
2354
2355         +-------------------------------+
2356         |                               |
2357         |  incoming stack arguments     |
2358         |                               |
2359         +-------------------------------+
2360         |                               | <-- incoming stack pointer (aligned)
2361         |  callee-allocated save area   |
2362         |  for register varargs         |
2363         |                               |
2364         +-------------------------------+
2365         |  local variables              | <-- frame_pointer_rtx
2366         |                               |
2367         +-------------------------------+
2368         |  padding0                     | \
2369         +-------------------------------+  |
2370         |  callee-saved registers       |  | frame.saved_regs_size
2371         +-------------------------------+  |
2372         |  LR'                          |  |
2373         +-------------------------------+  |
2374         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2375         +-------------------------------+
2376         |  dynamic allocation           |
2377         +-------------------------------+
2378         |  padding                      |
2379         +-------------------------------+
2380         |  outgoing stack arguments     | <-- arg_pointer
2381         |                               |
2382         +-------------------------------+
2383         |                               | <-- stack_pointer_rtx (aligned)
2384
2385    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2386    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2387    unchanged.  */
2388
2389 /* Generate the prologue instructions for entry into a function.
2390    Establish the stack frame by decreasing the stack pointer with a
2391    properly calculated size and, if necessary, create a frame record
2392    filled with the values of LR and previous frame pointer.  The
2393    current FP is also set up if it is in use.  */
2394
2395 void
2396 aarch64_expand_prologue (void)
2397 {
2398   /* sub sp, sp, #<frame_size>
2399      stp {fp, lr}, [sp, #<frame_size> - 16]
2400      add fp, sp, #<frame_size> - hardfp_offset
2401      stp {cs_reg}, [fp, #-16] etc.
2402
2403      sub sp, sp, <final_adjustment_if_any>
2404   */
2405   HOST_WIDE_INT frame_size, offset;
2406   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2407   HOST_WIDE_INT hard_fp_offset;
2408   rtx_insn *insn;
2409
2410   aarch64_layout_frame ();
2411
2412   offset = frame_size = cfun->machine->frame.frame_size;
2413   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2414   fp_offset = frame_size - hard_fp_offset;
2415
2416   if (flag_stack_usage_info)
2417     current_function_static_stack_size = frame_size;
2418
2419   /* Store pairs and load pairs have a range only -512 to 504.  */
2420   if (offset >= 512)
2421     {
2422       /* When the frame has a large size, an initial decrease is done on
2423          the stack pointer to jump over the callee-allocated save area for
2424          register varargs, the local variable area and/or the callee-saved
2425          register area.  This will allow the pre-index write-back
2426          store pair instructions to be used for setting up the stack frame
2427          efficiently.  */
2428       offset = hard_fp_offset;
2429       if (offset >= 512)
2430         offset = cfun->machine->frame.saved_regs_size;
2431
2432       frame_size -= (offset + crtl->outgoing_args_size);
2433       fp_offset = 0;
2434
2435       if (frame_size >= 0x1000000)
2436         {
2437           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2438           emit_move_insn (op0, GEN_INT (-frame_size));
2439           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2440
2441           add_reg_note (insn, REG_CFA_ADJUST_CFA,
2442                         gen_rtx_SET (VOIDmode, stack_pointer_rtx,
2443                                      plus_constant (Pmode, stack_pointer_rtx,
2444                                                     -frame_size)));
2445           RTX_FRAME_RELATED_P (insn) = 1;
2446         }
2447       else if (frame_size > 0)
2448         {
2449           int hi_ofs = frame_size & 0xfff000;
2450           int lo_ofs = frame_size & 0x000fff;
2451
2452           if (hi_ofs)
2453             {
2454               insn = emit_insn (gen_add2_insn
2455                                 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2456               RTX_FRAME_RELATED_P (insn) = 1;
2457             }
2458           if (lo_ofs)
2459             {
2460               insn = emit_insn (gen_add2_insn
2461                                 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2462               RTX_FRAME_RELATED_P (insn) = 1;
2463             }
2464         }
2465     }
2466   else
2467     frame_size = -1;
2468
2469   if (offset > 0)
2470     {
2471       bool skip_wb = false;
2472
2473       if (frame_pointer_needed)
2474         {
2475           skip_wb = true;
2476
2477           if (fp_offset)
2478             {
2479               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2480                                                GEN_INT (-offset)));
2481               RTX_FRAME_RELATED_P (insn) = 1;
2482
2483               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2484                                          R30_REGNUM, false);
2485             }
2486           else
2487             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2488
2489           /* Set up frame pointer to point to the location of the
2490              previous frame pointer on the stack.  */
2491           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2492                                            stack_pointer_rtx,
2493                                            GEN_INT (fp_offset)));
2494           RTX_FRAME_RELATED_P (insn) = 1;
2495           emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2496         }
2497       else
2498         {
2499           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2500           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2501
2502           if (fp_offset
2503               || reg1 == FIRST_PSEUDO_REGISTER
2504               || (reg2 == FIRST_PSEUDO_REGISTER
2505                   && offset >= 256))
2506             {
2507               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2508                                                GEN_INT (-offset)));
2509               RTX_FRAME_RELATED_P (insn) = 1;
2510             }
2511           else
2512             {
2513               machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2514
2515               skip_wb = true;
2516
2517               if (reg2 == FIRST_PSEUDO_REGISTER)
2518                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2519               else
2520                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2521             }
2522         }
2523
2524       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2525                                  skip_wb);
2526       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2527                                  skip_wb);
2528     }
2529
2530   /* when offset >= 512,
2531      sub sp, sp, #<outgoing_args_size> */
2532   if (frame_size > -1)
2533     {
2534       if (crtl->outgoing_args_size > 0)
2535         {
2536           insn = emit_insn (gen_add2_insn
2537                             (stack_pointer_rtx,
2538                              GEN_INT (- crtl->outgoing_args_size)));
2539           RTX_FRAME_RELATED_P (insn) = 1;
2540         }
2541     }
2542 }
2543
2544 /* Return TRUE if we can use a simple_return insn.
2545
2546    This function checks whether the callee saved stack is empty, which
2547    means no restore actions are need. The pro_and_epilogue will use
2548    this to check whether shrink-wrapping opt is feasible.  */
2549
2550 bool
2551 aarch64_use_return_insn_p (void)
2552 {
2553   if (!reload_completed)
2554     return false;
2555
2556   if (crtl->profile)
2557     return false;
2558
2559   aarch64_layout_frame ();
2560
2561   return cfun->machine->frame.frame_size == 0;
2562 }
2563
2564 /* Generate the epilogue instructions for returning from a function.  */
2565 void
2566 aarch64_expand_epilogue (bool for_sibcall)
2567 {
2568   HOST_WIDE_INT frame_size, offset;
2569   HOST_WIDE_INT fp_offset;
2570   HOST_WIDE_INT hard_fp_offset;
2571   rtx_insn *insn;
2572   /* We need to add memory barrier to prevent read from deallocated stack.  */
2573   bool need_barrier_p = (get_frame_size () != 0
2574                          || cfun->machine->frame.saved_varargs_size);
2575
2576   aarch64_layout_frame ();
2577
2578   offset = frame_size = cfun->machine->frame.frame_size;
2579   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2580   fp_offset = frame_size - hard_fp_offset;
2581
2582   /* Store pairs and load pairs have a range only -512 to 504.  */
2583   if (offset >= 512)
2584     {
2585       offset = hard_fp_offset;
2586       if (offset >= 512)
2587         offset = cfun->machine->frame.saved_regs_size;
2588
2589       frame_size -= (offset + crtl->outgoing_args_size);
2590       fp_offset = 0;
2591       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2592         {
2593           insn = emit_insn (gen_add2_insn
2594                             (stack_pointer_rtx,
2595                              GEN_INT (crtl->outgoing_args_size)));
2596           RTX_FRAME_RELATED_P (insn) = 1;
2597         }
2598     }
2599   else
2600     frame_size = -1;
2601
2602   /* If there were outgoing arguments or we've done dynamic stack
2603      allocation, then restore the stack pointer from the frame
2604      pointer.  This is at most one insn and more efficient than using
2605      GCC's internal mechanism.  */
2606   if (frame_pointer_needed
2607       && (crtl->outgoing_args_size || cfun->calls_alloca))
2608     {
2609       if (cfun->calls_alloca)
2610         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2611
2612       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2613                                        hard_frame_pointer_rtx,
2614                                        GEN_INT (0)));
2615       offset = offset - fp_offset;
2616     }
2617
2618   if (offset > 0)
2619     {
2620       unsigned reg1 = cfun->machine->frame.wb_candidate1;
2621       unsigned reg2 = cfun->machine->frame.wb_candidate2;
2622       bool skip_wb = true;
2623       rtx cfi_ops = NULL;
2624
2625       if (frame_pointer_needed)
2626         fp_offset = 0;
2627       else if (fp_offset
2628                || reg1 == FIRST_PSEUDO_REGISTER
2629                || (reg2 == FIRST_PSEUDO_REGISTER
2630                    && offset >= 256))
2631         skip_wb = false;
2632
2633       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2634                                     skip_wb, &cfi_ops);
2635       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2636                                     skip_wb, &cfi_ops);
2637
2638       if (need_barrier_p)
2639         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2640
2641       if (skip_wb)
2642         {
2643           machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2644           rtx rreg1 = gen_rtx_REG (mode1, reg1);
2645
2646           cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2647           if (reg2 == FIRST_PSEUDO_REGISTER)
2648             {
2649               rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2650               mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2651               mem = gen_rtx_MEM (mode1, mem);
2652               insn = emit_move_insn (rreg1, mem);
2653             }
2654           else
2655             {
2656               rtx rreg2 = gen_rtx_REG (mode1, reg2);
2657
2658               cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2659               insn = emit_insn (aarch64_gen_loadwb_pair
2660                                 (mode1, stack_pointer_rtx, rreg1,
2661                                  rreg2, offset));
2662             }
2663         }
2664       else
2665         {
2666           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2667                                            GEN_INT (offset)));
2668         }
2669
2670       /* Reset the CFA to be SP + FRAME_SIZE.  */
2671       rtx new_cfa = stack_pointer_rtx;
2672       if (frame_size > 0)
2673         new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2674       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2675       REG_NOTES (insn) = cfi_ops;
2676       RTX_FRAME_RELATED_P (insn) = 1;
2677     }
2678
2679   if (frame_size > 0)
2680     {
2681       if (need_barrier_p)
2682         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2683
2684       if (frame_size >= 0x1000000)
2685         {
2686           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2687           emit_move_insn (op0, GEN_INT (frame_size));
2688           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2689         }
2690       else
2691         {
2692           int hi_ofs = frame_size & 0xfff000;
2693           int lo_ofs = frame_size & 0x000fff;
2694
2695           if (hi_ofs && lo_ofs)
2696             {
2697               insn = emit_insn (gen_add2_insn
2698                                 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2699               RTX_FRAME_RELATED_P (insn) = 1;
2700               frame_size = lo_ofs;
2701             }
2702           insn = emit_insn (gen_add2_insn
2703                             (stack_pointer_rtx, GEN_INT (frame_size)));
2704         }
2705
2706       /* Reset the CFA to be SP + 0.  */
2707       add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2708       RTX_FRAME_RELATED_P (insn) = 1;
2709     }
2710
2711   /* Stack adjustment for exception handler.  */
2712   if (crtl->calls_eh_return)
2713     {
2714       /* We need to unwind the stack by the offset computed by
2715          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
2716          to be SP; letting the CFA move during this adjustment
2717          is just as correct as retaining the CFA from the body
2718          of the function.  Therefore, do nothing special.  */
2719       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2720     }
2721
2722   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2723   if (!for_sibcall)
2724     emit_jump_insn (ret_rtx);
2725 }
2726
2727 /* Return the place to copy the exception unwinding return address to.
2728    This will probably be a stack slot, but could (in theory be the
2729    return register).  */
2730 rtx
2731 aarch64_final_eh_return_addr (void)
2732 {
2733   HOST_WIDE_INT fp_offset;
2734
2735   aarch64_layout_frame ();
2736
2737   fp_offset = cfun->machine->frame.frame_size
2738               - cfun->machine->frame.hard_fp_offset;
2739
2740   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2741     return gen_rtx_REG (DImode, LR_REGNUM);
2742
2743   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2744      result in a store to save LR introduced by builtin_eh_return () being
2745      incorrectly deleted because the alias is not detected.
2746      So in the calculation of the address to copy the exception unwinding
2747      return address to, we note 2 cases.
2748      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2749      we return a SP-relative location since all the addresses are SP-relative
2750      in this case.  This prevents the store from being optimized away.
2751      If the fp_offset is not 0, then the addresses will be FP-relative and
2752      therefore we return a FP-relative location.  */
2753
2754   if (frame_pointer_needed)
2755     {
2756       if (fp_offset)
2757         return gen_frame_mem (DImode,
2758                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2759       else
2760         return gen_frame_mem (DImode,
2761                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2762     }
2763
2764   /* If FP is not needed, we calculate the location of LR, which would be
2765      at the top of the saved registers block.  */
2766
2767   return gen_frame_mem (DImode,
2768                         plus_constant (Pmode,
2769                                        stack_pointer_rtx,
2770                                        fp_offset
2771                                        + cfun->machine->frame.saved_regs_size
2772                                        - 2 * UNITS_PER_WORD));
2773 }
2774
2775 /* Possibly output code to build up a constant in a register.  For
2776    the benefit of the costs infrastructure, returns the number of
2777    instructions which would be emitted.  GENERATE inhibits or
2778    enables code generation.  */
2779
2780 static int
2781 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2782 {
2783   int insns = 0;
2784
2785   if (aarch64_bitmask_imm (val, DImode))
2786     {
2787       if (generate)
2788         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2789       insns = 1;
2790     }
2791   else
2792     {
2793       int i;
2794       int ncount = 0;
2795       int zcount = 0;
2796       HOST_WIDE_INT valp = val >> 16;
2797       HOST_WIDE_INT valm;
2798       HOST_WIDE_INT tval;
2799
2800       for (i = 16; i < 64; i += 16)
2801         {
2802           valm = (valp & 0xffff);
2803
2804           if (valm != 0)
2805             ++ zcount;
2806
2807           if (valm != 0xffff)
2808             ++ ncount;
2809
2810           valp >>= 16;
2811         }
2812
2813       /* zcount contains the number of additional MOVK instructions
2814          required if the constant is built up with an initial MOVZ instruction,
2815          while ncount is the number of MOVK instructions required if starting
2816          with a MOVN instruction.  Choose the sequence that yields the fewest
2817          number of instructions, preferring MOVZ instructions when they are both
2818          the same.  */
2819       if (ncount < zcount)
2820         {
2821           if (generate)
2822             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2823                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2824           tval = 0xffff;
2825           insns++;
2826         }
2827       else
2828         {
2829           if (generate)
2830             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2831                             GEN_INT (val & 0xffff));
2832           tval = 0;
2833           insns++;
2834         }
2835
2836       val >>= 16;
2837
2838       for (i = 16; i < 64; i += 16)
2839         {
2840           if ((val & 0xffff) != tval)
2841             {
2842               if (generate)
2843                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2844                                            GEN_INT (i),
2845                                            GEN_INT (val & 0xffff)));
2846               insns++;
2847             }
2848           val >>= 16;
2849         }
2850     }
2851   return insns;
2852 }
2853
2854 static void
2855 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2856 {
2857   HOST_WIDE_INT mdelta = delta;
2858   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2859   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2860
2861   if (mdelta < 0)
2862     mdelta = -mdelta;
2863
2864   if (mdelta >= 4096 * 4096)
2865     {
2866       (void) aarch64_build_constant (scratchreg, delta, true);
2867       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2868     }
2869   else if (mdelta > 0)
2870     {
2871       if (mdelta >= 4096)
2872         {
2873           emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2874           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2875           if (delta < 0)
2876             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2877                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
2878           else
2879             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2880                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
2881         }
2882       if (mdelta % 4096 != 0)
2883         {
2884           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2885           emit_insn (gen_rtx_SET (Pmode, this_rtx,
2886                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2887         }
2888     }
2889 }
2890
2891 /* Output code to add DELTA to the first argument, and then jump
2892    to FUNCTION.  Used for C++ multiple inheritance.  */
2893 static void
2894 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2895                          HOST_WIDE_INT delta,
2896                          HOST_WIDE_INT vcall_offset,
2897                          tree function)
2898 {
2899   /* The this pointer is always in x0.  Note that this differs from
2900      Arm where the this pointer maybe bumped to r1 if r0 is required
2901      to return a pointer to an aggregate.  On AArch64 a result value
2902      pointer will be in x8.  */
2903   int this_regno = R0_REGNUM;
2904   rtx this_rtx, temp0, temp1, addr, funexp;
2905   rtx_insn *insn;
2906
2907   reload_completed = 1;
2908   emit_note (NOTE_INSN_PROLOGUE_END);
2909
2910   if (vcall_offset == 0)
2911     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2912   else
2913     {
2914       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2915
2916       this_rtx = gen_rtx_REG (Pmode, this_regno);
2917       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2918       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2919
2920       addr = this_rtx;
2921       if (delta != 0)
2922         {
2923           if (delta >= -256 && delta < 256)
2924             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2925                                        plus_constant (Pmode, this_rtx, delta));
2926           else
2927             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2928         }
2929
2930       if (Pmode == ptr_mode)
2931         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2932       else
2933         aarch64_emit_move (temp0,
2934                            gen_rtx_ZERO_EXTEND (Pmode,
2935                                                 gen_rtx_MEM (ptr_mode, addr)));
2936
2937       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2938           addr = plus_constant (Pmode, temp0, vcall_offset);
2939       else
2940         {
2941           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2942           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2943         }
2944
2945       if (Pmode == ptr_mode)
2946         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2947       else
2948         aarch64_emit_move (temp1,
2949                            gen_rtx_SIGN_EXTEND (Pmode,
2950                                                 gen_rtx_MEM (ptr_mode, addr)));
2951
2952       emit_insn (gen_add2_insn (this_rtx, temp1));
2953     }
2954
2955   /* Generate a tail call to the target function.  */
2956   if (!TREE_USED (function))
2957     {
2958       assemble_external (function);
2959       TREE_USED (function) = 1;
2960     }
2961   funexp = XEXP (DECL_RTL (function), 0);
2962   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2963   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2964   SIBLING_CALL_P (insn) = 1;
2965
2966   insn = get_insns ();
2967   shorten_branches (insn);
2968   final_start_function (insn, file, 1);
2969   final (insn, file, 1);
2970   final_end_function ();
2971
2972   /* Stop pretending to be a post-reload pass.  */
2973   reload_completed = 0;
2974 }
2975
2976 static bool
2977 aarch64_tls_referenced_p (rtx x)
2978 {
2979   if (!TARGET_HAVE_TLS)
2980     return false;
2981   subrtx_iterator::array_type array;
2982   FOR_EACH_SUBRTX (iter, array, x, ALL)
2983     {
2984       const_rtx x = *iter;
2985       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
2986         return true;
2987       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2988          TLS offsets, not real symbol references.  */
2989       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
2990         iter.skip_subrtxes ();
2991     }
2992   return false;
2993 }
2994
2995
2996 static int
2997 aarch64_bitmasks_cmp (const void *i1, const void *i2)
2998 {
2999   const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3000   const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3001
3002   if (*imm1 < *imm2)
3003     return -1;
3004   if (*imm1 > *imm2)
3005     return +1;
3006   return 0;
3007 }
3008
3009
3010 static void
3011 aarch64_build_bitmask_table (void)
3012 {
3013   unsigned HOST_WIDE_INT mask, imm;
3014   unsigned int log_e, e, s, r;
3015   unsigned int nimms = 0;
3016
3017   for (log_e = 1; log_e <= 6; log_e++)
3018     {
3019       e = 1 << log_e;
3020       if (e == 64)
3021         mask = ~(HOST_WIDE_INT) 0;
3022       else
3023         mask = ((HOST_WIDE_INT) 1 << e) - 1;
3024       for (s = 1; s < e; s++)
3025         {
3026           for (r = 0; r < e; r++)
3027             {
3028               /* set s consecutive bits to 1 (s < 64) */
3029               imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3030               /* rotate right by r */
3031               if (r != 0)
3032                 imm = ((imm >> r) | (imm << (e - r))) & mask;
3033               /* replicate the constant depending on SIMD size */
3034               switch (log_e) {
3035               case 1: imm |= (imm <<  2);
3036               case 2: imm |= (imm <<  4);
3037               case 3: imm |= (imm <<  8);
3038               case 4: imm |= (imm << 16);
3039               case 5: imm |= (imm << 32);
3040               case 6:
3041                 break;
3042               default:
3043                 gcc_unreachable ();
3044               }
3045               gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3046               aarch64_bitmasks[nimms++] = imm;
3047             }
3048         }
3049     }
3050
3051   gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3052   qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3053          aarch64_bitmasks_cmp);
3054 }
3055
3056
3057 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3058    a left shift of 0 or 12 bits.  */
3059 bool
3060 aarch64_uimm12_shift (HOST_WIDE_INT val)
3061 {
3062   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3063           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3064           );
3065 }
3066
3067
3068 /* Return true if val is an immediate that can be loaded into a
3069    register by a MOVZ instruction.  */
3070 static bool
3071 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3072 {
3073   if (GET_MODE_SIZE (mode) > 4)
3074     {
3075       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3076           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3077         return 1;
3078     }
3079   else
3080     {
3081       /* Ignore sign extension.  */
3082       val &= (HOST_WIDE_INT) 0xffffffff;
3083     }
3084   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3085           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3086 }
3087
3088
3089 /* Return true if val is a valid bitmask immediate.  */
3090 bool
3091 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3092 {
3093   if (GET_MODE_SIZE (mode) < 8)
3094     {
3095       /* Replicate bit pattern.  */
3096       val &= (HOST_WIDE_INT) 0xffffffff;
3097       val |= val << 32;
3098     }
3099   return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3100                   sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3101 }
3102
3103
3104 /* Return true if val is an immediate that can be loaded into a
3105    register in a single instruction.  */
3106 bool
3107 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3108 {
3109   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3110     return 1;
3111   return aarch64_bitmask_imm (val, mode);
3112 }
3113
3114 static bool
3115 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3116 {
3117   rtx base, offset;
3118
3119   if (GET_CODE (x) == HIGH)
3120     return true;
3121
3122   split_const (x, &base, &offset);
3123   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3124     {
3125       if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3126           != SYMBOL_FORCE_TO_MEM)
3127         return true;
3128       else
3129         /* Avoid generating a 64-bit relocation in ILP32; leave
3130            to aarch64_expand_mov_immediate to handle it properly.  */
3131         return mode != ptr_mode;
3132     }
3133
3134   return aarch64_tls_referenced_p (x);
3135 }
3136
3137 /* Return true if register REGNO is a valid index register.
3138    STRICT_P is true if REG_OK_STRICT is in effect.  */
3139
3140 bool
3141 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3142 {
3143   if (!HARD_REGISTER_NUM_P (regno))
3144     {
3145       if (!strict_p)
3146         return true;
3147
3148       if (!reg_renumber)
3149         return false;
3150
3151       regno = reg_renumber[regno];
3152     }
3153   return GP_REGNUM_P (regno);
3154 }
3155
3156 /* Return true if register REGNO is a valid base register for mode MODE.
3157    STRICT_P is true if REG_OK_STRICT is in effect.  */
3158
3159 bool
3160 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3161 {
3162   if (!HARD_REGISTER_NUM_P (regno))
3163     {
3164       if (!strict_p)
3165         return true;
3166
3167       if (!reg_renumber)
3168         return false;
3169
3170       regno = reg_renumber[regno];
3171     }
3172
3173   /* The fake registers will be eliminated to either the stack or
3174      hard frame pointer, both of which are usually valid base registers.
3175      Reload deals with the cases where the eliminated form isn't valid.  */
3176   return (GP_REGNUM_P (regno)
3177           || regno == SP_REGNUM
3178           || regno == FRAME_POINTER_REGNUM
3179           || regno == ARG_POINTER_REGNUM);
3180 }
3181
3182 /* Return true if X is a valid base register for mode MODE.
3183    STRICT_P is true if REG_OK_STRICT is in effect.  */
3184
3185 static bool
3186 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3187 {
3188   if (!strict_p && GET_CODE (x) == SUBREG)
3189     x = SUBREG_REG (x);
3190
3191   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3192 }
3193
3194 /* Return true if address offset is a valid index.  If it is, fill in INFO
3195    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3196
3197 static bool
3198 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3199                         machine_mode mode, bool strict_p)
3200 {
3201   enum aarch64_address_type type;
3202   rtx index;
3203   int shift;
3204
3205   /* (reg:P) */
3206   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3207       && GET_MODE (x) == Pmode)
3208     {
3209       type = ADDRESS_REG_REG;
3210       index = x;
3211       shift = 0;
3212     }
3213   /* (sign_extend:DI (reg:SI)) */
3214   else if ((GET_CODE (x) == SIGN_EXTEND
3215             || GET_CODE (x) == ZERO_EXTEND)
3216            && GET_MODE (x) == DImode
3217            && GET_MODE (XEXP (x, 0)) == SImode)
3218     {
3219       type = (GET_CODE (x) == SIGN_EXTEND)
3220         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3221       index = XEXP (x, 0);
3222       shift = 0;
3223     }
3224   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3225   else if (GET_CODE (x) == MULT
3226            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3227                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3228            && GET_MODE (XEXP (x, 0)) == DImode
3229            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3230            && CONST_INT_P (XEXP (x, 1)))
3231     {
3232       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3233         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3234       index = XEXP (XEXP (x, 0), 0);
3235       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3236     }
3237   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3238   else if (GET_CODE (x) == ASHIFT
3239            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3240                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3241            && GET_MODE (XEXP (x, 0)) == DImode
3242            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3243            && CONST_INT_P (XEXP (x, 1)))
3244     {
3245       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3246         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3247       index = XEXP (XEXP (x, 0), 0);
3248       shift = INTVAL (XEXP (x, 1));
3249     }
3250   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3251   else if ((GET_CODE (x) == SIGN_EXTRACT
3252             || GET_CODE (x) == ZERO_EXTRACT)
3253            && GET_MODE (x) == DImode
3254            && GET_CODE (XEXP (x, 0)) == MULT
3255            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3256            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3257     {
3258       type = (GET_CODE (x) == SIGN_EXTRACT)
3259         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3260       index = XEXP (XEXP (x, 0), 0);
3261       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3262       if (INTVAL (XEXP (x, 1)) != 32 + shift
3263           || INTVAL (XEXP (x, 2)) != 0)
3264         shift = -1;
3265     }
3266   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3267      (const_int 0xffffffff<<shift)) */
3268   else if (GET_CODE (x) == AND
3269            && GET_MODE (x) == DImode
3270            && GET_CODE (XEXP (x, 0)) == MULT
3271            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3272            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3273            && CONST_INT_P (XEXP (x, 1)))
3274     {
3275       type = ADDRESS_REG_UXTW;
3276       index = XEXP (XEXP (x, 0), 0);
3277       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3278       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3279         shift = -1;
3280     }
3281   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3282   else if ((GET_CODE (x) == SIGN_EXTRACT
3283             || GET_CODE (x) == ZERO_EXTRACT)
3284            && GET_MODE (x) == DImode
3285            && GET_CODE (XEXP (x, 0)) == ASHIFT
3286            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3287            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3288     {
3289       type = (GET_CODE (x) == SIGN_EXTRACT)
3290         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3291       index = XEXP (XEXP (x, 0), 0);
3292       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3293       if (INTVAL (XEXP (x, 1)) != 32 + shift
3294           || INTVAL (XEXP (x, 2)) != 0)
3295         shift = -1;
3296     }
3297   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3298      (const_int 0xffffffff<<shift)) */
3299   else if (GET_CODE (x) == AND
3300            && GET_MODE (x) == DImode
3301            && GET_CODE (XEXP (x, 0)) == ASHIFT
3302            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3303            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3304            && CONST_INT_P (XEXP (x, 1)))
3305     {
3306       type = ADDRESS_REG_UXTW;
3307       index = XEXP (XEXP (x, 0), 0);
3308       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3309       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3310         shift = -1;
3311     }
3312   /* (mult:P (reg:P) (const_int scale)) */
3313   else if (GET_CODE (x) == MULT
3314            && GET_MODE (x) == Pmode
3315            && GET_MODE (XEXP (x, 0)) == Pmode
3316            && CONST_INT_P (XEXP (x, 1)))
3317     {
3318       type = ADDRESS_REG_REG;
3319       index = XEXP (x, 0);
3320       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3321     }
3322   /* (ashift:P (reg:P) (const_int shift)) */
3323   else if (GET_CODE (x) == ASHIFT
3324            && GET_MODE (x) == Pmode
3325            && GET_MODE (XEXP (x, 0)) == Pmode
3326            && CONST_INT_P (XEXP (x, 1)))
3327     {
3328       type = ADDRESS_REG_REG;
3329       index = XEXP (x, 0);
3330       shift = INTVAL (XEXP (x, 1));
3331     }
3332   else
3333     return false;
3334
3335   if (GET_CODE (index) == SUBREG)
3336     index = SUBREG_REG (index);
3337
3338   if ((shift == 0 ||
3339        (shift > 0 && shift <= 3
3340         && (1 << shift) == GET_MODE_SIZE (mode)))
3341       && REG_P (index)
3342       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3343     {
3344       info->type = type;
3345       info->offset = index;
3346       info->shift = shift;
3347       return true;
3348     }
3349
3350   return false;
3351 }
3352
3353 bool
3354 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3355 {
3356   return (offset >= -64 * GET_MODE_SIZE (mode)
3357           && offset < 64 * GET_MODE_SIZE (mode)
3358           && offset % GET_MODE_SIZE (mode) == 0);
3359 }
3360
3361 static inline bool
3362 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3363                                HOST_WIDE_INT offset)
3364 {
3365   return offset >= -256 && offset < 256;
3366 }
3367
3368 static inline bool
3369 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3370 {
3371   return (offset >= 0
3372           && offset < 4096 * GET_MODE_SIZE (mode)
3373           && offset % GET_MODE_SIZE (mode) == 0);
3374 }
3375
3376 /* Return true if X is a valid address for machine mode MODE.  If it is,
3377    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3378    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3379
3380 static bool
3381 aarch64_classify_address (struct aarch64_address_info *info,
3382                           rtx x, machine_mode mode,
3383                           RTX_CODE outer_code, bool strict_p)
3384 {
3385   enum rtx_code code = GET_CODE (x);
3386   rtx op0, op1;
3387
3388   /* On BE, we use load/store pair for all large int mode load/stores.  */
3389   bool load_store_pair_p = (outer_code == PARALLEL
3390                             || (BYTES_BIG_ENDIAN
3391                                 && aarch64_vect_struct_mode_p (mode)));
3392
3393   bool allow_reg_index_p =
3394     !load_store_pair_p
3395     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3396     && !aarch64_vect_struct_mode_p (mode);
3397
3398   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3399      REG addressing.  */
3400   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3401       && (code != POST_INC && code != REG))
3402     return false;
3403
3404   switch (code)
3405     {
3406     case REG:
3407     case SUBREG:
3408       info->type = ADDRESS_REG_IMM;
3409       info->base = x;
3410       info->offset = const0_rtx;
3411       return aarch64_base_register_rtx_p (x, strict_p);
3412
3413     case PLUS:
3414       op0 = XEXP (x, 0);
3415       op1 = XEXP (x, 1);
3416
3417       if (! strict_p
3418           && REG_P (op0)
3419           && (op0 == virtual_stack_vars_rtx
3420               || op0 == frame_pointer_rtx
3421               || op0 == arg_pointer_rtx)
3422           && CONST_INT_P (op1))
3423         {
3424           info->type = ADDRESS_REG_IMM;
3425           info->base = op0;
3426           info->offset = op1;
3427
3428           return true;
3429         }
3430
3431       if (GET_MODE_SIZE (mode) != 0
3432           && CONST_INT_P (op1)
3433           && aarch64_base_register_rtx_p (op0, strict_p))
3434         {
3435           HOST_WIDE_INT offset = INTVAL (op1);
3436
3437           info->type = ADDRESS_REG_IMM;
3438           info->base = op0;
3439           info->offset = op1;
3440
3441           /* TImode and TFmode values are allowed in both pairs of X
3442              registers and individual Q registers.  The available
3443              address modes are:
3444              X,X: 7-bit signed scaled offset
3445              Q:   9-bit signed offset
3446              We conservatively require an offset representable in either mode.
3447            */
3448           if (mode == TImode || mode == TFmode)
3449             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3450                     && offset_9bit_signed_unscaled_p (mode, offset));
3451
3452           /* A 7bit offset check because OImode will emit a ldp/stp
3453              instruction (only big endian will get here).
3454              For ldp/stp instructions, the offset is scaled for the size of a
3455              single element of the pair.  */
3456           if (mode == OImode)
3457             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3458
3459           /* Three 9/12 bit offsets checks because CImode will emit three
3460              ldr/str instructions (only big endian will get here).  */
3461           if (mode == CImode)
3462             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3463                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3464                         || offset_12bit_unsigned_scaled_p (V16QImode,
3465                                                            offset + 32)));
3466
3467           /* Two 7bit offsets checks because XImode will emit two ldp/stp
3468              instructions (only big endian will get here).  */
3469           if (mode == XImode)
3470             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3471                     && aarch64_offset_7bit_signed_scaled_p (TImode,
3472                                                             offset + 32));
3473
3474           if (load_store_pair_p)
3475             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3476                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3477           else
3478             return (offset_9bit_signed_unscaled_p (mode, offset)
3479                     || offset_12bit_unsigned_scaled_p (mode, offset));
3480         }
3481
3482       if (allow_reg_index_p)
3483         {
3484           /* Look for base + (scaled/extended) index register.  */
3485           if (aarch64_base_register_rtx_p (op0, strict_p)
3486               && aarch64_classify_index (info, op1, mode, strict_p))
3487             {
3488               info->base = op0;
3489               return true;
3490             }
3491           if (aarch64_base_register_rtx_p (op1, strict_p)
3492               && aarch64_classify_index (info, op0, mode, strict_p))
3493             {
3494               info->base = op1;
3495               return true;
3496             }
3497         }
3498
3499       return false;
3500
3501     case POST_INC:
3502     case POST_DEC:
3503     case PRE_INC:
3504     case PRE_DEC:
3505       info->type = ADDRESS_REG_WB;
3506       info->base = XEXP (x, 0);
3507       info->offset = NULL_RTX;
3508       return aarch64_base_register_rtx_p (info->base, strict_p);
3509
3510     case POST_MODIFY:
3511     case PRE_MODIFY:
3512       info->type = ADDRESS_REG_WB;
3513       info->base = XEXP (x, 0);
3514       if (GET_CODE (XEXP (x, 1)) == PLUS
3515           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3516           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3517           && aarch64_base_register_rtx_p (info->base, strict_p))
3518         {
3519           HOST_WIDE_INT offset;
3520           info->offset = XEXP (XEXP (x, 1), 1);
3521           offset = INTVAL (info->offset);
3522
3523           /* TImode and TFmode values are allowed in both pairs of X
3524              registers and individual Q registers.  The available
3525              address modes are:
3526              X,X: 7-bit signed scaled offset
3527              Q:   9-bit signed offset
3528              We conservatively require an offset representable in either mode.
3529            */
3530           if (mode == TImode || mode == TFmode)
3531             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3532                     && offset_9bit_signed_unscaled_p (mode, offset));
3533
3534           if (load_store_pair_p)
3535             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3536                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3537           else
3538             return offset_9bit_signed_unscaled_p (mode, offset);
3539         }
3540       return false;
3541
3542     case CONST:
3543     case SYMBOL_REF:
3544     case LABEL_REF:
3545       /* load literal: pc-relative constant pool entry.  Only supported
3546          for SI mode or larger.  */
3547       info->type = ADDRESS_SYMBOLIC;
3548
3549       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3550         {
3551           rtx sym, addend;
3552
3553           split_const (x, &sym, &addend);
3554           return (GET_CODE (sym) == LABEL_REF
3555                   || (GET_CODE (sym) == SYMBOL_REF
3556                       && CONSTANT_POOL_ADDRESS_P (sym)));
3557         }
3558       return false;
3559
3560     case LO_SUM:
3561       info->type = ADDRESS_LO_SUM;
3562       info->base = XEXP (x, 0);
3563       info->offset = XEXP (x, 1);
3564       if (allow_reg_index_p
3565           && aarch64_base_register_rtx_p (info->base, strict_p))
3566         {
3567           rtx sym, offs;
3568           split_const (info->offset, &sym, &offs);
3569           if (GET_CODE (sym) == SYMBOL_REF
3570               && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3571                   == SYMBOL_SMALL_ABSOLUTE))
3572             {
3573               /* The symbol and offset must be aligned to the access size.  */
3574               unsigned int align;
3575               unsigned int ref_size;
3576
3577               if (CONSTANT_POOL_ADDRESS_P (sym))
3578                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3579               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3580                 {
3581                   tree exp = SYMBOL_REF_DECL (sym);
3582                   align = TYPE_ALIGN (TREE_TYPE (exp));
3583                   align = CONSTANT_ALIGNMENT (exp, align);
3584                 }
3585               else if (SYMBOL_REF_DECL (sym))
3586                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3587               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3588                        && SYMBOL_REF_BLOCK (sym) != NULL)
3589                 align = SYMBOL_REF_BLOCK (sym)->alignment;
3590               else
3591                 align = BITS_PER_UNIT;
3592
3593               ref_size = GET_MODE_SIZE (mode);
3594               if (ref_size == 0)
3595                 ref_size = GET_MODE_SIZE (DImode);
3596
3597               return ((INTVAL (offs) & (ref_size - 1)) == 0
3598                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3599             }
3600         }
3601       return false;
3602
3603     default:
3604       return false;
3605     }
3606 }
3607
3608 bool
3609 aarch64_symbolic_address_p (rtx x)
3610 {
3611   rtx offset;
3612
3613   split_const (x, &x, &offset);
3614   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3615 }
3616
3617 /* Classify the base of symbolic expression X, given that X appears in
3618    context CONTEXT.  */
3619
3620 enum aarch64_symbol_type
3621 aarch64_classify_symbolic_expression (rtx x,
3622                                       enum aarch64_symbol_context context)
3623 {
3624   rtx offset;
3625
3626   split_const (x, &x, &offset);
3627   return aarch64_classify_symbol (x, offset, context);
3628 }
3629
3630
3631 /* Return TRUE if X is a legitimate address for accessing memory in
3632    mode MODE.  */
3633 static bool
3634 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3635 {
3636   struct aarch64_address_info addr;
3637
3638   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3639 }
3640
3641 /* Return TRUE if X is a legitimate address for accessing memory in
3642    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3643    pair operation.  */
3644 bool
3645 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3646                               RTX_CODE outer_code, bool strict_p)
3647 {
3648   struct aarch64_address_info addr;
3649
3650   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3651 }
3652
3653 /* Return TRUE if rtx X is immediate constant 0.0 */
3654 bool
3655 aarch64_float_const_zero_rtx_p (rtx x)
3656 {
3657   REAL_VALUE_TYPE r;
3658
3659   if (GET_MODE (x) == VOIDmode)
3660     return false;
3661
3662   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3663   if (REAL_VALUE_MINUS_ZERO (r))
3664     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3665   return REAL_VALUES_EQUAL (r, dconst0);
3666 }
3667
3668 /* Return the fixed registers used for condition codes.  */
3669
3670 static bool
3671 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3672 {
3673   *p1 = CC_REGNUM;
3674   *p2 = INVALID_REGNUM;
3675   return true;
3676 }
3677
3678 /* Emit call insn with PAT and do aarch64-specific handling.  */
3679
3680 void
3681 aarch64_emit_call_insn (rtx pat)
3682 {
3683   rtx insn = emit_call_insn (pat);
3684
3685   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3686   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3687   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3688 }
3689
3690 machine_mode
3691 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3692 {
3693   /* All floating point compares return CCFP if it is an equality
3694      comparison, and CCFPE otherwise.  */
3695   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3696     {
3697       switch (code)
3698         {
3699         case EQ:
3700         case NE:
3701         case UNORDERED:
3702         case ORDERED:
3703         case UNLT:
3704         case UNLE:
3705         case UNGT:
3706         case UNGE:
3707         case UNEQ:
3708         case LTGT:
3709           return CCFPmode;
3710
3711         case LT:
3712         case LE:
3713         case GT:
3714         case GE:
3715           return CCFPEmode;
3716
3717         default:
3718           gcc_unreachable ();
3719         }
3720     }
3721
3722   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3723       && y == const0_rtx
3724       && (code == EQ || code == NE || code == LT || code == GE)
3725       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3726           || GET_CODE (x) == NEG))
3727     return CC_NZmode;
3728
3729   /* A compare with a shifted operand.  Because of canonicalization,
3730      the comparison will have to be swapped when we emit the assembly
3731      code.  */
3732   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3733       && (REG_P (y) || GET_CODE (y) == SUBREG)
3734       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3735           || GET_CODE (x) == LSHIFTRT
3736           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3737     return CC_SWPmode;
3738
3739   /* Similarly for a negated operand, but we can only do this for
3740      equalities.  */
3741   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3742       && (REG_P (y) || GET_CODE (y) == SUBREG)
3743       && (code == EQ || code == NE)
3744       && GET_CODE (x) == NEG)
3745     return CC_Zmode;
3746
3747   /* A compare of a mode narrower than SI mode against zero can be done
3748      by extending the value in the comparison.  */
3749   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3750       && y == const0_rtx)
3751     /* Only use sign-extension if we really need it.  */
3752     return ((code == GT || code == GE || code == LE || code == LT)
3753             ? CC_SESWPmode : CC_ZESWPmode);
3754
3755   /* For everything else, return CCmode.  */
3756   return CCmode;
3757 }
3758
3759 static int
3760 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3761
3762 int
3763 aarch64_get_condition_code (rtx x)
3764 {
3765   machine_mode mode = GET_MODE (XEXP (x, 0));
3766   enum rtx_code comp_code = GET_CODE (x);
3767
3768   if (GET_MODE_CLASS (mode) != MODE_CC)
3769     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3770   return aarch64_get_condition_code_1 (mode, comp_code);
3771 }
3772
3773 static int
3774 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3775 {
3776   int ne = -1, eq = -1;
3777   switch (mode)
3778     {
3779     case CCFPmode:
3780     case CCFPEmode:
3781       switch (comp_code)
3782         {
3783         case GE: return AARCH64_GE;
3784         case GT: return AARCH64_GT;
3785         case LE: return AARCH64_LS;
3786         case LT: return AARCH64_MI;
3787         case NE: return AARCH64_NE;
3788         case EQ: return AARCH64_EQ;
3789         case ORDERED: return AARCH64_VC;
3790         case UNORDERED: return AARCH64_VS;
3791         case UNLT: return AARCH64_LT;
3792         case UNLE: return AARCH64_LE;
3793         case UNGT: return AARCH64_HI;
3794         case UNGE: return AARCH64_PL;
3795         default: return -1;
3796         }
3797       break;
3798
3799     case CC_DNEmode:
3800       ne = AARCH64_NE;
3801       eq = AARCH64_EQ;
3802       break;
3803
3804     case CC_DEQmode:
3805       ne = AARCH64_EQ;
3806       eq = AARCH64_NE;
3807       break;
3808
3809     case CC_DGEmode:
3810       ne = AARCH64_GE;
3811       eq = AARCH64_LT;
3812       break;
3813
3814     case CC_DLTmode:
3815       ne = AARCH64_LT;
3816       eq = AARCH64_GE;
3817       break;
3818
3819     case CC_DGTmode:
3820       ne = AARCH64_GT;
3821       eq = AARCH64_LE;
3822       break;
3823
3824     case CC_DLEmode:
3825       ne = AARCH64_LE;
3826       eq = AARCH64_GT;
3827       break;
3828
3829     case CC_DGEUmode:
3830       ne = AARCH64_CS;
3831       eq = AARCH64_CC;
3832       break;
3833
3834     case CC_DLTUmode:
3835       ne = AARCH64_CC;
3836       eq = AARCH64_CS;
3837       break;
3838
3839     case CC_DGTUmode:
3840       ne = AARCH64_HI;
3841       eq = AARCH64_LS;
3842       break;
3843
3844     case CC_DLEUmode:
3845       ne = AARCH64_LS;
3846       eq = AARCH64_HI;
3847       break;
3848
3849     case CCmode:
3850       switch (comp_code)
3851         {
3852         case NE: return AARCH64_NE;
3853         case EQ: return AARCH64_EQ;
3854         case GE: return AARCH64_GE;
3855         case GT: return AARCH64_GT;
3856         case LE: return AARCH64_LE;
3857         case LT: return AARCH64_LT;
3858         case GEU: return AARCH64_CS;
3859         case GTU: return AARCH64_HI;
3860         case LEU: return AARCH64_LS;
3861         case LTU: return AARCH64_CC;
3862         default: return -1;
3863         }
3864       break;
3865
3866     case CC_SWPmode:
3867     case CC_ZESWPmode:
3868     case CC_SESWPmode:
3869       switch (comp_code)
3870         {
3871         case NE: return AARCH64_NE;
3872         case EQ: return AARCH64_EQ;
3873         case GE: return AARCH64_LE;
3874         case GT: return AARCH64_LT;
3875         case LE: return AARCH64_GE;
3876         case LT: return AARCH64_GT;
3877         case GEU: return AARCH64_LS;
3878         case GTU: return AARCH64_CC;
3879         case LEU: return AARCH64_CS;
3880         case LTU: return AARCH64_HI;
3881         default: return -1;
3882         }
3883       break;
3884
3885     case CC_NZmode:
3886       switch (comp_code)
3887         {
3888         case NE: return AARCH64_NE;
3889         case EQ: return AARCH64_EQ;
3890         case GE: return AARCH64_PL;
3891         case LT: return AARCH64_MI;
3892         default: return -1;
3893         }
3894       break;
3895
3896     case CC_Zmode:
3897       switch (comp_code)
3898         {
3899         case NE: return AARCH64_NE;
3900         case EQ: return AARCH64_EQ;
3901         default: return -1;
3902         }
3903       break;
3904
3905     default:
3906       return -1;
3907       break;
3908     }
3909
3910   if (comp_code == NE)
3911     return ne;
3912
3913   if (comp_code == EQ)
3914     return eq;
3915
3916   return -1;
3917 }
3918
3919 bool
3920 aarch64_const_vec_all_same_in_range_p (rtx x,
3921                                   HOST_WIDE_INT minval,
3922                                   HOST_WIDE_INT maxval)
3923 {
3924   HOST_WIDE_INT firstval;
3925   int count, i;
3926
3927   if (GET_CODE (x) != CONST_VECTOR
3928       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3929     return false;
3930
3931   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3932   if (firstval < minval || firstval > maxval)
3933     return false;
3934
3935   count = CONST_VECTOR_NUNITS (x);
3936   for (i = 1; i < count; i++)
3937     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3938       return false;
3939
3940   return true;
3941 }
3942
3943 bool
3944 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3945 {
3946   return aarch64_const_vec_all_same_in_range_p (x, val, val);
3947 }
3948
3949 static unsigned
3950 bit_count (unsigned HOST_WIDE_INT value)
3951 {
3952   unsigned count = 0;
3953
3954   while (value)
3955     {
3956       count++;
3957       value &= value - 1;
3958     }
3959
3960   return count;
3961 }
3962
3963 /* N Z C V.  */
3964 #define AARCH64_CC_V 1
3965 #define AARCH64_CC_C (1 << 1)
3966 #define AARCH64_CC_Z (1 << 2)
3967 #define AARCH64_CC_N (1 << 3)
3968
3969 /* N Z C V flags for ccmp.  The first code is for AND op and the other
3970    is for IOR op.  Indexed by AARCH64_COND_CODE.  */
3971 static const int aarch64_nzcv_codes[][2] =
3972 {
3973   {AARCH64_CC_Z, 0}, /* EQ, Z == 1.  */
3974   {0, AARCH64_CC_Z}, /* NE, Z == 0.  */
3975   {AARCH64_CC_C, 0}, /* CS, C == 1.  */
3976   {0, AARCH64_CC_C}, /* CC, C == 0.  */
3977   {AARCH64_CC_N, 0}, /* MI, N == 1.  */
3978   {0, AARCH64_CC_N}, /* PL, N == 0.  */
3979   {AARCH64_CC_V, 0}, /* VS, V == 1.  */
3980   {0, AARCH64_CC_V}, /* VC, V == 0.  */
3981   {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0.  */
3982   {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0).  */
3983   {0, AARCH64_CC_V}, /* GE, N == V.  */
3984   {AARCH64_CC_V, 0}, /* LT, N != V.  */
3985   {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V.  */
3986   {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V).  */
3987   {0, 0}, /* AL, Any.  */
3988   {0, 0}, /* NV, Any.  */
3989 };
3990
3991 int
3992 aarch64_ccmp_mode_to_code (enum machine_mode mode)
3993 {
3994   switch (mode)
3995     {
3996     case CC_DNEmode:
3997       return NE;
3998
3999     case CC_DEQmode:
4000       return EQ;
4001
4002     case CC_DLEmode:
4003       return LE;
4004
4005     case CC_DGTmode:
4006       return GT;
4007
4008     case CC_DLTmode:
4009       return LT;
4010
4011     case CC_DGEmode:
4012       return GE;
4013
4014     case CC_DLEUmode:
4015       return LEU;
4016
4017     case CC_DGTUmode:
4018       return GTU;
4019
4020     case CC_DLTUmode:
4021       return LTU;
4022
4023     case CC_DGEUmode:
4024       return GEU;
4025
4026     default:
4027       gcc_unreachable ();
4028     }
4029 }
4030
4031
4032 void
4033 aarch64_print_operand (FILE *f, rtx x, char code)
4034 {
4035   switch (code)
4036     {
4037     /* An integer or symbol address without a preceding # sign.  */
4038     case 'c':
4039       switch (GET_CODE (x))
4040         {
4041         case CONST_INT:
4042           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4043           break;
4044
4045         case SYMBOL_REF:
4046           output_addr_const (f, x);
4047           break;
4048
4049         case CONST:
4050           if (GET_CODE (XEXP (x, 0)) == PLUS
4051               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4052             {
4053               output_addr_const (f, x);
4054               break;
4055             }
4056           /* Fall through.  */
4057
4058         default:
4059           output_operand_lossage ("Unsupported operand for code '%c'", code);
4060         }
4061       break;
4062
4063     case 'e':
4064       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4065       {
4066         int n;
4067
4068         if (!CONST_INT_P (x)
4069             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4070           {
4071             output_operand_lossage ("invalid operand for '%%%c'", code);
4072             return;
4073           }
4074
4075         switch (n)
4076           {
4077           case 3:
4078             fputc ('b', f);
4079             break;
4080           case 4:
4081             fputc ('h', f);
4082             break;
4083           case 5:
4084             fputc ('w', f);
4085             break;
4086           default:
4087             output_operand_lossage ("invalid operand for '%%%c'", code);
4088             return;
4089           }
4090       }
4091       break;
4092
4093     case 'p':
4094       {
4095         int n;
4096
4097         /* Print N such that 2^N == X.  */
4098         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4099           {
4100             output_operand_lossage ("invalid operand for '%%%c'", code);
4101             return;
4102           }
4103
4104         asm_fprintf (f, "%d", n);
4105       }
4106       break;
4107
4108     case 'P':
4109       /* Print the number of non-zero bits in X (a const_int).  */
4110       if (!CONST_INT_P (x))
4111         {
4112           output_operand_lossage ("invalid operand for '%%%c'", code);
4113           return;
4114         }
4115
4116       asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4117       break;
4118
4119     case 'H':
4120       /* Print the higher numbered register of a pair (TImode) of regs.  */
4121       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4122         {
4123           output_operand_lossage ("invalid operand for '%%%c'", code);
4124           return;
4125         }
4126
4127       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4128       break;
4129
4130     case 'm':
4131       {
4132         int cond_code;
4133         /* Print a condition (eq, ne, etc).  */
4134
4135         /* CONST_TRUE_RTX means always -- that's the default.  */
4136         if (x == const_true_rtx)
4137           return;
4138
4139         if (!COMPARISON_P (x))
4140           {
4141             output_operand_lossage ("invalid operand for '%%%c'", code);
4142             return;
4143           }
4144
4145         cond_code = aarch64_get_condition_code (x);
4146         gcc_assert (cond_code >= 0);
4147         fputs (aarch64_condition_codes[cond_code], f);
4148       }
4149       break;
4150
4151     case 'M':
4152       {
4153         int cond_code;
4154         /* Print the inverse of a condition (eq <-> ne, etc).  */
4155
4156         /* CONST_TRUE_RTX means never -- that's the default.  */
4157         if (x == const_true_rtx)
4158           {
4159             fputs ("nv", f);
4160             return;
4161           }
4162
4163         if (!COMPARISON_P (x))
4164           {
4165             output_operand_lossage ("invalid operand for '%%%c'", code);
4166             return;
4167           }
4168         cond_code = aarch64_get_condition_code (x);
4169         gcc_assert (cond_code >= 0);
4170         fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4171                                        (cond_code)], f);
4172       }
4173       break;
4174
4175     case 'b':
4176     case 'h':
4177     case 's':
4178     case 'd':
4179     case 'q':
4180       /* Print a scalar FP/SIMD register name.  */
4181       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4182         {
4183           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4184           return;
4185         }
4186       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4187       break;
4188
4189     case 'S':
4190     case 'T':
4191     case 'U':
4192     case 'V':
4193       /* Print the first FP/SIMD register name in a list.  */
4194       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4195         {
4196           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4197           return;
4198         }
4199       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4200       break;
4201
4202     case 'R':
4203       /* Print a scalar FP/SIMD register name + 1.  */
4204       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4205         {
4206           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4207           return;
4208         }
4209       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4210       break;
4211
4212     case 'X':
4213       /* Print bottom 16 bits of integer constant in hex.  */
4214       if (!CONST_INT_P (x))
4215         {
4216           output_operand_lossage ("invalid operand for '%%%c'", code);
4217           return;
4218         }
4219       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4220       break;
4221
4222     case 'w':
4223     case 'x':
4224       /* Print a general register name or the zero register (32-bit or
4225          64-bit).  */
4226       if (x == const0_rtx
4227           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4228         {
4229           asm_fprintf (f, "%czr", code);
4230           break;
4231         }
4232
4233       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4234         {
4235           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4236           break;
4237         }
4238
4239       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4240         {
4241           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4242           break;
4243         }
4244
4245       /* Fall through */
4246
4247     case 0:
4248       /* Print a normal operand, if it's a general register, then we
4249          assume DImode.  */
4250       if (x == NULL)
4251         {
4252           output_operand_lossage ("missing operand");
4253           return;
4254         }
4255
4256       switch (GET_CODE (x))
4257         {
4258         case REG:
4259           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4260           break;
4261
4262         case MEM:
4263           aarch64_memory_reference_mode = GET_MODE (x);
4264           output_address (XEXP (x, 0));
4265           break;
4266
4267         case LABEL_REF:
4268         case SYMBOL_REF:
4269           output_addr_const (asm_out_file, x);
4270           break;
4271
4272         case CONST_INT:
4273           asm_fprintf (f, "%wd", INTVAL (x));
4274           break;
4275
4276         case CONST_VECTOR:
4277           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4278             {
4279               gcc_assert (
4280                   aarch64_const_vec_all_same_in_range_p (x,
4281                                                          HOST_WIDE_INT_MIN,
4282                                                          HOST_WIDE_INT_MAX));
4283               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4284             }
4285           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4286             {
4287               fputc ('0', f);
4288             }
4289           else
4290             gcc_unreachable ();
4291           break;
4292
4293         case CONST_DOUBLE:
4294           /* CONST_DOUBLE can represent a double-width integer.
4295              In this case, the mode of x is VOIDmode.  */
4296           if (GET_MODE (x) == VOIDmode)
4297             ; /* Do Nothing.  */
4298           else if (aarch64_float_const_zero_rtx_p (x))
4299             {
4300               fputc ('0', f);
4301               break;
4302             }
4303           else if (aarch64_float_const_representable_p (x))
4304             {
4305 #define buf_size 20
4306               char float_buf[buf_size] = {'\0'};
4307               REAL_VALUE_TYPE r;
4308               REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4309               real_to_decimal_for_mode (float_buf, &r,
4310                                         buf_size, buf_size,
4311                                         1, GET_MODE (x));
4312               asm_fprintf (asm_out_file, "%s", float_buf);
4313               break;
4314 #undef buf_size
4315             }
4316           output_operand_lossage ("invalid constant");
4317           return;
4318         default:
4319           output_operand_lossage ("invalid operand");
4320           return;
4321         }
4322       break;
4323
4324     case 'A':
4325       if (GET_CODE (x) == HIGH)
4326         x = XEXP (x, 0);
4327
4328       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4329         {
4330         case SYMBOL_SMALL_GOT:
4331           asm_fprintf (asm_out_file, ":got:");
4332           break;
4333
4334         case SYMBOL_SMALL_TLSGD:
4335           asm_fprintf (asm_out_file, ":tlsgd:");
4336           break;
4337
4338         case SYMBOL_SMALL_TLSDESC:
4339           asm_fprintf (asm_out_file, ":tlsdesc:");
4340           break;
4341
4342         case SYMBOL_SMALL_GOTTPREL:
4343           asm_fprintf (asm_out_file, ":gottprel:");
4344           break;
4345
4346         case SYMBOL_SMALL_TPREL:
4347           asm_fprintf (asm_out_file, ":tprel:");
4348           break;
4349
4350         case SYMBOL_TINY_GOT:
4351           gcc_unreachable ();
4352           break;
4353
4354         default:
4355           break;
4356         }
4357       output_addr_const (asm_out_file, x);
4358       break;
4359
4360     case 'L':
4361       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4362         {
4363         case SYMBOL_SMALL_GOT:
4364           asm_fprintf (asm_out_file, ":lo12:");
4365           break;
4366
4367         case SYMBOL_SMALL_TLSGD:
4368           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4369           break;
4370
4371         case SYMBOL_SMALL_TLSDESC:
4372           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4373           break;
4374
4375         case SYMBOL_SMALL_GOTTPREL:
4376           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4377           break;
4378
4379         case SYMBOL_SMALL_TPREL:
4380           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4381           break;
4382
4383         case SYMBOL_TINY_GOT:
4384           asm_fprintf (asm_out_file, ":got:");
4385           break;
4386
4387         default:
4388           break;
4389         }
4390       output_addr_const (asm_out_file, x);
4391       break;
4392
4393     case 'G':
4394
4395       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4396         {
4397         case SYMBOL_SMALL_TPREL:
4398           asm_fprintf (asm_out_file, ":tprel_hi12:");
4399           break;
4400         default:
4401           break;
4402         }
4403       output_addr_const (asm_out_file, x);
4404       break;
4405
4406     case 'K':
4407       {
4408         int cond_code;
4409         /* Print nzcv.  */
4410
4411         if (!COMPARISON_P (x))
4412           {
4413             output_operand_lossage ("invalid operand for '%%%c'", code);
4414             return;
4415           }
4416
4417         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4418         gcc_assert (cond_code >= 0);
4419         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4420       }
4421       break;
4422
4423     case 'k':
4424       {
4425         int cond_code;
4426         /* Print nzcv.  */
4427
4428         if (!COMPARISON_P (x))
4429           {
4430             output_operand_lossage ("invalid operand for '%%%c'", code);
4431             return;
4432           }
4433
4434         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4435         gcc_assert (cond_code >= 0);
4436         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4437       }
4438       break;
4439
4440     default:
4441       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4442       return;
4443     }
4444 }
4445
4446 void
4447 aarch64_print_operand_address (FILE *f, rtx x)
4448 {
4449   struct aarch64_address_info addr;
4450
4451   if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4452                              MEM, true))
4453     switch (addr.type)
4454       {
4455       case ADDRESS_REG_IMM:
4456         if (addr.offset == const0_rtx)
4457           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4458         else
4459           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4460                        INTVAL (addr.offset));
4461         return;
4462
4463       case ADDRESS_REG_REG:
4464         if (addr.shift == 0)
4465           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4466                        reg_names [REGNO (addr.offset)]);
4467         else
4468           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4469                        reg_names [REGNO (addr.offset)], addr.shift);
4470         return;
4471
4472       case ADDRESS_REG_UXTW:
4473         if (addr.shift == 0)
4474           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4475                        REGNO (addr.offset) - R0_REGNUM);
4476         else
4477           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4478                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4479         return;
4480
4481       case ADDRESS_REG_SXTW:
4482         if (addr.shift == 0)
4483           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4484                        REGNO (addr.offset) - R0_REGNUM);
4485         else
4486           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4487                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4488         return;
4489
4490       case ADDRESS_REG_WB:
4491         switch (GET_CODE (x))
4492           {
4493           case PRE_INC:
4494             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4495                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4496             return;
4497           case POST_INC:
4498             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4499                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4500             return;
4501           case PRE_DEC:
4502             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4503                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4504             return;
4505           case POST_DEC:
4506             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4507                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4508             return;
4509           case PRE_MODIFY:
4510             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4511                          INTVAL (addr.offset));
4512             return;
4513           case POST_MODIFY:
4514             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4515                          INTVAL (addr.offset));
4516             return;
4517           default:
4518             break;
4519           }
4520         break;
4521
4522       case ADDRESS_LO_SUM:
4523         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4524         output_addr_const (f, addr.offset);
4525         asm_fprintf (f, "]");
4526         return;
4527
4528       case ADDRESS_SYMBOLIC:
4529         break;
4530       }
4531
4532   output_addr_const (f, x);
4533 }
4534
4535 bool
4536 aarch64_label_mentioned_p (rtx x)
4537 {
4538   const char *fmt;
4539   int i;
4540
4541   if (GET_CODE (x) == LABEL_REF)
4542     return true;
4543
4544   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4545      referencing instruction, but they are constant offsets, not
4546      symbols.  */
4547   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4548     return false;
4549
4550   fmt = GET_RTX_FORMAT (GET_CODE (x));
4551   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4552     {
4553       if (fmt[i] == 'E')
4554         {
4555           int j;
4556
4557           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4558             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4559               return 1;
4560         }
4561       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4562         return 1;
4563     }
4564
4565   return 0;
4566 }
4567
4568 /* Implement REGNO_REG_CLASS.  */
4569
4570 enum reg_class
4571 aarch64_regno_regclass (unsigned regno)
4572 {
4573   if (GP_REGNUM_P (regno))
4574     return GENERAL_REGS;
4575
4576   if (regno == SP_REGNUM)
4577     return STACK_REG;
4578
4579   if (regno == FRAME_POINTER_REGNUM
4580       || regno == ARG_POINTER_REGNUM)
4581     return POINTER_REGS;
4582
4583   if (FP_REGNUM_P (regno))
4584     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4585
4586   return NO_REGS;
4587 }
4588
4589 static rtx
4590 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
4591 {
4592   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4593      where mask is selected by alignment and size of the offset.
4594      We try to pick as large a range for the offset as possible to
4595      maximize the chance of a CSE.  However, for aligned addresses
4596      we limit the range to 4k so that structures with different sized
4597      elements are likely to use the same base.  */
4598
4599   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4600     {
4601       HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4602       HOST_WIDE_INT base_offset;
4603
4604       /* Does it look like we'll need a load/store-pair operation?  */
4605       if (GET_MODE_SIZE (mode) > 16
4606           || mode == TImode)
4607         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4608                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
4609       /* For offsets aren't a multiple of the access size, the limit is
4610          -256...255.  */
4611       else if (offset & (GET_MODE_SIZE (mode) - 1))
4612         base_offset = (offset + 0x100) & ~0x1ff;
4613       else
4614         base_offset = offset & ~0xfff;
4615
4616       if (base_offset == 0)
4617         return x;
4618
4619       offset -= base_offset;
4620       rtx base_reg = gen_reg_rtx (Pmode);
4621       rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4622                            NULL_RTX);
4623       emit_move_insn (base_reg, val);
4624       x = plus_constant (Pmode, base_reg, offset);
4625     }
4626
4627   return x;
4628 }
4629
4630 /* Try a machine-dependent way of reloading an illegitimate address
4631    operand.  If we find one, push the reload and return the new rtx.  */
4632
4633 rtx
4634 aarch64_legitimize_reload_address (rtx *x_p,
4635                                    machine_mode mode,
4636                                    int opnum, int type,
4637                                    int ind_levels ATTRIBUTE_UNUSED)
4638 {
4639   rtx x = *x_p;
4640
4641   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4642   if (aarch64_vect_struct_mode_p (mode)
4643       && GET_CODE (x) == PLUS
4644       && REG_P (XEXP (x, 0))
4645       && CONST_INT_P (XEXP (x, 1)))
4646     {
4647       rtx orig_rtx = x;
4648       x = copy_rtx (x);
4649       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4650                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4651                    opnum, (enum reload_type) type);
4652       return x;
4653     }
4654
4655   /* We must recognize output that we have already generated ourselves.  */
4656   if (GET_CODE (x) == PLUS
4657       && GET_CODE (XEXP (x, 0)) == PLUS
4658       && REG_P (XEXP (XEXP (x, 0), 0))
4659       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4660       && CONST_INT_P (XEXP (x, 1)))
4661     {
4662       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4663                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4664                    opnum, (enum reload_type) type);
4665       return x;
4666     }
4667
4668   /* We wish to handle large displacements off a base register by splitting
4669      the addend across an add and the mem insn.  This can cut the number of
4670      extra insns needed from 3 to 1.  It is only useful for load/store of a
4671      single register with 12 bit offset field.  */
4672   if (GET_CODE (x) == PLUS
4673       && REG_P (XEXP (x, 0))
4674       && CONST_INT_P (XEXP (x, 1))
4675       && HARD_REGISTER_P (XEXP (x, 0))
4676       && mode != TImode
4677       && mode != TFmode
4678       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4679     {
4680       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4681       HOST_WIDE_INT low = val & 0xfff;
4682       HOST_WIDE_INT high = val - low;
4683       HOST_WIDE_INT offs;
4684       rtx cst;
4685       machine_mode xmode = GET_MODE (x);
4686
4687       /* In ILP32, xmode can be either DImode or SImode.  */
4688       gcc_assert (xmode == DImode || xmode == SImode);
4689
4690       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4691          BLKmode alignment.  */
4692       if (GET_MODE_SIZE (mode) == 0)
4693         return NULL_RTX;
4694
4695       offs = low % GET_MODE_SIZE (mode);
4696
4697       /* Align misaligned offset by adjusting high part to compensate.  */
4698       if (offs != 0)
4699         {
4700           if (aarch64_uimm12_shift (high + offs))
4701             {
4702               /* Align down.  */
4703               low = low - offs;
4704               high = high + offs;
4705             }
4706           else
4707             {
4708               /* Align up.  */
4709               offs = GET_MODE_SIZE (mode) - offs;
4710               low = low + offs;
4711               high = high + (low & 0x1000) - offs;
4712               low &= 0xfff;
4713             }
4714         }
4715
4716       /* Check for overflow.  */
4717       if (high + low != val)
4718         return NULL_RTX;
4719
4720       cst = GEN_INT (high);
4721       if (!aarch64_uimm12_shift (high))
4722         cst = force_const_mem (xmode, cst);
4723
4724       /* Reload high part into base reg, leaving the low part
4725          in the mem instruction.
4726          Note that replacing this gen_rtx_PLUS with plus_constant is
4727          wrong in this case because we rely on the
4728          (plus (plus reg c1) c2) structure being preserved so that
4729          XEXP (*p, 0) in push_reload below uses the correct term.  */
4730       x = gen_rtx_PLUS (xmode,
4731                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4732                         GEN_INT (low));
4733
4734       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4735                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4736                    opnum, (enum reload_type) type);
4737       return x;
4738     }
4739
4740   return NULL_RTX;
4741 }
4742
4743
4744 static reg_class_t
4745 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4746                           reg_class_t rclass,
4747                           machine_mode mode,
4748                           secondary_reload_info *sri)
4749 {
4750   /* Without the TARGET_SIMD instructions we cannot move a Q register
4751      to a Q register directly.  We need a scratch.  */
4752   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4753       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4754       && reg_class_subset_p (rclass, FP_REGS))
4755     {
4756       if (mode == TFmode)
4757         sri->icode = CODE_FOR_aarch64_reload_movtf;
4758       else if (mode == TImode)
4759         sri->icode = CODE_FOR_aarch64_reload_movti;
4760       return NO_REGS;
4761     }
4762
4763   /* A TFmode or TImode memory access should be handled via an FP_REGS
4764      because AArch64 has richer addressing modes for LDR/STR instructions
4765      than LDP/STP instructions.  */
4766   if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4767       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4768     return FP_REGS;
4769
4770   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4771       return GENERAL_REGS;
4772
4773   return NO_REGS;
4774 }
4775
4776 static bool
4777 aarch64_can_eliminate (const int from, const int to)
4778 {
4779   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4780      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4781
4782   if (frame_pointer_needed)
4783     {
4784       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4785         return true;
4786       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4787         return false;
4788       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4789           && !cfun->calls_alloca)
4790         return true;
4791       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4792         return true;
4793
4794       return false;
4795     }
4796   else
4797     {
4798       /* If we decided that we didn't need a leaf frame pointer but then used
4799          LR in the function, then we'll want a frame pointer after all, so
4800          prevent this elimination to ensure a frame pointer is used.  */
4801       if (to == STACK_POINTER_REGNUM
4802           && flag_omit_leaf_frame_pointer
4803           && df_regs_ever_live_p (LR_REGNUM))
4804         return false;
4805     }
4806
4807   return true;
4808 }
4809
4810 HOST_WIDE_INT
4811 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4812 {
4813   aarch64_layout_frame ();
4814
4815   if (to == HARD_FRAME_POINTER_REGNUM)
4816     {
4817       if (from == ARG_POINTER_REGNUM)
4818         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4819
4820       if (from == FRAME_POINTER_REGNUM)
4821         return (cfun->machine->frame.hard_fp_offset
4822                 - cfun->machine->frame.saved_varargs_size);
4823     }
4824
4825   if (to == STACK_POINTER_REGNUM)
4826     {
4827       if (from == FRAME_POINTER_REGNUM)
4828           return (cfun->machine->frame.frame_size
4829                   - cfun->machine->frame.saved_varargs_size);
4830     }
4831
4832   return cfun->machine->frame.frame_size;
4833 }
4834
4835 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
4836    previous frame.  */
4837
4838 rtx
4839 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4840 {
4841   if (count != 0)
4842     return const0_rtx;
4843   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4844 }
4845
4846
4847 static void
4848 aarch64_asm_trampoline_template (FILE *f)
4849 {
4850   if (TARGET_ILP32)
4851     {
4852       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4853       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4854     }
4855   else
4856     {
4857       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4858       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4859     }
4860   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4861   assemble_aligned_integer (4, const0_rtx);
4862   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4863   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4864 }
4865
4866 static void
4867 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4868 {
4869   rtx fnaddr, mem, a_tramp;
4870   const int tramp_code_sz = 16;
4871
4872   /* Don't need to copy the trailing D-words, we fill those in below.  */
4873   emit_block_move (m_tramp, assemble_trampoline_template (),
4874                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4875   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4876   fnaddr = XEXP (DECL_RTL (fndecl), 0);
4877   if (GET_MODE (fnaddr) != ptr_mode)
4878     fnaddr = convert_memory_address (ptr_mode, fnaddr);
4879   emit_move_insn (mem, fnaddr);
4880
4881   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4882   emit_move_insn (mem, chain_value);
4883
4884   /* XXX We should really define a "clear_cache" pattern and use
4885      gen_clear_cache().  */
4886   a_tramp = XEXP (m_tramp, 0);
4887   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4888                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4889                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4890                      ptr_mode);
4891 }
4892
4893 static unsigned char
4894 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4895 {
4896   switch (regclass)
4897     {
4898     case CALLER_SAVE_REGS:
4899     case POINTER_REGS:
4900     case GENERAL_REGS:
4901     case ALL_REGS:
4902     case FP_REGS:
4903     case FP_LO_REGS:
4904       return
4905         aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4906                                        (GET_MODE_SIZE (mode) + 7) / 8;
4907     case STACK_REG:
4908       return 1;
4909
4910     case NO_REGS:
4911       return 0;
4912
4913     default:
4914       break;
4915     }
4916   gcc_unreachable ();
4917 }
4918
4919 static reg_class_t
4920 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4921 {
4922   if (regclass == POINTER_REGS)
4923     return GENERAL_REGS;
4924
4925   if (regclass == STACK_REG)
4926     {
4927       if (REG_P(x)
4928           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4929           return regclass;
4930
4931       return NO_REGS;
4932     }
4933
4934   /* If it's an integer immediate that MOVI can't handle, then
4935      FP_REGS is not an option, so we return NO_REGS instead.  */
4936   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4937       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4938     return NO_REGS;
4939
4940   /* Register eliminiation can result in a request for
4941      SP+constant->FP_REGS.  We cannot support such operations which
4942      use SP as source and an FP_REG as destination, so reject out
4943      right now.  */
4944   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4945     {
4946       rtx lhs = XEXP (x, 0);
4947
4948       /* Look through a possible SUBREG introduced by ILP32.  */
4949       if (GET_CODE (lhs) == SUBREG)
4950         lhs = SUBREG_REG (lhs);
4951
4952       gcc_assert (REG_P (lhs));
4953       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4954                                       POINTER_REGS));
4955       return NO_REGS;
4956     }
4957
4958   return regclass;
4959 }
4960
4961 void
4962 aarch64_asm_output_labelref (FILE* f, const char *name)
4963 {
4964   asm_fprintf (f, "%U%s", name);
4965 }
4966
4967 static void
4968 aarch64_elf_asm_constructor (rtx symbol, int priority)
4969 {
4970   if (priority == DEFAULT_INIT_PRIORITY)
4971     default_ctor_section_asm_out_constructor (symbol, priority);
4972   else
4973     {
4974       section *s;
4975       char buf[18];
4976       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4977       s = get_section (buf, SECTION_WRITE, NULL);
4978       switch_to_section (s);
4979       assemble_align (POINTER_SIZE);
4980       assemble_aligned_integer (POINTER_BYTES, symbol);
4981     }
4982 }
4983
4984 static void
4985 aarch64_elf_asm_destructor (rtx symbol, int priority)
4986 {
4987   if (priority == DEFAULT_INIT_PRIORITY)
4988     default_dtor_section_asm_out_destructor (symbol, priority);
4989   else
4990     {
4991       section *s;
4992       char buf[18];
4993       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
4994       s = get_section (buf, SECTION_WRITE, NULL);
4995       switch_to_section (s);
4996       assemble_align (POINTER_SIZE);
4997       assemble_aligned_integer (POINTER_BYTES, symbol);
4998     }
4999 }
5000
5001 const char*
5002 aarch64_output_casesi (rtx *operands)
5003 {
5004   char buf[100];
5005   char label[100];
5006   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5007   int index;
5008   static const char *const patterns[4][2] =
5009   {
5010     {
5011       "ldrb\t%w3, [%0,%w1,uxtw]",
5012       "add\t%3, %4, %w3, sxtb #2"
5013     },
5014     {
5015       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5016       "add\t%3, %4, %w3, sxth #2"
5017     },
5018     {
5019       "ldr\t%w3, [%0,%w1,uxtw #2]",
5020       "add\t%3, %4, %w3, sxtw #2"
5021     },
5022     /* We assume that DImode is only generated when not optimizing and
5023        that we don't really need 64-bit address offsets.  That would
5024        imply an object file with 8GB of code in a single function!  */
5025     {
5026       "ldr\t%w3, [%0,%w1,uxtw #2]",
5027       "add\t%3, %4, %w3, sxtw #2"
5028     }
5029   };
5030
5031   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5032
5033   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5034
5035   gcc_assert (index >= 0 && index <= 3);
5036
5037   /* Need to implement table size reduction, by chaning the code below.  */
5038   output_asm_insn (patterns[index][0], operands);
5039   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5040   snprintf (buf, sizeof (buf),
5041             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5042   output_asm_insn (buf, operands);
5043   output_asm_insn (patterns[index][1], operands);
5044   output_asm_insn ("br\t%3", operands);
5045   assemble_label (asm_out_file, label);
5046   return "";
5047 }
5048
5049
5050 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5051    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5052    operator.  */
5053
5054 int
5055 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5056 {
5057   if (shift >= 0 && shift <= 3)
5058     {
5059       int size;
5060       for (size = 8; size <= 32; size *= 2)
5061         {
5062           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5063           if (mask == bits << shift)
5064             return size;
5065         }
5066     }
5067   return 0;
5068 }
5069
5070 static bool
5071 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5072                                    const_rtx x ATTRIBUTE_UNUSED)
5073 {
5074   /* We can't use blocks for constants when we're using a per-function
5075      constant pool.  */
5076   return false;
5077 }
5078
5079 static section *
5080 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5081                             rtx x ATTRIBUTE_UNUSED,
5082                             unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5083 {
5084   /* Force all constant pool entries into the current function section.  */
5085   return function_section (current_function_decl);
5086 }
5087
5088
5089 /* Costs.  */
5090
5091 /* Helper function for rtx cost calculation.  Strip a shift expression
5092    from X.  Returns the inner operand if successful, or the original
5093    expression on failure.  */
5094 static rtx
5095 aarch64_strip_shift (rtx x)
5096 {
5097   rtx op = x;
5098
5099   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5100      we can convert both to ROR during final output.  */
5101   if ((GET_CODE (op) == ASHIFT
5102        || GET_CODE (op) == ASHIFTRT
5103        || GET_CODE (op) == LSHIFTRT
5104        || GET_CODE (op) == ROTATERT
5105        || GET_CODE (op) == ROTATE)
5106       && CONST_INT_P (XEXP (op, 1)))
5107     return XEXP (op, 0);
5108
5109   if (GET_CODE (op) == MULT
5110       && CONST_INT_P (XEXP (op, 1))
5111       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5112     return XEXP (op, 0);
5113
5114   return x;
5115 }
5116
5117 /* Helper function for rtx cost calculation.  Strip an extend
5118    expression from X.  Returns the inner operand if successful, or the
5119    original expression on failure.  We deal with a number of possible
5120    canonicalization variations here.  */
5121 static rtx
5122 aarch64_strip_extend (rtx x)
5123 {
5124   rtx op = x;
5125
5126   /* Zero and sign extraction of a widened value.  */
5127   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5128       && XEXP (op, 2) == const0_rtx
5129       && GET_CODE (XEXP (op, 0)) == MULT
5130       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5131                                          XEXP (op, 1)))
5132     return XEXP (XEXP (op, 0), 0);
5133
5134   /* It can also be represented (for zero-extend) as an AND with an
5135      immediate.  */
5136   if (GET_CODE (op) == AND
5137       && GET_CODE (XEXP (op, 0)) == MULT
5138       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5139       && CONST_INT_P (XEXP (op, 1))
5140       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5141                            INTVAL (XEXP (op, 1))) != 0)
5142     return XEXP (XEXP (op, 0), 0);
5143
5144   /* Now handle extended register, as this may also have an optional
5145      left shift by 1..4.  */
5146   if (GET_CODE (op) == ASHIFT
5147       && CONST_INT_P (XEXP (op, 1))
5148       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5149     op = XEXP (op, 0);
5150
5151   if (GET_CODE (op) == ZERO_EXTEND
5152       || GET_CODE (op) == SIGN_EXTEND)
5153     op = XEXP (op, 0);
5154
5155   if (op != x)
5156     return op;
5157
5158   return x;
5159 }
5160
5161 /* Helper function for rtx cost calculation.  Calculate the cost of
5162    a MULT, which may be part of a multiply-accumulate rtx.  Return
5163    the calculated cost of the expression, recursing manually in to
5164    operands where needed.  */
5165
5166 static int
5167 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5168 {
5169   rtx op0, op1;
5170   const struct cpu_cost_table *extra_cost
5171     = aarch64_tune_params->insn_extra_cost;
5172   int cost = 0;
5173   bool maybe_fma = (outer == PLUS || outer == MINUS);
5174   machine_mode mode = GET_MODE (x);
5175
5176   gcc_checking_assert (code == MULT);
5177
5178   op0 = XEXP (x, 0);
5179   op1 = XEXP (x, 1);
5180
5181   if (VECTOR_MODE_P (mode))
5182     mode = GET_MODE_INNER (mode);
5183
5184   /* Integer multiply/fma.  */
5185   if (GET_MODE_CLASS (mode) == MODE_INT)
5186     {
5187       /* The multiply will be canonicalized as a shift, cost it as such.  */
5188       if (CONST_INT_P (op1)
5189           && exact_log2 (INTVAL (op1)) > 0)
5190         {
5191           if (speed)
5192             {
5193               if (maybe_fma)
5194                 /* ADD (shifted register).  */
5195                 cost += extra_cost->alu.arith_shift;
5196               else
5197                 /* LSL (immediate).  */
5198                 cost += extra_cost->alu.shift;
5199             }
5200
5201           cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5202
5203           return cost;
5204         }
5205
5206       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5207       if ((GET_CODE (op0) == ZERO_EXTEND
5208            && GET_CODE (op1) == ZERO_EXTEND)
5209           || (GET_CODE (op0) == SIGN_EXTEND
5210               && GET_CODE (op1) == SIGN_EXTEND))
5211         {
5212           cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5213                   + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5214
5215           if (speed)
5216             {
5217               if (maybe_fma)
5218                 /* MADD/SMADDL/UMADDL.  */
5219                 cost += extra_cost->mult[0].extend_add;
5220               else
5221                 /* MUL/SMULL/UMULL.  */
5222                 cost += extra_cost->mult[0].extend;
5223             }
5224
5225           return cost;
5226         }
5227
5228       /* This is either an integer multiply or an FMA.  In both cases
5229          we want to recurse and cost the operands.  */
5230       cost += rtx_cost (op0, MULT, 0, speed)
5231               + rtx_cost (op1, MULT, 1, speed);
5232
5233       if (speed)
5234         {
5235           if (maybe_fma)
5236             /* MADD.  */
5237             cost += extra_cost->mult[mode == DImode].add;
5238           else
5239             /* MUL.  */
5240             cost += extra_cost->mult[mode == DImode].simple;
5241         }
5242
5243       return cost;
5244     }
5245   else
5246     {
5247       if (speed)
5248         {
5249           /* Floating-point FMA/FMUL can also support negations of the
5250              operands.  */
5251           if (GET_CODE (op0) == NEG)
5252             op0 = XEXP (op0, 0);
5253           if (GET_CODE (op1) == NEG)
5254             op1 = XEXP (op1, 0);
5255
5256           if (maybe_fma)
5257             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5258             cost += extra_cost->fp[mode == DFmode].fma;
5259           else
5260             /* FMUL/FNMUL.  */
5261             cost += extra_cost->fp[mode == DFmode].mult;
5262         }
5263
5264       cost += rtx_cost (op0, MULT, 0, speed)
5265               + rtx_cost (op1, MULT, 1, speed);
5266       return cost;
5267     }
5268 }
5269
5270 static int
5271 aarch64_address_cost (rtx x,
5272                       machine_mode mode,
5273                       addr_space_t as ATTRIBUTE_UNUSED,
5274                       bool speed)
5275 {
5276   enum rtx_code c = GET_CODE (x);
5277   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
5278   struct aarch64_address_info info;
5279   int cost = 0;
5280   info.shift = 0;
5281
5282   if (!aarch64_classify_address (&info, x, mode, c, false))
5283     {
5284       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5285         {
5286           /* This is a CONST or SYMBOL ref which will be split
5287              in a different way depending on the code model in use.
5288              Cost it through the generic infrastructure.  */
5289           int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5290           /* Divide through by the cost of one instruction to
5291              bring it to the same units as the address costs.  */
5292           cost_symbol_ref /= COSTS_N_INSNS (1);
5293           /* The cost is then the cost of preparing the address,
5294              followed by an immediate (possibly 0) offset.  */
5295           return cost_symbol_ref + addr_cost->imm_offset;
5296         }
5297       else
5298         {
5299           /* This is most likely a jump table from a case
5300              statement.  */
5301           return addr_cost->register_offset;
5302         }
5303     }
5304
5305   switch (info.type)
5306     {
5307       case ADDRESS_LO_SUM:
5308       case ADDRESS_SYMBOLIC:
5309       case ADDRESS_REG_IMM:
5310         cost += addr_cost->imm_offset;
5311         break;
5312
5313       case ADDRESS_REG_WB:
5314         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5315           cost += addr_cost->pre_modify;
5316         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5317           cost += addr_cost->post_modify;
5318         else
5319           gcc_unreachable ();
5320
5321         break;
5322
5323       case ADDRESS_REG_REG:
5324         cost += addr_cost->register_offset;
5325         break;
5326
5327       case ADDRESS_REG_UXTW:
5328       case ADDRESS_REG_SXTW:
5329         cost += addr_cost->register_extend;
5330         break;
5331
5332       default:
5333         gcc_unreachable ();
5334     }
5335
5336
5337   if (info.shift > 0)
5338     {
5339       /* For the sake of calculating the cost of the shifted register
5340          component, we can treat same sized modes in the same way.  */
5341       switch (GET_MODE_BITSIZE (mode))
5342         {
5343           case 16:
5344             cost += addr_cost->addr_scale_costs.hi;
5345             break;
5346
5347           case 32:
5348             cost += addr_cost->addr_scale_costs.si;
5349             break;
5350
5351           case 64:
5352             cost += addr_cost->addr_scale_costs.di;
5353             break;
5354
5355           /* We can't tell, or this is a 128-bit vector.  */
5356           default:
5357             cost += addr_cost->addr_scale_costs.ti;
5358             break;
5359         }
5360     }
5361
5362   return cost;
5363 }
5364
5365 /* Return true if the RTX X in mode MODE is a zero or sign extract
5366    usable in an ADD or SUB (extended register) instruction.  */
5367 static bool
5368 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5369 {
5370   /* Catch add with a sign extract.
5371      This is add_<optab><mode>_multp2.  */
5372   if (GET_CODE (x) == SIGN_EXTRACT
5373       || GET_CODE (x) == ZERO_EXTRACT)
5374     {
5375       rtx op0 = XEXP (x, 0);
5376       rtx op1 = XEXP (x, 1);
5377       rtx op2 = XEXP (x, 2);
5378
5379       if (GET_CODE (op0) == MULT
5380           && CONST_INT_P (op1)
5381           && op2 == const0_rtx
5382           && CONST_INT_P (XEXP (op0, 1))
5383           && aarch64_is_extend_from_extract (mode,
5384                                              XEXP (op0, 1),
5385                                              op1))
5386         {
5387           return true;
5388         }
5389     }
5390
5391   return false;
5392 }
5393
5394 static bool
5395 aarch64_frint_unspec_p (unsigned int u)
5396 {
5397   switch (u)
5398     {
5399       case UNSPEC_FRINTZ:
5400       case UNSPEC_FRINTP:
5401       case UNSPEC_FRINTM:
5402       case UNSPEC_FRINTA:
5403       case UNSPEC_FRINTN:
5404       case UNSPEC_FRINTX:
5405       case UNSPEC_FRINTI:
5406         return true;
5407
5408       default:
5409         return false;
5410     }
5411 }
5412
5413 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5414    storing it in *COST.  Result is true if the total cost of the operation
5415    has now been calculated.  */
5416 static bool
5417 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5418 {
5419   rtx inner;
5420   rtx comparator;
5421   enum rtx_code cmpcode;
5422
5423   if (COMPARISON_P (op0))
5424     {
5425       inner = XEXP (op0, 0);
5426       comparator = XEXP (op0, 1);
5427       cmpcode = GET_CODE (op0);
5428     }
5429   else
5430     {
5431       inner = op0;
5432       comparator = const0_rtx;
5433       cmpcode = NE;
5434     }
5435
5436   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5437     {
5438       /* Conditional branch.  */
5439       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5440         return true;
5441       else
5442         {
5443           if (cmpcode == NE || cmpcode == EQ)
5444             {
5445               if (comparator == const0_rtx)
5446                 {
5447                   /* TBZ/TBNZ/CBZ/CBNZ.  */
5448                   if (GET_CODE (inner) == ZERO_EXTRACT)
5449                     /* TBZ/TBNZ.  */
5450                     *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5451                                        0, speed);
5452                 else
5453                   /* CBZ/CBNZ.  */
5454                   *cost += rtx_cost (inner, cmpcode, 0, speed);
5455
5456                 return true;
5457               }
5458             }
5459           else if (cmpcode == LT || cmpcode == GE)
5460             {
5461               /* TBZ/TBNZ.  */
5462               if (comparator == const0_rtx)
5463                 return true;
5464             }
5465         }
5466     }
5467   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5468     {
5469       /* It's a conditional operation based on the status flags,
5470          so it must be some flavor of CSEL.  */
5471
5472       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
5473       if (GET_CODE (op1) == NEG
5474           || GET_CODE (op1) == NOT
5475           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5476         op1 = XEXP (op1, 0);
5477
5478       *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5479       *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5480       return true;
5481     }
5482
5483   /* We don't know what this is, cost all operands.  */
5484   return false;
5485 }
5486
5487 /* Calculate the cost of calculating X, storing it in *COST.  Result
5488    is true if the total cost of the operation has now been calculated.  */
5489 static bool
5490 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5491                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5492 {
5493   rtx op0, op1, op2;
5494   const struct cpu_cost_table *extra_cost
5495     = aarch64_tune_params->insn_extra_cost;
5496   machine_mode mode = GET_MODE (x);
5497
5498   /* By default, assume that everything has equivalent cost to the
5499      cheapest instruction.  Any additional costs are applied as a delta
5500      above this default.  */
5501   *cost = COSTS_N_INSNS (1);
5502
5503   /* TODO: The cost infrastructure currently does not handle
5504      vector operations.  Assume that all vector operations
5505      are equally expensive.  */
5506   if (VECTOR_MODE_P (mode))
5507     {
5508       if (speed)
5509         *cost += extra_cost->vect.alu;
5510       return true;
5511     }
5512
5513   switch (code)
5514     {
5515     case SET:
5516       /* The cost depends entirely on the operands to SET.  */
5517       *cost = 0;
5518       op0 = SET_DEST (x);
5519       op1 = SET_SRC (x);
5520
5521       switch (GET_CODE (op0))
5522         {
5523         case MEM:
5524           if (speed)
5525             {
5526               rtx address = XEXP (op0, 0);
5527               if (GET_MODE_CLASS (mode) == MODE_INT)
5528                 *cost += extra_cost->ldst.store;
5529               else if (mode == SFmode)
5530                 *cost += extra_cost->ldst.storef;
5531               else if (mode == DFmode)
5532                 *cost += extra_cost->ldst.stored;
5533
5534               *cost +=
5535                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5536                                                      0, speed));
5537             }
5538
5539           *cost += rtx_cost (op1, SET, 1, speed);
5540           return true;
5541
5542         case SUBREG:
5543           if (! REG_P (SUBREG_REG (op0)))
5544             *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5545
5546           /* Fall through.  */
5547         case REG:
5548           /* const0_rtx is in general free, but we will use an
5549              instruction to set a register to 0.  */
5550           if (REG_P (op1) || op1 == const0_rtx)
5551             {
5552               /* The cost is 1 per register copied.  */
5553               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5554                               / UNITS_PER_WORD;
5555               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5556             }
5557           else
5558             /* Cost is just the cost of the RHS of the set.  */
5559             *cost += rtx_cost (op1, SET, 1, speed);
5560           return true;
5561
5562         case ZERO_EXTRACT:
5563         case SIGN_EXTRACT:
5564           /* Bit-field insertion.  Strip any redundant widening of
5565              the RHS to meet the width of the target.  */
5566           if (GET_CODE (op1) == SUBREG)
5567             op1 = SUBREG_REG (op1);
5568           if ((GET_CODE (op1) == ZERO_EXTEND
5569                || GET_CODE (op1) == SIGN_EXTEND)
5570               && CONST_INT_P (XEXP (op0, 1))
5571               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5572                   >= INTVAL (XEXP (op0, 1))))
5573             op1 = XEXP (op1, 0);
5574
5575           if (CONST_INT_P (op1))
5576             {
5577               /* MOV immediate is assumed to always be cheap.  */
5578               *cost = COSTS_N_INSNS (1);
5579             }
5580           else
5581             {
5582               /* BFM.  */
5583               if (speed)
5584                 *cost += extra_cost->alu.bfi;
5585               *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5586             }
5587
5588           return true;
5589
5590         default:
5591           /* We can't make sense of this, assume default cost.  */
5592           *cost = COSTS_N_INSNS (1);
5593           return false;
5594         }
5595       return false;
5596
5597     case CONST_INT:
5598       /* If an instruction can incorporate a constant within the
5599          instruction, the instruction's expression avoids calling
5600          rtx_cost() on the constant.  If rtx_cost() is called on a
5601          constant, then it is usually because the constant must be
5602          moved into a register by one or more instructions.
5603
5604          The exception is constant 0, which can be expressed
5605          as XZR/WZR and is therefore free.  The exception to this is
5606          if we have (set (reg) (const0_rtx)) in which case we must cost
5607          the move.  However, we can catch that when we cost the SET, so
5608          we don't need to consider that here.  */
5609       if (x == const0_rtx)
5610         *cost = 0;
5611       else
5612         {
5613           /* To an approximation, building any other constant is
5614              proportionally expensive to the number of instructions
5615              required to build that constant.  This is true whether we
5616              are compiling for SPEED or otherwise.  */
5617           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5618                                  (NULL_RTX, x, false, mode));
5619         }
5620       return true;
5621
5622     case CONST_DOUBLE:
5623       if (speed)
5624         {
5625           /* mov[df,sf]_aarch64.  */
5626           if (aarch64_float_const_representable_p (x))
5627             /* FMOV (scalar immediate).  */
5628             *cost += extra_cost->fp[mode == DFmode].fpconst;
5629           else if (!aarch64_float_const_zero_rtx_p (x))
5630             {
5631               /* This will be a load from memory.  */
5632               if (mode == DFmode)
5633                 *cost += extra_cost->ldst.loadd;
5634               else
5635                 *cost += extra_cost->ldst.loadf;
5636             }
5637           else
5638             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5639                or MOV v0.s[0], wzr - neither of which are modeled by the
5640                cost tables.  Just use the default cost.  */
5641             {
5642             }
5643         }
5644
5645       return true;
5646
5647     case MEM:
5648       if (speed)
5649         {
5650           /* For loads we want the base cost of a load, plus an
5651              approximation for the additional cost of the addressing
5652              mode.  */
5653           rtx address = XEXP (x, 0);
5654           if (GET_MODE_CLASS (mode) == MODE_INT)
5655             *cost += extra_cost->ldst.load;
5656           else if (mode == SFmode)
5657             *cost += extra_cost->ldst.loadf;
5658           else if (mode == DFmode)
5659             *cost += extra_cost->ldst.loadd;
5660
5661           *cost +=
5662                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5663                                                      0, speed));
5664         }
5665
5666       return true;
5667
5668     case NEG:
5669       op0 = XEXP (x, 0);
5670
5671       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5672        {
5673           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5674               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5675             {
5676               /* CSETM.  */
5677               *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5678               return true;
5679             }
5680
5681           /* Cost this as SUB wzr, X.  */
5682           op0 = CONST0_RTX (GET_MODE (x));
5683           op1 = XEXP (x, 0);
5684           goto cost_minus;
5685         }
5686
5687       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5688         {
5689           /* Support (neg(fma...)) as a single instruction only if
5690              sign of zeros is unimportant.  This matches the decision
5691              making in aarch64.md.  */
5692           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5693             {
5694               /* FNMADD.  */
5695               *cost = rtx_cost (op0, NEG, 0, speed);
5696               return true;
5697             }
5698           if (speed)
5699             /* FNEG.  */
5700             *cost += extra_cost->fp[mode == DFmode].neg;
5701           return false;
5702         }
5703
5704       return false;
5705
5706     case CLRSB:
5707     case CLZ:
5708       if (speed)
5709         *cost += extra_cost->alu.clz;
5710
5711       return false;
5712
5713     case COMPARE:
5714       op0 = XEXP (x, 0);
5715       op1 = XEXP (x, 1);
5716
5717       if (op1 == const0_rtx
5718           && GET_CODE (op0) == AND)
5719         {
5720           x = op0;
5721           goto cost_logic;
5722         }
5723
5724       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5725         {
5726           /* TODO: A write to the CC flags possibly costs extra, this
5727              needs encoding in the cost tables.  */
5728
5729           /* CC_ZESWPmode supports zero extend for free.  */
5730           if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5731             op0 = XEXP (op0, 0);
5732
5733           /* ANDS.  */
5734           if (GET_CODE (op0) == AND)
5735             {
5736               x = op0;
5737               goto cost_logic;
5738             }
5739
5740           if (GET_CODE (op0) == PLUS)
5741             {
5742               /* ADDS (and CMN alias).  */
5743               x = op0;
5744               goto cost_plus;
5745             }
5746
5747           if (GET_CODE (op0) == MINUS)
5748             {
5749               /* SUBS.  */
5750               x = op0;
5751               goto cost_minus;
5752             }
5753
5754           if (GET_CODE (op1) == NEG)
5755             {
5756               /* CMN.  */
5757               if (speed)
5758                 *cost += extra_cost->alu.arith;
5759
5760               *cost += rtx_cost (op0, COMPARE, 0, speed);
5761               *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5762               return true;
5763             }
5764
5765           /* CMP.
5766
5767              Compare can freely swap the order of operands, and
5768              canonicalization puts the more complex operation first.
5769              But the integer MINUS logic expects the shift/extend
5770              operation in op1.  */
5771           if (! (REG_P (op0)
5772                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5773           {
5774             op0 = XEXP (x, 1);
5775             op1 = XEXP (x, 0);
5776           }
5777           goto cost_minus;
5778         }
5779
5780       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5781         {
5782           /* FCMP.  */
5783           if (speed)
5784             *cost += extra_cost->fp[mode == DFmode].compare;
5785
5786           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5787             {
5788               /* FCMP supports constant 0.0 for no extra cost. */
5789               return true;
5790             }
5791           return false;
5792         }
5793
5794       return false;
5795
5796     case MINUS:
5797       {
5798         op0 = XEXP (x, 0);
5799         op1 = XEXP (x, 1);
5800
5801 cost_minus:
5802         /* Detect valid immediates.  */
5803         if ((GET_MODE_CLASS (mode) == MODE_INT
5804              || (GET_MODE_CLASS (mode) == MODE_CC
5805                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5806             && CONST_INT_P (op1)
5807             && aarch64_uimm12_shift (INTVAL (op1)))
5808           {
5809             *cost += rtx_cost (op0, MINUS, 0, speed);
5810
5811             if (speed)
5812               /* SUB(S) (immediate).  */
5813               *cost += extra_cost->alu.arith;
5814             return true;
5815
5816           }
5817
5818         /* Look for SUB (extended register).  */
5819         if (aarch64_rtx_arith_op_extract_p (op1, mode))
5820           {
5821             if (speed)
5822               *cost += extra_cost->alu.extend_arith;
5823
5824             *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5825                                (enum rtx_code) GET_CODE (op1),
5826                                0, speed);
5827             return true;
5828           }
5829
5830         rtx new_op1 = aarch64_strip_extend (op1);
5831
5832         /* Cost this as an FMA-alike operation.  */
5833         if ((GET_CODE (new_op1) == MULT
5834              || GET_CODE (new_op1) == ASHIFT)
5835             && code != COMPARE)
5836           {
5837             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5838                                             (enum rtx_code) code,
5839                                             speed);
5840             *cost += rtx_cost (op0, MINUS, 0, speed);
5841             return true;
5842           }
5843
5844         *cost += rtx_cost (new_op1, MINUS, 1, speed);
5845
5846         if (speed)
5847           {
5848             if (GET_MODE_CLASS (mode) == MODE_INT)
5849               /* SUB(S).  */
5850               *cost += extra_cost->alu.arith;
5851             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5852               /* FSUB.  */
5853               *cost += extra_cost->fp[mode == DFmode].addsub;
5854           }
5855         return true;
5856       }
5857
5858     case PLUS:
5859       {
5860         rtx new_op0;
5861
5862         op0 = XEXP (x, 0);
5863         op1 = XEXP (x, 1);
5864
5865 cost_plus:
5866         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5867             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5868           {
5869             /* CSINC.  */
5870             *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5871             *cost += rtx_cost (op1, PLUS, 1, speed);
5872             return true;
5873           }
5874
5875         if (GET_MODE_CLASS (mode) == MODE_INT
5876             && CONST_INT_P (op1)
5877             && aarch64_uimm12_shift (INTVAL (op1)))
5878           {
5879             *cost += rtx_cost (op0, PLUS, 0, speed);
5880
5881             if (speed)
5882               /* ADD (immediate).  */
5883               *cost += extra_cost->alu.arith;
5884             return true;
5885           }
5886
5887         /* Look for ADD (extended register).  */
5888         if (aarch64_rtx_arith_op_extract_p (op0, mode))
5889           {
5890             if (speed)
5891               *cost += extra_cost->alu.extend_arith;
5892
5893             *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5894                                (enum rtx_code) GET_CODE (op0),
5895                                0, speed);
5896             return true;
5897           }
5898
5899         /* Strip any extend, leave shifts behind as we will
5900            cost them through mult_cost.  */
5901         new_op0 = aarch64_strip_extend (op0);
5902
5903         if (GET_CODE (new_op0) == MULT
5904             || GET_CODE (new_op0) == ASHIFT)
5905           {
5906             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5907                                             speed);
5908             *cost += rtx_cost (op1, PLUS, 1, speed);
5909             return true;
5910           }
5911
5912         *cost += (rtx_cost (new_op0, PLUS, 0, speed)
5913                   + rtx_cost (op1, PLUS, 1, speed));
5914
5915         if (speed)
5916           {
5917             if (GET_MODE_CLASS (mode) == MODE_INT)
5918               /* ADD.  */
5919               *cost += extra_cost->alu.arith;
5920             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5921               /* FADD.  */
5922               *cost += extra_cost->fp[mode == DFmode].addsub;
5923           }
5924         return true;
5925       }
5926
5927     case BSWAP:
5928       *cost = COSTS_N_INSNS (1);
5929
5930       if (speed)
5931         *cost += extra_cost->alu.rev;
5932
5933       return false;
5934
5935     case IOR:
5936       if (aarch_rev16_p (x))
5937         {
5938           *cost = COSTS_N_INSNS (1);
5939
5940           if (speed)
5941             *cost += extra_cost->alu.rev;
5942
5943           return true;
5944         }
5945     /* Fall through.  */
5946     case XOR:
5947     case AND:
5948     cost_logic:
5949       op0 = XEXP (x, 0);
5950       op1 = XEXP (x, 1);
5951
5952       if (code == AND
5953           && GET_CODE (op0) == MULT
5954           && CONST_INT_P (XEXP (op0, 1))
5955           && CONST_INT_P (op1)
5956           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
5957                                INTVAL (op1)) != 0)
5958         {
5959           /* This is a UBFM/SBFM.  */
5960           *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
5961           if (speed)
5962             *cost += extra_cost->alu.bfx;
5963           return true;
5964         }
5965
5966       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5967         {
5968           /* We possibly get the immediate for free, this is not
5969              modelled.  */
5970           if (CONST_INT_P (op1)
5971               && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
5972             {
5973               *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
5974
5975               if (speed)
5976                 *cost += extra_cost->alu.logical;
5977
5978               return true;
5979             }
5980           else
5981             {
5982               rtx new_op0 = op0;
5983
5984               /* Handle ORN, EON, or BIC.  */
5985               if (GET_CODE (op0) == NOT)
5986                 op0 = XEXP (op0, 0);
5987
5988               new_op0 = aarch64_strip_shift (op0);
5989
5990               /* If we had a shift on op0 then this is a logical-shift-
5991                  by-register/immediate operation.  Otherwise, this is just
5992                  a logical operation.  */
5993               if (speed)
5994                 {
5995                   if (new_op0 != op0)
5996                     {
5997                       /* Shift by immediate.  */
5998                       if (CONST_INT_P (XEXP (op0, 1)))
5999                         *cost += extra_cost->alu.log_shift;
6000                       else
6001                         *cost += extra_cost->alu.log_shift_reg;
6002                     }
6003                   else
6004                     *cost += extra_cost->alu.logical;
6005                 }
6006
6007               /* In both cases we want to cost both operands.  */
6008               *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6009                        + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6010
6011               return true;
6012             }
6013         }
6014       return false;
6015
6016     case NOT:
6017       /* MVN.  */
6018       if (speed)
6019         *cost += extra_cost->alu.logical;
6020
6021       /* The logical instruction could have the shifted register form,
6022          but the cost is the same if the shift is processed as a separate
6023          instruction, so we don't bother with it here.  */
6024       return false;
6025
6026     case ZERO_EXTEND:
6027
6028       op0 = XEXP (x, 0);
6029       /* If a value is written in SI mode, then zero extended to DI
6030          mode, the operation will in general be free as a write to
6031          a 'w' register implicitly zeroes the upper bits of an 'x'
6032          register.  However, if this is
6033
6034            (set (reg) (zero_extend (reg)))
6035
6036          we must cost the explicit register move.  */
6037       if (mode == DImode
6038           && GET_MODE (op0) == SImode
6039           && outer == SET)
6040         {
6041           int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6042
6043           if (!op_cost && speed)
6044             /* MOV.  */
6045             *cost += extra_cost->alu.extend;
6046           else
6047             /* Free, the cost is that of the SI mode operation.  */
6048             *cost = op_cost;
6049
6050           return true;
6051         }
6052       else if (MEM_P (XEXP (x, 0)))
6053         {
6054           /* All loads can zero extend to any size for free.  */
6055           *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6056           return true;
6057         }
6058
6059       /* UXTB/UXTH.  */
6060       if (speed)
6061         *cost += extra_cost->alu.extend;
6062
6063       return false;
6064
6065     case SIGN_EXTEND:
6066       if (MEM_P (XEXP (x, 0)))
6067         {
6068           /* LDRSH.  */
6069           if (speed)
6070             {
6071               rtx address = XEXP (XEXP (x, 0), 0);
6072               *cost += extra_cost->ldst.load_sign_extend;
6073
6074               *cost +=
6075                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6076                                                      0, speed));
6077             }
6078           return true;
6079         }
6080
6081       if (speed)
6082         *cost += extra_cost->alu.extend;
6083       return false;
6084
6085     case ASHIFT:
6086       op0 = XEXP (x, 0);
6087       op1 = XEXP (x, 1);
6088
6089       if (CONST_INT_P (op1))
6090         {
6091           /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6092              aliases.  */
6093           if (speed)
6094             *cost += extra_cost->alu.shift;
6095
6096           /* We can incorporate zero/sign extend for free.  */
6097           if (GET_CODE (op0) == ZERO_EXTEND
6098               || GET_CODE (op0) == SIGN_EXTEND)
6099             op0 = XEXP (op0, 0);
6100
6101           *cost += rtx_cost (op0, ASHIFT, 0, speed);
6102           return true;
6103         }
6104       else
6105         {
6106           /* LSLV.  */
6107           if (speed)
6108             *cost += extra_cost->alu.shift_reg;
6109
6110           return false;  /* All arguments need to be in registers.  */
6111         }
6112
6113     case ROTATE:
6114     case ROTATERT:
6115     case LSHIFTRT:
6116     case ASHIFTRT:
6117       op0 = XEXP (x, 0);
6118       op1 = XEXP (x, 1);
6119
6120       if (CONST_INT_P (op1))
6121         {
6122           /* ASR (immediate) and friends.  */
6123           if (speed)
6124             *cost += extra_cost->alu.shift;
6125
6126           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6127           return true;
6128         }
6129       else
6130         {
6131
6132           /* ASR (register) and friends.  */
6133           if (speed)
6134             *cost += extra_cost->alu.shift_reg;
6135
6136           return false;  /* All arguments need to be in registers.  */
6137         }
6138
6139     case SYMBOL_REF:
6140
6141       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6142         {
6143           /* LDR.  */
6144           if (speed)
6145             *cost += extra_cost->ldst.load;
6146         }
6147       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6148                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6149         {
6150           /* ADRP, followed by ADD.  */
6151           *cost += COSTS_N_INSNS (1);
6152           if (speed)
6153             *cost += 2 * extra_cost->alu.arith;
6154         }
6155       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6156                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6157         {
6158           /* ADR.  */
6159           if (speed)
6160             *cost += extra_cost->alu.arith;
6161         }
6162
6163       if (flag_pic)
6164         {
6165           /* One extra load instruction, after accessing the GOT.  */
6166           *cost += COSTS_N_INSNS (1);
6167           if (speed)
6168             *cost += extra_cost->ldst.load;
6169         }
6170       return true;
6171
6172     case HIGH:
6173     case LO_SUM:
6174       /* ADRP/ADD (immediate).  */
6175       if (speed)
6176         *cost += extra_cost->alu.arith;
6177       return true;
6178
6179     case ZERO_EXTRACT:
6180     case SIGN_EXTRACT:
6181       /* UBFX/SBFX.  */
6182       if (speed)
6183         *cost += extra_cost->alu.bfx;
6184
6185       /* We can trust that the immediates used will be correct (there
6186          are no by-register forms), so we need only cost op0.  */
6187       *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6188       return true;
6189
6190     case MULT:
6191       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6192       /* aarch64_rtx_mult_cost always handles recursion to its
6193          operands.  */
6194       return true;
6195
6196     case MOD:
6197     case UMOD:
6198       if (speed)
6199         {
6200           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6201             *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6202                       + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6203           else if (GET_MODE (x) == DFmode)
6204             *cost += (extra_cost->fp[1].mult
6205                       + extra_cost->fp[1].div);
6206           else if (GET_MODE (x) == SFmode)
6207             *cost += (extra_cost->fp[0].mult
6208                       + extra_cost->fp[0].div);
6209         }
6210       return false;  /* All arguments need to be in registers.  */
6211
6212     case DIV:
6213     case UDIV:
6214     case SQRT:
6215       if (speed)
6216         {
6217           if (GET_MODE_CLASS (mode) == MODE_INT)
6218             /* There is no integer SQRT, so only DIV and UDIV can get
6219                here.  */
6220             *cost += extra_cost->mult[mode == DImode].idiv;
6221           else
6222             *cost += extra_cost->fp[mode == DFmode].div;
6223         }
6224       return false;  /* All arguments need to be in registers.  */
6225
6226     case IF_THEN_ELSE:
6227       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6228                                          XEXP (x, 2), cost, speed);
6229
6230     case EQ:
6231     case NE:
6232     case GT:
6233     case GTU:
6234     case LT:
6235     case LTU:
6236     case GE:
6237     case GEU:
6238     case LE:
6239     case LEU:
6240
6241       return false; /* All arguments must be in registers.  */
6242
6243     case FMA:
6244       op0 = XEXP (x, 0);
6245       op1 = XEXP (x, 1);
6246       op2 = XEXP (x, 2);
6247
6248       if (speed)
6249         *cost += extra_cost->fp[mode == DFmode].fma;
6250
6251       /* FMSUB, FNMADD, and FNMSUB are free.  */
6252       if (GET_CODE (op0) == NEG)
6253         op0 = XEXP (op0, 0);
6254
6255       if (GET_CODE (op2) == NEG)
6256         op2 = XEXP (op2, 0);
6257
6258       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6259          and the by-element operand as operand 0.  */
6260       if (GET_CODE (op1) == NEG)
6261         op1 = XEXP (op1, 0);
6262
6263       /* Catch vector-by-element operations.  The by-element operand can
6264          either be (vec_duplicate (vec_select (x))) or just
6265          (vec_select (x)), depending on whether we are multiplying by
6266          a vector or a scalar.
6267
6268          Canonicalization is not very good in these cases, FMA4 will put the
6269          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
6270       if (GET_CODE (op0) == VEC_DUPLICATE)
6271         op0 = XEXP (op0, 0);
6272       else if (GET_CODE (op1) == VEC_DUPLICATE)
6273         op1 = XEXP (op1, 0);
6274
6275       if (GET_CODE (op0) == VEC_SELECT)
6276         op0 = XEXP (op0, 0);
6277       else if (GET_CODE (op1) == VEC_SELECT)
6278         op1 = XEXP (op1, 0);
6279
6280       /* If the remaining parameters are not registers,
6281          get the cost to put them into registers.  */
6282       *cost += rtx_cost (op0, FMA, 0, speed);
6283       *cost += rtx_cost (op1, FMA, 1, speed);
6284       *cost += rtx_cost (op2, FMA, 2, speed);
6285       return true;
6286
6287     case FLOAT_EXTEND:
6288       if (speed)
6289         *cost += extra_cost->fp[mode == DFmode].widen;
6290       return false;
6291
6292     case FLOAT_TRUNCATE:
6293       if (speed)
6294         *cost += extra_cost->fp[mode == DFmode].narrow;
6295       return false;
6296
6297     case FIX:
6298     case UNSIGNED_FIX:
6299       x = XEXP (x, 0);
6300       /* Strip the rounding part.  They will all be implemented
6301          by the fcvt* family of instructions anyway.  */
6302       if (GET_CODE (x) == UNSPEC)
6303         {
6304           unsigned int uns_code = XINT (x, 1);
6305
6306           if (uns_code == UNSPEC_FRINTA
6307               || uns_code == UNSPEC_FRINTM
6308               || uns_code == UNSPEC_FRINTN
6309               || uns_code == UNSPEC_FRINTP
6310               || uns_code == UNSPEC_FRINTZ)
6311             x = XVECEXP (x, 0, 0);
6312         }
6313
6314       if (speed)
6315         *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6316
6317       *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6318       return true;
6319
6320     case ABS:
6321       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6322         {
6323           /* FABS and FNEG are analogous.  */
6324           if (speed)
6325             *cost += extra_cost->fp[mode == DFmode].neg;
6326         }
6327       else
6328         {
6329           /* Integer ABS will either be split to
6330              two arithmetic instructions, or will be an ABS
6331              (scalar), which we don't model.  */
6332           *cost = COSTS_N_INSNS (2);
6333           if (speed)
6334             *cost += 2 * extra_cost->alu.arith;
6335         }
6336       return false;
6337
6338     case SMAX:
6339     case SMIN:
6340       if (speed)
6341         {
6342           /* FMAXNM/FMINNM/FMAX/FMIN.
6343              TODO: This may not be accurate for all implementations, but
6344              we do not model this in the cost tables.  */
6345           *cost += extra_cost->fp[mode == DFmode].addsub;
6346         }
6347       return false;
6348
6349     case UNSPEC:
6350       /* The floating point round to integer frint* instructions.  */
6351       if (aarch64_frint_unspec_p (XINT (x, 1)))
6352         {
6353           if (speed)
6354             *cost += extra_cost->fp[mode == DFmode].roundint;
6355
6356           return false;
6357         }
6358
6359       if (XINT (x, 1) == UNSPEC_RBIT)
6360         {
6361           if (speed)
6362             *cost += extra_cost->alu.rev;
6363
6364           return false;
6365         }
6366       break;
6367
6368     case TRUNCATE:
6369
6370       /* Decompose <su>muldi3_highpart.  */
6371       if (/* (truncate:DI  */
6372           mode == DImode
6373           /*   (lshiftrt:TI  */
6374           && GET_MODE (XEXP (x, 0)) == TImode
6375           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6376           /*      (mult:TI  */
6377           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6378           /*        (ANY_EXTEND:TI (reg:DI))
6379                     (ANY_EXTEND:TI (reg:DI)))  */
6380           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6381                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6382               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6383                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6384           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6385           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6386           /*     (const_int 64)  */
6387           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6388           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6389         {
6390           /* UMULH/SMULH.  */
6391           if (speed)
6392             *cost += extra_cost->mult[mode == DImode].extend;
6393           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6394                              MULT, 0, speed);
6395           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6396                              MULT, 1, speed);
6397           return true;
6398         }
6399
6400       /* Fall through.  */
6401     default:
6402       break;
6403     }
6404
6405   if (dump_file && (dump_flags & TDF_DETAILS))
6406     fprintf (dump_file,
6407       "\nFailed to cost RTX.  Assuming default cost.\n");
6408
6409   return true;
6410 }
6411
6412 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6413    calculated for X.  This cost is stored in *COST.  Returns true
6414    if the total cost of X was calculated.  */
6415 static bool
6416 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6417                    int param, int *cost, bool speed)
6418 {
6419   bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6420
6421   if (dump_file && (dump_flags & TDF_DETAILS))
6422     {
6423       print_rtl_single (dump_file, x);
6424       fprintf (dump_file, "\n%s cost: %d (%s)\n",
6425                speed ? "Hot" : "Cold",
6426                *cost, result ? "final" : "partial");
6427     }
6428
6429   return result;
6430 }
6431
6432 static int
6433 aarch64_register_move_cost (machine_mode mode,
6434                             reg_class_t from_i, reg_class_t to_i)
6435 {
6436   enum reg_class from = (enum reg_class) from_i;
6437   enum reg_class to = (enum reg_class) to_i;
6438   const struct cpu_regmove_cost *regmove_cost
6439     = aarch64_tune_params->regmove_cost;
6440
6441   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
6442   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6443     to = GENERAL_REGS;
6444
6445   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6446     from = GENERAL_REGS;
6447
6448   /* Moving between GPR and stack cost is the same as GP2GP.  */
6449   if ((from == GENERAL_REGS && to == STACK_REG)
6450       || (to == GENERAL_REGS && from == STACK_REG))
6451     return regmove_cost->GP2GP;
6452
6453   /* To/From the stack register, we move via the gprs.  */
6454   if (to == STACK_REG || from == STACK_REG)
6455     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6456             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6457
6458   if (GET_MODE_SIZE (mode) == 16)
6459     {
6460       /* 128-bit operations on general registers require 2 instructions.  */
6461       if (from == GENERAL_REGS && to == GENERAL_REGS)
6462         return regmove_cost->GP2GP * 2;
6463       else if (from == GENERAL_REGS)
6464         return regmove_cost->GP2FP * 2;
6465       else if (to == GENERAL_REGS)
6466         return regmove_cost->FP2GP * 2;
6467
6468       /* When AdvSIMD instructions are disabled it is not possible to move
6469          a 128-bit value directly between Q registers.  This is handled in
6470          secondary reload.  A general register is used as a scratch to move
6471          the upper DI value and the lower DI value is moved directly,
6472          hence the cost is the sum of three moves. */
6473       if (! TARGET_SIMD)
6474         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6475
6476       return regmove_cost->FP2FP;
6477     }
6478
6479   if (from == GENERAL_REGS && to == GENERAL_REGS)
6480     return regmove_cost->GP2GP;
6481   else if (from == GENERAL_REGS)
6482     return regmove_cost->GP2FP;
6483   else if (to == GENERAL_REGS)
6484     return regmove_cost->FP2GP;
6485
6486   return regmove_cost->FP2FP;
6487 }
6488
6489 static int
6490 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6491                           reg_class_t rclass ATTRIBUTE_UNUSED,
6492                           bool in ATTRIBUTE_UNUSED)
6493 {
6494   return aarch64_tune_params->memmov_cost;
6495 }
6496
6497 /* Return the number of instructions that can be issued per cycle.  */
6498 static int
6499 aarch64_sched_issue_rate (void)
6500 {
6501   return aarch64_tune_params->issue_rate;
6502 }
6503
6504 static int
6505 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6506 {
6507   int issue_rate = aarch64_sched_issue_rate ();
6508
6509   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6510 }
6511
6512 /* Vectorizer cost model target hooks.  */
6513
6514 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
6515 static int
6516 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6517                                     tree vectype,
6518                                     int misalign ATTRIBUTE_UNUSED)
6519 {
6520   unsigned elements;
6521
6522   switch (type_of_cost)
6523     {
6524       case scalar_stmt:
6525         return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6526
6527       case scalar_load:
6528         return aarch64_tune_params->vec_costs->scalar_load_cost;
6529
6530       case scalar_store:
6531         return aarch64_tune_params->vec_costs->scalar_store_cost;
6532
6533       case vector_stmt:
6534         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6535
6536       case vector_load:
6537         return aarch64_tune_params->vec_costs->vec_align_load_cost;
6538
6539       case vector_store:
6540         return aarch64_tune_params->vec_costs->vec_store_cost;
6541
6542       case vec_to_scalar:
6543         return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6544
6545       case scalar_to_vec:
6546         return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6547
6548       case unaligned_load:
6549         return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6550
6551       case unaligned_store:
6552         return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6553
6554       case cond_branch_taken:
6555         return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6556
6557       case cond_branch_not_taken:
6558         return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6559
6560       case vec_perm:
6561       case vec_promote_demote:
6562         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6563
6564       case vec_construct:
6565         elements = TYPE_VECTOR_SUBPARTS (vectype);
6566         return elements / 2 + 1;
6567
6568       default:
6569         gcc_unreachable ();
6570     }
6571 }
6572
6573 /* Implement targetm.vectorize.add_stmt_cost.  */
6574 static unsigned
6575 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6576                        struct _stmt_vec_info *stmt_info, int misalign,
6577                        enum vect_cost_model_location where)
6578 {
6579   unsigned *cost = (unsigned *) data;
6580   unsigned retval = 0;
6581
6582   if (flag_vect_cost_model)
6583     {
6584       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6585       int stmt_cost =
6586             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6587
6588       /* Statements in an inner loop relative to the loop being
6589          vectorized are weighted more heavily.  The value here is
6590          a function (linear for now) of the loop nest level.  */
6591       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6592         {
6593           loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6594           struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
6595           unsigned nest_level = loop_depth (loop);
6596
6597           count *= nest_level;
6598         }
6599
6600       retval = (unsigned) (count * stmt_cost);
6601       cost[where] += retval;
6602     }
6603
6604   return retval;
6605 }
6606
6607 static void initialize_aarch64_code_model (void);
6608
6609 /* Parse the architecture extension string.  */
6610
6611 static void
6612 aarch64_parse_extension (char *str)
6613 {
6614   /* The extension string is parsed left to right.  */
6615   const struct aarch64_option_extension *opt = NULL;
6616
6617   /* Flag to say whether we are adding or removing an extension.  */
6618   int adding_ext = -1;
6619
6620   while (str != NULL && *str != 0)
6621     {
6622       char *ext;
6623       size_t len;
6624
6625       str++;
6626       ext = strchr (str, '+');
6627
6628       if (ext != NULL)
6629         len = ext - str;
6630       else
6631         len = strlen (str);
6632
6633       if (len >= 2 && strncmp (str, "no", 2) == 0)
6634         {
6635           adding_ext = 0;
6636           len -= 2;
6637           str += 2;
6638         }
6639       else if (len > 0)
6640         adding_ext = 1;
6641
6642       if (len == 0)
6643         {
6644           error ("missing feature modifier after %qs", adding_ext ? "+"
6645                                                                   : "+no");
6646           return;
6647         }
6648
6649       /* Scan over the extensions table trying to find an exact match.  */
6650       for (opt = all_extensions; opt->name != NULL; opt++)
6651         {
6652           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6653             {
6654               /* Add or remove the extension.  */
6655               if (adding_ext)
6656                 aarch64_isa_flags |= opt->flags_on;
6657               else
6658                 aarch64_isa_flags &= ~(opt->flags_off);
6659               break;
6660             }
6661         }
6662
6663       if (opt->name == NULL)
6664         {
6665           /* Extension not found in list.  */
6666           error ("unknown feature modifier %qs", str);
6667           return;
6668         }
6669
6670       str = ext;
6671     };
6672
6673   return;
6674 }
6675
6676 /* Parse the ARCH string.  */
6677
6678 static void
6679 aarch64_parse_arch (void)
6680 {
6681   char *ext;
6682   const struct processor *arch;
6683   char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6684   size_t len;
6685
6686   strcpy (str, aarch64_arch_string);
6687
6688   ext = strchr (str, '+');
6689
6690   if (ext != NULL)
6691     len = ext - str;
6692   else
6693     len = strlen (str);
6694
6695   if (len == 0)
6696     {
6697       error ("missing arch name in -march=%qs", str);
6698       return;
6699     }
6700
6701   /* Loop through the list of supported ARCHs to find a match.  */
6702   for (arch = all_architectures; arch->name != NULL; arch++)
6703     {
6704       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6705         {
6706           selected_arch = arch;
6707           aarch64_isa_flags = selected_arch->flags;
6708
6709           if (!selected_cpu)
6710             selected_cpu = &all_cores[selected_arch->core];
6711
6712           if (ext != NULL)
6713             {
6714               /* ARCH string contains at least one extension.  */
6715               aarch64_parse_extension (ext);
6716             }
6717
6718           if (strcmp (selected_arch->arch, selected_cpu->arch))
6719             {
6720               warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6721                        selected_cpu->name, selected_arch->name);
6722             }
6723
6724           return;
6725         }
6726     }
6727
6728   /* ARCH name not found in list.  */
6729   error ("unknown value %qs for -march", str);
6730   return;
6731 }
6732
6733 /* Parse the CPU string.  */
6734
6735 static void
6736 aarch64_parse_cpu (void)
6737 {
6738   char *ext;
6739   const struct processor *cpu;
6740   char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6741   size_t len;
6742
6743   strcpy (str, aarch64_cpu_string);
6744
6745   ext = strchr (str, '+');
6746
6747   if (ext != NULL)
6748     len = ext - str;
6749   else
6750     len = strlen (str);
6751
6752   if (len == 0)
6753     {
6754       error ("missing cpu name in -mcpu=%qs", str);
6755       return;
6756     }
6757
6758   /* Loop through the list of supported CPUs to find a match.  */
6759   for (cpu = all_cores; cpu->name != NULL; cpu++)
6760     {
6761       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6762         {
6763           selected_cpu = cpu;
6764           aarch64_isa_flags = selected_cpu->flags;
6765
6766           if (ext != NULL)
6767             {
6768               /* CPU string contains at least one extension.  */
6769               aarch64_parse_extension (ext);
6770             }
6771
6772           return;
6773         }
6774     }
6775
6776   /* CPU name not found in list.  */
6777   error ("unknown value %qs for -mcpu", str);
6778   return;
6779 }
6780
6781 /* Parse the TUNE string.  */
6782
6783 static void
6784 aarch64_parse_tune (void)
6785 {
6786   const struct processor *cpu;
6787   char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6788   strcpy (str, aarch64_tune_string);
6789
6790   /* Loop through the list of supported CPUs to find a match.  */
6791   for (cpu = all_cores; cpu->name != NULL; cpu++)
6792     {
6793       if (strcmp (cpu->name, str) == 0)
6794         {
6795           selected_tune = cpu;
6796           return;
6797         }
6798     }
6799
6800   /* CPU name not found in list.  */
6801   error ("unknown value %qs for -mtune", str);
6802   return;
6803 }
6804
6805
6806 /* Implement TARGET_OPTION_OVERRIDE.  */
6807
6808 static void
6809 aarch64_override_options (void)
6810 {
6811   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6812      If either of -march or -mtune is given, they override their
6813      respective component of -mcpu.
6814
6815      So, first parse AARCH64_CPU_STRING, then the others, be careful
6816      with -march as, if -mcpu is not present on the command line, march
6817      must set a sensible default CPU.  */
6818   if (aarch64_cpu_string)
6819     {
6820       aarch64_parse_cpu ();
6821     }
6822
6823   if (aarch64_arch_string)
6824     {
6825       aarch64_parse_arch ();
6826     }
6827
6828   if (aarch64_tune_string)
6829     {
6830       aarch64_parse_tune ();
6831     }
6832
6833 #ifndef HAVE_AS_MABI_OPTION
6834   /* The compiler may have been configured with 2.23.* binutils, which does
6835      not have support for ILP32.  */
6836   if (TARGET_ILP32)
6837     error ("Assembler does not support -mabi=ilp32");
6838 #endif
6839
6840   initialize_aarch64_code_model ();
6841
6842   aarch64_build_bitmask_table ();
6843
6844   /* This target defaults to strict volatile bitfields.  */
6845   if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6846     flag_strict_volatile_bitfields = 1;
6847
6848   /* If the user did not specify a processor, choose the default
6849      one for them.  This will be the CPU set during configuration using
6850      --with-cpu, otherwise it is "generic".  */
6851   if (!selected_cpu)
6852     {
6853       selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6854       aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6855     }
6856
6857   gcc_assert (selected_cpu);
6858
6859   if (!selected_tune)
6860     selected_tune = selected_cpu;
6861
6862   aarch64_tune_flags = selected_tune->flags;
6863   aarch64_tune = selected_tune->core;
6864   aarch64_tune_params = selected_tune->tune;
6865   aarch64_architecture_version = selected_cpu->architecture_version;
6866
6867   if (aarch64_fix_a53_err835769 == 2)
6868     {
6869 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
6870       aarch64_fix_a53_err835769 = 1;
6871 #else
6872       aarch64_fix_a53_err835769 = 0;
6873 #endif
6874     }
6875
6876   /* If not opzimizing for size, set the default
6877      alignment to what the target wants */
6878   if (!optimize_size)
6879     {
6880       if (align_loops <= 0)
6881         align_loops = aarch64_tune_params->loop_align;
6882       if (align_jumps <= 0)
6883         align_jumps = aarch64_tune_params->jump_align;
6884       if (align_functions <= 0)
6885         align_functions = aarch64_tune_params->function_align;
6886     }
6887
6888   if (AARCH64_TUNE_FMA_STEERING)
6889     aarch64_register_fma_steering ();
6890
6891   aarch64_override_options_after_change ();
6892 }
6893
6894 /* Implement targetm.override_options_after_change.  */
6895
6896 static void
6897 aarch64_override_options_after_change (void)
6898 {
6899   if (flag_omit_frame_pointer)
6900     flag_omit_leaf_frame_pointer = false;
6901   else if (flag_omit_leaf_frame_pointer)
6902     flag_omit_frame_pointer = true;
6903 }
6904
6905 static struct machine_function *
6906 aarch64_init_machine_status (void)
6907 {
6908   struct machine_function *machine;
6909   machine = ggc_cleared_alloc<machine_function> ();
6910   return machine;
6911 }
6912
6913 void
6914 aarch64_init_expanders (void)
6915 {
6916   init_machine_status = aarch64_init_machine_status;
6917 }
6918
6919 /* A checking mechanism for the implementation of the various code models.  */
6920 static void
6921 initialize_aarch64_code_model (void)
6922 {
6923    if (flag_pic)
6924      {
6925        switch (aarch64_cmodel_var)
6926          {
6927          case AARCH64_CMODEL_TINY:
6928            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
6929            break;
6930          case AARCH64_CMODEL_SMALL:
6931            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
6932            break;
6933          case AARCH64_CMODEL_LARGE:
6934            sorry ("code model %qs with -f%s", "large",
6935                   flag_pic > 1 ? "PIC" : "pic");
6936          default:
6937            gcc_unreachable ();
6938          }
6939      }
6940    else
6941      aarch64_cmodel = aarch64_cmodel_var;
6942 }
6943
6944 /* Return true if SYMBOL_REF X binds locally.  */
6945
6946 static bool
6947 aarch64_symbol_binds_local_p (const_rtx x)
6948 {
6949   return (SYMBOL_REF_DECL (x)
6950           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
6951           : SYMBOL_REF_LOCAL_P (x));
6952 }
6953
6954 /* Return true if SYMBOL_REF X is thread local */
6955 static bool
6956 aarch64_tls_symbol_p (rtx x)
6957 {
6958   if (! TARGET_HAVE_TLS)
6959     return false;
6960
6961   if (GET_CODE (x) != SYMBOL_REF)
6962     return false;
6963
6964   return SYMBOL_REF_TLS_MODEL (x) != 0;
6965 }
6966
6967 /* Classify a TLS symbol into one of the TLS kinds.  */
6968 enum aarch64_symbol_type
6969 aarch64_classify_tls_symbol (rtx x)
6970 {
6971   enum tls_model tls_kind = tls_symbolic_operand_type (x);
6972
6973   switch (tls_kind)
6974     {
6975     case TLS_MODEL_GLOBAL_DYNAMIC:
6976     case TLS_MODEL_LOCAL_DYNAMIC:
6977       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
6978
6979     case TLS_MODEL_INITIAL_EXEC:
6980       return SYMBOL_SMALL_GOTTPREL;
6981
6982     case TLS_MODEL_LOCAL_EXEC:
6983       return SYMBOL_SMALL_TPREL;
6984
6985     case TLS_MODEL_EMULATED:
6986     case TLS_MODEL_NONE:
6987       return SYMBOL_FORCE_TO_MEM;
6988
6989     default:
6990       gcc_unreachable ();
6991     }
6992 }
6993
6994 /* Return the method that should be used to access SYMBOL_REF or
6995    LABEL_REF X in context CONTEXT.  */
6996
6997 enum aarch64_symbol_type
6998 aarch64_classify_symbol (rtx x, rtx offset,
6999                          enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
7000 {
7001   if (GET_CODE (x) == LABEL_REF)
7002     {
7003       switch (aarch64_cmodel)
7004         {
7005         case AARCH64_CMODEL_LARGE:
7006           return SYMBOL_FORCE_TO_MEM;
7007
7008         case AARCH64_CMODEL_TINY_PIC:
7009         case AARCH64_CMODEL_TINY:
7010           return SYMBOL_TINY_ABSOLUTE;
7011
7012         case AARCH64_CMODEL_SMALL_PIC:
7013         case AARCH64_CMODEL_SMALL:
7014           return SYMBOL_SMALL_ABSOLUTE;
7015
7016         default:
7017           gcc_unreachable ();
7018         }
7019     }
7020
7021   if (GET_CODE (x) == SYMBOL_REF)
7022     {
7023       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7024           return SYMBOL_FORCE_TO_MEM;
7025
7026       if (aarch64_tls_symbol_p (x))
7027         return aarch64_classify_tls_symbol (x);
7028
7029       switch (aarch64_cmodel)
7030         {
7031         case AARCH64_CMODEL_TINY:
7032           /* When we retreive symbol + offset address, we have to make sure
7033              the offset does not cause overflow of the final address.  But
7034              we have no way of knowing the address of symbol at compile time
7035              so we can't accurately say if the distance between the PC and
7036              symbol + offset is outside the addressible range of +/-1M in the
7037              TINY code model.  So we rely on images not being greater than
7038              1M and cap the offset at 1M and anything beyond 1M will have to
7039              be loaded using an alternative mechanism.  */
7040           if (SYMBOL_REF_WEAK (x)
7041               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7042             return SYMBOL_FORCE_TO_MEM;
7043           return SYMBOL_TINY_ABSOLUTE;
7044
7045         case AARCH64_CMODEL_SMALL:
7046           /* Same reasoning as the tiny code model, but the offset cap here is
7047              4G.  */
7048           if (SYMBOL_REF_WEAK (x)
7049               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
7050                             HOST_WIDE_INT_C (4294967264)))
7051             return SYMBOL_FORCE_TO_MEM;
7052           return SYMBOL_SMALL_ABSOLUTE;
7053
7054         case AARCH64_CMODEL_TINY_PIC:
7055           if (!aarch64_symbol_binds_local_p (x))
7056             return SYMBOL_TINY_GOT;
7057           return SYMBOL_TINY_ABSOLUTE;
7058
7059         case AARCH64_CMODEL_SMALL_PIC:
7060           if (!aarch64_symbol_binds_local_p (x))
7061             return SYMBOL_SMALL_GOT;
7062           return SYMBOL_SMALL_ABSOLUTE;
7063
7064         default:
7065           gcc_unreachable ();
7066         }
7067     }
7068
7069   /* By default push everything into the constant pool.  */
7070   return SYMBOL_FORCE_TO_MEM;
7071 }
7072
7073 bool
7074 aarch64_constant_address_p (rtx x)
7075 {
7076   return (CONSTANT_P (x) && memory_address_p (DImode, x));
7077 }
7078
7079 bool
7080 aarch64_legitimate_pic_operand_p (rtx x)
7081 {
7082   if (GET_CODE (x) == SYMBOL_REF
7083       || (GET_CODE (x) == CONST
7084           && GET_CODE (XEXP (x, 0)) == PLUS
7085           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7086      return false;
7087
7088   return true;
7089 }
7090
7091 /* Return true if X holds either a quarter-precision or
7092      floating-point +0.0 constant.  */
7093 static bool
7094 aarch64_valid_floating_const (machine_mode mode, rtx x)
7095 {
7096   if (!CONST_DOUBLE_P (x))
7097     return false;
7098
7099   /* TODO: We could handle moving 0.0 to a TFmode register,
7100      but first we would like to refactor the movtf_aarch64
7101      to be more amicable to split moves properly and
7102      correctly gate on TARGET_SIMD.  For now - reject all
7103      constants which are not to SFmode or DFmode registers.  */
7104   if (!(mode == SFmode || mode == DFmode))
7105     return false;
7106
7107   if (aarch64_float_const_zero_rtx_p (x))
7108     return true;
7109   return aarch64_float_const_representable_p (x);
7110 }
7111
7112 static bool
7113 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7114 {
7115   /* Do not allow vector struct mode constants.  We could support
7116      0 and -1 easily, but they need support in aarch64-simd.md.  */
7117   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7118     return false;
7119
7120   /* This could probably go away because
7121      we now decompose CONST_INTs according to expand_mov_immediate.  */
7122   if ((GET_CODE (x) == CONST_VECTOR
7123        && aarch64_simd_valid_immediate (x, mode, false, NULL))
7124       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7125         return !targetm.cannot_force_const_mem (mode, x);
7126
7127   if (GET_CODE (x) == HIGH
7128       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7129     return true;
7130
7131   return aarch64_constant_address_p (x);
7132 }
7133
7134 rtx
7135 aarch64_load_tp (rtx target)
7136 {
7137   if (!target
7138       || GET_MODE (target) != Pmode
7139       || !register_operand (target, Pmode))
7140     target = gen_reg_rtx (Pmode);
7141
7142   /* Can return in any reg.  */
7143   emit_insn (gen_aarch64_load_tp_hard (target));
7144   return target;
7145 }
7146
7147 /* On AAPCS systems, this is the "struct __va_list".  */
7148 static GTY(()) tree va_list_type;
7149
7150 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7151    Return the type to use as __builtin_va_list.
7152
7153    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7154
7155    struct __va_list
7156    {
7157      void *__stack;
7158      void *__gr_top;
7159      void *__vr_top;
7160      int   __gr_offs;
7161      int   __vr_offs;
7162    };  */
7163
7164 static tree
7165 aarch64_build_builtin_va_list (void)
7166 {
7167   tree va_list_name;
7168   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7169
7170   /* Create the type.  */
7171   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7172   /* Give it the required name.  */
7173   va_list_name = build_decl (BUILTINS_LOCATION,
7174                              TYPE_DECL,
7175                              get_identifier ("__va_list"),
7176                              va_list_type);
7177   DECL_ARTIFICIAL (va_list_name) = 1;
7178   TYPE_NAME (va_list_type) = va_list_name;
7179   TYPE_STUB_DECL (va_list_type) = va_list_name;
7180
7181   /* Create the fields.  */
7182   f_stack = build_decl (BUILTINS_LOCATION,
7183                         FIELD_DECL, get_identifier ("__stack"),
7184                         ptr_type_node);
7185   f_grtop = build_decl (BUILTINS_LOCATION,
7186                         FIELD_DECL, get_identifier ("__gr_top"),
7187                         ptr_type_node);
7188   f_vrtop = build_decl (BUILTINS_LOCATION,
7189                         FIELD_DECL, get_identifier ("__vr_top"),
7190                         ptr_type_node);
7191   f_groff = build_decl (BUILTINS_LOCATION,
7192                         FIELD_DECL, get_identifier ("__gr_offs"),
7193                         integer_type_node);
7194   f_vroff = build_decl (BUILTINS_LOCATION,
7195                         FIELD_DECL, get_identifier ("__vr_offs"),
7196                         integer_type_node);
7197
7198   DECL_ARTIFICIAL (f_stack) = 1;
7199   DECL_ARTIFICIAL (f_grtop) = 1;
7200   DECL_ARTIFICIAL (f_vrtop) = 1;
7201   DECL_ARTIFICIAL (f_groff) = 1;
7202   DECL_ARTIFICIAL (f_vroff) = 1;
7203
7204   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7205   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7206   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7207   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7208   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7209
7210   TYPE_FIELDS (va_list_type) = f_stack;
7211   DECL_CHAIN (f_stack) = f_grtop;
7212   DECL_CHAIN (f_grtop) = f_vrtop;
7213   DECL_CHAIN (f_vrtop) = f_groff;
7214   DECL_CHAIN (f_groff) = f_vroff;
7215
7216   /* Compute its layout.  */
7217   layout_type (va_list_type);
7218
7219   return va_list_type;
7220 }
7221
7222 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
7223 static void
7224 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7225 {
7226   const CUMULATIVE_ARGS *cum;
7227   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7228   tree stack, grtop, vrtop, groff, vroff;
7229   tree t;
7230   int gr_save_area_size;
7231   int vr_save_area_size;
7232   int vr_offset;
7233
7234   cum = &crtl->args.info;
7235   gr_save_area_size
7236     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7237   vr_save_area_size
7238     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7239
7240   if (TARGET_GENERAL_REGS_ONLY)
7241     {
7242       if (cum->aapcs_nvrn > 0)
7243         sorry ("%qs and floating point or vector arguments",
7244                "-mgeneral-regs-only");
7245       vr_save_area_size = 0;
7246     }
7247
7248   f_stack = TYPE_FIELDS (va_list_type_node);
7249   f_grtop = DECL_CHAIN (f_stack);
7250   f_vrtop = DECL_CHAIN (f_grtop);
7251   f_groff = DECL_CHAIN (f_vrtop);
7252   f_vroff = DECL_CHAIN (f_groff);
7253
7254   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7255                   NULL_TREE);
7256   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7257                   NULL_TREE);
7258   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7259                   NULL_TREE);
7260   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7261                   NULL_TREE);
7262   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7263                   NULL_TREE);
7264
7265   /* Emit code to initialize STACK, which points to the next varargs stack
7266      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
7267      by named arguments.  STACK is 8-byte aligned.  */
7268   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7269   if (cum->aapcs_stack_size > 0)
7270     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7271   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7272   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7273
7274   /* Emit code to initialize GRTOP, the top of the GR save area.
7275      virtual_incoming_args_rtx should have been 16 byte aligned.  */
7276   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7277   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7278   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7279
7280   /* Emit code to initialize VRTOP, the top of the VR save area.
7281      This address is gr_save_area_bytes below GRTOP, rounded
7282      down to the next 16-byte boundary.  */
7283   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7284   vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7285                              STACK_BOUNDARY / BITS_PER_UNIT);
7286
7287   if (vr_offset)
7288     t = fold_build_pointer_plus_hwi (t, -vr_offset);
7289   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7290   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7291
7292   /* Emit code to initialize GROFF, the offset from GRTOP of the
7293      next GPR argument.  */
7294   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7295               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7296   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7297
7298   /* Likewise emit code to initialize VROFF, the offset from FTOP
7299      of the next VR argument.  */
7300   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7301               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7302   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7303 }
7304
7305 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
7306
7307 static tree
7308 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7309                               gimple_seq *post_p ATTRIBUTE_UNUSED)
7310 {
7311   tree addr;
7312   bool indirect_p;
7313   bool is_ha;           /* is HFA or HVA.  */
7314   bool dw_align;        /* double-word align.  */
7315   machine_mode ag_mode = VOIDmode;
7316   int nregs;
7317   machine_mode mode;
7318
7319   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7320   tree stack, f_top, f_off, off, arg, roundup, on_stack;
7321   HOST_WIDE_INT size, rsize, adjust, align;
7322   tree t, u, cond1, cond2;
7323
7324   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7325   if (indirect_p)
7326     type = build_pointer_type (type);
7327
7328   mode = TYPE_MODE (type);
7329
7330   f_stack = TYPE_FIELDS (va_list_type_node);
7331   f_grtop = DECL_CHAIN (f_stack);
7332   f_vrtop = DECL_CHAIN (f_grtop);
7333   f_groff = DECL_CHAIN (f_vrtop);
7334   f_vroff = DECL_CHAIN (f_groff);
7335
7336   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7337                   f_stack, NULL_TREE);
7338   size = int_size_in_bytes (type);
7339   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7340
7341   dw_align = false;
7342   adjust = 0;
7343   if (aarch64_vfp_is_call_or_return_candidate (mode,
7344                                                type,
7345                                                &ag_mode,
7346                                                &nregs,
7347                                                &is_ha))
7348     {
7349       /* TYPE passed in fp/simd registers.  */
7350       if (TARGET_GENERAL_REGS_ONLY)
7351         sorry ("%qs and floating point or vector arguments",
7352                "-mgeneral-regs-only");
7353
7354       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
7355                       unshare_expr (valist), f_vrtop, NULL_TREE);
7356       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
7357                       unshare_expr (valist), f_vroff, NULL_TREE);
7358
7359       rsize = nregs * UNITS_PER_VREG;
7360
7361       if (is_ha)
7362         {
7363           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
7364             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
7365         }
7366       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
7367                && size < UNITS_PER_VREG)
7368         {
7369           adjust = UNITS_PER_VREG - size;
7370         }
7371     }
7372   else
7373     {
7374       /* TYPE passed in general registers.  */
7375       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
7376                       unshare_expr (valist), f_grtop, NULL_TREE);
7377       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
7378                       unshare_expr (valist), f_groff, NULL_TREE);
7379       rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
7380       nregs = rsize / UNITS_PER_WORD;
7381
7382       if (align > 8)
7383         dw_align = true;
7384
7385       if (BLOCK_REG_PADDING (mode, type, 1) == downward
7386           && size < UNITS_PER_WORD)
7387         {
7388           adjust = UNITS_PER_WORD  - size;
7389         }
7390     }
7391
7392   /* Get a local temporary for the field value.  */
7393   off = get_initialized_tmp_var (f_off, pre_p, NULL);
7394
7395   /* Emit code to branch if off >= 0.  */
7396   t = build2 (GE_EXPR, boolean_type_node, off,
7397               build_int_cst (TREE_TYPE (off), 0));
7398   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
7399
7400   if (dw_align)
7401     {
7402       /* Emit: offs = (offs + 15) & -16.  */
7403       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7404                   build_int_cst (TREE_TYPE (off), 15));
7405       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
7406                   build_int_cst (TREE_TYPE (off), -16));
7407       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
7408     }
7409   else
7410     roundup = NULL;
7411
7412   /* Update ap.__[g|v]r_offs  */
7413   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7414               build_int_cst (TREE_TYPE (off), rsize));
7415   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
7416
7417   /* String up.  */
7418   if (roundup)
7419     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7420
7421   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
7422   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
7423               build_int_cst (TREE_TYPE (f_off), 0));
7424   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
7425
7426   /* String up: make sure the assignment happens before the use.  */
7427   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
7428   COND_EXPR_ELSE (cond1) = t;
7429
7430   /* Prepare the trees handling the argument that is passed on the stack;
7431      the top level node will store in ON_STACK.  */
7432   arg = get_initialized_tmp_var (stack, pre_p, NULL);
7433   if (align > 8)
7434     {
7435       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
7436       t = fold_convert (intDI_type_node, arg);
7437       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7438                   build_int_cst (TREE_TYPE (t), 15));
7439       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7440                   build_int_cst (TREE_TYPE (t), -16));
7441       t = fold_convert (TREE_TYPE (arg), t);
7442       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7443     }
7444   else
7445     roundup = NULL;
7446   /* Advance ap.__stack  */
7447   t = fold_convert (intDI_type_node, arg);
7448   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7449               build_int_cst (TREE_TYPE (t), size + 7));
7450   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7451               build_int_cst (TREE_TYPE (t), -8));
7452   t = fold_convert (TREE_TYPE (arg), t);
7453   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7454   /* String up roundup and advance.  */
7455   if (roundup)
7456     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7457   /* String up with arg */
7458   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7459   /* Big-endianness related address adjustment.  */
7460   if (BLOCK_REG_PADDING (mode, type, 1) == downward
7461       && size < UNITS_PER_WORD)
7462   {
7463     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7464                 size_int (UNITS_PER_WORD - size));
7465     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7466   }
7467
7468   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7469   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7470
7471   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
7472   t = off;
7473   if (adjust)
7474     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7475                 build_int_cst (TREE_TYPE (off), adjust));
7476
7477   t = fold_convert (sizetype, t);
7478   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7479
7480   if (is_ha)
7481     {
7482       /* type ha; // treat as "struct {ftype field[n];}"
7483          ... [computing offs]
7484          for (i = 0; i <nregs; ++i, offs += 16)
7485            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7486          return ha;  */
7487       int i;
7488       tree tmp_ha, field_t, field_ptr_t;
7489
7490       /* Declare a local variable.  */
7491       tmp_ha = create_tmp_var_raw (type, "ha");
7492       gimple_add_tmp_var (tmp_ha);
7493
7494       /* Establish the base type.  */
7495       switch (ag_mode)
7496         {
7497         case SFmode:
7498           field_t = float_type_node;
7499           field_ptr_t = float_ptr_type_node;
7500           break;
7501         case DFmode:
7502           field_t = double_type_node;
7503           field_ptr_t = double_ptr_type_node;
7504           break;
7505         case TFmode:
7506           field_t = long_double_type_node;
7507           field_ptr_t = long_double_ptr_type_node;
7508           break;
7509 /* The half precision and quad precision are not fully supported yet.  Enable
7510    the following code after the support is complete.  Need to find the correct
7511    type node for __fp16 *.  */
7512 #if 0
7513         case HFmode:
7514           field_t = float_type_node;
7515           field_ptr_t = float_ptr_type_node;
7516           break;
7517 #endif
7518         case V2SImode:
7519         case V4SImode:
7520             {
7521               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7522               field_t = build_vector_type_for_mode (innertype, ag_mode);
7523               field_ptr_t = build_pointer_type (field_t);
7524             }
7525           break;
7526         default:
7527           gcc_assert (0);
7528         }
7529
7530       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
7531       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7532       addr = t;
7533       t = fold_convert (field_ptr_t, addr);
7534       t = build2 (MODIFY_EXPR, field_t,
7535                   build1 (INDIRECT_REF, field_t, tmp_ha),
7536                   build1 (INDIRECT_REF, field_t, t));
7537
7538       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
7539       for (i = 1; i < nregs; ++i)
7540         {
7541           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7542           u = fold_convert (field_ptr_t, addr);
7543           u = build2 (MODIFY_EXPR, field_t,
7544                       build2 (MEM_REF, field_t, tmp_ha,
7545                               build_int_cst (field_ptr_t,
7546                                              (i *
7547                                               int_size_in_bytes (field_t)))),
7548                       build1 (INDIRECT_REF, field_t, u));
7549           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7550         }
7551
7552       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7553       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7554     }
7555
7556   COND_EXPR_ELSE (cond2) = t;
7557   addr = fold_convert (build_pointer_type (type), cond1);
7558   addr = build_va_arg_indirect_ref (addr);
7559
7560   if (indirect_p)
7561     addr = build_va_arg_indirect_ref (addr);
7562
7563   return addr;
7564 }
7565
7566 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
7567
7568 static void
7569 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7570                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7571                                 int no_rtl)
7572 {
7573   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7574   CUMULATIVE_ARGS local_cum;
7575   int gr_saved, vr_saved;
7576
7577   /* The caller has advanced CUM up to, but not beyond, the last named
7578      argument.  Advance a local copy of CUM past the last "real" named
7579      argument, to find out how many registers are left over.  */
7580   local_cum = *cum;
7581   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7582
7583   /* Found out how many registers we need to save.  */
7584   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7585   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7586
7587   if (TARGET_GENERAL_REGS_ONLY)
7588     {
7589       if (local_cum.aapcs_nvrn > 0)
7590         sorry ("%qs and floating point or vector arguments",
7591                "-mgeneral-regs-only");
7592       vr_saved = 0;
7593     }
7594
7595   if (!no_rtl)
7596     {
7597       if (gr_saved > 0)
7598         {
7599           rtx ptr, mem;
7600
7601           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
7602           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7603                                - gr_saved * UNITS_PER_WORD);
7604           mem = gen_frame_mem (BLKmode, ptr);
7605           set_mem_alias_set (mem, get_varargs_alias_set ());
7606
7607           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7608                                mem, gr_saved);
7609         }
7610       if (vr_saved > 0)
7611         {
7612           /* We can't use move_block_from_reg, because it will use
7613              the wrong mode, storing D regs only.  */
7614           machine_mode mode = TImode;
7615           int off, i;
7616
7617           /* Set OFF to the offset from virtual_incoming_args_rtx of
7618              the first vector register.  The VR save area lies below
7619              the GR one, and is aligned to 16 bytes.  */
7620           off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7621                                    STACK_BOUNDARY / BITS_PER_UNIT);
7622           off -= vr_saved * UNITS_PER_VREG;
7623
7624           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7625             {
7626               rtx ptr, mem;
7627
7628               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7629               mem = gen_frame_mem (mode, ptr);
7630               set_mem_alias_set (mem, get_varargs_alias_set ());
7631               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7632               off += UNITS_PER_VREG;
7633             }
7634         }
7635     }
7636
7637   /* We don't save the size into *PRETEND_SIZE because we want to avoid
7638      any complication of having crtl->args.pretend_args_size changed.  */
7639   cfun->machine->frame.saved_varargs_size
7640     = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7641                       STACK_BOUNDARY / BITS_PER_UNIT)
7642        + vr_saved * UNITS_PER_VREG);
7643 }
7644
7645 static void
7646 aarch64_conditional_register_usage (void)
7647 {
7648   int i;
7649   if (!TARGET_FLOAT)
7650     {
7651       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7652         {
7653           fixed_regs[i] = 1;
7654           call_used_regs[i] = 1;
7655         }
7656     }
7657 }
7658
7659 /* Walk down the type tree of TYPE counting consecutive base elements.
7660    If *MODEP is VOIDmode, then set it to the first valid floating point
7661    type.  If a non-floating point type is found, or if a floating point
7662    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7663    otherwise return the count in the sub-tree.  */
7664 static int
7665 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
7666 {
7667   machine_mode mode;
7668   HOST_WIDE_INT size;
7669
7670   switch (TREE_CODE (type))
7671     {
7672     case REAL_TYPE:
7673       mode = TYPE_MODE (type);
7674       if (mode != DFmode && mode != SFmode && mode != TFmode)
7675         return -1;
7676
7677       if (*modep == VOIDmode)
7678         *modep = mode;
7679
7680       if (*modep == mode)
7681         return 1;
7682
7683       break;
7684
7685     case COMPLEX_TYPE:
7686       mode = TYPE_MODE (TREE_TYPE (type));
7687       if (mode != DFmode && mode != SFmode && mode != TFmode)
7688         return -1;
7689
7690       if (*modep == VOIDmode)
7691         *modep = mode;
7692
7693       if (*modep == mode)
7694         return 2;
7695
7696       break;
7697
7698     case VECTOR_TYPE:
7699       /* Use V2SImode and V4SImode as representatives of all 64-bit
7700          and 128-bit vector types.  */
7701       size = int_size_in_bytes (type);
7702       switch (size)
7703         {
7704         case 8:
7705           mode = V2SImode;
7706           break;
7707         case 16:
7708           mode = V4SImode;
7709           break;
7710         default:
7711           return -1;
7712         }
7713
7714       if (*modep == VOIDmode)
7715         *modep = mode;
7716
7717       /* Vector modes are considered to be opaque: two vectors are
7718          equivalent for the purposes of being homogeneous aggregates
7719          if they are the same size.  */
7720       if (*modep == mode)
7721         return 1;
7722
7723       break;
7724
7725     case ARRAY_TYPE:
7726       {
7727         int count;
7728         tree index = TYPE_DOMAIN (type);
7729
7730         /* Can't handle incomplete types nor sizes that are not
7731            fixed.  */
7732         if (!COMPLETE_TYPE_P (type)
7733             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7734           return -1;
7735
7736         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7737         if (count == -1
7738             || !index
7739             || !TYPE_MAX_VALUE (index)
7740             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7741             || !TYPE_MIN_VALUE (index)
7742             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7743             || count < 0)
7744           return -1;
7745
7746         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7747                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7748
7749         /* There must be no padding.  */
7750         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7751           return -1;
7752
7753         return count;
7754       }
7755
7756     case RECORD_TYPE:
7757       {
7758         int count = 0;
7759         int sub_count;
7760         tree field;
7761
7762         /* Can't handle incomplete types nor sizes that are not
7763            fixed.  */
7764         if (!COMPLETE_TYPE_P (type)
7765             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7766           return -1;
7767
7768         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7769           {
7770             if (TREE_CODE (field) != FIELD_DECL)
7771               continue;
7772
7773             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7774             if (sub_count < 0)
7775               return -1;
7776             count += sub_count;
7777           }
7778
7779         /* There must be no padding.  */
7780         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7781           return -1;
7782
7783         return count;
7784       }
7785
7786     case UNION_TYPE:
7787     case QUAL_UNION_TYPE:
7788       {
7789         /* These aren't very interesting except in a degenerate case.  */
7790         int count = 0;
7791         int sub_count;
7792         tree field;
7793
7794         /* Can't handle incomplete types nor sizes that are not
7795            fixed.  */
7796         if (!COMPLETE_TYPE_P (type)
7797             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7798           return -1;
7799
7800         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7801           {
7802             if (TREE_CODE (field) != FIELD_DECL)
7803               continue;
7804
7805             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7806             if (sub_count < 0)
7807               return -1;
7808             count = count > sub_count ? count : sub_count;
7809           }
7810
7811         /* There must be no padding.  */
7812         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7813           return -1;
7814
7815         return count;
7816       }
7817
7818     default:
7819       break;
7820     }
7821
7822   return -1;
7823 }
7824
7825 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7826    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
7827    array types.  The C99 floating-point complex types are also considered
7828    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
7829    types, which are GCC extensions and out of the scope of AAPCS64, are
7830    treated as composite types here as well.
7831
7832    Note that MODE itself is not sufficient in determining whether a type
7833    is such a composite type or not.  This is because
7834    stor-layout.c:compute_record_mode may have already changed the MODE
7835    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
7836    structure with only one field may have its MODE set to the mode of the
7837    field.  Also an integer mode whose size matches the size of the
7838    RECORD_TYPE type may be used to substitute the original mode
7839    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
7840    solely relied on.  */
7841
7842 static bool
7843 aarch64_composite_type_p (const_tree type,
7844                           machine_mode mode)
7845 {
7846   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7847     return true;
7848
7849   if (mode == BLKmode
7850       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7851       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7852     return true;
7853
7854   return false;
7855 }
7856
7857 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7858    type as described in AAPCS64 \S 4.1.2.
7859
7860    See the comment above aarch64_composite_type_p for the notes on MODE.  */
7861
7862 static bool
7863 aarch64_short_vector_p (const_tree type,
7864                         machine_mode mode)
7865 {
7866   HOST_WIDE_INT size = -1;
7867
7868   if (type && TREE_CODE (type) == VECTOR_TYPE)
7869     size = int_size_in_bytes (type);
7870   else if (!aarch64_composite_type_p (type, mode)
7871            && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
7872                || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
7873     size = GET_MODE_SIZE (mode);
7874
7875   return (size == 8 || size == 16) ? true : false;
7876 }
7877
7878 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
7879    shall be passed or returned in simd/fp register(s) (providing these
7880    parameter passing registers are available).
7881
7882    Upon successful return, *COUNT returns the number of needed registers,
7883    *BASE_MODE returns the mode of the individual register and when IS_HAF
7884    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
7885    floating-point aggregate or a homogeneous short-vector aggregate.  */
7886
7887 static bool
7888 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
7889                                          const_tree type,
7890                                          machine_mode *base_mode,
7891                                          int *count,
7892                                          bool *is_ha)
7893 {
7894   machine_mode new_mode = VOIDmode;
7895   bool composite_p = aarch64_composite_type_p (type, mode);
7896
7897   if (is_ha != NULL) *is_ha = false;
7898
7899   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
7900       || aarch64_short_vector_p (type, mode))
7901     {
7902       *count = 1;
7903       new_mode = mode;
7904     }
7905   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
7906     {
7907       if (is_ha != NULL) *is_ha = true;
7908       *count = 2;
7909       new_mode = GET_MODE_INNER (mode);
7910     }
7911   else if (type && composite_p)
7912     {
7913       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
7914
7915       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
7916         {
7917           if (is_ha != NULL) *is_ha = true;
7918           *count = ag_count;
7919         }
7920       else
7921         return false;
7922     }
7923   else
7924     return false;
7925
7926   *base_mode = new_mode;
7927   return true;
7928 }
7929
7930 /* Implement TARGET_STRUCT_VALUE_RTX.  */
7931
7932 static rtx
7933 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
7934                           int incoming ATTRIBUTE_UNUSED)
7935 {
7936   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
7937 }
7938
7939 /* Implements target hook vector_mode_supported_p.  */
7940 static bool
7941 aarch64_vector_mode_supported_p (machine_mode mode)
7942 {
7943   if (TARGET_SIMD
7944       && (mode == V4SImode  || mode == V8HImode
7945           || mode == V16QImode || mode == V2DImode
7946           || mode == V2SImode  || mode == V4HImode
7947           || mode == V8QImode || mode == V2SFmode
7948           || mode == V4SFmode || mode == V2DFmode
7949           || mode == V1DFmode))
7950     return true;
7951
7952   return false;
7953 }
7954
7955 /* Return appropriate SIMD container
7956    for MODE within a vector of WIDTH bits.  */
7957 static machine_mode
7958 aarch64_simd_container_mode (machine_mode mode, unsigned width)
7959 {
7960   gcc_assert (width == 64 || width == 128);
7961   if (TARGET_SIMD)
7962     {
7963       if (width == 128)
7964         switch (mode)
7965           {
7966           case DFmode:
7967             return V2DFmode;
7968           case SFmode:
7969             return V4SFmode;
7970           case SImode:
7971             return V4SImode;
7972           case HImode:
7973             return V8HImode;
7974           case QImode:
7975             return V16QImode;
7976           case DImode:
7977             return V2DImode;
7978           default:
7979             break;
7980           }
7981       else
7982         switch (mode)
7983           {
7984           case SFmode:
7985             return V2SFmode;
7986           case SImode:
7987             return V2SImode;
7988           case HImode:
7989             return V4HImode;
7990           case QImode:
7991             return V8QImode;
7992           default:
7993             break;
7994           }
7995     }
7996   return word_mode;
7997 }
7998
7999 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
8000 static machine_mode
8001 aarch64_preferred_simd_mode (machine_mode mode)
8002 {
8003   return aarch64_simd_container_mode (mode, 128);
8004 }
8005
8006 /* Return the bitmask of possible vector sizes for the vectorizer
8007    to iterate over.  */
8008 static unsigned int
8009 aarch64_autovectorize_vector_sizes (void)
8010 {
8011   return (16 | 8);
8012 }
8013
8014 /* Implement TARGET_MANGLE_TYPE.  */
8015
8016 static const char *
8017 aarch64_mangle_type (const_tree type)
8018 {
8019   /* The AArch64 ABI documents say that "__va_list" has to be
8020      managled as if it is in the "std" namespace.  */
8021   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8022     return "St9__va_list";
8023
8024   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
8025      builtin types.  */
8026   if (TYPE_NAME (type) != NULL)
8027     return aarch64_mangle_builtin_type (type);
8028
8029   /* Use the default mangling.  */
8030   return NULL;
8031 }
8032
8033
8034 /* Return true if the rtx_insn contains a MEM RTX somewhere
8035    in it.  */
8036
8037 static bool
8038 has_memory_op (rtx_insn *mem_insn)
8039 {
8040   subrtx_iterator::array_type array;
8041   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8042     if (MEM_P (*iter))
8043       return true;
8044
8045   return false;
8046 }
8047
8048 /* Find the first rtx_insn before insn that will generate an assembly
8049    instruction.  */
8050
8051 static rtx_insn *
8052 aarch64_prev_real_insn (rtx_insn *insn)
8053 {
8054   if (!insn)
8055     return NULL;
8056
8057   do
8058     {
8059       insn = prev_real_insn (insn);
8060     }
8061   while (insn && recog_memoized (insn) < 0);
8062
8063   return insn;
8064 }
8065
8066 static bool
8067 is_madd_op (enum attr_type t1)
8068 {
8069   unsigned int i;
8070   /* A number of these may be AArch32 only.  */
8071   enum attr_type mlatypes[] = {
8072     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8073     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8074     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8075   };
8076
8077   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8078     {
8079       if (t1 == mlatypes[i])
8080         return true;
8081     }
8082
8083   return false;
8084 }
8085
8086 /* Check if there is a register dependency between a load and the insn
8087    for which we hold recog_data.  */
8088
8089 static bool
8090 dep_between_memop_and_curr (rtx memop)
8091 {
8092   rtx load_reg;
8093   int opno;
8094
8095   gcc_assert (GET_CODE (memop) == SET);
8096
8097   if (!REG_P (SET_DEST (memop)))
8098     return false;
8099
8100   load_reg = SET_DEST (memop);
8101   for (opno = 1; opno < recog_data.n_operands; opno++)
8102     {
8103       rtx operand = recog_data.operand[opno];
8104       if (REG_P (operand)
8105           && reg_overlap_mentioned_p (load_reg, operand))
8106         return true;
8107
8108     }
8109   return false;
8110 }
8111
8112
8113 /* When working around the Cortex-A53 erratum 835769,
8114    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8115    instruction and has a preceding memory instruction such that a NOP
8116    should be inserted between them.  */
8117
8118 bool
8119 aarch64_madd_needs_nop (rtx_insn* insn)
8120 {
8121   enum attr_type attr_type;
8122   rtx_insn *prev;
8123   rtx body;
8124
8125   if (!aarch64_fix_a53_err835769)
8126     return false;
8127
8128   if (recog_memoized (insn) < 0)
8129     return false;
8130
8131   attr_type = get_attr_type (insn);
8132   if (!is_madd_op (attr_type))
8133     return false;
8134
8135   prev = aarch64_prev_real_insn (insn);
8136   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8137      Restore recog state to INSN to avoid state corruption.  */
8138   extract_constrain_insn_cached (insn);
8139
8140   if (!prev || !has_memory_op (prev))
8141     return false;
8142
8143   body = single_set (prev);
8144
8145   /* If the previous insn is a memory op and there is no dependency between
8146      it and the DImode madd, emit a NOP between them.  If body is NULL then we
8147      have a complex memory operation, probably a load/store pair.
8148      Be conservative for now and emit a NOP.  */
8149   if (GET_MODE (recog_data.operand[0]) == DImode
8150       && (!body || !dep_between_memop_and_curr (body)))
8151     return true;
8152
8153   return false;
8154
8155 }
8156
8157
8158 /* Implement FINAL_PRESCAN_INSN.  */
8159
8160 void
8161 aarch64_final_prescan_insn (rtx_insn *insn)
8162 {
8163   if (aarch64_madd_needs_nop (insn))
8164     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8165 }
8166
8167
8168 /* Return the equivalent letter for size.  */
8169 static char
8170 sizetochar (int size)
8171 {
8172   switch (size)
8173     {
8174     case 64: return 'd';
8175     case 32: return 's';
8176     case 16: return 'h';
8177     case 8 : return 'b';
8178     default: gcc_unreachable ();
8179     }
8180 }
8181
8182 /* Return true iff x is a uniform vector of floating-point
8183    constants, and the constant can be represented in
8184    quarter-precision form.  Note, as aarch64_float_const_representable
8185    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
8186 static bool
8187 aarch64_vect_float_const_representable_p (rtx x)
8188 {
8189   int i = 0;
8190   REAL_VALUE_TYPE r0, ri;
8191   rtx x0, xi;
8192
8193   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8194     return false;
8195
8196   x0 = CONST_VECTOR_ELT (x, 0);
8197   if (!CONST_DOUBLE_P (x0))
8198     return false;
8199
8200   REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8201
8202   for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8203     {
8204       xi = CONST_VECTOR_ELT (x, i);
8205       if (!CONST_DOUBLE_P (xi))
8206         return false;
8207
8208       REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8209       if (!REAL_VALUES_EQUAL (r0, ri))
8210         return false;
8211     }
8212
8213   return aarch64_float_const_representable_p (x0);
8214 }
8215
8216 /* Return true for valid and false for invalid.  */
8217 bool
8218 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8219                               struct simd_immediate_info *info)
8220 {
8221 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
8222   matches = 1;                                          \
8223   for (i = 0; i < idx; i += (STRIDE))                   \
8224     if (!(TEST))                                        \
8225       matches = 0;                                      \
8226   if (matches)                                          \
8227     {                                                   \
8228       immtype = (CLASS);                                \
8229       elsize = (ELSIZE);                                \
8230       eshift = (SHIFT);                                 \
8231       emvn = (NEG);                                     \
8232       break;                                            \
8233     }
8234
8235   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8236   unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8237   unsigned char bytes[16];
8238   int immtype = -1, matches;
8239   unsigned int invmask = inverse ? 0xff : 0;
8240   int eshift, emvn;
8241
8242   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8243     {
8244       if (! (aarch64_simd_imm_zero_p (op, mode)
8245              || aarch64_vect_float_const_representable_p (op)))
8246         return false;
8247
8248       if (info)
8249         {
8250           info->value = CONST_VECTOR_ELT (op, 0);
8251           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8252           info->mvn = false;
8253           info->shift = 0;
8254         }
8255
8256       return true;
8257     }
8258
8259   /* Splat vector constant out into a byte vector.  */
8260   for (i = 0; i < n_elts; i++)
8261     {
8262       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
8263          it must be laid out in the vector register in reverse order.  */
8264       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8265       unsigned HOST_WIDE_INT elpart;
8266       unsigned int part, parts;
8267
8268       if (CONST_INT_P (el))
8269         {
8270           elpart = INTVAL (el);
8271           parts = 1;
8272         }
8273       else if (GET_CODE (el) == CONST_DOUBLE)
8274         {
8275           elpart = CONST_DOUBLE_LOW (el);
8276           parts = 2;
8277         }
8278       else
8279         gcc_unreachable ();
8280
8281       for (part = 0; part < parts; part++)
8282         {
8283           unsigned int byte;
8284           for (byte = 0; byte < innersize; byte++)
8285             {
8286               bytes[idx++] = (elpart & 0xff) ^ invmask;
8287               elpart >>= BITS_PER_UNIT;
8288             }
8289           if (GET_CODE (el) == CONST_DOUBLE)
8290             elpart = CONST_DOUBLE_HIGH (el);
8291         }
8292     }
8293
8294   /* Sanity check.  */
8295   gcc_assert (idx == GET_MODE_SIZE (mode));
8296
8297   do
8298     {
8299       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8300              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8301
8302       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8303              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8304
8305       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8306              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8307
8308       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8309              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8310
8311       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8312
8313       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8314
8315       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8316              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8317
8318       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8319              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8320
8321       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8322              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8323
8324       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8325              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8326
8327       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8328
8329       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8330
8331       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8332              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8333
8334       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8335              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8336
8337       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8338              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8339
8340       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8341              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8342
8343       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8344
8345       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8346              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8347     }
8348   while (0);
8349
8350   if (immtype == -1)
8351     return false;
8352
8353   if (info)
8354     {
8355       info->element_width = elsize;
8356       info->mvn = emvn != 0;
8357       info->shift = eshift;
8358
8359       unsigned HOST_WIDE_INT imm = 0;
8360
8361       if (immtype >= 12 && immtype <= 15)
8362         info->msl = true;
8363
8364       /* Un-invert bytes of recognized vector, if necessary.  */
8365       if (invmask != 0)
8366         for (i = 0; i < idx; i++)
8367           bytes[i] ^= invmask;
8368
8369       if (immtype == 17)
8370         {
8371           /* FIXME: Broken on 32-bit H_W_I hosts.  */
8372           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8373
8374           for (i = 0; i < 8; i++)
8375             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8376               << (i * BITS_PER_UNIT);
8377
8378
8379           info->value = GEN_INT (imm);
8380         }
8381       else
8382         {
8383           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8384             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8385
8386           /* Construct 'abcdefgh' because the assembler cannot handle
8387              generic constants.  */
8388           if (info->mvn)
8389             imm = ~imm;
8390           imm = (imm >> info->shift) & 0xff;
8391           info->value = GEN_INT (imm);
8392         }
8393     }
8394
8395   return true;
8396 #undef CHECK
8397 }
8398
8399 /* Check of immediate shift constants are within range.  */
8400 bool
8401 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8402 {
8403   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8404   if (left)
8405     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8406   else
8407     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8408 }
8409
8410 /* Return true if X is a uniform vector where all elements
8411    are either the floating-point constant 0.0 or the
8412    integer constant 0.  */
8413 bool
8414 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8415 {
8416   return x == CONST0_RTX (mode);
8417 }
8418
8419 bool
8420 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8421 {
8422   HOST_WIDE_INT imm = INTVAL (x);
8423   int i;
8424
8425   for (i = 0; i < 8; i++)
8426     {
8427       unsigned int byte = imm & 0xff;
8428       if (byte != 0xff && byte != 0)
8429        return false;
8430       imm >>= 8;
8431     }
8432
8433   return true;
8434 }
8435
8436 bool
8437 aarch64_mov_operand_p (rtx x,
8438                        enum aarch64_symbol_context context,
8439                        machine_mode mode)
8440 {
8441   if (GET_CODE (x) == HIGH
8442       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8443     return true;
8444
8445   if (CONST_INT_P (x))
8446     return true;
8447
8448   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8449     return true;
8450
8451   return aarch64_classify_symbolic_expression (x, context)
8452     == SYMBOL_TINY_ABSOLUTE;
8453 }
8454
8455 /* Return a const_int vector of VAL.  */
8456 rtx
8457 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8458 {
8459   int nunits = GET_MODE_NUNITS (mode);
8460   rtvec v = rtvec_alloc (nunits);
8461   int i;
8462
8463   for (i=0; i < nunits; i++)
8464     RTVEC_ELT (v, i) = GEN_INT (val);
8465
8466   return gen_rtx_CONST_VECTOR (mode, v);
8467 }
8468
8469 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
8470
8471 bool
8472 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8473 {
8474   machine_mode vmode;
8475
8476   gcc_assert (!VECTOR_MODE_P (mode));
8477   vmode = aarch64_preferred_simd_mode (mode);
8478   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8479   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8480 }
8481
8482 /* Construct and return a PARALLEL RTX vector with elements numbering the
8483    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8484    the vector - from the perspective of the architecture.  This does not
8485    line up with GCC's perspective on lane numbers, so we end up with
8486    different masks depending on our target endian-ness.  The diagram
8487    below may help.  We must draw the distinction when building masks
8488    which select one half of the vector.  An instruction selecting
8489    architectural low-lanes for a big-endian target, must be described using
8490    a mask selecting GCC high-lanes.
8491
8492                  Big-Endian             Little-Endian
8493
8494 GCC             0   1   2   3           3   2   1   0
8495               | x | x | x | x |       | x | x | x | x |
8496 Architecture    3   2   1   0           3   2   1   0
8497
8498 Low Mask:         { 2, 3 }                { 0, 1 }
8499 High Mask:        { 0, 1 }                { 2, 3 }
8500 */
8501
8502 rtx
8503 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8504 {
8505   int nunits = GET_MODE_NUNITS (mode);
8506   rtvec v = rtvec_alloc (nunits / 2);
8507   int high_base = nunits / 2;
8508   int low_base = 0;
8509   int base;
8510   rtx t1;
8511   int i;
8512
8513   if (BYTES_BIG_ENDIAN)
8514     base = high ? low_base : high_base;
8515   else
8516     base = high ? high_base : low_base;
8517
8518   for (i = 0; i < nunits / 2; i++)
8519     RTVEC_ELT (v, i) = GEN_INT (base + i);
8520
8521   t1 = gen_rtx_PARALLEL (mode, v);
8522   return t1;
8523 }
8524
8525 /* Check OP for validity as a PARALLEL RTX vector with elements
8526    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8527    from the perspective of the architecture.  See the diagram above
8528    aarch64_simd_vect_par_cnst_half for more details.  */
8529
8530 bool
8531 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8532                                        bool high)
8533 {
8534   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8535   HOST_WIDE_INT count_op = XVECLEN (op, 0);
8536   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8537   int i = 0;
8538
8539   if (!VECTOR_MODE_P (mode))
8540     return false;
8541
8542   if (count_op != count_ideal)
8543     return false;
8544
8545   for (i = 0; i < count_ideal; i++)
8546     {
8547       rtx elt_op = XVECEXP (op, 0, i);
8548       rtx elt_ideal = XVECEXP (ideal, 0, i);
8549
8550       if (!CONST_INT_P (elt_op)
8551           || INTVAL (elt_ideal) != INTVAL (elt_op))
8552         return false;
8553     }
8554   return true;
8555 }
8556
8557 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
8558    HIGH (exclusive).  */
8559 void
8560 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
8561                           const_tree exp)
8562 {
8563   HOST_WIDE_INT lane;
8564   gcc_assert (CONST_INT_P (operand));
8565   lane = INTVAL (operand);
8566
8567   if (lane < low || lane >= high)
8568   {
8569     if (exp)
8570       error ("%Klane %ld out of range %ld - %ld", exp, lane, low, high - 1);
8571     else
8572       error ("lane %ld out of range %ld - %ld", lane, low, high - 1);
8573   }
8574 }
8575
8576 /* Return TRUE if OP is a valid vector addressing mode.  */
8577 bool
8578 aarch64_simd_mem_operand_p (rtx op)
8579 {
8580   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8581                         || REG_P (XEXP (op, 0)));
8582 }
8583
8584 /* Emit a register copy from operand to operand, taking care not to
8585    early-clobber source registers in the process.
8586
8587    COUNT is the number of components into which the copy needs to be
8588    decomposed.  */
8589 void
8590 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
8591                                 unsigned int count)
8592 {
8593   unsigned int i;
8594   int rdest = REGNO (operands[0]);
8595   int rsrc = REGNO (operands[1]);
8596
8597   if (!reg_overlap_mentioned_p (operands[0], operands[1])
8598       || rdest < rsrc)
8599     for (i = 0; i < count; i++)
8600       emit_move_insn (gen_rtx_REG (mode, rdest + i),
8601                       gen_rtx_REG (mode, rsrc + i));
8602   else
8603     for (i = 0; i < count; i++)
8604       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
8605                       gen_rtx_REG (mode, rsrc + count - i - 1));
8606 }
8607
8608 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8609    one of VSTRUCT modes: OI, CI or XI.  */
8610 int
8611 aarch64_simd_attr_length_move (rtx_insn *insn)
8612 {
8613   machine_mode mode;
8614
8615   extract_insn_cached (insn);
8616
8617   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8618     {
8619       mode = GET_MODE (recog_data.operand[0]);
8620       switch (mode)
8621         {
8622         case OImode:
8623           return 8;
8624         case CImode:
8625           return 12;
8626         case XImode:
8627           return 16;
8628         default:
8629           gcc_unreachable ();
8630         }
8631     }
8632   return 4;
8633 }
8634
8635 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
8636    one of VSTRUCT modes: OI, CI, EI, or XI.  */
8637 int
8638 aarch64_simd_attr_length_rglist (enum machine_mode mode)
8639 {
8640   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
8641 }
8642
8643 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
8644    alignment of a vector to 128 bits.  */
8645 static HOST_WIDE_INT
8646 aarch64_simd_vector_alignment (const_tree type)
8647 {
8648   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8649   return MIN (align, 128);
8650 }
8651
8652 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
8653 static bool
8654 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8655 {
8656   if (is_packed)
8657     return false;
8658
8659   /* We guarantee alignment for vectors up to 128-bits.  */
8660   if (tree_int_cst_compare (TYPE_SIZE (type),
8661                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8662     return false;
8663
8664   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
8665   return true;
8666 }
8667
8668 /* If VALS is a vector constant that can be loaded into a register
8669    using DUP, generate instructions to do so and return an RTX to
8670    assign to the register.  Otherwise return NULL_RTX.  */
8671 static rtx
8672 aarch64_simd_dup_constant (rtx vals)
8673 {
8674   machine_mode mode = GET_MODE (vals);
8675   machine_mode inner_mode = GET_MODE_INNER (mode);
8676   int n_elts = GET_MODE_NUNITS (mode);
8677   bool all_same = true;
8678   rtx x;
8679   int i;
8680
8681   if (GET_CODE (vals) != CONST_VECTOR)
8682     return NULL_RTX;
8683
8684   for (i = 1; i < n_elts; ++i)
8685     {
8686       x = CONST_VECTOR_ELT (vals, i);
8687       if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8688         all_same = false;
8689     }
8690
8691   if (!all_same)
8692     return NULL_RTX;
8693
8694   /* We can load this constant by using DUP and a constant in a
8695      single ARM register.  This will be cheaper than a vector
8696      load.  */
8697   x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8698   return gen_rtx_VEC_DUPLICATE (mode, x);
8699 }
8700
8701
8702 /* Generate code to load VALS, which is a PARALLEL containing only
8703    constants (for vec_init) or CONST_VECTOR, efficiently into a
8704    register.  Returns an RTX to copy into the register, or NULL_RTX
8705    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
8706 static rtx
8707 aarch64_simd_make_constant (rtx vals)
8708 {
8709   machine_mode mode = GET_MODE (vals);
8710   rtx const_dup;
8711   rtx const_vec = NULL_RTX;
8712   int n_elts = GET_MODE_NUNITS (mode);
8713   int n_const = 0;
8714   int i;
8715
8716   if (GET_CODE (vals) == CONST_VECTOR)
8717     const_vec = vals;
8718   else if (GET_CODE (vals) == PARALLEL)
8719     {
8720       /* A CONST_VECTOR must contain only CONST_INTs and
8721          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8722          Only store valid constants in a CONST_VECTOR.  */
8723       for (i = 0; i < n_elts; ++i)
8724         {
8725           rtx x = XVECEXP (vals, 0, i);
8726           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8727             n_const++;
8728         }
8729       if (n_const == n_elts)
8730         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8731     }
8732   else
8733     gcc_unreachable ();
8734
8735   if (const_vec != NULL_RTX
8736       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8737     /* Load using MOVI/MVNI.  */
8738     return const_vec;
8739   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8740     /* Loaded using DUP.  */
8741     return const_dup;
8742   else if (const_vec != NULL_RTX)
8743     /* Load from constant pool. We can not take advantage of single-cycle
8744        LD1 because we need a PC-relative addressing mode.  */
8745     return const_vec;
8746   else
8747     /* A PARALLEL containing something not valid inside CONST_VECTOR.
8748        We can not construct an initializer.  */
8749     return NULL_RTX;
8750 }
8751
8752 void
8753 aarch64_expand_vector_init (rtx target, rtx vals)
8754 {
8755   machine_mode mode = GET_MODE (target);
8756   machine_mode inner_mode = GET_MODE_INNER (mode);
8757   int n_elts = GET_MODE_NUNITS (mode);
8758   int n_var = 0;
8759   rtx any_const = NULL_RTX;
8760   bool all_same = true;
8761
8762   for (int i = 0; i < n_elts; ++i)
8763     {
8764       rtx x = XVECEXP (vals, 0, i);
8765       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8766         ++n_var;
8767       else
8768         any_const = x;
8769
8770       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8771         all_same = false;
8772     }
8773
8774   if (n_var == 0)
8775     {
8776       rtx constant = aarch64_simd_make_constant (vals);
8777       if (constant != NULL_RTX)
8778         {
8779           emit_move_insn (target, constant);
8780           return;
8781         }
8782     }
8783
8784   /* Splat a single non-constant element if we can.  */
8785   if (all_same)
8786     {
8787       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8788       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8789       return;
8790     }
8791
8792   /* Half the fields (or less) are non-constant.  Load constant then overwrite
8793      varying fields.  Hope that this is more efficient than using the stack.  */
8794   if (n_var <= n_elts/2)
8795     {
8796       rtx copy = copy_rtx (vals);
8797
8798       /* Load constant part of vector.  We really don't care what goes into the
8799          parts we will overwrite, but we're more likely to be able to load the
8800          constant efficiently if it has fewer, larger, repeating parts
8801          (see aarch64_simd_valid_immediate).  */
8802       for (int i = 0; i < n_elts; i++)
8803         {
8804           rtx x = XVECEXP (vals, 0, i);
8805           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8806             continue;
8807           rtx subst = any_const;
8808           for (int bit = n_elts / 2; bit > 0; bit /= 2)
8809             {
8810               /* Look in the copied vector, as more elements are const.  */
8811               rtx test = XVECEXP (copy, 0, i ^ bit);
8812               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
8813                 {
8814                   subst = test;
8815                   break;
8816                 }
8817             }
8818           XVECEXP (copy, 0, i) = subst;
8819         }
8820       aarch64_expand_vector_init (target, copy);
8821
8822       /* Insert variables.  */
8823       enum insn_code icode = optab_handler (vec_set_optab, mode);
8824       gcc_assert (icode != CODE_FOR_nothing);
8825
8826       for (int i = 0; i < n_elts; i++)
8827         {
8828           rtx x = XVECEXP (vals, 0, i);
8829           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8830             continue;
8831           x = copy_to_mode_reg (inner_mode, x);
8832           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
8833         }
8834       return;
8835     }
8836
8837   /* Construct the vector in memory one field at a time
8838      and load the whole vector.  */
8839   rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8840   for (int i = 0; i < n_elts; i++)
8841     emit_move_insn (adjust_address_nv (mem, inner_mode,
8842                                     i * GET_MODE_SIZE (inner_mode)),
8843                     XVECEXP (vals, 0, i));
8844   emit_move_insn (target, mem);
8845
8846 }
8847
8848 static unsigned HOST_WIDE_INT
8849 aarch64_shift_truncation_mask (machine_mode mode)
8850 {
8851   return
8852     (aarch64_vector_mode_supported_p (mode)
8853      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8854 }
8855
8856 #ifndef TLS_SECTION_ASM_FLAG
8857 #define TLS_SECTION_ASM_FLAG 'T'
8858 #endif
8859
8860 void
8861 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8862                                tree decl ATTRIBUTE_UNUSED)
8863 {
8864   char flagchars[10], *f = flagchars;
8865
8866   /* If we have already declared this section, we can use an
8867      abbreviated form to switch back to it -- unless this section is
8868      part of a COMDAT groups, in which case GAS requires the full
8869      declaration every time.  */
8870   if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8871       && (flags & SECTION_DECLARED))
8872     {
8873       fprintf (asm_out_file, "\t.section\t%s\n", name);
8874       return;
8875     }
8876
8877   if (!(flags & SECTION_DEBUG))
8878     *f++ = 'a';
8879   if (flags & SECTION_WRITE)
8880     *f++ = 'w';
8881   if (flags & SECTION_CODE)
8882     *f++ = 'x';
8883   if (flags & SECTION_SMALL)
8884     *f++ = 's';
8885   if (flags & SECTION_MERGE)
8886     *f++ = 'M';
8887   if (flags & SECTION_STRINGS)
8888     *f++ = 'S';
8889   if (flags & SECTION_TLS)
8890     *f++ = TLS_SECTION_ASM_FLAG;
8891   if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8892     *f++ = 'G';
8893   *f = '\0';
8894
8895   fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
8896
8897   if (!(flags & SECTION_NOTYPE))
8898     {
8899       const char *type;
8900       const char *format;
8901
8902       if (flags & SECTION_BSS)
8903         type = "nobits";
8904       else
8905         type = "progbits";
8906
8907 #ifdef TYPE_OPERAND_FMT
8908       format = "," TYPE_OPERAND_FMT;
8909 #else
8910       format = ",@%s";
8911 #endif
8912
8913       fprintf (asm_out_file, format, type);
8914
8915       if (flags & SECTION_ENTSIZE)
8916         fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
8917       if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8918         {
8919           if (TREE_CODE (decl) == IDENTIFIER_NODE)
8920             fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
8921           else
8922             fprintf (asm_out_file, ",%s,comdat",
8923                      IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
8924         }
8925     }
8926
8927   putc ('\n', asm_out_file);
8928 }
8929
8930 /* Select a format to encode pointers in exception handling data.  */
8931 int
8932 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
8933 {
8934    int type;
8935    switch (aarch64_cmodel)
8936      {
8937      case AARCH64_CMODEL_TINY:
8938      case AARCH64_CMODEL_TINY_PIC:
8939      case AARCH64_CMODEL_SMALL:
8940      case AARCH64_CMODEL_SMALL_PIC:
8941        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
8942           for everything.  */
8943        type = DW_EH_PE_sdata4;
8944        break;
8945      default:
8946        /* No assumptions here.  8-byte relocs required.  */
8947        type = DW_EH_PE_sdata8;
8948        break;
8949      }
8950    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
8951 }
8952
8953 /* Emit load exclusive.  */
8954
8955 static void
8956 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
8957                              rtx mem, rtx model_rtx)
8958 {
8959   rtx (*gen) (rtx, rtx, rtx);
8960
8961   switch (mode)
8962     {
8963     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
8964     case HImode: gen = gen_aarch64_load_exclusivehi; break;
8965     case SImode: gen = gen_aarch64_load_exclusivesi; break;
8966     case DImode: gen = gen_aarch64_load_exclusivedi; break;
8967     default:
8968       gcc_unreachable ();
8969     }
8970
8971   emit_insn (gen (rval, mem, model_rtx));
8972 }
8973
8974 /* Emit store exclusive.  */
8975
8976 static void
8977 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
8978                               rtx rval, rtx mem, rtx model_rtx)
8979 {
8980   rtx (*gen) (rtx, rtx, rtx, rtx);
8981
8982   switch (mode)
8983     {
8984     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
8985     case HImode: gen = gen_aarch64_store_exclusivehi; break;
8986     case SImode: gen = gen_aarch64_store_exclusivesi; break;
8987     case DImode: gen = gen_aarch64_store_exclusivedi; break;
8988     default:
8989       gcc_unreachable ();
8990     }
8991
8992   emit_insn (gen (bval, rval, mem, model_rtx));
8993 }
8994
8995 /* Mark the previous jump instruction as unlikely.  */
8996
8997 static void
8998 aarch64_emit_unlikely_jump (rtx insn)
8999 {
9000   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
9001
9002   insn = emit_jump_insn (insn);
9003   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
9004 }
9005
9006 /* Expand a compare and swap pattern.  */
9007
9008 void
9009 aarch64_expand_compare_and_swap (rtx operands[])
9010 {
9011   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9012   machine_mode mode, cmp_mode;
9013   rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9014
9015   bval = operands[0];
9016   rval = operands[1];
9017   mem = operands[2];
9018   oldval = operands[3];
9019   newval = operands[4];
9020   is_weak = operands[5];
9021   mod_s = operands[6];
9022   mod_f = operands[7];
9023   mode = GET_MODE (mem);
9024   cmp_mode = mode;
9025
9026   /* Normally the succ memory model must be stronger than fail, but in the
9027      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9028      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
9029
9030   if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
9031       && INTVAL (mod_s) == MEMMODEL_RELEASE)
9032     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9033
9034   switch (mode)
9035     {
9036     case QImode:
9037     case HImode:
9038       /* For short modes, we're going to perform the comparison in SImode,
9039          so do the zero-extension now.  */
9040       cmp_mode = SImode;
9041       rval = gen_reg_rtx (SImode);
9042       oldval = convert_modes (SImode, mode, oldval, true);
9043       /* Fall through.  */
9044
9045     case SImode:
9046     case DImode:
9047       /* Force the value into a register if needed.  */
9048       if (!aarch64_plus_operand (oldval, mode))
9049         oldval = force_reg (cmp_mode, oldval);
9050       break;
9051
9052     default:
9053       gcc_unreachable ();
9054     }
9055
9056   switch (mode)
9057     {
9058     case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9059     case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9060     case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9061     case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9062     default:
9063       gcc_unreachable ();
9064     }
9065
9066   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9067
9068   if (mode == QImode || mode == HImode)
9069     emit_move_insn (operands[1], gen_lowpart (mode, rval));
9070
9071   x = gen_rtx_REG (CCmode, CC_REGNUM);
9072   x = gen_rtx_EQ (SImode, x, const0_rtx);
9073   emit_insn (gen_rtx_SET (VOIDmode, bval, x));
9074 }
9075
9076 /* Split a compare and swap pattern.  */
9077
9078 void
9079 aarch64_split_compare_and_swap (rtx operands[])
9080 {
9081   rtx rval, mem, oldval, newval, scratch;
9082   machine_mode mode;
9083   bool is_weak;
9084   rtx_code_label *label1, *label2;
9085   rtx x, cond;
9086
9087   rval = operands[0];
9088   mem = operands[1];
9089   oldval = operands[2];
9090   newval = operands[3];
9091   is_weak = (operands[4] != const0_rtx);
9092   scratch = operands[7];
9093   mode = GET_MODE (mem);
9094
9095   label1 = NULL;
9096   if (!is_weak)
9097     {
9098       label1 = gen_label_rtx ();
9099       emit_label (label1);
9100     }
9101   label2 = gen_label_rtx ();
9102
9103   aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
9104
9105   cond = aarch64_gen_compare_reg (NE, rval, oldval);
9106   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9107   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9108                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9109   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9110
9111   aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
9112
9113   if (!is_weak)
9114     {
9115       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9116       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9117                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9118       aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9119     }
9120   else
9121     {
9122       cond = gen_rtx_REG (CCmode, CC_REGNUM);
9123       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9124       emit_insn (gen_rtx_SET (VOIDmode, cond, x));
9125     }
9126
9127   emit_label (label2);
9128 }
9129
9130 /* Split an atomic operation.  */
9131
9132 void
9133 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9134                      rtx value, rtx model_rtx, rtx cond)
9135 {
9136   machine_mode mode = GET_MODE (mem);
9137   machine_mode wmode = (mode == DImode ? DImode : SImode);
9138   rtx_code_label *label;
9139   rtx x;
9140
9141   label = gen_label_rtx ();
9142   emit_label (label);
9143
9144   if (new_out)
9145     new_out = gen_lowpart (wmode, new_out);
9146   if (old_out)
9147     old_out = gen_lowpart (wmode, old_out);
9148   else
9149     old_out = new_out;
9150   value = simplify_gen_subreg (wmode, value, mode, 0);
9151
9152   aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9153
9154   switch (code)
9155     {
9156     case SET:
9157       new_out = value;
9158       break;
9159
9160     case NOT:
9161       x = gen_rtx_AND (wmode, old_out, value);
9162       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9163       x = gen_rtx_NOT (wmode, new_out);
9164       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9165       break;
9166
9167     case MINUS:
9168       if (CONST_INT_P (value))
9169         {
9170           value = GEN_INT (-INTVAL (value));
9171           code = PLUS;
9172         }
9173       /* Fall through.  */
9174
9175     default:
9176       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9177       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9178       break;
9179     }
9180
9181   aarch64_emit_store_exclusive (mode, cond, mem,
9182                                 gen_lowpart (mode, new_out), model_rtx);
9183
9184   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9185   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9186                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9187   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9188 }
9189
9190 static void
9191 aarch64_print_extension (void)
9192 {
9193   const struct aarch64_option_extension *opt = NULL;
9194
9195   for (opt = all_extensions; opt->name != NULL; opt++)
9196     if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9197       asm_fprintf (asm_out_file, "+%s", opt->name);
9198
9199   asm_fprintf (asm_out_file, "\n");
9200 }
9201
9202 static void
9203 aarch64_start_file (void)
9204 {
9205   if (selected_arch)
9206     {
9207       asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9208       aarch64_print_extension ();
9209     }
9210   else if (selected_cpu)
9211     {
9212       const char *truncated_name
9213             = aarch64_rewrite_selected_cpu (selected_cpu->name);
9214       asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9215       aarch64_print_extension ();
9216     }
9217   default_file_start();
9218 }
9219
9220 /* Target hook for c_mode_for_suffix.  */
9221 static machine_mode
9222 aarch64_c_mode_for_suffix (char suffix)
9223 {
9224   if (suffix == 'q')
9225     return TFmode;
9226
9227   return VOIDmode;
9228 }
9229
9230 /* We can only represent floating point constants which will fit in
9231    "quarter-precision" values.  These values are characterised by
9232    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
9233    by:
9234
9235    (-1)^s * (n/16) * 2^r
9236
9237    Where:
9238      's' is the sign bit.
9239      'n' is an integer in the range 16 <= n <= 31.
9240      'r' is an integer in the range -3 <= r <= 4.  */
9241
9242 /* Return true iff X can be represented by a quarter-precision
9243    floating point immediate operand X.  Note, we cannot represent 0.0.  */
9244 bool
9245 aarch64_float_const_representable_p (rtx x)
9246 {
9247   /* This represents our current view of how many bits
9248      make up the mantissa.  */
9249   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9250   int exponent;
9251   unsigned HOST_WIDE_INT mantissa, mask;
9252   REAL_VALUE_TYPE r, m;
9253   bool fail;
9254
9255   if (!CONST_DOUBLE_P (x))
9256     return false;
9257
9258   if (GET_MODE (x) == VOIDmode)
9259     return false;
9260
9261   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9262
9263   /* We cannot represent infinities, NaNs or +/-zero.  We won't
9264      know if we have +zero until we analyse the mantissa, but we
9265      can reject the other invalid values.  */
9266   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9267       || REAL_VALUE_MINUS_ZERO (r))
9268     return false;
9269
9270   /* Extract exponent.  */
9271   r = real_value_abs (&r);
9272   exponent = REAL_EXP (&r);
9273
9274   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9275      highest (sign) bit, with a fixed binary point at bit point_pos.
9276      m1 holds the low part of the mantissa, m2 the high part.
9277      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9278      bits for the mantissa, this can fail (low bits will be lost).  */
9279   real_ldexp (&m, &r, point_pos - exponent);
9280   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9281
9282   /* If the low part of the mantissa has bits set we cannot represent
9283      the value.  */
9284   if (w.elt (0) != 0)
9285     return false;
9286   /* We have rejected the lower HOST_WIDE_INT, so update our
9287      understanding of how many bits lie in the mantissa and
9288      look only at the high HOST_WIDE_INT.  */
9289   mantissa = w.elt (1);
9290   point_pos -= HOST_BITS_PER_WIDE_INT;
9291
9292   /* We can only represent values with a mantissa of the form 1.xxxx.  */
9293   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9294   if ((mantissa & mask) != 0)
9295     return false;
9296
9297   /* Having filtered unrepresentable values, we may now remove all
9298      but the highest 5 bits.  */
9299   mantissa >>= point_pos - 5;
9300
9301   /* We cannot represent the value 0.0, so reject it.  This is handled
9302      elsewhere.  */
9303   if (mantissa == 0)
9304     return false;
9305
9306   /* Then, as bit 4 is always set, we can mask it off, leaving
9307      the mantissa in the range [0, 15].  */
9308   mantissa &= ~(1 << 4);
9309   gcc_assert (mantissa <= 15);
9310
9311   /* GCC internally does not use IEEE754-like encoding (where normalized
9312      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
9313      Our mantissa values are shifted 4 places to the left relative to
9314      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9315      by 5 places to correct for GCC's representation.  */
9316   exponent = 5 - exponent;
9317
9318   return (exponent >= 0 && exponent <= 7);
9319 }
9320
9321 char*
9322 aarch64_output_simd_mov_immediate (rtx const_vector,
9323                                    machine_mode mode,
9324                                    unsigned width)
9325 {
9326   bool is_valid;
9327   static char templ[40];
9328   const char *mnemonic;
9329   const char *shift_op;
9330   unsigned int lane_count = 0;
9331   char element_char;
9332
9333   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
9334
9335   /* This will return true to show const_vector is legal for use as either
9336      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
9337      also update INFO to show how the immediate should be generated.  */
9338   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
9339   gcc_assert (is_valid);
9340
9341   element_char = sizetochar (info.element_width);
9342   lane_count = width / info.element_width;
9343
9344   mode = GET_MODE_INNER (mode);
9345   if (mode == SFmode || mode == DFmode)
9346     {
9347       gcc_assert (info.shift == 0 && ! info.mvn);
9348       if (aarch64_float_const_zero_rtx_p (info.value))
9349         info.value = GEN_INT (0);
9350       else
9351         {
9352 #define buf_size 20
9353           REAL_VALUE_TYPE r;
9354           REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
9355           char float_buf[buf_size] = {'\0'};
9356           real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
9357 #undef buf_size
9358
9359           if (lane_count == 1)
9360             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
9361           else
9362             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
9363                       lane_count, element_char, float_buf);
9364           return templ;
9365         }
9366     }
9367
9368   mnemonic = info.mvn ? "mvni" : "movi";
9369   shift_op = info.msl ? "msl" : "lsl";
9370
9371   if (lane_count == 1)
9372     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9373               mnemonic, UINTVAL (info.value));
9374   else if (info.shift)
9375     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9376               ", %s %d", mnemonic, lane_count, element_char,
9377               UINTVAL (info.value), shift_op, info.shift);
9378   else
9379     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9380               mnemonic, lane_count, element_char, UINTVAL (info.value));
9381   return templ;
9382 }
9383
9384 char*
9385 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9386                                           machine_mode mode)
9387 {
9388   machine_mode vmode;
9389
9390   gcc_assert (!VECTOR_MODE_P (mode));
9391   vmode = aarch64_simd_container_mode (mode, 64);
9392   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9393   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9394 }
9395
9396 /* Split operands into moves from op[1] + op[2] into op[0].  */
9397
9398 void
9399 aarch64_split_combinev16qi (rtx operands[3])
9400 {
9401   unsigned int dest = REGNO (operands[0]);
9402   unsigned int src1 = REGNO (operands[1]);
9403   unsigned int src2 = REGNO (operands[2]);
9404   machine_mode halfmode = GET_MODE (operands[1]);
9405   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9406   rtx destlo, desthi;
9407
9408   gcc_assert (halfmode == V16QImode);
9409
9410   if (src1 == dest && src2 == dest + halfregs)
9411     {
9412       /* No-op move.  Can't split to nothing; emit something.  */
9413       emit_note (NOTE_INSN_DELETED);
9414       return;
9415     }
9416
9417   /* Preserve register attributes for variable tracking.  */
9418   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9419   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9420                                GET_MODE_SIZE (halfmode));
9421
9422   /* Special case of reversed high/low parts.  */
9423   if (reg_overlap_mentioned_p (operands[2], destlo)
9424       && reg_overlap_mentioned_p (operands[1], desthi))
9425     {
9426       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9427       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9428       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9429     }
9430   else if (!reg_overlap_mentioned_p (operands[2], destlo))
9431     {
9432       /* Try to avoid unnecessary moves if part of the result
9433          is in the right place already.  */
9434       if (src1 != dest)
9435         emit_move_insn (destlo, operands[1]);
9436       if (src2 != dest + halfregs)
9437         emit_move_insn (desthi, operands[2]);
9438     }
9439   else
9440     {
9441       if (src2 != dest + halfregs)
9442         emit_move_insn (desthi, operands[2]);
9443       if (src1 != dest)
9444         emit_move_insn (destlo, operands[1]);
9445     }
9446 }
9447
9448 /* vec_perm support.  */
9449
9450 #define MAX_VECT_LEN 16
9451
9452 struct expand_vec_perm_d
9453 {
9454   rtx target, op0, op1;
9455   unsigned char perm[MAX_VECT_LEN];
9456   machine_mode vmode;
9457   unsigned char nelt;
9458   bool one_vector_p;
9459   bool testing_p;
9460 };
9461
9462 /* Generate a variable permutation.  */
9463
9464 static void
9465 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9466 {
9467   machine_mode vmode = GET_MODE (target);
9468   bool one_vector_p = rtx_equal_p (op0, op1);
9469
9470   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9471   gcc_checking_assert (GET_MODE (op0) == vmode);
9472   gcc_checking_assert (GET_MODE (op1) == vmode);
9473   gcc_checking_assert (GET_MODE (sel) == vmode);
9474   gcc_checking_assert (TARGET_SIMD);
9475
9476   if (one_vector_p)
9477     {
9478       if (vmode == V8QImode)
9479         {
9480           /* Expand the argument to a V16QI mode by duplicating it.  */
9481           rtx pair = gen_reg_rtx (V16QImode);
9482           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9483           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9484         }
9485       else
9486         {
9487           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9488         }
9489     }
9490   else
9491     {
9492       rtx pair;
9493
9494       if (vmode == V8QImode)
9495         {
9496           pair = gen_reg_rtx (V16QImode);
9497           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9498           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9499         }
9500       else
9501         {
9502           pair = gen_reg_rtx (OImode);
9503           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9504           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9505         }
9506     }
9507 }
9508
9509 void
9510 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9511 {
9512   machine_mode vmode = GET_MODE (target);
9513   unsigned int nelt = GET_MODE_NUNITS (vmode);
9514   bool one_vector_p = rtx_equal_p (op0, op1);
9515   rtx mask;
9516
9517   /* The TBL instruction does not use a modulo index, so we must take care
9518      of that ourselves.  */
9519   mask = aarch64_simd_gen_const_vector_dup (vmode,
9520       one_vector_p ? nelt - 1 : 2 * nelt - 1);
9521   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9522
9523   /* For big-endian, we also need to reverse the index within the vector
9524      (but not which vector).  */
9525   if (BYTES_BIG_ENDIAN)
9526     {
9527       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
9528       if (!one_vector_p)
9529         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9530       sel = expand_simple_binop (vmode, XOR, sel, mask,
9531                                  NULL, 0, OPTAB_LIB_WIDEN);
9532     }
9533   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9534 }
9535
9536 /* Recognize patterns suitable for the TRN instructions.  */
9537 static bool
9538 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9539 {
9540   unsigned int i, odd, mask, nelt = d->nelt;
9541   rtx out, in0, in1, x;
9542   rtx (*gen) (rtx, rtx, rtx);
9543   machine_mode vmode = d->vmode;
9544
9545   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9546     return false;
9547
9548   /* Note that these are little-endian tests.
9549      We correct for big-endian later.  */
9550   if (d->perm[0] == 0)
9551     odd = 0;
9552   else if (d->perm[0] == 1)
9553     odd = 1;
9554   else
9555     return false;
9556   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9557
9558   for (i = 0; i < nelt; i += 2)
9559     {
9560       if (d->perm[i] != i + odd)
9561         return false;
9562       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9563         return false;
9564     }
9565
9566   /* Success!  */
9567   if (d->testing_p)
9568     return true;
9569
9570   in0 = d->op0;
9571   in1 = d->op1;
9572   if (BYTES_BIG_ENDIAN)
9573     {
9574       x = in0, in0 = in1, in1 = x;
9575       odd = !odd;
9576     }
9577   out = d->target;
9578
9579   if (odd)
9580     {
9581       switch (vmode)
9582         {
9583         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9584         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9585         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9586         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9587         case V4SImode: gen = gen_aarch64_trn2v4si; break;
9588         case V2SImode: gen = gen_aarch64_trn2v2si; break;
9589         case V2DImode: gen = gen_aarch64_trn2v2di; break;
9590         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9591         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9592         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9593         default:
9594           return false;
9595         }
9596     }
9597   else
9598     {
9599       switch (vmode)
9600         {
9601         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9602         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9603         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9604         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9605         case V4SImode: gen = gen_aarch64_trn1v4si; break;
9606         case V2SImode: gen = gen_aarch64_trn1v2si; break;
9607         case V2DImode: gen = gen_aarch64_trn1v2di; break;
9608         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9609         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9610         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9611         default:
9612           return false;
9613         }
9614     }
9615
9616   emit_insn (gen (out, in0, in1));
9617   return true;
9618 }
9619
9620 /* Recognize patterns suitable for the UZP instructions.  */
9621 static bool
9622 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
9623 {
9624   unsigned int i, odd, mask, nelt = d->nelt;
9625   rtx out, in0, in1, x;
9626   rtx (*gen) (rtx, rtx, rtx);
9627   machine_mode vmode = d->vmode;
9628
9629   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9630     return false;
9631
9632   /* Note that these are little-endian tests.
9633      We correct for big-endian later.  */
9634   if (d->perm[0] == 0)
9635     odd = 0;
9636   else if (d->perm[0] == 1)
9637     odd = 1;
9638   else
9639     return false;
9640   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9641
9642   for (i = 0; i < nelt; i++)
9643     {
9644       unsigned elt = (i * 2 + odd) & mask;
9645       if (d->perm[i] != elt)
9646         return false;
9647     }
9648
9649   /* Success!  */
9650   if (d->testing_p)
9651     return true;
9652
9653   in0 = d->op0;
9654   in1 = d->op1;
9655   if (BYTES_BIG_ENDIAN)
9656     {
9657       x = in0, in0 = in1, in1 = x;
9658       odd = !odd;
9659     }
9660   out = d->target;
9661
9662   if (odd)
9663     {
9664       switch (vmode)
9665         {
9666         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9667         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9668         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9669         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9670         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9671         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9672         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9673         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9674         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9675         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9676         default:
9677           return false;
9678         }
9679     }
9680   else
9681     {
9682       switch (vmode)
9683         {
9684         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9685         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9686         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9687         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9688         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9689         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9690         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9691         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9692         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9693         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9694         default:
9695           return false;
9696         }
9697     }
9698
9699   emit_insn (gen (out, in0, in1));
9700   return true;
9701 }
9702
9703 /* Recognize patterns suitable for the ZIP instructions.  */
9704 static bool
9705 aarch64_evpc_zip (struct expand_vec_perm_d *d)
9706 {
9707   unsigned int i, high, mask, nelt = d->nelt;
9708   rtx out, in0, in1, x;
9709   rtx (*gen) (rtx, rtx, rtx);
9710   machine_mode vmode = d->vmode;
9711
9712   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9713     return false;
9714
9715   /* Note that these are little-endian tests.
9716      We correct for big-endian later.  */
9717   high = nelt / 2;
9718   if (d->perm[0] == high)
9719     /* Do Nothing.  */
9720     ;
9721   else if (d->perm[0] == 0)
9722     high = 0;
9723   else
9724     return false;
9725   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9726
9727   for (i = 0; i < nelt / 2; i++)
9728     {
9729       unsigned elt = (i + high) & mask;
9730       if (d->perm[i * 2] != elt)
9731         return false;
9732       elt = (elt + nelt) & mask;
9733       if (d->perm[i * 2 + 1] != elt)
9734         return false;
9735     }
9736
9737   /* Success!  */
9738   if (d->testing_p)
9739     return true;
9740
9741   in0 = d->op0;
9742   in1 = d->op1;
9743   if (BYTES_BIG_ENDIAN)
9744     {
9745       x = in0, in0 = in1, in1 = x;
9746       high = !high;
9747     }
9748   out = d->target;
9749
9750   if (high)
9751     {
9752       switch (vmode)
9753         {
9754         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9755         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9756         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9757         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9758         case V4SImode: gen = gen_aarch64_zip2v4si; break;
9759         case V2SImode: gen = gen_aarch64_zip2v2si; break;
9760         case V2DImode: gen = gen_aarch64_zip2v2di; break;
9761         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9762         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9763         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9764         default:
9765           return false;
9766         }
9767     }
9768   else
9769     {
9770       switch (vmode)
9771         {
9772         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9773         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9774         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9775         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9776         case V4SImode: gen = gen_aarch64_zip1v4si; break;
9777         case V2SImode: gen = gen_aarch64_zip1v2si; break;
9778         case V2DImode: gen = gen_aarch64_zip1v2di; break;
9779         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9780         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9781         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9782         default:
9783           return false;
9784         }
9785     }
9786
9787   emit_insn (gen (out, in0, in1));
9788   return true;
9789 }
9790
9791 /* Recognize patterns for the EXT insn.  */
9792
9793 static bool
9794 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9795 {
9796   unsigned int i, nelt = d->nelt;
9797   rtx (*gen) (rtx, rtx, rtx, rtx);
9798   rtx offset;
9799
9800   unsigned int location = d->perm[0]; /* Always < nelt.  */
9801
9802   /* Check if the extracted indices are increasing by one.  */
9803   for (i = 1; i < nelt; i++)
9804     {
9805       unsigned int required = location + i;
9806       if (d->one_vector_p)
9807         {
9808           /* We'll pass the same vector in twice, so allow indices to wrap.  */
9809           required &= (nelt - 1);
9810         }
9811       if (d->perm[i] != required)
9812         return false;
9813     }
9814
9815   switch (d->vmode)
9816     {
9817     case V16QImode: gen = gen_aarch64_extv16qi; break;
9818     case V8QImode: gen = gen_aarch64_extv8qi; break;
9819     case V4HImode: gen = gen_aarch64_extv4hi; break;
9820     case V8HImode: gen = gen_aarch64_extv8hi; break;
9821     case V2SImode: gen = gen_aarch64_extv2si; break;
9822     case V4SImode: gen = gen_aarch64_extv4si; break;
9823     case V2SFmode: gen = gen_aarch64_extv2sf; break;
9824     case V4SFmode: gen = gen_aarch64_extv4sf; break;
9825     case V2DImode: gen = gen_aarch64_extv2di; break;
9826     case V2DFmode: gen = gen_aarch64_extv2df; break;
9827     default:
9828       return false;
9829     }
9830
9831   /* Success! */
9832   if (d->testing_p)
9833     return true;
9834
9835   /* The case where (location == 0) is a no-op for both big- and little-endian,
9836      and is removed by the mid-end at optimization levels -O1 and higher.  */
9837
9838   if (BYTES_BIG_ENDIAN && (location != 0))
9839     {
9840       /* After setup, we want the high elements of the first vector (stored
9841          at the LSB end of the register), and the low elements of the second
9842          vector (stored at the MSB end of the register). So swap.  */
9843       std::swap (d->op0, d->op1);
9844       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
9845       location = nelt - location;
9846     }
9847
9848   offset = GEN_INT (location);
9849   emit_insn (gen (d->target, d->op0, d->op1, offset));
9850   return true;
9851 }
9852
9853 /* Recognize patterns for the REV insns.  */
9854
9855 static bool
9856 aarch64_evpc_rev (struct expand_vec_perm_d *d)
9857 {
9858   unsigned int i, j, diff, nelt = d->nelt;
9859   rtx (*gen) (rtx, rtx);
9860
9861   if (!d->one_vector_p)
9862     return false;
9863
9864   diff = d->perm[0];
9865   switch (diff)
9866     {
9867     case 7:
9868       switch (d->vmode)
9869         {
9870         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
9871         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
9872         default:
9873           return false;
9874         }
9875       break;
9876     case 3:
9877       switch (d->vmode)
9878         {
9879         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
9880         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
9881         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
9882         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
9883         default:
9884           return false;
9885         }
9886       break;
9887     case 1:
9888       switch (d->vmode)
9889         {
9890         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
9891         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
9892         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
9893         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
9894         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
9895         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
9896         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
9897         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
9898         default:
9899           return false;
9900         }
9901       break;
9902     default:
9903       return false;
9904     }
9905
9906   for (i = 0; i < nelt ; i += diff + 1)
9907     for (j = 0; j <= diff; j += 1)
9908       {
9909         /* This is guaranteed to be true as the value of diff
9910            is 7, 3, 1 and we should have enough elements in the
9911            queue to generate this.  Getting a vector mask with a
9912            value of diff other than these values implies that
9913            something is wrong by the time we get here.  */
9914         gcc_assert (i + j < nelt);
9915         if (d->perm[i + j] != i + diff - j)
9916           return false;
9917       }
9918
9919   /* Success! */
9920   if (d->testing_p)
9921     return true;
9922
9923   emit_insn (gen (d->target, d->op0));
9924   return true;
9925 }
9926
9927 static bool
9928 aarch64_evpc_dup (struct expand_vec_perm_d *d)
9929 {
9930   rtx (*gen) (rtx, rtx, rtx);
9931   rtx out = d->target;
9932   rtx in0;
9933   machine_mode vmode = d->vmode;
9934   unsigned int i, elt, nelt = d->nelt;
9935   rtx lane;
9936
9937   elt = d->perm[0];
9938   for (i = 1; i < nelt; i++)
9939     {
9940       if (elt != d->perm[i])
9941         return false;
9942     }
9943
9944   /* The generic preparation in aarch64_expand_vec_perm_const_1
9945      swaps the operand order and the permute indices if it finds
9946      d->perm[0] to be in the second operand.  Thus, we can always
9947      use d->op0 and need not do any extra arithmetic to get the
9948      correct lane number.  */
9949   in0 = d->op0;
9950   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
9951
9952   switch (vmode)
9953     {
9954     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
9955     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
9956     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
9957     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
9958     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
9959     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
9960     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
9961     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
9962     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
9963     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
9964     default:
9965       return false;
9966     }
9967
9968   emit_insn (gen (out, in0, lane));
9969   return true;
9970 }
9971
9972 static bool
9973 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
9974 {
9975   rtx rperm[MAX_VECT_LEN], sel;
9976   machine_mode vmode = d->vmode;
9977   unsigned int i, nelt = d->nelt;
9978
9979   if (d->testing_p)
9980     return true;
9981
9982   /* Generic code will try constant permutation twice.  Once with the
9983      original mode and again with the elements lowered to QImode.
9984      So wait and don't do the selector expansion ourselves.  */
9985   if (vmode != V8QImode && vmode != V16QImode)
9986     return false;
9987
9988   for (i = 0; i < nelt; ++i)
9989     {
9990       int nunits = GET_MODE_NUNITS (vmode);
9991
9992       /* If big-endian and two vectors we end up with a weird mixed-endian
9993          mode on NEON.  Reverse the index within each word but not the word
9994          itself.  */
9995       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
9996                                            : d->perm[i]);
9997     }
9998   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
9999   sel = force_reg (vmode, sel);
10000
10001   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
10002   return true;
10003 }
10004
10005 static bool
10006 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10007 {
10008   /* The pattern matching functions above are written to look for a small
10009      number to begin the sequence (0, 1, N/2).  If we begin with an index
10010      from the second operand, we can swap the operands.  */
10011   if (d->perm[0] >= d->nelt)
10012     {
10013       unsigned i, nelt = d->nelt;
10014
10015       gcc_assert (nelt == (nelt & -nelt));
10016       for (i = 0; i < nelt; ++i)
10017         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
10018
10019       std::swap (d->op0, d->op1);
10020     }
10021
10022   if (TARGET_SIMD)
10023     {
10024       if (aarch64_evpc_rev (d))
10025         return true;
10026       else if (aarch64_evpc_ext (d))
10027         return true;
10028       else if (aarch64_evpc_dup (d))
10029         return true;
10030       else if (aarch64_evpc_zip (d))
10031         return true;
10032       else if (aarch64_evpc_uzp (d))
10033         return true;
10034       else if (aarch64_evpc_trn (d))
10035         return true;
10036       return aarch64_evpc_tbl (d);
10037     }
10038   return false;
10039 }
10040
10041 /* Expand a vec_perm_const pattern.  */
10042
10043 bool
10044 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10045 {
10046   struct expand_vec_perm_d d;
10047   int i, nelt, which;
10048
10049   d.target = target;
10050   d.op0 = op0;
10051   d.op1 = op1;
10052
10053   d.vmode = GET_MODE (target);
10054   gcc_assert (VECTOR_MODE_P (d.vmode));
10055   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10056   d.testing_p = false;
10057
10058   for (i = which = 0; i < nelt; ++i)
10059     {
10060       rtx e = XVECEXP (sel, 0, i);
10061       int ei = INTVAL (e) & (2 * nelt - 1);
10062       which |= (ei < nelt ? 1 : 2);
10063       d.perm[i] = ei;
10064     }
10065
10066   switch (which)
10067     {
10068     default:
10069       gcc_unreachable ();
10070
10071     case 3:
10072       d.one_vector_p = false;
10073       if (!rtx_equal_p (op0, op1))
10074         break;
10075
10076       /* The elements of PERM do not suggest that only the first operand
10077          is used, but both operands are identical.  Allow easier matching
10078          of the permutation by folding the permutation into the single
10079          input vector.  */
10080       /* Fall Through.  */
10081     case 2:
10082       for (i = 0; i < nelt; ++i)
10083         d.perm[i] &= nelt - 1;
10084       d.op0 = op1;
10085       d.one_vector_p = true;
10086       break;
10087
10088     case 1:
10089       d.op1 = op0;
10090       d.one_vector_p = true;
10091       break;
10092     }
10093
10094   return aarch64_expand_vec_perm_const_1 (&d);
10095 }
10096
10097 static bool
10098 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10099                                      const unsigned char *sel)
10100 {
10101   struct expand_vec_perm_d d;
10102   unsigned int i, nelt, which;
10103   bool ret;
10104
10105   d.vmode = vmode;
10106   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10107   d.testing_p = true;
10108   memcpy (d.perm, sel, nelt);
10109
10110   /* Calculate whether all elements are in one vector.  */
10111   for (i = which = 0; i < nelt; ++i)
10112     {
10113       unsigned char e = d.perm[i];
10114       gcc_assert (e < 2 * nelt);
10115       which |= (e < nelt ? 1 : 2);
10116     }
10117
10118   /* If all elements are from the second vector, reindex as if from the
10119      first vector.  */
10120   if (which == 2)
10121     for (i = 0; i < nelt; ++i)
10122       d.perm[i] -= nelt;
10123
10124   /* Check whether the mask can be applied to a single vector.  */
10125   d.one_vector_p = (which != 3);
10126
10127   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10128   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10129   if (!d.one_vector_p)
10130     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10131
10132   start_sequence ();
10133   ret = aarch64_expand_vec_perm_const_1 (&d);
10134   end_sequence ();
10135
10136   return ret;
10137 }
10138
10139 rtx
10140 aarch64_reverse_mask (enum machine_mode mode)
10141 {
10142   /* We have to reverse each vector because we dont have
10143      a permuted load that can reverse-load according to ABI rules.  */
10144   rtx mask;
10145   rtvec v = rtvec_alloc (16);
10146   int i, j;
10147   int nunits = GET_MODE_NUNITS (mode);
10148   int usize = GET_MODE_UNIT_SIZE (mode);
10149
10150   gcc_assert (BYTES_BIG_ENDIAN);
10151   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
10152
10153   for (i = 0; i < nunits; i++)
10154     for (j = 0; j < usize; j++)
10155       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
10156   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
10157   return force_reg (V16QImode, mask);
10158 }
10159
10160 /* Implement MODES_TIEABLE_P.  */
10161
10162 bool
10163 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10164 {
10165   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10166     return true;
10167
10168   /* We specifically want to allow elements of "structure" modes to
10169      be tieable to the structure.  This more general condition allows
10170      other rarer situations too.  */
10171   if (TARGET_SIMD
10172       && aarch64_vector_mode_p (mode1)
10173       && aarch64_vector_mode_p (mode2))
10174     return true;
10175
10176   return false;
10177 }
10178
10179 /* Return a new RTX holding the result of moving POINTER forward by
10180    AMOUNT bytes.  */
10181
10182 static rtx
10183 aarch64_move_pointer (rtx pointer, int amount)
10184 {
10185   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10186
10187   return adjust_automodify_address (pointer, GET_MODE (pointer),
10188                                     next, amount);
10189 }
10190
10191 /* Return a new RTX holding the result of moving POINTER forward by the
10192    size of the mode it points to.  */
10193
10194 static rtx
10195 aarch64_progress_pointer (rtx pointer)
10196 {
10197   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10198
10199   return aarch64_move_pointer (pointer, amount);
10200 }
10201
10202 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10203    MODE bytes.  */
10204
10205 static void
10206 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10207                                               machine_mode mode)
10208 {
10209   rtx reg = gen_reg_rtx (mode);
10210
10211   /* "Cast" the pointers to the correct mode.  */
10212   *src = adjust_address (*src, mode, 0);
10213   *dst = adjust_address (*dst, mode, 0);
10214   /* Emit the memcpy.  */
10215   emit_move_insn (reg, *src);
10216   emit_move_insn (*dst, reg);
10217   /* Move the pointers forward.  */
10218   *src = aarch64_progress_pointer (*src);
10219   *dst = aarch64_progress_pointer (*dst);
10220 }
10221
10222 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
10223    we succeed, otherwise return false.  */
10224
10225 bool
10226 aarch64_expand_movmem (rtx *operands)
10227 {
10228   unsigned int n;
10229   rtx dst = operands[0];
10230   rtx src = operands[1];
10231   rtx base;
10232   bool speed_p = !optimize_function_for_size_p (cfun);
10233
10234   /* When optimizing for size, give a better estimate of the length of a
10235      memcpy call, but use the default otherwise.  */
10236   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10237
10238   /* We can't do anything smart if the amount to copy is not constant.  */
10239   if (!CONST_INT_P (operands[2]))
10240     return false;
10241
10242   n = UINTVAL (operands[2]);
10243
10244   /* Try to keep the number of instructions low.  For cases below 16 bytes we
10245      need to make at most two moves.  For cases above 16 bytes it will be one
10246      move for each 16 byte chunk, then at most two additional moves.  */
10247   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10248     return false;
10249
10250   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10251   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10252
10253   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10254   src = adjust_automodify_address (src, VOIDmode, base, 0);
10255
10256   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10257      1-byte chunk.  */
10258   if (n < 4)
10259     {
10260       if (n >= 2)
10261         {
10262           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10263           n -= 2;
10264         }
10265
10266       if (n == 1)
10267         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10268
10269       return true;
10270     }
10271
10272   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
10273      4-byte chunk, partially overlapping with the previously copied chunk.  */
10274   if (n < 8)
10275     {
10276       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10277       n -= 4;
10278       if (n > 0)
10279         {
10280           int move = n - 4;
10281
10282           src = aarch64_move_pointer (src, move);
10283           dst = aarch64_move_pointer (dst, move);
10284           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10285         }
10286       return true;
10287     }
10288
10289   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
10290      them, then (if applicable) an 8-byte chunk.  */
10291   while (n >= 8)
10292     {
10293       if (n / 16)
10294         {
10295           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10296           n -= 16;
10297         }
10298       else
10299         {
10300           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10301           n -= 8;
10302         }
10303     }
10304
10305   /* Finish the final bytes of the copy.  We can always do this in one
10306      instruction.  We either copy the exact amount we need, or partially
10307      overlap with the previous chunk we copied and copy 8-bytes.  */
10308   if (n == 0)
10309     return true;
10310   else if (n == 1)
10311     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10312   else if (n == 2)
10313     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10314   else if (n == 4)
10315     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10316   else
10317     {
10318       if (n == 3)
10319         {
10320           src = aarch64_move_pointer (src, -1);
10321           dst = aarch64_move_pointer (dst, -1);
10322           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10323         }
10324       else
10325         {
10326           int move = n - 8;
10327
10328           src = aarch64_move_pointer (src, move);
10329           dst = aarch64_move_pointer (dst, move);
10330           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10331         }
10332     }
10333
10334   return true;
10335 }
10336
10337 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
10338
10339 static unsigned HOST_WIDE_INT
10340 aarch64_asan_shadow_offset (void)
10341 {
10342   return (HOST_WIDE_INT_1 << 36);
10343 }
10344
10345 static bool
10346 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
10347                                         unsigned int align,
10348                                         enum by_pieces_operation op,
10349                                         bool speed_p)
10350 {
10351   /* STORE_BY_PIECES can be used when copying a constant string, but
10352      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10353      For now we always fail this and let the move_by_pieces code copy
10354      the string from read-only memory.  */
10355   if (op == STORE_BY_PIECES)
10356     return false;
10357
10358   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10359 }
10360
10361 static enum machine_mode
10362 aarch64_code_to_ccmode (enum rtx_code code)
10363 {
10364   switch (code)
10365     {
10366     case NE:
10367       return CC_DNEmode;
10368
10369     case EQ:
10370       return CC_DEQmode;
10371
10372     case LE:
10373       return CC_DLEmode;
10374
10375     case LT:
10376       return CC_DLTmode;
10377
10378     case GE:
10379       return CC_DGEmode;
10380
10381     case GT:
10382       return CC_DGTmode;
10383
10384     case LEU:
10385       return CC_DLEUmode;
10386
10387     case LTU:
10388       return CC_DLTUmode;
10389
10390     case GEU:
10391       return CC_DGEUmode;
10392
10393     case GTU:
10394       return CC_DGTUmode;
10395
10396     default:
10397       return CCmode;
10398     }
10399 }
10400
10401 static rtx
10402 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
10403                         int code, tree treeop0, tree treeop1)
10404 {
10405   enum machine_mode op_mode, cmp_mode, cc_mode;
10406   rtx op0, op1, cmp, target;
10407   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10408   enum insn_code icode;
10409   struct expand_operand ops[4];
10410
10411   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
10412   if (cc_mode == CCmode)
10413     return NULL_RTX;
10414
10415   start_sequence ();
10416   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10417
10418   op_mode = GET_MODE (op0);
10419   if (op_mode == VOIDmode)
10420     op_mode = GET_MODE (op1);
10421
10422   switch (op_mode)
10423     {
10424     case QImode:
10425     case HImode:
10426     case SImode:
10427       cmp_mode = SImode;
10428       icode = CODE_FOR_cmpsi;
10429       break;
10430
10431     case DImode:
10432       cmp_mode = DImode;
10433       icode = CODE_FOR_cmpdi;
10434       break;
10435
10436     default:
10437       end_sequence ();
10438       return NULL_RTX;
10439     }
10440
10441   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10442   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10443   if (!op0 || !op1)
10444     {
10445       end_sequence ();
10446       return NULL_RTX;
10447     }
10448   *prep_seq = get_insns ();
10449   end_sequence ();
10450
10451   cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
10452   target = gen_rtx_REG (CCmode, CC_REGNUM);
10453
10454   create_output_operand (&ops[0], target, CCmode);
10455   create_fixed_operand (&ops[1], cmp);
10456   create_fixed_operand (&ops[2], op0);
10457   create_fixed_operand (&ops[3], op1);
10458
10459   start_sequence ();
10460   if (!maybe_expand_insn (icode, 4, ops))
10461     {
10462       end_sequence ();
10463       return NULL_RTX;
10464     }
10465   *gen_seq = get_insns ();
10466   end_sequence ();
10467
10468   return gen_rtx_REG (cc_mode, CC_REGNUM);
10469 }
10470
10471 static rtx
10472 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
10473                        tree treeop0, tree treeop1, int bit_code)
10474 {
10475   rtx op0, op1, cmp0, cmp1, target;
10476   enum machine_mode op_mode, cmp_mode, cc_mode;
10477   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10478   enum insn_code icode = CODE_FOR_ccmp_andsi;
10479   struct expand_operand ops[6];
10480
10481   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
10482   if (cc_mode == CCmode)
10483     return NULL_RTX;
10484
10485   push_to_sequence ((rtx_insn*) *prep_seq);
10486   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10487
10488   op_mode = GET_MODE (op0);
10489   if (op_mode == VOIDmode)
10490     op_mode = GET_MODE (op1);
10491
10492   switch (op_mode)
10493     {
10494     case QImode:
10495     case HImode:
10496     case SImode:
10497       cmp_mode = SImode;
10498       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
10499                                                 : CODE_FOR_ccmp_iorsi;
10500       break;
10501
10502     case DImode:
10503       cmp_mode = DImode;
10504       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
10505                                                 : CODE_FOR_ccmp_iordi;
10506       break;
10507
10508     default:
10509       end_sequence ();
10510       return NULL_RTX;
10511     }
10512
10513   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10514   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10515   if (!op0 || !op1)
10516     {
10517       end_sequence ();
10518       return NULL_RTX;
10519     }
10520   *prep_seq = get_insns ();
10521   end_sequence ();
10522
10523   target = gen_rtx_REG (cc_mode, CC_REGNUM);
10524   cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
10525   cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
10526
10527   create_fixed_operand (&ops[0], prev);
10528   create_fixed_operand (&ops[1], target);
10529   create_fixed_operand (&ops[2], op0);
10530   create_fixed_operand (&ops[3], op1);
10531   create_fixed_operand (&ops[4], cmp0);
10532   create_fixed_operand (&ops[5], cmp1);
10533
10534   push_to_sequence ((rtx_insn*) *gen_seq);
10535   if (!maybe_expand_insn (icode, 6, ops))
10536     {
10537       end_sequence ();
10538       return NULL_RTX;
10539     }
10540
10541   *gen_seq = get_insns ();
10542   end_sequence ();
10543
10544   return target;
10545 }
10546
10547 #undef TARGET_GEN_CCMP_FIRST
10548 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
10549
10550 #undef TARGET_GEN_CCMP_NEXT
10551 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
10552
10553 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
10554    instruction fusion of some sort.  */
10555
10556 static bool
10557 aarch64_macro_fusion_p (void)
10558 {
10559   return aarch64_tune_params->fuseable_ops != AARCH64_FUSE_NOTHING;
10560 }
10561
10562
10563 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
10564    should be kept together during scheduling.  */
10565
10566 static bool
10567 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
10568 {
10569   rtx set_dest;
10570   rtx prev_set = single_set (prev);
10571   rtx curr_set = single_set (curr);
10572   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
10573   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
10574
10575   if (!aarch64_macro_fusion_p ())
10576     return false;
10577
10578   if (simple_sets_p
10579       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOV_MOVK))
10580     {
10581       /* We are trying to match:
10582          prev (mov)  == (set (reg r0) (const_int imm16))
10583          curr (movk) == (set (zero_extract (reg r0)
10584                                            (const_int 16)
10585                                            (const_int 16))
10586                              (const_int imm16_1))  */
10587
10588       set_dest = SET_DEST (curr_set);
10589
10590       if (GET_CODE (set_dest) == ZERO_EXTRACT
10591           && CONST_INT_P (SET_SRC (curr_set))
10592           && CONST_INT_P (SET_SRC (prev_set))
10593           && CONST_INT_P (XEXP (set_dest, 2))
10594           && INTVAL (XEXP (set_dest, 2)) == 16
10595           && REG_P (XEXP (set_dest, 0))
10596           && REG_P (SET_DEST (prev_set))
10597           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
10598         {
10599           return true;
10600         }
10601     }
10602
10603   if (simple_sets_p
10604       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_ADD))
10605     {
10606
10607       /*  We're trying to match:
10608           prev (adrp) == (set (reg r1)
10609                               (high (symbol_ref ("SYM"))))
10610           curr (add) == (set (reg r0)
10611                              (lo_sum (reg r1)
10612                                      (symbol_ref ("SYM"))))
10613           Note that r0 need not necessarily be the same as r1, especially
10614           during pre-regalloc scheduling.  */
10615
10616       if (satisfies_constraint_Ush (SET_SRC (prev_set))
10617           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10618         {
10619           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
10620               && REG_P (XEXP (SET_SRC (curr_set), 0))
10621               && REGNO (XEXP (SET_SRC (curr_set), 0))
10622                  == REGNO (SET_DEST (prev_set))
10623               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
10624                               XEXP (SET_SRC (curr_set), 1)))
10625             return true;
10626         }
10627     }
10628
10629   if (simple_sets_p
10630       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOVK_MOVK))
10631     {
10632
10633       /* We're trying to match:
10634          prev (movk) == (set (zero_extract (reg r0)
10635                                            (const_int 16)
10636                                            (const_int 32))
10637                              (const_int imm16_1))
10638          curr (movk) == (set (zero_extract (reg r0)
10639                                            (const_int 16)
10640                                            (const_int 48))
10641                              (const_int imm16_2))  */
10642
10643       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
10644           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
10645           && REG_P (XEXP (SET_DEST (prev_set), 0))
10646           && REG_P (XEXP (SET_DEST (curr_set), 0))
10647           && REGNO (XEXP (SET_DEST (prev_set), 0))
10648              == REGNO (XEXP (SET_DEST (curr_set), 0))
10649           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
10650           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
10651           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
10652           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
10653           && CONST_INT_P (SET_SRC (prev_set))
10654           && CONST_INT_P (SET_SRC (curr_set)))
10655         return true;
10656
10657     }
10658   if (simple_sets_p
10659       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_LDR))
10660     {
10661       /* We're trying to match:
10662           prev (adrp) == (set (reg r0)
10663                               (high (symbol_ref ("SYM"))))
10664           curr (ldr) == (set (reg r1)
10665                              (mem (lo_sum (reg r0)
10666                                              (symbol_ref ("SYM")))))
10667                  or
10668           curr (ldr) == (set (reg r1)
10669                              (zero_extend (mem
10670                                            (lo_sum (reg r0)
10671                                                    (symbol_ref ("SYM"))))))  */
10672       if (satisfies_constraint_Ush (SET_SRC (prev_set))
10673           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10674         {
10675           rtx curr_src = SET_SRC (curr_set);
10676
10677           if (GET_CODE (curr_src) == ZERO_EXTEND)
10678             curr_src = XEXP (curr_src, 0);
10679
10680           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
10681               && REG_P (XEXP (XEXP (curr_src, 0), 0))
10682               && REGNO (XEXP (XEXP (curr_src, 0), 0))
10683                  == REGNO (SET_DEST (prev_set))
10684               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
10685                               XEXP (SET_SRC (prev_set), 0)))
10686               return true;
10687         }
10688     }
10689
10690   if ((aarch64_tune_params->fuseable_ops & AARCH64_FUSE_CMP_BRANCH)
10691       && any_condjump_p (curr))
10692     {
10693       enum attr_type prev_type = get_attr_type (prev);
10694
10695       /* FIXME: this misses some which is considered simple arthematic
10696          instructions for ThunderX.  Simple shifts are missed here.  */
10697       if (prev_type == TYPE_ALUS_SREG
10698           || prev_type == TYPE_ALUS_IMM
10699           || prev_type == TYPE_LOGICS_REG
10700           || prev_type == TYPE_LOGICS_IMM)
10701         return true;
10702     }
10703
10704   return false;
10705 }
10706
10707 /* If MEM is in the form of [base+offset], extract the two parts
10708    of address and set to BASE and OFFSET, otherwise return false
10709    after clearing BASE and OFFSET.  */
10710
10711 bool
10712 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
10713 {
10714   rtx addr;
10715
10716   gcc_assert (MEM_P (mem));
10717
10718   addr = XEXP (mem, 0);
10719
10720   if (REG_P (addr))
10721     {
10722       *base = addr;
10723       *offset = const0_rtx;
10724       return true;
10725     }
10726
10727   if (GET_CODE (addr) == PLUS
10728       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
10729     {
10730       *base = XEXP (addr, 0);
10731       *offset = XEXP (addr, 1);
10732       return true;
10733     }
10734
10735   *base = NULL_RTX;
10736   *offset = NULL_RTX;
10737
10738   return false;
10739 }
10740
10741 /* Types for scheduling fusion.  */
10742 enum sched_fusion_type
10743 {
10744   SCHED_FUSION_NONE = 0,
10745   SCHED_FUSION_LD_SIGN_EXTEND,
10746   SCHED_FUSION_LD_ZERO_EXTEND,
10747   SCHED_FUSION_LD,
10748   SCHED_FUSION_ST,
10749   SCHED_FUSION_NUM
10750 };
10751
10752 /* If INSN is a load or store of address in the form of [base+offset],
10753    extract the two parts and set to BASE and OFFSET.  Return scheduling
10754    fusion type this INSN is.  */
10755
10756 static enum sched_fusion_type
10757 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
10758 {
10759   rtx x, dest, src;
10760   enum sched_fusion_type fusion = SCHED_FUSION_LD;
10761
10762   gcc_assert (INSN_P (insn));
10763   x = PATTERN (insn);
10764   if (GET_CODE (x) != SET)
10765     return SCHED_FUSION_NONE;
10766
10767   src = SET_SRC (x);
10768   dest = SET_DEST (x);
10769
10770   if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
10771       && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
10772     return SCHED_FUSION_NONE;
10773
10774   if (GET_CODE (src) == SIGN_EXTEND)
10775     {
10776       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
10777       src = XEXP (src, 0);
10778       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10779         return SCHED_FUSION_NONE;
10780     }
10781   else if (GET_CODE (src) == ZERO_EXTEND)
10782     {
10783       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
10784       src = XEXP (src, 0);
10785       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10786         return SCHED_FUSION_NONE;
10787     }
10788
10789   if (GET_CODE (src) == MEM && REG_P (dest))
10790     extract_base_offset_in_addr (src, base, offset);
10791   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
10792     {
10793       fusion = SCHED_FUSION_ST;
10794       extract_base_offset_in_addr (dest, base, offset);
10795     }
10796   else
10797     return SCHED_FUSION_NONE;
10798
10799   if (*base == NULL_RTX || *offset == NULL_RTX)
10800     fusion = SCHED_FUSION_NONE;
10801
10802   return fusion;
10803 }
10804
10805 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
10806
10807    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
10808    and PRI are only calculated for these instructions.  For other instruction,
10809    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
10810    type instruction fusion can be added by returning different priorities.
10811
10812    It's important that irrelevant instructions get the largest FUSION_PRI.  */
10813
10814 static void
10815 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
10816                                int *fusion_pri, int *pri)
10817 {
10818   int tmp, off_val;
10819   rtx base, offset;
10820   enum sched_fusion_type fusion;
10821
10822   gcc_assert (INSN_P (insn));
10823
10824   tmp = max_pri - 1;
10825   fusion = fusion_load_store (insn, &base, &offset);
10826   if (fusion == SCHED_FUSION_NONE)
10827     {
10828       *pri = tmp;
10829       *fusion_pri = tmp;
10830       return;
10831     }
10832
10833   /* Set FUSION_PRI according to fusion type and base register.  */
10834   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
10835
10836   /* Calculate PRI.  */
10837   tmp /= 2;
10838
10839   /* INSN with smaller offset goes first.  */
10840   off_val = (int)(INTVAL (offset));
10841   if (off_val >= 0)
10842     tmp -= (off_val & 0xfffff);
10843   else
10844     tmp += ((- off_val) & 0xfffff);
10845
10846   *pri = tmp;
10847   return;
10848 }
10849
10850 /* Given OPERANDS of consecutive load/store, check if we can merge
10851    them into ldp/stp.  LOAD is true if they are load instructions.
10852    MODE is the mode of memory operands.  */
10853
10854 bool
10855 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
10856                                 enum machine_mode mode)
10857 {
10858   HOST_WIDE_INT offval_1, offval_2, msize;
10859   enum reg_class rclass_1, rclass_2;
10860   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
10861
10862   if (load)
10863     {
10864       mem_1 = operands[1];
10865       mem_2 = operands[3];
10866       reg_1 = operands[0];
10867       reg_2 = operands[2];
10868       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
10869       if (REGNO (reg_1) == REGNO (reg_2))
10870         return false;
10871     }
10872   else
10873     {
10874       mem_1 = operands[0];
10875       mem_2 = operands[2];
10876       reg_1 = operands[1];
10877       reg_2 = operands[3];
10878     }
10879
10880   /* The mems cannot be volatile.  */
10881   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
10882     return false;
10883
10884   /* Check if the addresses are in the form of [base+offset].  */
10885   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
10886   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
10887     return false;
10888   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
10889   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
10890     return false;
10891
10892   /* Check if the bases are same.  */
10893   if (!rtx_equal_p (base_1, base_2))
10894     return false;
10895
10896   offval_1 = INTVAL (offset_1);
10897   offval_2 = INTVAL (offset_2);
10898   msize = GET_MODE_SIZE (mode);
10899   /* Check if the offsets are consecutive.  */
10900   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
10901     return false;
10902
10903   /* Check if the addresses are clobbered by load.  */
10904   if (load)
10905     {
10906       if (reg_mentioned_p (reg_1, mem_1))
10907         return false;
10908
10909       /* In increasing order, the last load can clobber the address.  */
10910       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
10911       return false;
10912     }
10913
10914   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
10915     rclass_1 = FP_REGS;
10916   else
10917     rclass_1 = GENERAL_REGS;
10918
10919   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
10920     rclass_2 = FP_REGS;
10921   else
10922     rclass_2 = GENERAL_REGS;
10923
10924   /* Check if the registers are of same class.  */
10925   if (rclass_1 != rclass_2)
10926     return false;
10927
10928   return true;
10929 }
10930
10931 /* Given OPERANDS of consecutive load/store, check if we can merge
10932    them into ldp/stp by adjusting the offset.  LOAD is true if they
10933    are load instructions.  MODE is the mode of memory operands.
10934
10935    Given below consecutive stores:
10936
10937      str  w1, [xb, 0x100]
10938      str  w1, [xb, 0x104]
10939      str  w1, [xb, 0x108]
10940      str  w1, [xb, 0x10c]
10941
10942    Though the offsets are out of the range supported by stp, we can
10943    still pair them after adjusting the offset, like:
10944
10945      add  scratch, xb, 0x100
10946      stp  w1, w1, [scratch]
10947      stp  w1, w1, [scratch, 0x8]
10948
10949    The peephole patterns detecting this opportunity should guarantee
10950    the scratch register is avaliable.  */
10951
10952 bool
10953 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
10954                                        enum machine_mode mode)
10955 {
10956   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
10957   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
10958   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
10959   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
10960
10961   if (load)
10962     {
10963       reg_1 = operands[0];
10964       mem_1 = operands[1];
10965       reg_2 = operands[2];
10966       mem_2 = operands[3];
10967       reg_3 = operands[4];
10968       mem_3 = operands[5];
10969       reg_4 = operands[6];
10970       mem_4 = operands[7];
10971       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
10972                   && REG_P (reg_3) && REG_P (reg_4));
10973       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
10974         return false;
10975     }
10976   else
10977     {
10978       mem_1 = operands[0];
10979       reg_1 = operands[1];
10980       mem_2 = operands[2];
10981       reg_2 = operands[3];
10982       mem_3 = operands[4];
10983       reg_3 = operands[5];
10984       mem_4 = operands[6];
10985       reg_4 = operands[7];
10986     }
10987   /* Skip if memory operand is by itslef valid for ldp/stp.  */
10988   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
10989     return false;
10990
10991   /* The mems cannot be volatile.  */
10992   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
10993       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
10994     return false;
10995
10996   /* Check if the addresses are in the form of [base+offset].  */
10997   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
10998   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
10999     return false;
11000   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11001   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11002     return false;
11003   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11004   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11005     return false;
11006   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11007   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11008     return false;
11009
11010   /* Check if the bases are same.  */
11011   if (!rtx_equal_p (base_1, base_2)
11012       || !rtx_equal_p (base_2, base_3)
11013       || !rtx_equal_p (base_3, base_4))
11014     return false;
11015
11016   offval_1 = INTVAL (offset_1);
11017   offval_2 = INTVAL (offset_2);
11018   offval_3 = INTVAL (offset_3);
11019   offval_4 = INTVAL (offset_4);
11020   msize = GET_MODE_SIZE (mode);
11021   /* Check if the offsets are consecutive.  */
11022   if ((offval_1 != (offval_2 + msize)
11023        || offval_1 != (offval_3 + msize * 2)
11024        || offval_1 != (offval_4 + msize * 3))
11025       && (offval_4 != (offval_3 + msize)
11026           || offval_4 != (offval_2 + msize * 2)
11027           || offval_4 != (offval_1 + msize * 3)))
11028     return false;
11029
11030   /* Check if the addresses are clobbered by load.  */
11031   if (load)
11032     {
11033       if (reg_mentioned_p (reg_1, mem_1)
11034           || reg_mentioned_p (reg_2, mem_2)
11035           || reg_mentioned_p (reg_3, mem_3))
11036         return false;
11037
11038       /* In increasing order, the last load can clobber the address.  */
11039       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11040         return false;
11041     }
11042
11043   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11044     rclass_1 = FP_REGS;
11045   else
11046     rclass_1 = GENERAL_REGS;
11047
11048   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11049     rclass_2 = FP_REGS;
11050   else
11051     rclass_2 = GENERAL_REGS;
11052
11053   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11054     rclass_3 = FP_REGS;
11055   else
11056     rclass_3 = GENERAL_REGS;
11057
11058   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11059     rclass_4 = FP_REGS;
11060   else
11061     rclass_4 = GENERAL_REGS;
11062
11063   /* Check if the registers are of same class.  */
11064   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11065     return false;
11066
11067   return true;
11068 }
11069
11070 /* Given OPERANDS of consecutive load/store, this function pairs them
11071    into ldp/stp after adjusting the offset.  It depends on the fact
11072    that addresses of load/store instructions are in increasing order.
11073    MODE is the mode of memory operands.  CODE is the rtl operator
11074    which should be applied to all memory operands, it's SIGN_EXTEND,
11075    ZERO_EXTEND or UNKNOWN.  */
11076
11077 bool
11078 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11079                              enum machine_mode mode, RTX_CODE code)
11080 {
11081   rtx base, offset, t1, t2;
11082   rtx mem_1, mem_2, mem_3, mem_4;
11083   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11084
11085   if (load)
11086     {
11087       mem_1 = operands[1];
11088       mem_2 = operands[3];
11089       mem_3 = operands[5];
11090       mem_4 = operands[7];
11091     }
11092   else
11093     {
11094       mem_1 = operands[0];
11095       mem_2 = operands[2];
11096       mem_3 = operands[4];
11097       mem_4 = operands[6];
11098       gcc_assert (code == UNKNOWN);
11099     }
11100
11101   extract_base_offset_in_addr (mem_1, &base, &offset);
11102   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11103
11104   /* Adjust offset thus it can fit in ldp/stp instruction.  */
11105   msize = GET_MODE_SIZE (mode);
11106   stp_off_limit = msize * 0x40;
11107   off_val = INTVAL (offset);
11108   abs_off = (off_val < 0) ? -off_val : off_val;
11109   new_off = abs_off % stp_off_limit;
11110   adj_off = abs_off - new_off;
11111
11112   /* Further adjust to make sure all offsets are OK.  */
11113   if ((new_off + msize * 2) >= stp_off_limit)
11114     {
11115       adj_off += stp_off_limit;
11116       new_off -= stp_off_limit;
11117     }
11118
11119   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
11120   if (adj_off >= 0x1000)
11121     return false;
11122
11123   if (off_val < 0)
11124     {
11125       adj_off = -adj_off;
11126       new_off = -new_off;
11127     }
11128
11129   /* Create new memory references.  */
11130   mem_1 = change_address (mem_1, VOIDmode,
11131                           plus_constant (DImode, operands[8], new_off));
11132
11133   /* Check if the adjusted address is OK for ldp/stp.  */
11134   if (!aarch64_mem_pair_operand (mem_1, mode))
11135     return false;
11136
11137   msize = GET_MODE_SIZE (mode);
11138   mem_2 = change_address (mem_2, VOIDmode,
11139                           plus_constant (DImode,
11140                                          operands[8],
11141                                          new_off + msize));
11142   mem_3 = change_address (mem_3, VOIDmode,
11143                           plus_constant (DImode,
11144                                          operands[8],
11145                                          new_off + msize * 2));
11146   mem_4 = change_address (mem_4, VOIDmode,
11147                           plus_constant (DImode,
11148                                          operands[8],
11149                                          new_off + msize * 3));
11150
11151   if (code == ZERO_EXTEND)
11152     {
11153       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11154       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11155       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11156       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11157     }
11158   else if (code == SIGN_EXTEND)
11159     {
11160       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11161       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11162       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11163       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11164     }
11165
11166   if (load)
11167     {
11168       operands[1] = mem_1;
11169       operands[3] = mem_2;
11170       operands[5] = mem_3;
11171       operands[7] = mem_4;
11172     }
11173   else
11174     {
11175       operands[0] = mem_1;
11176       operands[2] = mem_2;
11177       operands[4] = mem_3;
11178       operands[6] = mem_4;
11179     }
11180
11181   /* Emit adjusting instruction.  */
11182   emit_insn (gen_rtx_SET (VOIDmode, operands[8],
11183                           plus_constant (DImode, base, adj_off)));
11184   /* Emit ldp/stp instructions.  */
11185   t1 = gen_rtx_SET (VOIDmode, operands[0], operands[1]);
11186   t2 = gen_rtx_SET (VOIDmode, operands[2], operands[3]);
11187   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11188   t1 = gen_rtx_SET (VOIDmode, operands[4], operands[5]);
11189   t2 = gen_rtx_SET (VOIDmode, operands[6], operands[7]);
11190   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11191   return true;
11192 }
11193
11194 #undef TARGET_ADDRESS_COST
11195 #define TARGET_ADDRESS_COST aarch64_address_cost
11196
11197 /* This hook will determines whether unnamed bitfields affect the alignment
11198    of the containing structure.  The hook returns true if the structure
11199    should inherit the alignment requirements of an unnamed bitfield's
11200    type.  */
11201 #undef TARGET_ALIGN_ANON_BITFIELD
11202 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11203
11204 #undef TARGET_ASM_ALIGNED_DI_OP
11205 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11206
11207 #undef TARGET_ASM_ALIGNED_HI_OP
11208 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11209
11210 #undef TARGET_ASM_ALIGNED_SI_OP
11211 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11212
11213 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11214 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11215   hook_bool_const_tree_hwi_hwi_const_tree_true
11216
11217 #undef TARGET_ASM_FILE_START
11218 #define TARGET_ASM_FILE_START aarch64_start_file
11219
11220 #undef TARGET_ASM_OUTPUT_MI_THUNK
11221 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11222
11223 #undef TARGET_ASM_SELECT_RTX_SECTION
11224 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11225
11226 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11227 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11228
11229 #undef TARGET_BUILD_BUILTIN_VA_LIST
11230 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11231
11232 #undef TARGET_CALLEE_COPIES
11233 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11234
11235 #undef TARGET_CAN_ELIMINATE
11236 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11237
11238 #undef TARGET_CANNOT_FORCE_CONST_MEM
11239 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11240
11241 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11242 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11243
11244 /* Only the least significant bit is used for initialization guard
11245    variables.  */
11246 #undef TARGET_CXX_GUARD_MASK_BIT
11247 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11248
11249 #undef TARGET_C_MODE_FOR_SUFFIX
11250 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11251
11252 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11253 #undef  TARGET_DEFAULT_TARGET_FLAGS
11254 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11255 #endif
11256
11257 #undef TARGET_CLASS_MAX_NREGS
11258 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11259
11260 #undef TARGET_BUILTIN_DECL
11261 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11262
11263 #undef  TARGET_EXPAND_BUILTIN
11264 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11265
11266 #undef TARGET_EXPAND_BUILTIN_VA_START
11267 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11268
11269 #undef TARGET_FOLD_BUILTIN
11270 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11271
11272 #undef TARGET_FUNCTION_ARG
11273 #define TARGET_FUNCTION_ARG aarch64_function_arg
11274
11275 #undef TARGET_FUNCTION_ARG_ADVANCE
11276 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11277
11278 #undef TARGET_FUNCTION_ARG_BOUNDARY
11279 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11280
11281 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11282 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11283
11284 #undef TARGET_FUNCTION_VALUE
11285 #define TARGET_FUNCTION_VALUE aarch64_function_value
11286
11287 #undef TARGET_FUNCTION_VALUE_REGNO_P
11288 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11289
11290 #undef TARGET_FRAME_POINTER_REQUIRED
11291 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11292
11293 #undef TARGET_GIMPLE_FOLD_BUILTIN
11294 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11295
11296 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11297 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11298
11299 #undef  TARGET_INIT_BUILTINS
11300 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
11301
11302 #undef TARGET_LEGITIMATE_ADDRESS_P
11303 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11304
11305 #undef TARGET_LEGITIMATE_CONSTANT_P
11306 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11307
11308 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11309 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11310
11311 #undef TARGET_LRA_P
11312 #define TARGET_LRA_P hook_bool_void_true
11313
11314 #undef TARGET_MANGLE_TYPE
11315 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11316
11317 #undef TARGET_MEMORY_MOVE_COST
11318 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11319
11320 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11321 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11322
11323 #undef TARGET_MUST_PASS_IN_STACK
11324 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11325
11326 /* This target hook should return true if accesses to volatile bitfields
11327    should use the narrowest mode possible.  It should return false if these
11328    accesses should use the bitfield container type.  */
11329 #undef TARGET_NARROW_VOLATILE_BITFIELD
11330 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11331
11332 #undef  TARGET_OPTION_OVERRIDE
11333 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11334
11335 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11336 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11337   aarch64_override_options_after_change
11338
11339 #undef TARGET_PASS_BY_REFERENCE
11340 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11341
11342 #undef TARGET_PREFERRED_RELOAD_CLASS
11343 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11344
11345 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11346 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11347
11348 #undef TARGET_SECONDARY_RELOAD
11349 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11350
11351 #undef TARGET_SHIFT_TRUNCATION_MASK
11352 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11353
11354 #undef TARGET_SETUP_INCOMING_VARARGS
11355 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11356
11357 #undef TARGET_STRUCT_VALUE_RTX
11358 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
11359
11360 #undef TARGET_REGISTER_MOVE_COST
11361 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11362
11363 #undef TARGET_RETURN_IN_MEMORY
11364 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11365
11366 #undef TARGET_RETURN_IN_MSB
11367 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11368
11369 #undef TARGET_RTX_COSTS
11370 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11371
11372 #undef TARGET_SCHED_ISSUE_RATE
11373 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11374
11375 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
11376 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
11377   aarch64_sched_first_cycle_multipass_dfa_lookahead
11378
11379 #undef TARGET_TRAMPOLINE_INIT
11380 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11381
11382 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11383 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11384
11385 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11386 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11387
11388 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11389 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11390
11391 #undef TARGET_VECTORIZE_ADD_STMT_COST
11392 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11393
11394 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11395 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11396   aarch64_builtin_vectorization_cost
11397
11398 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11399 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11400
11401 #undef TARGET_VECTORIZE_BUILTINS
11402 #define TARGET_VECTORIZE_BUILTINS
11403
11404 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11405 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11406   aarch64_builtin_vectorized_function
11407
11408 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11409 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11410   aarch64_autovectorize_vector_sizes
11411
11412 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11413 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11414   aarch64_atomic_assign_expand_fenv
11415
11416 /* Section anchor support.  */
11417
11418 #undef TARGET_MIN_ANCHOR_OFFSET
11419 #define TARGET_MIN_ANCHOR_OFFSET -256
11420
11421 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11422    byte offset; we can do much more for larger data types, but have no way
11423    to determine the size of the access.  We assume accesses are aligned.  */
11424 #undef TARGET_MAX_ANCHOR_OFFSET
11425 #define TARGET_MAX_ANCHOR_OFFSET 4095
11426
11427 #undef TARGET_VECTOR_ALIGNMENT
11428 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11429
11430 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11431 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11432   aarch64_simd_vector_alignment_reachable
11433
11434 /* vec_perm support.  */
11435
11436 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11437 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11438   aarch64_vectorize_vec_perm_const_ok
11439
11440
11441 #undef TARGET_FIXED_CONDITION_CODE_REGS
11442 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11443
11444 #undef TARGET_FLAGS_REGNUM
11445 #define TARGET_FLAGS_REGNUM CC_REGNUM
11446
11447 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11448 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11449
11450 #undef TARGET_ASAN_SHADOW_OFFSET
11451 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11452
11453 #undef TARGET_LEGITIMIZE_ADDRESS
11454 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11455
11456 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11457 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11458   aarch64_use_by_pieces_infrastructure_p
11459
11460 #undef TARGET_CAN_USE_DOLOOP_P
11461 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11462
11463 #undef TARGET_SCHED_MACRO_FUSION_P
11464 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11465
11466 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11467 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11468
11469 #undef TARGET_SCHED_FUSION_PRIORITY
11470 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11471
11472 struct gcc_target targetm = TARGET_INITIALIZER;
11473
11474 #include "gt-aarch64.h"