gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2015 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "insn-codes.h"
  26 #include "rtl.h"
  27 #include "insn-attr.h"
  28 #include "hash-set.h"
  29 #include "machmode.h"
  30 #include "vec.h"
  31 #include "double-int.h"
  32 #include "input.h"
  33 #include "alias.h"
  34 #include "symtab.h"
  35 #include "wide-int.h"
  36 #include "inchash.h"
  37 #include "tree.h"
  38 #include "fold-const.h"
  39 #include "stringpool.h"
  40 #include "stor-layout.h"
  41 #include "calls.h"
  42 #include "varasm.h"
  43 #include "regs.h"
  44 #include "dominance.h"
  45 #include "cfg.h"
  46 #include "cfgrtl.h"
  47 #include "cfganal.h"
  48 #include "lcm.h"
  49 #include "cfgbuild.h"
  50 #include "cfgcleanup.h"
  51 #include "predict.h"
  52 #include "basic-block.h"
  53 #include "df.h"
  54 #include "hard-reg-set.h"
  55 #include "output.h"
  56 #include "hashtab.h"
  57 #include "function.h"
  58 #include "flags.h"
  59 #include "statistics.h"
  60 #include "real.h"
  61 #include "fixed-value.h"
  62 #include "insn-config.h"
  63 #include "expmed.h"
  64 #include "dojump.h"
  65 #include "explow.h"
  66 #include "emit-rtl.h"
  67 #include "stmt.h"
  68 #include "expr.h"
  69 #include "reload.h"
  70 #include "toplev.h"
  71 #include "target.h"
  72 #include "target-def.h"
  73 #include "targhooks.h"
  74 #include "ggc.h"
  75 #include "tm_p.h"
  76 #include "recog.h"
  77 #include "langhooks.h"
  78 #include "diagnostic-core.h"
  79 #include "hash-table.h"
  80 #include "tree-ssa-alias.h"
  81 #include "internal-fn.h"
  82 #include "gimple-fold.h"
  83 #include "tree-eh.h"
  84 #include "gimple-expr.h"
  85 #include "is-a.h"
  86 #include "gimple.h"
  87 #include "gimplify.h"
  88 #include "optabs.h"
  89 #include "dwarf2.h"
  90 #include "cfgloop.h"
  91 #include "tree-vectorizer.h"
  92 #include "aarch64-cost-tables.h"
  93 #include "dumpfile.h"
  94 #include "builtins.h"
  95 #include "rtl-iter.h"
  96 #include "tm-constrs.h"
  97 #include "sched-int.h"
  98 #include "cortex-a57-fma-steering.h"
  99
 100 /* Defined for convenience.  */
 101 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
 102
 103 /* Classifies an address.
 104
 105    ADDRESS_REG_IMM
 106        A simple base register plus immediate offset.
 107
 108    ADDRESS_REG_WB
 109        A base register indexed by immediate offset with writeback.
 110
 111    ADDRESS_REG_REG
 112        A base register indexed by (optionally scaled) register.
 113
 114    ADDRESS_REG_UXTW
 115        A base register indexed by (optionally scaled) zero-extended register.
 116
 117    ADDRESS_REG_SXTW
 118        A base register indexed by (optionally scaled) sign-extended register.
 119
 120    ADDRESS_LO_SUM
 121        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 122
 123    ADDRESS_SYMBOLIC:
 124        A constant symbolic address, in pc-relative literal pool.  */
 125
 126 enum aarch64_address_type {
 127   ADDRESS_REG_IMM,
 128   ADDRESS_REG_WB,
 129   ADDRESS_REG_REG,
 130   ADDRESS_REG_UXTW,
 131   ADDRESS_REG_SXTW,
 132   ADDRESS_LO_SUM,
 133   ADDRESS_SYMBOLIC
 134 };
 135
 136 struct aarch64_address_info {
 137   enum aarch64_address_type type;
 138   rtx base;
 139   rtx offset;
 140   int shift;
 141   enum aarch64_symbol_type symbol_type;
 142 };
 143
 144 struct simd_immediate_info
 145 {
 146   rtx value;
 147   int shift;
 148   int element_width;
 149   bool mvn;
 150   bool msl;
 151 };
 152
 153 /* The current code model.  */
 154 enum aarch64_code_model aarch64_cmodel;
 155
 156 #ifdef HAVE_AS_TLS
 157 #undef TARGET_HAVE_TLS
 158 #define TARGET_HAVE_TLS 1
 159 #endif
 160
 161 static bool aarch64_composite_type_p (const_tree, machine_mode);
 162 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 163                                                      const_tree,
 164                                                      machine_mode *, int *,
 165                                                      bool *);
 166 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 167 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 168 static void aarch64_override_options_after_change (void);
 169 static bool aarch64_vector_mode_supported_p (machine_mode);
 170 static unsigned bit_count (unsigned HOST_WIDE_INT);
 171 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 172                                                  const unsigned char *sel);
 173 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 174
 175 /* Major revision number of the ARM Architecture implemented by the target.  */
 176 unsigned aarch64_architecture_version;
 177
 178 /* The processor for which instructions should be scheduled.  */
 179 enum aarch64_processor aarch64_tune = cortexa53;
 180
 181 /* The current tuning set.  */
 182 const struct tune_params *aarch64_tune_params;
 183
 184 /* Mask to specify which instructions we are allowed to generate.  */
 185 unsigned long aarch64_isa_flags = 0;
 186
 187 /* Mask to specify which instruction scheduling options should be used.  */
 188 unsigned long aarch64_tune_flags = 0;
 189
 190 /* Tuning parameters.  */
 191
 192 static const struct cpu_addrcost_table generic_addrcost_table =
 193 {
 194     {
 195       0, /* hi  */
 196       0, /* si  */
 197       0, /* di  */
 198       0, /* ti  */
 199     },
 200   0, /* pre_modify  */
 201   0, /* post_modify  */
 202   0, /* register_offset  */
 203   0, /* register_extend  */
 204   0 /* imm_offset  */
 205 };
 206
 207 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 208 {
 209     {
 210       1, /* hi  */
 211       0, /* si  */
 212       0, /* di  */
 213       1, /* ti  */
 214     },
 215   0, /* pre_modify  */
 216   0, /* post_modify  */
 217   0, /* register_offset  */
 218   0, /* register_extend  */
 219   0, /* imm_offset  */
 220 };
 221
 222 static const struct cpu_addrcost_table xgene1_addrcost_table =
 223 {
 224     {
 225       1, /* hi  */
 226       0, /* si  */
 227       0, /* di  */
 228       1, /* ti  */
 229     },
 230   1, /* pre_modify  */
 231   0, /* post_modify  */
 232   0, /* register_offset  */
 233   1, /* register_extend  */
 234   0, /* imm_offset  */
 235 };
 236
 237 static const struct cpu_regmove_cost generic_regmove_cost =
 238 {
 239   1, /* GP2GP  */
 240   /* Avoid the use of slow int<->fp moves for spilling by setting
 241      their cost higher than memmov_cost.  */
 242   5, /* GP2FP  */
 243   5, /* FP2GP  */
 244   2 /* FP2FP  */
 245 };
 246
 247 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 248 {
 249   1, /* GP2GP  */
 250   /* Avoid the use of slow int<->fp moves for spilling by setting
 251      their cost higher than memmov_cost.  */
 252   5, /* GP2FP  */
 253   5, /* FP2GP  */
 254   2 /* FP2FP  */
 255 };
 256
 257 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 258 {
 259   1, /* GP2GP  */
 260   /* Avoid the use of slow int<->fp moves for spilling by setting
 261      their cost higher than memmov_cost.  */
 262   5, /* GP2FP  */
 263   5, /* FP2GP  */
 264   2 /* FP2FP  */
 265 };
 266
 267 static const struct cpu_regmove_cost thunderx_regmove_cost =
 268 {
 269   2, /* GP2GP  */
 270   2, /* GP2FP  */
 271   6, /* FP2GP  */
 272   4 /* FP2FP  */
 273 };
 274
 275 static const struct cpu_regmove_cost xgene1_regmove_cost =
 276 {
 277   1, /* GP2GP  */
 278   /* Avoid the use of slow int<->fp moves for spilling by setting
 279      their cost higher than memmov_cost.  */
 280   8, /* GP2FP  */
 281   8, /* FP2GP  */
 282   2 /* FP2FP  */
 283 };
 284
 285 /* Generic costs for vector insn classes.  */
 286 static const struct cpu_vector_cost generic_vector_cost =
 287 {
 288   1, /* scalar_stmt_cost  */
 289   1, /* scalar_load_cost  */
 290   1, /* scalar_store_cost  */
 291   1, /* vec_stmt_cost  */
 292   1, /* vec_to_scalar_cost  */
 293   1, /* scalar_to_vec_cost  */
 294   1, /* vec_align_load_cost  */
 295   1, /* vec_unalign_load_cost  */
 296   1, /* vec_unalign_store_cost  */
 297   1, /* vec_store_cost  */
 298   3, /* cond_taken_branch_cost  */
 299   1 /* cond_not_taken_branch_cost  */
 300 };
 301
 302 /* Generic costs for vector insn classes.  */
 303 static const struct cpu_vector_cost cortexa57_vector_cost =
 304 {
 305   1, /* scalar_stmt_cost  */
 306   4, /* scalar_load_cost  */
 307   1, /* scalar_store_cost  */
 308   3, /* vec_stmt_cost  */
 309   8, /* vec_to_scalar_cost  */
 310   8, /* scalar_to_vec_cost  */
 311   5, /* vec_align_load_cost  */
 312   5, /* vec_unalign_load_cost  */
 313   1, /* vec_unalign_store_cost  */
 314   1, /* vec_store_cost  */
 315   1, /* cond_taken_branch_cost  */
 316   1 /* cond_not_taken_branch_cost  */
 317 };
 318
 319 /* Generic costs for vector insn classes.  */
 320 static const struct cpu_vector_cost xgene1_vector_cost =
 321 {
 322   1, /* scalar_stmt_cost  */
 323   5, /* scalar_load_cost  */
 324   1, /* scalar_store_cost  */
 325   2, /* vec_stmt_cost  */
 326   4, /* vec_to_scalar_cost  */
 327   4, /* scalar_to_vec_cost  */
 328   10, /* vec_align_load_cost  */
 329   10, /* vec_unalign_load_cost  */
 330   2, /* vec_unalign_store_cost  */
 331   2, /* vec_store_cost  */
 332   2, /* cond_taken_branch_cost  */
 333   1 /* cond_not_taken_branch_cost  */
 334 };
 335
 336 #define AARCH64_FUSE_NOTHING    (0)
 337 #define AARCH64_FUSE_MOV_MOVK   (1 << 0)
 338 #define AARCH64_FUSE_ADRP_ADD   (1 << 1)
 339 #define AARCH64_FUSE_MOVK_MOVK  (1 << 2)
 340 #define AARCH64_FUSE_ADRP_LDR   (1 << 3)
 341 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
 342
 343 static const struct tune_params generic_tunings =
 344 {
 345   &cortexa57_extra_costs,
 346   &generic_addrcost_table,
 347   &generic_regmove_cost,
 348   &generic_vector_cost,
 349   4, /* memmov_cost  */
 350   2, /* issue_rate  */
 351   AARCH64_FUSE_NOTHING, /* fuseable_ops  */
 352   8,    /* function_align.  */
 353   8,    /* jump_align.  */
 354   4,    /* loop_align.  */
 355   2,    /* int_reassoc_width.  */
 356   4,    /* fp_reassoc_width.  */
 357   1     /* vec_reassoc_width.  */
 358 };
 359
 360 static const struct tune_params cortexa53_tunings =
 361 {
 362   &cortexa53_extra_costs,
 363   &generic_addrcost_table,
 364   &cortexa53_regmove_cost,
 365   &generic_vector_cost,
 366   4, /* memmov_cost  */
 367   2, /* issue_rate  */
 368   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 369    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fuseable_ops  */
 370   8,    /* function_align.  */
 371   8,    /* jump_align.  */
 372   4,    /* loop_align.  */
 373   2,    /* int_reassoc_width.  */
 374   4,    /* fp_reassoc_width.  */
 375   1     /* vec_reassoc_width.  */
 376 };
 377
 378 static const struct tune_params cortexa57_tunings =
 379 {
 380   &cortexa57_extra_costs,
 381   &cortexa57_addrcost_table,
 382   &cortexa57_regmove_cost,
 383   &cortexa57_vector_cost,
 384   4, /* memmov_cost  */
 385   3, /* issue_rate  */
 386   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 387    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 388   16,   /* function_align.  */
 389   8,    /* jump_align.  */
 390   4,    /* loop_align.  */
 391   2,    /* int_reassoc_width.  */
 392   4,    /* fp_reassoc_width.  */
 393   1     /* vec_reassoc_width.  */
 394 };
 395
 396 static const struct tune_params thunderx_tunings =
 397 {
 398   &thunderx_extra_costs,
 399   &generic_addrcost_table,
 400   &thunderx_regmove_cost,
 401   &generic_vector_cost,
 402   6, /* memmov_cost  */
 403   2, /* issue_rate  */
 404   AARCH64_FUSE_CMP_BRANCH, /* fuseable_ops  */
 405   8,    /* function_align.  */
 406   8,    /* jump_align.  */
 407   8,    /* loop_align.  */
 408   2,    /* int_reassoc_width.  */
 409   4,    /* fp_reassoc_width.  */
 410   1     /* vec_reassoc_width.  */
 411 };
 412
 413 static const struct tune_params xgene1_tunings =
 414 {
 415   &xgene1_extra_costs,
 416   &xgene1_addrcost_table,
 417   &xgene1_regmove_cost,
 418   &xgene1_vector_cost,
 419   6, /* memmov_cost  */
 420   4, /* issue_rate  */
 421   AARCH64_FUSE_NOTHING, /* fuseable_ops  */
 422   16,   /* function_align.  */
 423   8,    /* jump_align.  */
 424   16,   /* loop_align.  */
 425   2,    /* int_reassoc_width.  */
 426   4,    /* fp_reassoc_width.  */
 427   1     /* vec_reassoc_width.  */
 428 };
 429
 430 /* A processor implementing AArch64.  */
 431 struct processor
 432 {
 433   const char *const name;
 434   enum aarch64_processor core;
 435   const char *arch;
 436   unsigned architecture_version;
 437   const unsigned long flags;
 438   const struct tune_params *const tune;
 439 };
 440
 441 /* Processor cores implementing AArch64.  */
 442 static const struct processor all_cores[] =
 443 {
 444 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
 445   {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
 446 #include "aarch64-cores.def"
 447 #undef AARCH64_CORE
 448   {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
 449   {NULL, aarch64_none, NULL, 0, 0, NULL}
 450 };
 451
 452 /* Architectures implementing AArch64.  */
 453 static const struct processor all_architectures[] =
 454 {
 455 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
 456   {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
 457 #include "aarch64-arches.def"
 458 #undef AARCH64_ARCH
 459   {NULL, aarch64_none, NULL, 0, 0, NULL}
 460 };
 461
 462 /* Target specification.  These are populated as commandline arguments
 463    are processed, or NULL if not specified.  */
 464 static const struct processor *selected_arch;
 465 static const struct processor *selected_cpu;
 466 static const struct processor *selected_tune;
 467
 468 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 469
 470 /* An ISA extension in the co-processor and main instruction set space.  */
 471 struct aarch64_option_extension
 472 {
 473   const char *const name;
 474   const unsigned long flags_on;
 475   const unsigned long flags_off;
 476 };
 477
 478 /* ISA extensions in AArch64.  */
 479 static const struct aarch64_option_extension all_extensions[] =
 480 {
 481 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
 482   {NAME, FLAGS_ON, FLAGS_OFF},
 483 #include "aarch64-option-extensions.def"
 484 #undef AARCH64_OPT_EXTENSION
 485   {NULL, 0, 0}
 486 };
 487
 488 /* Used to track the size of an address when generating a pre/post
 489    increment address.  */
 490 static machine_mode aarch64_memory_reference_mode;
 491
 492 /* A table of valid AArch64 "bitmask immediate" values for
 493    logical instructions.  */
 494
 495 #define AARCH64_NUM_BITMASKS  5334
 496 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
 497
 498 typedef enum aarch64_cond_code
 499 {
 500   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 501   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 502   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 503 }
 504 aarch64_cc;
 505
 506 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 507
 508 /* The condition codes of the processor, and the inverse function.  */
 509 static const char * const aarch64_condition_codes[] =
 510 {
 511   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 512   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 513 };
 514
 515 static unsigned int
 516 aarch64_min_divisions_for_recip_mul (enum machine_mode mode ATTRIBUTE_UNUSED)
 517 {
 518   return 2;
 519 }
 520
 521 static int
 522 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 523                              enum machine_mode mode)
 524 {
 525   if (VECTOR_MODE_P (mode))
 526     return aarch64_tune_params->vec_reassoc_width;
 527   if (INTEGRAL_MODE_P (mode))
 528     return aarch64_tune_params->int_reassoc_width;
 529   if (FLOAT_MODE_P (mode))
 530     return aarch64_tune_params->fp_reassoc_width;
 531   return 1;
 532 }
 533
 534 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 535 unsigned
 536 aarch64_dbx_register_number (unsigned regno)
 537 {
 538    if (GP_REGNUM_P (regno))
 539      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 540    else if (regno == SP_REGNUM)
 541      return AARCH64_DWARF_SP;
 542    else if (FP_REGNUM_P (regno))
 543      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 544
 545    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 546       equivalent DWARF register.  */
 547    return DWARF_FRAME_REGISTERS;
 548 }
 549
 550 /* Return TRUE if MODE is any of the large INT modes.  */
 551 static bool
 552 aarch64_vect_struct_mode_p (machine_mode mode)
 553 {
 554   return mode == OImode || mode == CImode || mode == XImode;
 555 }
 556
 557 /* Return TRUE if MODE is any of the vector modes.  */
 558 static bool
 559 aarch64_vector_mode_p (machine_mode mode)
 560 {
 561   return aarch64_vector_mode_supported_p (mode)
 562          || aarch64_vect_struct_mode_p (mode);
 563 }
 564
 565 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 566 static bool
 567 aarch64_array_mode_supported_p (machine_mode mode,
 568                                 unsigned HOST_WIDE_INT nelems)
 569 {
 570   if (TARGET_SIMD
 571       && AARCH64_VALID_SIMD_QREG_MODE (mode)
 572       && (nelems >= 2 && nelems <= 4))
 573     return true;
 574
 575   return false;
 576 }
 577
 578 /* Implement HARD_REGNO_NREGS.  */
 579
 580 int
 581 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
 582 {
 583   switch (aarch64_regno_regclass (regno))
 584     {
 585     case FP_REGS:
 586     case FP_LO_REGS:
 587       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 588     default:
 589       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 590     }
 591   gcc_unreachable ();
 592 }
 593
 594 /* Implement HARD_REGNO_MODE_OK.  */
 595
 596 int
 597 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
 598 {
 599   if (GET_MODE_CLASS (mode) == MODE_CC)
 600     return regno == CC_REGNUM;
 601
 602   if (regno == SP_REGNUM)
 603     /* The purpose of comparing with ptr_mode is to support the
 604        global register variable associated with the stack pointer
 605        register via the syntax of asm ("wsp") in ILP32.  */
 606     return mode == Pmode || mode == ptr_mode;
 607
 608   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 609     return mode == Pmode;
 610
 611   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 612     return 1;
 613
 614   if (FP_REGNUM_P (regno))
 615     {
 616       if (aarch64_vect_struct_mode_p (mode))
 617         return
 618           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 619       else
 620         return 1;
 621     }
 622
 623   return 0;
 624 }
 625
 626 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 627 machine_mode
 628 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 629                                      machine_mode mode)
 630 {
 631   /* Handle modes that fit within single registers.  */
 632   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 633     {
 634       if (GET_MODE_SIZE (mode) >= 4)
 635         return mode;
 636       else
 637         return SImode;
 638     }
 639   /* Fall back to generic for multi-reg and very large modes.  */
 640   else
 641     return choose_hard_reg_mode (regno, nregs, false);
 642 }
 643
 644 /* Return true if calls to DECL should be treated as
 645    long-calls (ie called via a register).  */
 646 static bool
 647 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 648 {
 649   return false;
 650 }
 651
 652 /* Return true if calls to symbol-ref SYM should be treated as
 653    long-calls (ie called via a register).  */
 654 bool
 655 aarch64_is_long_call_p (rtx sym)
 656 {
 657   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 658 }
 659
 660 /* Return true if the offsets to a zero/sign-extract operation
 661    represent an expression that matches an extend operation.  The
 662    operands represent the paramters from
 663
 664    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 665 bool
 666 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
 667                                 rtx extract_imm)
 668 {
 669   HOST_WIDE_INT mult_val, extract_val;
 670
 671   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 672     return false;
 673
 674   mult_val = INTVAL (mult_imm);
 675   extract_val = INTVAL (extract_imm);
 676
 677   if (extract_val > 8
 678       && extract_val < GET_MODE_BITSIZE (mode)
 679       && exact_log2 (extract_val & ~7) > 0
 680       && (extract_val & 7) <= 4
 681       && mult_val == (1 << (extract_val & 7)))
 682     return true;
 683
 684   return false;
 685 }
 686
 687 /* Emit an insn that's a simple single-set.  Both the operands must be
 688    known to be valid.  */
 689 inline static rtx
 690 emit_set_insn (rtx x, rtx y)
 691 {
 692   return emit_insn (gen_rtx_SET (VOIDmode, x, y));
 693 }
 694
 695 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 696    return the rtx for register 0 in the proper mode.  */
 697 rtx
 698 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 699 {
 700   machine_mode mode = SELECT_CC_MODE (code, x, y);
 701   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 702
 703   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 704   return cc_reg;
 705 }
 706
 707 /* Build the SYMBOL_REF for __tls_get_addr.  */
 708
 709 static GTY(()) rtx tls_get_addr_libfunc;
 710
 711 rtx
 712 aarch64_tls_get_addr (void)
 713 {
 714   if (!tls_get_addr_libfunc)
 715     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 716   return tls_get_addr_libfunc;
 717 }
 718
 719 /* Return the TLS model to use for ADDR.  */
 720
 721 static enum tls_model
 722 tls_symbolic_operand_type (rtx addr)
 723 {
 724   enum tls_model tls_kind = TLS_MODEL_NONE;
 725   rtx sym, addend;
 726
 727   if (GET_CODE (addr) == CONST)
 728     {
 729       split_const (addr, &sym, &addend);
 730       if (GET_CODE (sym) == SYMBOL_REF)
 731         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 732     }
 733   else if (GET_CODE (addr) == SYMBOL_REF)
 734     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 735
 736   return tls_kind;
 737 }
 738
 739 /* We'll allow lo_sum's in addresses in our legitimate addresses
 740    so that combine would take care of combining addresses where
 741    necessary, but for generation purposes, we'll generate the address
 742    as :
 743    RTL                               Absolute
 744    tmp = hi (symbol_ref);            adrp  x1, foo
 745    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 746                                      nop
 747
 748    PIC                               TLS
 749    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 750    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 751                                      bl   __tls_get_addr
 752                                      nop
 753
 754    Load TLS symbol, depending on TLS mechanism and TLS access model.
 755
 756    Global Dynamic - Traditional TLS:
 757    adrp tmp, :tlsgd:imm
 758    add  dest, tmp, #:tlsgd_lo12:imm
 759    bl   __tls_get_addr
 760
 761    Global Dynamic - TLS Descriptors:
 762    adrp dest, :tlsdesc:imm
 763    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 764    add  dest, dest, #:tlsdesc_lo12:imm
 765    blr  tmp
 766    mrs  tp, tpidr_el0
 767    add  dest, dest, tp
 768
 769    Initial Exec:
 770    mrs  tp, tpidr_el0
 771    adrp tmp, :gottprel:imm
 772    ldr  dest, [tmp, #:gottprel_lo12:imm]
 773    add  dest, dest, tp
 774
 775    Local Exec:
 776    mrs  tp, tpidr_el0
 777    add  t0, tp, #:tprel_hi12:imm, lsl #12
 778    add  t0, t0, #:tprel_lo12_nc:imm
 779 */
 780
 781 static void
 782 aarch64_load_symref_appropriately (rtx dest, rtx imm,
 783                                    enum aarch64_symbol_type type)
 784 {
 785   switch (type)
 786     {
 787     case SYMBOL_SMALL_ABSOLUTE:
 788       {
 789         /* In ILP32, the mode of dest can be either SImode or DImode.  */
 790         rtx tmp_reg = dest;
 791         machine_mode mode = GET_MODE (dest);
 792
 793         gcc_assert (mode == Pmode || mode == ptr_mode);
 794
 795         if (can_create_pseudo_p ())
 796           tmp_reg = gen_reg_rtx (mode);
 797
 798         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 799         emit_insn (gen_add_losym (dest, tmp_reg, imm));
 800         return;
 801       }
 802
 803     case SYMBOL_TINY_ABSOLUTE:
 804       emit_insn (gen_rtx_SET (Pmode, dest, imm));
 805       return;
 806
 807     case SYMBOL_SMALL_GOT:
 808       {
 809         /* In ILP32, the mode of dest can be either SImode or DImode,
 810            while the got entry is always of SImode size.  The mode of
 811            dest depends on how dest is used: if dest is assigned to a
 812            pointer (e.g. in the memory), it has SImode; it may have
 813            DImode if dest is dereferenced to access the memeory.
 814            This is why we have to handle three different ldr_got_small
 815            patterns here (two patterns for ILP32).  */
 816         rtx tmp_reg = dest;
 817         machine_mode mode = GET_MODE (dest);
 818
 819         if (can_create_pseudo_p ())
 820           tmp_reg = gen_reg_rtx (mode);
 821
 822         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 823         if (mode == ptr_mode)
 824           {
 825             if (mode == DImode)
 826               emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
 827             else
 828               emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
 829           }
 830         else
 831           {
 832             gcc_assert (mode == Pmode);
 833             emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
 834           }
 835
 836         return;
 837       }
 838
 839     case SYMBOL_SMALL_TLSGD:
 840       {
 841         rtx_insn *insns;
 842         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
 843
 844         start_sequence ();
 845         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
 846         insns = get_insns ();
 847         end_sequence ();
 848
 849         RTL_CONST_CALL_P (insns) = 1;
 850         emit_libcall_block (insns, dest, result, imm);
 851         return;
 852       }
 853
 854     case SYMBOL_SMALL_TLSDESC:
 855       {
 856         machine_mode mode = GET_MODE (dest);
 857         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
 858         rtx tp;
 859
 860         gcc_assert (mode == Pmode || mode == ptr_mode);
 861
 862         /* In ILP32, the got entry is always of SImode size.  Unlike
 863            small GOT, the dest is fixed at reg 0.  */
 864         if (TARGET_ILP32)
 865           emit_insn (gen_tlsdesc_small_si (imm));
 866         else
 867           emit_insn (gen_tlsdesc_small_di (imm));
 868         tp = aarch64_load_tp (NULL);
 869
 870         if (mode != Pmode)
 871           tp = gen_lowpart (mode, tp);
 872
 873         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
 874         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 875         return;
 876       }
 877
 878     case SYMBOL_SMALL_GOTTPREL:
 879       {
 880         /* In ILP32, the mode of dest can be either SImode or DImode,
 881            while the got entry is always of SImode size.  The mode of
 882            dest depends on how dest is used: if dest is assigned to a
 883            pointer (e.g. in the memory), it has SImode; it may have
 884            DImode if dest is dereferenced to access the memeory.
 885            This is why we have to handle three different tlsie_small
 886            patterns here (two patterns for ILP32).  */
 887         machine_mode mode = GET_MODE (dest);
 888         rtx tmp_reg = gen_reg_rtx (mode);
 889         rtx tp = aarch64_load_tp (NULL);
 890
 891         if (mode == ptr_mode)
 892           {
 893             if (mode == DImode)
 894               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
 895             else
 896               {
 897                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
 898                 tp = gen_lowpart (mode, tp);
 899               }
 900           }
 901         else
 902           {
 903             gcc_assert (mode == Pmode);
 904             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
 905           }
 906
 907         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
 908         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 909         return;
 910       }
 911
 912     case SYMBOL_SMALL_TPREL:
 913       {
 914         rtx tp = aarch64_load_tp (NULL);
 915
 916         if (GET_MODE (dest) != Pmode)
 917           tp = gen_lowpart (GET_MODE (dest), tp);
 918
 919         emit_insn (gen_tlsle_small (dest, tp, imm));
 920         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 921         return;
 922       }
 923
 924     case SYMBOL_TINY_GOT:
 925       emit_insn (gen_ldr_got_tiny (dest, imm));
 926       return;
 927
 928     default:
 929       gcc_unreachable ();
 930     }
 931 }
 932
 933 /* Emit a move from SRC to DEST.  Assume that the move expanders can
 934    handle all moves if !can_create_pseudo_p ().  The distinction is
 935    important because, unlike emit_move_insn, the move expanders know
 936    how to force Pmode objects into the constant pool even when the
 937    constant pool address is not itself legitimate.  */
 938 static rtx
 939 aarch64_emit_move (rtx dest, rtx src)
 940 {
 941   return (can_create_pseudo_p ()
 942           ? emit_move_insn (dest, src)
 943           : emit_move_insn_1 (dest, src));
 944 }
 945
 946 /* Split a 128-bit move operation into two 64-bit move operations,
 947    taking care to handle partial overlap of register to register
 948    copies.  Special cases are needed when moving between GP regs and
 949    FP regs.  SRC can be a register, constant or memory; DST a register
 950    or memory.  If either operand is memory it must not have any side
 951    effects.  */
 952 void
 953 aarch64_split_128bit_move (rtx dst, rtx src)
 954 {
 955   rtx dst_lo, dst_hi;
 956   rtx src_lo, src_hi;
 957
 958   machine_mode mode = GET_MODE (dst);
 959
 960   gcc_assert (mode == TImode || mode == TFmode);
 961   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
 962   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
 963
 964   if (REG_P (dst) && REG_P (src))
 965     {
 966       int src_regno = REGNO (src);
 967       int dst_regno = REGNO (dst);
 968
 969       /* Handle FP <-> GP regs.  */
 970       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
 971         {
 972           src_lo = gen_lowpart (word_mode, src);
 973           src_hi = gen_highpart (word_mode, src);
 974
 975           if (mode == TImode)
 976             {
 977               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
 978               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
 979             }
 980           else
 981             {
 982               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
 983               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
 984             }
 985           return;
 986         }
 987       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
 988         {
 989           dst_lo = gen_lowpart (word_mode, dst);
 990           dst_hi = gen_highpart (word_mode, dst);
 991
 992           if (mode == TImode)
 993             {
 994               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
 995               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
 996             }
 997           else
 998             {
 999               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1000               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1001             }
1002           return;
1003         }
1004     }
1005
1006   dst_lo = gen_lowpart (word_mode, dst);
1007   dst_hi = gen_highpart (word_mode, dst);
1008   src_lo = gen_lowpart (word_mode, src);
1009   src_hi = gen_highpart_mode (word_mode, mode, src);
1010
1011   /* At most one pairing may overlap.  */
1012   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1013     {
1014       aarch64_emit_move (dst_hi, src_hi);
1015       aarch64_emit_move (dst_lo, src_lo);
1016     }
1017   else
1018     {
1019       aarch64_emit_move (dst_lo, src_lo);
1020       aarch64_emit_move (dst_hi, src_hi);
1021     }
1022 }
1023
1024 bool
1025 aarch64_split_128bit_move_p (rtx dst, rtx src)
1026 {
1027   return (! REG_P (src)
1028           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1029 }
1030
1031 /* Split a complex SIMD combine.  */
1032
1033 void
1034 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1035 {
1036   machine_mode src_mode = GET_MODE (src1);
1037   machine_mode dst_mode = GET_MODE (dst);
1038
1039   gcc_assert (VECTOR_MODE_P (dst_mode));
1040
1041   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1042     {
1043       rtx (*gen) (rtx, rtx, rtx);
1044
1045       switch (src_mode)
1046         {
1047         case V8QImode:
1048           gen = gen_aarch64_simd_combinev8qi;
1049           break;
1050         case V4HImode:
1051           gen = gen_aarch64_simd_combinev4hi;
1052           break;
1053         case V2SImode:
1054           gen = gen_aarch64_simd_combinev2si;
1055           break;
1056         case V2SFmode:
1057           gen = gen_aarch64_simd_combinev2sf;
1058           break;
1059         case DImode:
1060           gen = gen_aarch64_simd_combinedi;
1061           break;
1062         case DFmode:
1063           gen = gen_aarch64_simd_combinedf;
1064           break;
1065         default:
1066           gcc_unreachable ();
1067         }
1068
1069       emit_insn (gen (dst, src1, src2));
1070       return;
1071     }
1072 }
1073
1074 /* Split a complex SIMD move.  */
1075
1076 void
1077 aarch64_split_simd_move (rtx dst, rtx src)
1078 {
1079   machine_mode src_mode = GET_MODE (src);
1080   machine_mode dst_mode = GET_MODE (dst);
1081
1082   gcc_assert (VECTOR_MODE_P (dst_mode));
1083
1084   if (REG_P (dst) && REG_P (src))
1085     {
1086       rtx (*gen) (rtx, rtx);
1087
1088       gcc_assert (VECTOR_MODE_P (src_mode));
1089
1090       switch (src_mode)
1091         {
1092         case V16QImode:
1093           gen = gen_aarch64_split_simd_movv16qi;
1094           break;
1095         case V8HImode:
1096           gen = gen_aarch64_split_simd_movv8hi;
1097           break;
1098         case V4SImode:
1099           gen = gen_aarch64_split_simd_movv4si;
1100           break;
1101         case V2DImode:
1102           gen = gen_aarch64_split_simd_movv2di;
1103           break;
1104         case V4SFmode:
1105           gen = gen_aarch64_split_simd_movv4sf;
1106           break;
1107         case V2DFmode:
1108           gen = gen_aarch64_split_simd_movv2df;
1109           break;
1110         default:
1111           gcc_unreachable ();
1112         }
1113
1114       emit_insn (gen (dst, src));
1115       return;
1116     }
1117 }
1118
1119 static rtx
1120 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1121 {
1122   if (can_create_pseudo_p ())
1123     return force_reg (mode, value);
1124   else
1125     {
1126       x = aarch64_emit_move (x, value);
1127       return x;
1128     }
1129 }
1130
1131
1132 static rtx
1133 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1134 {
1135   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1136     {
1137       rtx high;
1138       /* Load the full offset into a register.  This
1139          might be improvable in the future.  */
1140       high = GEN_INT (offset);
1141       offset = 0;
1142       high = aarch64_force_temporary (mode, temp, high);
1143       reg = aarch64_force_temporary (mode, temp,
1144                                      gen_rtx_PLUS (mode, high, reg));
1145     }
1146   return plus_constant (mode, reg, offset);
1147 }
1148
1149 static int
1150 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1151                                 machine_mode mode)
1152 {
1153   unsigned HOST_WIDE_INT mask;
1154   int i;
1155   bool first;
1156   unsigned HOST_WIDE_INT val;
1157   bool subtargets;
1158   rtx subtarget;
1159   int one_match, zero_match, first_not_ffff_match;
1160   int num_insns = 0;
1161
1162   if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1163     {
1164       if (generate)
1165         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1166       num_insns++;
1167       return num_insns;
1168     }
1169
1170   if (mode == SImode)
1171     {
1172       /* We know we can't do this in 1 insn, and we must be able to do it
1173          in two; so don't mess around looking for sequences that don't buy
1174          us anything.  */
1175       if (generate)
1176         {
1177           emit_insn (gen_rtx_SET (VOIDmode, dest,
1178                                   GEN_INT (INTVAL (imm) & 0xffff)));
1179           emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1180                                      GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1181         }
1182       num_insns += 2;
1183       return num_insns;
1184     }
1185
1186   /* Remaining cases are all for DImode.  */
1187
1188   val = INTVAL (imm);
1189   subtargets = optimize && can_create_pseudo_p ();
1190
1191   one_match = 0;
1192   zero_match = 0;
1193   mask = 0xffff;
1194   first_not_ffff_match = -1;
1195
1196   for (i = 0; i < 64; i += 16, mask <<= 16)
1197     {
1198       if ((val & mask) == mask)
1199         one_match++;
1200       else
1201         {
1202           if (first_not_ffff_match < 0)
1203             first_not_ffff_match = i;
1204           if ((val & mask) == 0)
1205             zero_match++;
1206         }
1207     }
1208
1209   if (one_match == 2)
1210     {
1211       /* Set one of the quarters and then insert back into result.  */
1212       mask = 0xffffll << first_not_ffff_match;
1213       if (generate)
1214         {
1215           emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1216           emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1217                                      GEN_INT ((val >> first_not_ffff_match)
1218                                               & 0xffff)));
1219         }
1220       num_insns += 2;
1221       return num_insns;
1222     }
1223
1224   if (zero_match == 2)
1225     goto simple_sequence;
1226
1227   mask = 0x0ffff0000UL;
1228   for (i = 16; i < 64; i += 16, mask <<= 16)
1229     {
1230       HOST_WIDE_INT comp = mask & ~(mask - 1);
1231
1232       if (aarch64_uimm12_shift (val - (val & mask)))
1233         {
1234           if (generate)
1235             {
1236               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1237               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1238                                       GEN_INT (val & mask)));
1239               emit_insn (gen_adddi3 (dest, subtarget,
1240                                      GEN_INT (val - (val & mask))));
1241             }
1242           num_insns += 2;
1243           return num_insns;
1244         }
1245       else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1246         {
1247           if (generate)
1248             {
1249               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1250               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1251                                       GEN_INT ((val + comp) & mask)));
1252               emit_insn (gen_adddi3 (dest, subtarget,
1253                                      GEN_INT (val - ((val + comp) & mask))));
1254             }
1255           num_insns += 2;
1256           return num_insns;
1257         }
1258       else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1259         {
1260           if (generate)
1261             {
1262               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1263               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1264                                       GEN_INT ((val - comp) | ~mask)));
1265               emit_insn (gen_adddi3 (dest, subtarget,
1266                                      GEN_INT (val - ((val - comp) | ~mask))));
1267             }
1268           num_insns += 2;
1269           return num_insns;
1270         }
1271       else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1272         {
1273           if (generate)
1274             {
1275               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1276               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1277                                       GEN_INT (val | ~mask)));
1278               emit_insn (gen_adddi3 (dest, subtarget,
1279                                      GEN_INT (val - (val | ~mask))));
1280             }
1281           num_insns += 2;
1282           return num_insns;
1283         }
1284     }
1285
1286   /* See if we can do it by arithmetically combining two
1287      immediates.  */
1288   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1289     {
1290       int j;
1291       mask = 0xffff;
1292
1293       if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1294           || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1295         {
1296           if (generate)
1297             {
1298               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1299               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1300                                       GEN_INT (aarch64_bitmasks[i])));
1301               emit_insn (gen_adddi3 (dest, subtarget,
1302                                      GEN_INT (val - aarch64_bitmasks[i])));
1303             }
1304           num_insns += 2;
1305           return num_insns;
1306         }
1307
1308       for (j = 0; j < 64; j += 16, mask <<= 16)
1309         {
1310           if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1311             {
1312               if (generate)
1313                 {
1314                   emit_insn (gen_rtx_SET (VOIDmode, dest,
1315                                           GEN_INT (aarch64_bitmasks[i])));
1316                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1317                                              GEN_INT ((val >> j) & 0xffff)));
1318                 }
1319               num_insns += 2;
1320               return num_insns;
1321             }
1322         }
1323     }
1324
1325   /* See if we can do it by logically combining two immediates.  */
1326   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1327     {
1328       if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1329         {
1330           int j;
1331
1332           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1333             if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1334               {
1335                 if (generate)
1336                   {
1337                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1338                     emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1339                                             GEN_INT (aarch64_bitmasks[i])));
1340                     emit_insn (gen_iordi3 (dest, subtarget,
1341                                            GEN_INT (aarch64_bitmasks[j])));
1342                   }
1343                 num_insns += 2;
1344                 return num_insns;
1345               }
1346         }
1347       else if ((val & aarch64_bitmasks[i]) == val)
1348         {
1349           int j;
1350
1351           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1352             if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1353               {
1354                 if (generate)
1355                   {
1356                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1357                     emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1358                                             GEN_INT (aarch64_bitmasks[j])));
1359                     emit_insn (gen_anddi3 (dest, subtarget,
1360                                            GEN_INT (aarch64_bitmasks[i])));
1361                   }
1362                 num_insns += 2;
1363                 return num_insns;
1364               }
1365         }
1366     }
1367
1368   if (one_match > zero_match)
1369     {
1370       /* Set either first three quarters or all but the third.   */
1371       mask = 0xffffll << (16 - first_not_ffff_match);
1372       if (generate)
1373         emit_insn (gen_rtx_SET (VOIDmode, dest,
1374                                 GEN_INT (val | mask | 0xffffffff00000000ull)));
1375       num_insns ++;
1376
1377       /* Now insert other two quarters.  */
1378       for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1379            i < 64; i += 16, mask <<= 16)
1380         {
1381           if ((val & mask) != mask)
1382             {
1383               if (generate)
1384                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1385                                            GEN_INT ((val >> i) & 0xffff)));
1386               num_insns ++;
1387             }
1388         }
1389       return num_insns;
1390     }
1391
1392  simple_sequence:
1393   first = true;
1394   mask = 0xffff;
1395   for (i = 0; i < 64; i += 16, mask <<= 16)
1396     {
1397       if ((val & mask) != 0)
1398         {
1399           if (first)
1400             {
1401               if (generate)
1402                 emit_insn (gen_rtx_SET (VOIDmode, dest,
1403                                         GEN_INT (val & mask)));
1404               num_insns ++;
1405               first = false;
1406             }
1407           else
1408             {
1409               if (generate)
1410                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1411                                            GEN_INT ((val >> i) & 0xffff)));
1412               num_insns ++;
1413             }
1414         }
1415     }
1416
1417   return num_insns;
1418 }
1419
1420
1421 void
1422 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1423 {
1424   machine_mode mode = GET_MODE (dest);
1425
1426   gcc_assert (mode == SImode || mode == DImode);
1427
1428   /* Check on what type of symbol it is.  */
1429   if (GET_CODE (imm) == SYMBOL_REF
1430       || GET_CODE (imm) == LABEL_REF
1431       || GET_CODE (imm) == CONST)
1432     {
1433       rtx mem, base, offset;
1434       enum aarch64_symbol_type sty;
1435
1436       /* If we have (const (plus symbol offset)), separate out the offset
1437          before we start classifying the symbol.  */
1438       split_const (imm, &base, &offset);
1439
1440       sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1441       switch (sty)
1442         {
1443         case SYMBOL_FORCE_TO_MEM:
1444           if (offset != const0_rtx
1445               && targetm.cannot_force_const_mem (mode, imm))
1446             {
1447               gcc_assert (can_create_pseudo_p ());
1448               base = aarch64_force_temporary (mode, dest, base);
1449               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1450               aarch64_emit_move (dest, base);
1451               return;
1452             }
1453           mem = force_const_mem (ptr_mode, imm);
1454           gcc_assert (mem);
1455           if (mode != ptr_mode)
1456             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1457           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1458           return;
1459
1460         case SYMBOL_SMALL_TLSGD:
1461         case SYMBOL_SMALL_TLSDESC:
1462         case SYMBOL_SMALL_GOTTPREL:
1463         case SYMBOL_SMALL_GOT:
1464         case SYMBOL_TINY_GOT:
1465           if (offset != const0_rtx)
1466             {
1467               gcc_assert(can_create_pseudo_p ());
1468               base = aarch64_force_temporary (mode, dest, base);
1469               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1470               aarch64_emit_move (dest, base);
1471               return;
1472             }
1473           /* FALLTHRU */
1474
1475         case SYMBOL_SMALL_TPREL:
1476         case SYMBOL_SMALL_ABSOLUTE:
1477         case SYMBOL_TINY_ABSOLUTE:
1478           aarch64_load_symref_appropriately (dest, imm, sty);
1479           return;
1480
1481         default:
1482           gcc_unreachable ();
1483         }
1484     }
1485
1486   if (!CONST_INT_P (imm))
1487     {
1488       if (GET_CODE (imm) == HIGH)
1489         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1490       else
1491         {
1492           rtx mem = force_const_mem (mode, imm);
1493           gcc_assert (mem);
1494           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1495         }
1496
1497       return;
1498     }
1499
1500   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1501 }
1502
1503 static bool
1504 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1505                                  tree exp ATTRIBUTE_UNUSED)
1506 {
1507   /* Currently, always true.  */
1508   return true;
1509 }
1510
1511 /* Implement TARGET_PASS_BY_REFERENCE.  */
1512
1513 static bool
1514 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1515                            machine_mode mode,
1516                            const_tree type,
1517                            bool named ATTRIBUTE_UNUSED)
1518 {
1519   HOST_WIDE_INT size;
1520   machine_mode dummymode;
1521   int nregs;
1522
1523   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1524   size = (mode == BLKmode && type)
1525     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1526
1527   /* Aggregates are passed by reference based on their size.  */
1528   if (type && AGGREGATE_TYPE_P (type))
1529     {
1530       size = int_size_in_bytes (type);
1531     }
1532
1533   /* Variable sized arguments are always returned by reference.  */
1534   if (size < 0)
1535     return true;
1536
1537   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1538   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1539                                                &dummymode, &nregs,
1540                                                NULL))
1541     return false;
1542
1543   /* Arguments which are variable sized or larger than 2 registers are
1544      passed by reference unless they are a homogenous floating point
1545      aggregate.  */
1546   return size > 2 * UNITS_PER_WORD;
1547 }
1548
1549 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1550 static bool
1551 aarch64_return_in_msb (const_tree valtype)
1552 {
1553   machine_mode dummy_mode;
1554   int dummy_int;
1555
1556   /* Never happens in little-endian mode.  */
1557   if (!BYTES_BIG_ENDIAN)
1558     return false;
1559
1560   /* Only composite types smaller than or equal to 16 bytes can
1561      be potentially returned in registers.  */
1562   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1563       || int_size_in_bytes (valtype) <= 0
1564       || int_size_in_bytes (valtype) > 16)
1565     return false;
1566
1567   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1568      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1569      is always passed/returned in the least significant bits of fp/simd
1570      register(s).  */
1571   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1572                                                &dummy_mode, &dummy_int, NULL))
1573     return false;
1574
1575   return true;
1576 }
1577
1578 /* Implement TARGET_FUNCTION_VALUE.
1579    Define how to find the value returned by a function.  */
1580
1581 static rtx
1582 aarch64_function_value (const_tree type, const_tree func,
1583                         bool outgoing ATTRIBUTE_UNUSED)
1584 {
1585   machine_mode mode;
1586   int unsignedp;
1587   int count;
1588   machine_mode ag_mode;
1589
1590   mode = TYPE_MODE (type);
1591   if (INTEGRAL_TYPE_P (type))
1592     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1593
1594   if (aarch64_return_in_msb (type))
1595     {
1596       HOST_WIDE_INT size = int_size_in_bytes (type);
1597
1598       if (size % UNITS_PER_WORD != 0)
1599         {
1600           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1601           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1602         }
1603     }
1604
1605   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1606                                                &ag_mode, &count, NULL))
1607     {
1608       if (!aarch64_composite_type_p (type, mode))
1609         {
1610           gcc_assert (count == 1 && mode == ag_mode);
1611           return gen_rtx_REG (mode, V0_REGNUM);
1612         }
1613       else
1614         {
1615           int i;
1616           rtx par;
1617
1618           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1619           for (i = 0; i < count; i++)
1620             {
1621               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1622               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1623                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1624               XVECEXP (par, 0, i) = tmp;
1625             }
1626           return par;
1627         }
1628     }
1629   else
1630     return gen_rtx_REG (mode, R0_REGNUM);
1631 }
1632
1633 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1634    Return true if REGNO is the number of a hard register in which the values
1635    of called function may come back.  */
1636
1637 static bool
1638 aarch64_function_value_regno_p (const unsigned int regno)
1639 {
1640   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1641      of 16-byte return values are: 128-bit integers and 16-byte small
1642      structures (excluding homogeneous floating-point aggregates).  */
1643   if (regno == R0_REGNUM || regno == R1_REGNUM)
1644     return true;
1645
1646   /* Up to four fp/simd registers can return a function value, e.g. a
1647      homogeneous floating-point aggregate having four members.  */
1648   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1649     return !TARGET_GENERAL_REGS_ONLY;
1650
1651   return false;
1652 }
1653
1654 /* Implement TARGET_RETURN_IN_MEMORY.
1655
1656    If the type T of the result of a function is such that
1657      void func (T arg)
1658    would require that arg be passed as a value in a register (or set of
1659    registers) according to the parameter passing rules, then the result
1660    is returned in the same registers as would be used for such an
1661    argument.  */
1662
1663 static bool
1664 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1665 {
1666   HOST_WIDE_INT size;
1667   machine_mode ag_mode;
1668   int count;
1669
1670   if (!AGGREGATE_TYPE_P (type)
1671       && TREE_CODE (type) != COMPLEX_TYPE
1672       && TREE_CODE (type) != VECTOR_TYPE)
1673     /* Simple scalar types always returned in registers.  */
1674     return false;
1675
1676   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1677                                                type,
1678                                                &ag_mode,
1679                                                &count,
1680                                                NULL))
1681     return false;
1682
1683   /* Types larger than 2 registers returned in memory.  */
1684   size = int_size_in_bytes (type);
1685   return (size < 0 || size > 2 * UNITS_PER_WORD);
1686 }
1687
1688 static bool
1689 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1690                                const_tree type, int *nregs)
1691 {
1692   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1693   return aarch64_vfp_is_call_or_return_candidate (mode,
1694                                                   type,
1695                                                   &pcum->aapcs_vfp_rmode,
1696                                                   nregs,
1697                                                   NULL);
1698 }
1699
1700 /* Given MODE and TYPE of a function argument, return the alignment in
1701    bits.  The idea is to suppress any stronger alignment requested by
1702    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1703    This is a helper function for local use only.  */
1704
1705 static unsigned int
1706 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1707 {
1708   unsigned int alignment;
1709
1710   if (type)
1711     {
1712       if (!integer_zerop (TYPE_SIZE (type)))
1713         {
1714           if (TYPE_MODE (type) == mode)
1715             alignment = TYPE_ALIGN (type);
1716           else
1717             alignment = GET_MODE_ALIGNMENT (mode);
1718         }
1719       else
1720         alignment = 0;
1721     }
1722   else
1723     alignment = GET_MODE_ALIGNMENT (mode);
1724
1725   return alignment;
1726 }
1727
1728 /* Layout a function argument according to the AAPCS64 rules.  The rule
1729    numbers refer to the rule numbers in the AAPCS64.  */
1730
1731 static void
1732 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1733                     const_tree type,
1734                     bool named ATTRIBUTE_UNUSED)
1735 {
1736   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1737   int ncrn, nvrn, nregs;
1738   bool allocate_ncrn, allocate_nvrn;
1739   HOST_WIDE_INT size;
1740
1741   /* We need to do this once per argument.  */
1742   if (pcum->aapcs_arg_processed)
1743     return;
1744
1745   pcum->aapcs_arg_processed = true;
1746
1747   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1748   size
1749     = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1750                         UNITS_PER_WORD);
1751
1752   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1753   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1754                                                  mode,
1755                                                  type,
1756                                                  &nregs);
1757
1758   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1759      The following code thus handles passing by SIMD/FP registers first.  */
1760
1761   nvrn = pcum->aapcs_nvrn;
1762
1763   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1764      and homogenous short-vector aggregates (HVA).  */
1765   if (allocate_nvrn)
1766     {
1767       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1768         {
1769           pcum->aapcs_nextnvrn = nvrn + nregs;
1770           if (!aarch64_composite_type_p (type, mode))
1771             {
1772               gcc_assert (nregs == 1);
1773               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1774             }
1775           else
1776             {
1777               rtx par;
1778               int i;
1779               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1780               for (i = 0; i < nregs; i++)
1781                 {
1782                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1783                                          V0_REGNUM + nvrn + i);
1784                   tmp = gen_rtx_EXPR_LIST
1785                     (VOIDmode, tmp,
1786                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1787                   XVECEXP (par, 0, i) = tmp;
1788                 }
1789               pcum->aapcs_reg = par;
1790             }
1791           return;
1792         }
1793       else
1794         {
1795           /* C.3 NSRN is set to 8.  */
1796           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1797           goto on_stack;
1798         }
1799     }
1800
1801   ncrn = pcum->aapcs_ncrn;
1802   nregs = size / UNITS_PER_WORD;
1803
1804   /* C6 - C9.  though the sign and zero extension semantics are
1805      handled elsewhere.  This is the case where the argument fits
1806      entirely general registers.  */
1807   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1808     {
1809       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1810
1811       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1812
1813       /* C.8 if the argument has an alignment of 16 then the NGRN is
1814          rounded up to the next even number.  */
1815       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1816         {
1817           ++ncrn;
1818           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1819         }
1820       /* NREGS can be 0 when e.g. an empty structure is to be passed.
1821          A reg is still generated for it, but the caller should be smart
1822          enough not to use it.  */
1823       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1824         {
1825           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1826         }
1827       else
1828         {
1829           rtx par;
1830           int i;
1831
1832           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1833           for (i = 0; i < nregs; i++)
1834             {
1835               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1836               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1837                                        GEN_INT (i * UNITS_PER_WORD));
1838               XVECEXP (par, 0, i) = tmp;
1839             }
1840           pcum->aapcs_reg = par;
1841         }
1842
1843       pcum->aapcs_nextncrn = ncrn + nregs;
1844       return;
1845     }
1846
1847   /* C.11  */
1848   pcum->aapcs_nextncrn = NUM_ARG_REGS;
1849
1850   /* The argument is passed on stack; record the needed number of words for
1851      this argument and align the total size if necessary.  */
1852 on_stack:
1853   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1854   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1855     pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1856                                                16 / UNITS_PER_WORD);
1857   return;
1858 }
1859
1860 /* Implement TARGET_FUNCTION_ARG.  */
1861
1862 static rtx
1863 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1864                       const_tree type, bool named)
1865 {
1866   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1867   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1868
1869   if (mode == VOIDmode)
1870     return NULL_RTX;
1871
1872   aarch64_layout_arg (pcum_v, mode, type, named);
1873   return pcum->aapcs_reg;
1874 }
1875
1876 void
1877 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1878                            const_tree fntype ATTRIBUTE_UNUSED,
1879                            rtx libname ATTRIBUTE_UNUSED,
1880                            const_tree fndecl ATTRIBUTE_UNUSED,
1881                            unsigned n_named ATTRIBUTE_UNUSED)
1882 {
1883   pcum->aapcs_ncrn = 0;
1884   pcum->aapcs_nvrn = 0;
1885   pcum->aapcs_nextncrn = 0;
1886   pcum->aapcs_nextnvrn = 0;
1887   pcum->pcs_variant = ARM_PCS_AAPCS64;
1888   pcum->aapcs_reg = NULL_RTX;
1889   pcum->aapcs_arg_processed = false;
1890   pcum->aapcs_stack_words = 0;
1891   pcum->aapcs_stack_size = 0;
1892
1893   return;
1894 }
1895
1896 static void
1897 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1898                               machine_mode mode,
1899                               const_tree type,
1900                               bool named)
1901 {
1902   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1903   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1904     {
1905       aarch64_layout_arg (pcum_v, mode, type, named);
1906       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1907                   != (pcum->aapcs_stack_words != 0));
1908       pcum->aapcs_arg_processed = false;
1909       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1910       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1911       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1912       pcum->aapcs_stack_words = 0;
1913       pcum->aapcs_reg = NULL_RTX;
1914     }
1915 }
1916
1917 bool
1918 aarch64_function_arg_regno_p (unsigned regno)
1919 {
1920   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1921           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1922 }
1923
1924 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
1925    PARM_BOUNDARY bits of alignment, but will be given anything up
1926    to STACK_BOUNDARY bits if the type requires it.  This makes sure
1927    that both before and after the layout of each argument, the Next
1928    Stacked Argument Address (NSAA) will have a minimum alignment of
1929    8 bytes.  */
1930
1931 static unsigned int
1932 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1933 {
1934   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1935
1936   if (alignment < PARM_BOUNDARY)
1937     alignment = PARM_BOUNDARY;
1938   if (alignment > STACK_BOUNDARY)
1939     alignment = STACK_BOUNDARY;
1940   return alignment;
1941 }
1942
1943 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1944
1945    Return true if an argument passed on the stack should be padded upwards,
1946    i.e. if the least-significant byte of the stack slot has useful data.
1947
1948    Small aggregate types are placed in the lowest memory address.
1949
1950    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
1951
1952 bool
1953 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1954 {
1955   /* On little-endian targets, the least significant byte of every stack
1956      argument is passed at the lowest byte address of the stack slot.  */
1957   if (!BYTES_BIG_ENDIAN)
1958     return true;
1959
1960   /* Otherwise, integral, floating-point and pointer types are padded downward:
1961      the least significant byte of a stack argument is passed at the highest
1962      byte address of the stack slot.  */
1963   if (type
1964       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1965          || POINTER_TYPE_P (type))
1966       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1967     return false;
1968
1969   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
1970   return true;
1971 }
1972
1973 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1974
1975    It specifies padding for the last (may also be the only)
1976    element of a block move between registers and memory.  If
1977    assuming the block is in the memory, padding upward means that
1978    the last element is padded after its highest significant byte,
1979    while in downward padding, the last element is padded at the
1980    its least significant byte side.
1981
1982    Small aggregates and small complex types are always padded
1983    upwards.
1984
1985    We don't need to worry about homogeneous floating-point or
1986    short-vector aggregates; their move is not affected by the
1987    padding direction determined here.  Regardless of endianness,
1988    each element of such an aggregate is put in the least
1989    significant bits of a fp/simd register.
1990
1991    Return !BYTES_BIG_ENDIAN if the least significant byte of the
1992    register has useful data, and return the opposite if the most
1993    significant byte does.  */
1994
1995 bool
1996 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
1997                      bool first ATTRIBUTE_UNUSED)
1998 {
1999
2000   /* Small composite types are always padded upward.  */
2001   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2002     {
2003       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2004                             : GET_MODE_SIZE (mode));
2005       if (size < 2 * UNITS_PER_WORD)
2006         return true;
2007     }
2008
2009   /* Otherwise, use the default padding.  */
2010   return !BYTES_BIG_ENDIAN;
2011 }
2012
2013 static machine_mode
2014 aarch64_libgcc_cmp_return_mode (void)
2015 {
2016   return SImode;
2017 }
2018
2019 static bool
2020 aarch64_frame_pointer_required (void)
2021 {
2022   /* In aarch64_override_options_after_change
2023      flag_omit_leaf_frame_pointer turns off the frame pointer by
2024      default.  Turn it back on now if we've not got a leaf
2025      function.  */
2026   if (flag_omit_leaf_frame_pointer
2027       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2028     return true;
2029
2030   return false;
2031 }
2032
2033 /* Mark the registers that need to be saved by the callee and calculate
2034    the size of the callee-saved registers area and frame record (both FP
2035    and LR may be omitted).  */
2036 static void
2037 aarch64_layout_frame (void)
2038 {
2039   HOST_WIDE_INT offset = 0;
2040   int regno;
2041
2042   if (reload_completed && cfun->machine->frame.laid_out)
2043     return;
2044
2045 #define SLOT_NOT_REQUIRED (-2)
2046 #define SLOT_REQUIRED     (-1)
2047
2048   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2049   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2050
2051   /* First mark all the registers that really need to be saved...  */
2052   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2053     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2054
2055   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2056     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2057
2058   /* ... that includes the eh data registers (if needed)...  */
2059   if (crtl->calls_eh_return)
2060     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2061       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2062         = SLOT_REQUIRED;
2063
2064   /* ... and any callee saved register that dataflow says is live.  */
2065   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2066     if (df_regs_ever_live_p (regno)
2067         && (regno == R30_REGNUM
2068             || !call_used_regs[regno]))
2069       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2070
2071   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2072     if (df_regs_ever_live_p (regno)
2073         && !call_used_regs[regno])
2074       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2075
2076   if (frame_pointer_needed)
2077     {
2078       /* FP and LR are placed in the linkage record.  */
2079       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2080       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2081       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2082       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2083       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2084       offset += 2 * UNITS_PER_WORD;
2085     }
2086
2087   /* Now assign stack slots for them.  */
2088   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2089     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2090       {
2091         cfun->machine->frame.reg_offset[regno] = offset;
2092         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2093           cfun->machine->frame.wb_candidate1 = regno;
2094         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2095           cfun->machine->frame.wb_candidate2 = regno;
2096         offset += UNITS_PER_WORD;
2097       }
2098
2099   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2100     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2101       {
2102         cfun->machine->frame.reg_offset[regno] = offset;
2103         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2104           cfun->machine->frame.wb_candidate1 = regno;
2105         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2106                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2107           cfun->machine->frame.wb_candidate2 = regno;
2108         offset += UNITS_PER_WORD;
2109       }
2110
2111   cfun->machine->frame.padding0 =
2112     (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2113   offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2114
2115   cfun->machine->frame.saved_regs_size = offset;
2116
2117   cfun->machine->frame.hard_fp_offset
2118     = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2119                         + get_frame_size ()
2120                         + cfun->machine->frame.saved_regs_size,
2121                         STACK_BOUNDARY / BITS_PER_UNIT);
2122
2123   cfun->machine->frame.frame_size
2124     = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2125                         + crtl->outgoing_args_size,
2126                         STACK_BOUNDARY / BITS_PER_UNIT);
2127
2128   cfun->machine->frame.laid_out = true;
2129 }
2130
2131 static bool
2132 aarch64_register_saved_on_entry (int regno)
2133 {
2134   return cfun->machine->frame.reg_offset[regno] >= 0;
2135 }
2136
2137 static unsigned
2138 aarch64_next_callee_save (unsigned regno, unsigned limit)
2139 {
2140   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2141     regno ++;
2142   return regno;
2143 }
2144
2145 static void
2146 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2147                            HOST_WIDE_INT adjustment)
2148  {
2149   rtx base_rtx = stack_pointer_rtx;
2150   rtx insn, reg, mem;
2151
2152   reg = gen_rtx_REG (mode, regno);
2153   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2154                             plus_constant (Pmode, base_rtx, -adjustment));
2155   mem = gen_rtx_MEM (mode, mem);
2156
2157   insn = emit_move_insn (mem, reg);
2158   RTX_FRAME_RELATED_P (insn) = 1;
2159 }
2160
2161 static rtx
2162 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2163                           HOST_WIDE_INT adjustment)
2164 {
2165   switch (mode)
2166     {
2167     case DImode:
2168       return gen_storewb_pairdi_di (base, base, reg, reg2,
2169                                     GEN_INT (-adjustment),
2170                                     GEN_INT (UNITS_PER_WORD - adjustment));
2171     case DFmode:
2172       return gen_storewb_pairdf_di (base, base, reg, reg2,
2173                                     GEN_INT (-adjustment),
2174                                     GEN_INT (UNITS_PER_WORD - adjustment));
2175     default:
2176       gcc_unreachable ();
2177     }
2178 }
2179
2180 static void
2181 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2182                          unsigned regno2, HOST_WIDE_INT adjustment)
2183 {
2184   rtx_insn *insn;
2185   rtx reg1 = gen_rtx_REG (mode, regno1);
2186   rtx reg2 = gen_rtx_REG (mode, regno2);
2187
2188   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2189                                               reg2, adjustment));
2190   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2191   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2192   RTX_FRAME_RELATED_P (insn) = 1;
2193 }
2194
2195 static rtx
2196 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2197                          HOST_WIDE_INT adjustment)
2198 {
2199   switch (mode)
2200     {
2201     case DImode:
2202       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2203                                    GEN_INT (UNITS_PER_WORD));
2204     case DFmode:
2205       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2206                                    GEN_INT (UNITS_PER_WORD));
2207     default:
2208       gcc_unreachable ();
2209     }
2210 }
2211
2212 static rtx
2213 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2214                         rtx reg2)
2215 {
2216   switch (mode)
2217     {
2218     case DImode:
2219       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2220
2221     case DFmode:
2222       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2223
2224     default:
2225       gcc_unreachable ();
2226     }
2227 }
2228
2229 static rtx
2230 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2231                        rtx mem2)
2232 {
2233   switch (mode)
2234     {
2235     case DImode:
2236       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2237
2238     case DFmode:
2239       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2240
2241     default:
2242       gcc_unreachable ();
2243     }
2244 }
2245
2246
2247 static void
2248 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2249                            unsigned start, unsigned limit, bool skip_wb)
2250 {
2251   rtx_insn *insn;
2252   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2253                                                  ? gen_frame_mem : gen_rtx_MEM);
2254   unsigned regno;
2255   unsigned regno2;
2256
2257   for (regno = aarch64_next_callee_save (start, limit);
2258        regno <= limit;
2259        regno = aarch64_next_callee_save (regno + 1, limit))
2260     {
2261       rtx reg, mem;
2262       HOST_WIDE_INT offset;
2263
2264       if (skip_wb
2265           && (regno == cfun->machine->frame.wb_candidate1
2266               || regno == cfun->machine->frame.wb_candidate2))
2267         continue;
2268
2269       reg = gen_rtx_REG (mode, regno);
2270       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2271       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2272                                               offset));
2273
2274       regno2 = aarch64_next_callee_save (regno + 1, limit);
2275
2276       if (regno2 <= limit
2277           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2278               == cfun->machine->frame.reg_offset[regno2]))
2279
2280         {
2281           rtx reg2 = gen_rtx_REG (mode, regno2);
2282           rtx mem2;
2283
2284           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2285           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2286                                                    offset));
2287           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2288                                                     reg2));
2289
2290           /* The first part of a frame-related parallel insn is
2291              always assumed to be relevant to the frame
2292              calculations; subsequent parts, are only
2293              frame-related if explicitly marked.  */
2294           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2295           regno = regno2;
2296         }
2297       else
2298         insn = emit_move_insn (mem, reg);
2299
2300       RTX_FRAME_RELATED_P (insn) = 1;
2301     }
2302 }
2303
2304 static void
2305 aarch64_restore_callee_saves (machine_mode mode,
2306                               HOST_WIDE_INT start_offset, unsigned start,
2307                               unsigned limit, bool skip_wb, rtx *cfi_ops)
2308 {
2309   rtx base_rtx = stack_pointer_rtx;
2310   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2311                                                  ? gen_frame_mem : gen_rtx_MEM);
2312   unsigned regno;
2313   unsigned regno2;
2314   HOST_WIDE_INT offset;
2315
2316   for (regno = aarch64_next_callee_save (start, limit);
2317        regno <= limit;
2318        regno = aarch64_next_callee_save (regno + 1, limit))
2319     {
2320       rtx reg, mem;
2321
2322       if (skip_wb
2323           && (regno == cfun->machine->frame.wb_candidate1
2324               || regno == cfun->machine->frame.wb_candidate2))
2325         continue;
2326
2327       reg = gen_rtx_REG (mode, regno);
2328       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2329       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2330
2331       regno2 = aarch64_next_callee_save (regno + 1, limit);
2332
2333       if (regno2 <= limit
2334           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2335               == cfun->machine->frame.reg_offset[regno2]))
2336         {
2337           rtx reg2 = gen_rtx_REG (mode, regno2);
2338           rtx mem2;
2339
2340           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2341           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2342           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2343
2344           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2345           regno = regno2;
2346         }
2347       else
2348         emit_move_insn (reg, mem);
2349       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2350     }
2351 }
2352
2353 /* AArch64 stack frames generated by this compiler look like:
2354
2355         +-------------------------------+
2356         |                               |
2357         |  incoming stack arguments     |
2358         |                               |
2359         +-------------------------------+
2360         |                               | <-- incoming stack pointer (aligned)
2361         |  callee-allocated save area   |
2362         |  for register varargs         |
2363         |                               |
2364         +-------------------------------+
2365         |  local variables              | <-- frame_pointer_rtx
2366         |                               |
2367         +-------------------------------+
2368         |  padding0                     | \
2369         +-------------------------------+  |
2370         |  callee-saved registers       |  | frame.saved_regs_size
2371         +-------------------------------+  |
2372         |  LR'                          |  |
2373         +-------------------------------+  |
2374         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2375         +-------------------------------+
2376         |  dynamic allocation           |
2377         +-------------------------------+
2378         |  padding                      |
2379         +-------------------------------+
2380         |  outgoing stack arguments     | <-- arg_pointer
2381         |                               |
2382         +-------------------------------+
2383         |                               | <-- stack_pointer_rtx (aligned)
2384
2385    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2386    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2387    unchanged.  */
2388
2389 /* Generate the prologue instructions for entry into a function.
2390    Establish the stack frame by decreasing the stack pointer with a
2391    properly calculated size and, if necessary, create a frame record
2392    filled with the values of LR and previous frame pointer.  The
2393    current FP is also set up if it is in use.  */
2394
2395 void
2396 aarch64_expand_prologue (void)
2397 {
2398   /* sub sp, sp, #<frame_size>
2399      stp {fp, lr}, [sp, #<frame_size> - 16]
2400      add fp, sp, #<frame_size> - hardfp_offset
2401      stp {cs_reg}, [fp, #-16] etc.
2402
2403      sub sp, sp, <final_adjustment_if_any>
2404   */
2405   HOST_WIDE_INT frame_size, offset;
2406   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2407   HOST_WIDE_INT hard_fp_offset;
2408   rtx_insn *insn;
2409
2410   aarch64_layout_frame ();
2411
2412   offset = frame_size = cfun->machine->frame.frame_size;
2413   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2414   fp_offset = frame_size - hard_fp_offset;
2415
2416   if (flag_stack_usage_info)
2417     current_function_static_stack_size = frame_size;
2418
2419   /* Store pairs and load pairs have a range only -512 to 504.  */
2420   if (offset >= 512)
2421     {
2422       /* When the frame has a large size, an initial decrease is done on
2423          the stack pointer to jump over the callee-allocated save area for
2424          register varargs, the local variable area and/or the callee-saved
2425          register area.  This will allow the pre-index write-back
2426          store pair instructions to be used for setting up the stack frame
2427          efficiently.  */
2428       offset = hard_fp_offset;
2429       if (offset >= 512)
2430         offset = cfun->machine->frame.saved_regs_size;
2431
2432       frame_size -= (offset + crtl->outgoing_args_size);
2433       fp_offset = 0;
2434
2435       if (frame_size >= 0x1000000)
2436         {
2437           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2438           emit_move_insn (op0, GEN_INT (-frame_size));
2439           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2440
2441           add_reg_note (insn, REG_CFA_ADJUST_CFA,
2442                         gen_rtx_SET (VOIDmode, stack_pointer_rtx,
2443                                      plus_constant (Pmode, stack_pointer_rtx,
2444                                                     -frame_size)));
2445           RTX_FRAME_RELATED_P (insn) = 1;
2446         }
2447       else if (frame_size > 0)
2448         {
2449           int hi_ofs = frame_size & 0xfff000;
2450           int lo_ofs = frame_size & 0x000fff;
2451
2452           if (hi_ofs)
2453             {
2454               insn = emit_insn (gen_add2_insn
2455                                 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2456               RTX_FRAME_RELATED_P (insn) = 1;
2457             }
2458           if (lo_ofs)
2459             {
2460               insn = emit_insn (gen_add2_insn
2461                                 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2462               RTX_FRAME_RELATED_P (insn) = 1;
2463             }
2464         }
2465     }
2466   else
2467     frame_size = -1;
2468
2469   if (offset > 0)
2470     {
2471       bool skip_wb = false;
2472
2473       if (frame_pointer_needed)
2474         {
2475           skip_wb = true;
2476
2477           if (fp_offset)
2478             {
2479               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2480                                                GEN_INT (-offset)));
2481               RTX_FRAME_RELATED_P (insn) = 1;
2482
2483               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2484                                          R30_REGNUM, false);
2485             }
2486           else
2487             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2488
2489           /* Set up frame pointer to point to the location of the
2490              previous frame pointer on the stack.  */
2491           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2492                                            stack_pointer_rtx,
2493                                            GEN_INT (fp_offset)));
2494           RTX_FRAME_RELATED_P (insn) = 1;
2495           emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2496         }
2497       else
2498         {
2499           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2500           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2501
2502           if (fp_offset
2503               || reg1 == FIRST_PSEUDO_REGISTER
2504               || (reg2 == FIRST_PSEUDO_REGISTER
2505                   && offset >= 256))
2506             {
2507               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2508                                                GEN_INT (-offset)));
2509               RTX_FRAME_RELATED_P (insn) = 1;
2510             }
2511           else
2512             {
2513               machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2514
2515               skip_wb = true;
2516
2517               if (reg2 == FIRST_PSEUDO_REGISTER)
2518                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2519               else
2520                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2521             }
2522         }
2523
2524       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2525                                  skip_wb);
2526       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2527                                  skip_wb);
2528     }
2529
2530   /* when offset >= 512,
2531      sub sp, sp, #<outgoing_args_size> */
2532   if (frame_size > -1)
2533     {
2534       if (crtl->outgoing_args_size > 0)
2535         {
2536           insn = emit_insn (gen_add2_insn
2537                             (stack_pointer_rtx,
2538                              GEN_INT (- crtl->outgoing_args_size)));
2539           RTX_FRAME_RELATED_P (insn) = 1;
2540         }
2541     }
2542 }
2543
2544 /* Return TRUE if we can use a simple_return insn.
2545
2546    This function checks whether the callee saved stack is empty, which
2547    means no restore actions are need. The pro_and_epilogue will use
2548    this to check whether shrink-wrapping opt is feasible.  */
2549
2550 bool
2551 aarch64_use_return_insn_p (void)
2552 {
2553   if (!reload_completed)
2554     return false;
2555
2556   if (crtl->profile)
2557     return false;
2558
2559   aarch64_layout_frame ();
2560
2561   return cfun->machine->frame.frame_size == 0;
2562 }
2563
2564 /* Generate the epilogue instructions for returning from a function.  */
2565 void
2566 aarch64_expand_epilogue (bool for_sibcall)
2567 {
2568   HOST_WIDE_INT frame_size, offset;
2569   HOST_WIDE_INT fp_offset;
2570   HOST_WIDE_INT hard_fp_offset;
2571   rtx_insn *insn;
2572   /* We need to add memory barrier to prevent read from deallocated stack.  */
2573   bool need_barrier_p = (get_frame_size () != 0
2574                          || cfun->machine->frame.saved_varargs_size);
2575
2576   aarch64_layout_frame ();
2577
2578   offset = frame_size = cfun->machine->frame.frame_size;
2579   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2580   fp_offset = frame_size - hard_fp_offset;
2581
2582   /* Store pairs and load pairs have a range only -512 to 504.  */
2583   if (offset >= 512)
2584     {
2585       offset = hard_fp_offset;
2586       if (offset >= 512)
2587         offset = cfun->machine->frame.saved_regs_size;
2588
2589       frame_size -= (offset + crtl->outgoing_args_size);
2590       fp_offset = 0;
2591       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2592         {
2593           insn = emit_insn (gen_add2_insn
2594                             (stack_pointer_rtx,
2595                              GEN_INT (crtl->outgoing_args_size)));
2596           RTX_FRAME_RELATED_P (insn) = 1;
2597         }
2598     }
2599   else
2600     frame_size = -1;
2601
2602   /* If there were outgoing arguments or we've done dynamic stack
2603      allocation, then restore the stack pointer from the frame
2604      pointer.  This is at most one insn and more efficient than using
2605      GCC's internal mechanism.  */
2606   if (frame_pointer_needed
2607       && (crtl->outgoing_args_size || cfun->calls_alloca))
2608     {
2609       if (cfun->calls_alloca)
2610         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2611
2612       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2613                                        hard_frame_pointer_rtx,
2614                                        GEN_INT (0)));
2615       offset = offset - fp_offset;
2616     }
2617
2618   if (offset > 0)
2619     {
2620       unsigned reg1 = cfun->machine->frame.wb_candidate1;
2621       unsigned reg2 = cfun->machine->frame.wb_candidate2;
2622       bool skip_wb = true;
2623       rtx cfi_ops = NULL;
2624
2625       if (frame_pointer_needed)
2626         fp_offset = 0;
2627       else if (fp_offset
2628                || reg1 == FIRST_PSEUDO_REGISTER
2629                || (reg2 == FIRST_PSEUDO_REGISTER
2630                    && offset >= 256))
2631         skip_wb = false;
2632
2633       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2634                                     skip_wb, &cfi_ops);
2635       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2636                                     skip_wb, &cfi_ops);
2637
2638       if (need_barrier_p)
2639         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2640
2641       if (skip_wb)
2642         {
2643           machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2644           rtx rreg1 = gen_rtx_REG (mode1, reg1);
2645
2646           cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2647           if (reg2 == FIRST_PSEUDO_REGISTER)
2648             {
2649               rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2650               mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2651               mem = gen_rtx_MEM (mode1, mem);
2652               insn = emit_move_insn (rreg1, mem);
2653             }
2654           else
2655             {
2656               rtx rreg2 = gen_rtx_REG (mode1, reg2);
2657
2658               cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2659               insn = emit_insn (aarch64_gen_loadwb_pair
2660                                 (mode1, stack_pointer_rtx, rreg1,
2661                                  rreg2, offset));
2662             }
2663         }
2664       else
2665         {
2666           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2667                                            GEN_INT (offset)));
2668         }
2669
2670       /* Reset the CFA to be SP + FRAME_SIZE.  */
2671       rtx new_cfa = stack_pointer_rtx;
2672       if (frame_size > 0)
2673         new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2674       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2675       REG_NOTES (insn) = cfi_ops;
2676       RTX_FRAME_RELATED_P (insn) = 1;
2677     }
2678
2679   if (frame_size > 0)
2680     {
2681       if (need_barrier_p)
2682         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2683
2684       if (frame_size >= 0x1000000)
2685         {
2686           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2687           emit_move_insn (op0, GEN_INT (frame_size));
2688           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2689         }
2690       else
2691         {
2692           int hi_ofs = frame_size & 0xfff000;
2693           int lo_ofs = frame_size & 0x000fff;
2694
2695           if (hi_ofs && lo_ofs)
2696             {
2697               insn = emit_insn (gen_add2_insn
2698                                 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2699               RTX_FRAME_RELATED_P (insn) = 1;
2700               frame_size = lo_ofs;
2701             }
2702           insn = emit_insn (gen_add2_insn
2703                             (stack_pointer_rtx, GEN_INT (frame_size)));
2704         }
2705
2706       /* Reset the CFA to be SP + 0.  */
2707       add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2708       RTX_FRAME_RELATED_P (insn) = 1;
2709     }
2710
2711   /* Stack adjustment for exception handler.  */
2712   if (crtl->calls_eh_return)
2713     {
2714       /* We need to unwind the stack by the offset computed by
2715          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
2716          to be SP; letting the CFA move during this adjustment
2717          is just as correct as retaining the CFA from the body
2718          of the function.  Therefore, do nothing special.  */
2719       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2720     }
2721
2722   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2723   if (!for_sibcall)
2724     emit_jump_insn (ret_rtx);
2725 }
2726
2727 /* Return the place to copy the exception unwinding return address to.
2728    This will probably be a stack slot, but could (in theory be the
2729    return register).  */
2730 rtx
2731 aarch64_final_eh_return_addr (void)
2732 {
2733   HOST_WIDE_INT fp_offset;
2734
2735   aarch64_layout_frame ();
2736
2737   fp_offset = cfun->machine->frame.frame_size
2738               - cfun->machine->frame.hard_fp_offset;
2739
2740   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2741     return gen_rtx_REG (DImode, LR_REGNUM);
2742
2743   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2744      result in a store to save LR introduced by builtin_eh_return () being
2745      incorrectly deleted because the alias is not detected.
2746      So in the calculation of the address to copy the exception unwinding
2747      return address to, we note 2 cases.
2748      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2749      we return a SP-relative location since all the addresses are SP-relative
2750      in this case.  This prevents the store from being optimized away.
2751      If the fp_offset is not 0, then the addresses will be FP-relative and
2752      therefore we return a FP-relative location.  */
2753
2754   if (frame_pointer_needed)
2755     {
2756       if (fp_offset)
2757         return gen_frame_mem (DImode,
2758                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2759       else
2760         return gen_frame_mem (DImode,
2761                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2762     }
2763
2764   /* If FP is not needed, we calculate the location of LR, which would be
2765      at the top of the saved registers block.  */
2766
2767   return gen_frame_mem (DImode,
2768                         plus_constant (Pmode,
2769                                        stack_pointer_rtx,
2770                                        fp_offset
2771                                        + cfun->machine->frame.saved_regs_size
2772                                        - 2 * UNITS_PER_WORD));
2773 }
2774
2775 /* Possibly output code to build up a constant in a register.  For
2776    the benefit of the costs infrastructure, returns the number of
2777    instructions which would be emitted.  GENERATE inhibits or
2778    enables code generation.  */
2779
2780 static int
2781 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2782 {
2783   int insns = 0;
2784
2785   if (aarch64_bitmask_imm (val, DImode))
2786     {
2787       if (generate)
2788         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2789       insns = 1;
2790     }
2791   else
2792     {
2793       int i;
2794       int ncount = 0;
2795       int zcount = 0;
2796       HOST_WIDE_INT valp = val >> 16;
2797       HOST_WIDE_INT valm;
2798       HOST_WIDE_INT tval;
2799
2800       for (i = 16; i < 64; i += 16)
2801         {
2802           valm = (valp & 0xffff);
2803
2804           if (valm != 0)
2805             ++ zcount;
2806
2807           if (valm != 0xffff)
2808             ++ ncount;
2809
2810           valp >>= 16;
2811         }
2812
2813       /* zcount contains the number of additional MOVK instructions
2814          required if the constant is built up with an initial MOVZ instruction,
2815          while ncount is the number of MOVK instructions required if starting
2816          with a MOVN instruction.  Choose the sequence that yields the fewest
2817          number of instructions, preferring MOVZ instructions when they are both
2818          the same.  */
2819       if (ncount < zcount)
2820         {
2821           if (generate)
2822             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2823                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2824           tval = 0xffff;
2825           insns++;
2826         }
2827       else
2828         {
2829           if (generate)
2830             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2831                             GEN_INT (val & 0xffff));
2832           tval = 0;
2833           insns++;
2834         }
2835
2836       val >>= 16;
2837
2838       for (i = 16; i < 64; i += 16)
2839         {
2840           if ((val & 0xffff) != tval)
2841             {
2842               if (generate)
2843                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2844                                            GEN_INT (i),
2845                                            GEN_INT (val & 0xffff)));
2846               insns++;
2847             }
2848           val >>= 16;
2849         }
2850     }
2851   return insns;
2852 }
2853
2854 static void
2855 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2856 {
2857   HOST_WIDE_INT mdelta = delta;
2858   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2859   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2860
2861   if (mdelta < 0)
2862     mdelta = -mdelta;
2863
2864   if (mdelta >= 4096 * 4096)
2865     {
2866       (void) aarch64_build_constant (scratchreg, delta, true);
2867       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2868     }
2869   else if (mdelta > 0)
2870     {
2871       if (mdelta >= 4096)
2872         {
2873           emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2874           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2875           if (delta < 0)
2876             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2877                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
2878           else
2879             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2880                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
2881         }
2882       if (mdelta % 4096 != 0)
2883         {
2884           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2885           emit_insn (gen_rtx_SET (Pmode, this_rtx,
2886                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2887         }
2888     }
2889 }
2890
2891 /* Output code to add DELTA to the first argument, and then jump
2892    to FUNCTION.  Used for C++ multiple inheritance.  */
2893 static void
2894 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2895                          HOST_WIDE_INT delta,
2896                          HOST_WIDE_INT vcall_offset,
2897                          tree function)
2898 {
2899   /* The this pointer is always in x0.  Note that this differs from
2900      Arm where the this pointer maybe bumped to r1 if r0 is required
2901      to return a pointer to an aggregate.  On AArch64 a result value
2902      pointer will be in x8.  */
2903   int this_regno = R0_REGNUM;
2904   rtx this_rtx, temp0, temp1, addr, funexp;
2905   rtx_insn *insn;
2906
2907   reload_completed = 1;
2908   emit_note (NOTE_INSN_PROLOGUE_END);
2909
2910   if (vcall_offset == 0)
2911     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2912   else
2913     {
2914       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2915
2916       this_rtx = gen_rtx_REG (Pmode, this_regno);
2917       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2918       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2919
2920       addr = this_rtx;
2921       if (delta != 0)
2922         {
2923           if (delta >= -256 && delta < 256)
2924             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2925                                        plus_constant (Pmode, this_rtx, delta));
2926           else
2927             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2928         }
2929
2930       if (Pmode == ptr_mode)
2931         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2932       else
2933         aarch64_emit_move (temp0,
2934                            gen_rtx_ZERO_EXTEND (Pmode,
2935                                                 gen_rtx_MEM (ptr_mode, addr)));
2936
2937       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2938           addr = plus_constant (Pmode, temp0, vcall_offset);
2939       else
2940         {
2941           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2942           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2943         }
2944
2945       if (Pmode == ptr_mode)
2946         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2947       else
2948         aarch64_emit_move (temp1,
2949                            gen_rtx_SIGN_EXTEND (Pmode,
2950                                                 gen_rtx_MEM (ptr_mode, addr)));
2951
2952       emit_insn (gen_add2_insn (this_rtx, temp1));
2953     }
2954
2955   /* Generate a tail call to the target function.  */
2956   if (!TREE_USED (function))
2957     {
2958       assemble_external (function);
2959       TREE_USED (function) = 1;
2960     }
2961   funexp = XEXP (DECL_RTL (function), 0);
2962   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2963   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2964   SIBLING_CALL_P (insn) = 1;
2965
2966   insn = get_insns ();
2967   shorten_branches (insn);
2968   final_start_function (insn, file, 1);
2969   final (insn, file, 1);
2970   final_end_function ();
2971
2972   /* Stop pretending to be a post-reload pass.  */
2973   reload_completed = 0;
2974 }
2975
2976 static bool
2977 aarch64_tls_referenced_p (rtx x)
2978 {
2979   if (!TARGET_HAVE_TLS)
2980     return false;
2981   subrtx_iterator::array_type array;
2982   FOR_EACH_SUBRTX (iter, array, x, ALL)
2983     {
2984       const_rtx x = *iter;
2985       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
2986         return true;
2987       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2988          TLS offsets, not real symbol references.  */
2989       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
2990         iter.skip_subrtxes ();
2991     }
2992   return false;
2993 }
2994
2995
2996 static int
2997 aarch64_bitmasks_cmp (const void *i1, const void *i2)
2998 {
2999   const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3000   const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3001
3002   if (*imm1 < *imm2)
3003     return -1;
3004   if (*imm1 > *imm2)
3005     return +1;
3006   return 0;
3007 }
3008
3009
3010 static void
3011 aarch64_build_bitmask_table (void)
3012 {
3013   unsigned HOST_WIDE_INT mask, imm;
3014   unsigned int log_e, e, s, r;
3015   unsigned int nimms = 0;
3016
3017   for (log_e = 1; log_e <= 6; log_e++)
3018     {
3019       e = 1 << log_e;
3020       if (e == 64)
3021         mask = ~(HOST_WIDE_INT) 0;
3022       else
3023         mask = ((HOST_WIDE_INT) 1 << e) - 1;
3024       for (s = 1; s < e; s++)
3025         {
3026           for (r = 0; r < e; r++)
3027             {
3028               /* set s consecutive bits to 1 (s < 64) */
3029               imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3030               /* rotate right by r */
3031               if (r != 0)
3032                 imm = ((imm >> r) | (imm << (e - r))) & mask;
3033               /* replicate the constant depending on SIMD size */
3034               switch (log_e) {
3035               case 1: imm |= (imm <<  2);
3036               case 2: imm |= (imm <<  4);
3037               case 3: imm |= (imm <<  8);
3038               case 4: imm |= (imm << 16);
3039               case 5: imm |= (imm << 32);
3040               case 6:
3041                 break;
3042               default:
3043                 gcc_unreachable ();
3044               }
3045               gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3046               aarch64_bitmasks[nimms++] = imm;
3047             }
3048         }
3049     }
3050
3051   gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3052   qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3053          aarch64_bitmasks_cmp);
3054 }
3055
3056
3057 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3058    a left shift of 0 or 12 bits.  */
3059 bool
3060 aarch64_uimm12_shift (HOST_WIDE_INT val)
3061 {
3062   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3063           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3064           );
3065 }
3066
3067
3068 /* Return true if val is an immediate that can be loaded into a
3069    register by a MOVZ instruction.  */
3070 static bool
3071 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3072 {
3073   if (GET_MODE_SIZE (mode) > 4)
3074     {
3075       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3076           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3077         return 1;
3078     }
3079   else
3080     {
3081       /* Ignore sign extension.  */
3082       val &= (HOST_WIDE_INT) 0xffffffff;
3083     }
3084   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3085           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3086 }
3087
3088
3089 /* Return true if val is a valid bitmask immediate.  */
3090 bool
3091 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3092 {
3093   if (GET_MODE_SIZE (mode) < 8)
3094     {
3095       /* Replicate bit pattern.  */
3096       val &= (HOST_WIDE_INT) 0xffffffff;
3097       val |= val << 32;
3098     }
3099   return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3100                   sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3101 }
3102
3103
3104 /* Return true if val is an immediate that can be loaded into a
3105    register in a single instruction.  */
3106 bool
3107 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3108 {
3109   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3110     return 1;
3111   return aarch64_bitmask_imm (val, mode);
3112 }
3113
3114 static bool
3115 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3116 {
3117   rtx base, offset;
3118
3119   if (GET_CODE (x) == HIGH)
3120     return true;
3121
3122   split_const (x, &base, &offset);
3123   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3124     {
3125       if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3126           != SYMBOL_FORCE_TO_MEM)
3127         return true;
3128       else
3129         /* Avoid generating a 64-bit relocation in ILP32; leave
3130            to aarch64_expand_mov_immediate to handle it properly.  */
3131         return mode != ptr_mode;
3132     }
3133
3134   return aarch64_tls_referenced_p (x);
3135 }
3136
3137 /* Return true if register REGNO is a valid index register.
3138    STRICT_P is true if REG_OK_STRICT is in effect.  */
3139
3140 bool
3141 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3142 {
3143   if (!HARD_REGISTER_NUM_P (regno))
3144     {
3145       if (!strict_p)
3146         return true;
3147
3148       if (!reg_renumber)
3149         return false;
3150
3151       regno = reg_renumber[regno];
3152     }
3153   return GP_REGNUM_P (regno);
3154 }
3155
3156 /* Return true if register REGNO is a valid base register for mode MODE.
3157    STRICT_P is true if REG_OK_STRICT is in effect.  */
3158
3159 bool
3160 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3161 {
3162   if (!HARD_REGISTER_NUM_P (regno))
3163     {
3164       if (!strict_p)
3165         return true;
3166
3167       if (!reg_renumber)
3168         return false;
3169
3170       regno = reg_renumber[regno];
3171     }
3172
3173   /* The fake registers will be eliminated to either the stack or
3174      hard frame pointer, both of which are usually valid base registers.
3175      Reload deals with the cases where the eliminated form isn't valid.  */
3176   return (GP_REGNUM_P (regno)
3177           || regno == SP_REGNUM
3178           || regno == FRAME_POINTER_REGNUM
3179           || regno == ARG_POINTER_REGNUM);
3180 }
3181
3182 /* Return true if X is a valid base register for mode MODE.
3183    STRICT_P is true if REG_OK_STRICT is in effect.  */
3184
3185 static bool
3186 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3187 {
3188   if (!strict_p && GET_CODE (x) == SUBREG)
3189     x = SUBREG_REG (x);
3190
3191   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3192 }
3193
3194 /* Return true if address offset is a valid index.  If it is, fill in INFO
3195    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3196
3197 static bool
3198 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3199                         machine_mode mode, bool strict_p)
3200 {
3201   enum aarch64_address_type type;
3202   rtx index;
3203   int shift;
3204
3205   /* (reg:P) */
3206   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3207       && GET_MODE (x) == Pmode)
3208     {
3209       type = ADDRESS_REG_REG;
3210       index = x;
3211       shift = 0;
3212     }
3213   /* (sign_extend:DI (reg:SI)) */
3214   else if ((GET_CODE (x) == SIGN_EXTEND
3215             || GET_CODE (x) == ZERO_EXTEND)
3216            && GET_MODE (x) == DImode
3217            && GET_MODE (XEXP (x, 0)) == SImode)
3218     {
3219       type = (GET_CODE (x) == SIGN_EXTEND)
3220         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3221       index = XEXP (x, 0);
3222       shift = 0;
3223     }
3224   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3225   else if (GET_CODE (x) == MULT
3226            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3227                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3228            && GET_MODE (XEXP (x, 0)) == DImode
3229            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3230            && CONST_INT_P (XEXP (x, 1)))
3231     {
3232       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3233         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3234       index = XEXP (XEXP (x, 0), 0);
3235       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3236     }
3237   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3238   else if (GET_CODE (x) == ASHIFT
3239            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3240                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3241            && GET_MODE (XEXP (x, 0)) == DImode
3242            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3243            && CONST_INT_P (XEXP (x, 1)))
3244     {
3245       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3246         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3247       index = XEXP (XEXP (x, 0), 0);
3248       shift = INTVAL (XEXP (x, 1));
3249     }
3250   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3251   else if ((GET_CODE (x) == SIGN_EXTRACT
3252             || GET_CODE (x) == ZERO_EXTRACT)
3253            && GET_MODE (x) == DImode
3254            && GET_CODE (XEXP (x, 0)) == MULT
3255            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3256            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3257     {
3258       type = (GET_CODE (x) == SIGN_EXTRACT)
3259         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3260       index = XEXP (XEXP (x, 0), 0);
3261       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3262       if (INTVAL (XEXP (x, 1)) != 32 + shift
3263           || INTVAL (XEXP (x, 2)) != 0)
3264         shift = -1;
3265     }
3266   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3267      (const_int 0xffffffff<<shift)) */
3268   else if (GET_CODE (x) == AND
3269            && GET_MODE (x) == DImode
3270            && GET_CODE (XEXP (x, 0)) == MULT
3271            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3272            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3273            && CONST_INT_P (XEXP (x, 1)))
3274     {
3275       type = ADDRESS_REG_UXTW;
3276       index = XEXP (XEXP (x, 0), 0);
3277       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3278       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3279         shift = -1;
3280     }
3281   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3282   else if ((GET_CODE (x) == SIGN_EXTRACT
3283             || GET_CODE (x) == ZERO_EXTRACT)
3284            && GET_MODE (x) == DImode
3285            && GET_CODE (XEXP (x, 0)) == ASHIFT
3286            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3287            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3288     {
3289       type = (GET_CODE (x) == SIGN_EXTRACT)
3290         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3291       index = XEXP (XEXP (x, 0), 0);
3292       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3293       if (INTVAL (XEXP (x, 1)) != 32 + shift
3294           || INTVAL (XEXP (x, 2)) != 0)
3295         shift = -1;
3296     }
3297   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3298      (const_int 0xffffffff<<shift)) */
3299   else if (GET_CODE (x) == AND
3300            && GET_MODE (x) == DImode
3301            && GET_CODE (XEXP (x, 0)) == ASHIFT
3302            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3303            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3304            && CONST_INT_P (XEXP (x, 1)))
3305     {
3306       type = ADDRESS_REG_UXTW;
3307       index = XEXP (XEXP (x, 0), 0);
3308       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3309       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3310         shift = -1;
3311     }
3312   /* (mult:P (reg:P) (const_int scale)) */
3313   else if (GET_CODE (x) == MULT
3314            && GET_MODE (x) == Pmode
3315            && GET_MODE (XEXP (x, 0)) == Pmode
3316            && CONST_INT_P (XEXP (x, 1)))
3317     {
3318       type = ADDRESS_REG_REG;
3319       index = XEXP (x, 0);
3320       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3321     }
3322   /* (ashift:P (reg:P) (const_int shift)) */
3323   else if (GET_CODE (x) == ASHIFT
3324            && GET_MODE (x) == Pmode
3325            && GET_MODE (XEXP (x, 0)) == Pmode
3326            && CONST_INT_P (XEXP (x, 1)))
3327     {
3328       type = ADDRESS_REG_REG;
3329       index = XEXP (x, 0);
3330       shift = INTVAL (XEXP (x, 1));
3331     }
3332   else
3333     return false;
3334
3335   if (GET_CODE (index) == SUBREG)
3336     index = SUBREG_REG (index);
3337
3338   if ((shift == 0 ||
3339        (shift > 0 && shift <= 3
3340         && (1 << shift) == GET_MODE_SIZE (mode)))
3341       && REG_P (index)
3342       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3343     {
3344       info->type = type;
3345       info->offset = index;
3346       info->shift = shift;
3347       return true;
3348     }
3349
3350   return false;
3351 }
3352
3353 bool
3354 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3355 {
3356   return (offset >= -64 * GET_MODE_SIZE (mode)
3357           && offset < 64 * GET_MODE_SIZE (mode)
3358           && offset % GET_MODE_SIZE (mode) == 0);
3359 }
3360
3361 static inline bool
3362 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3363                                HOST_WIDE_INT offset)
3364 {
3365   return offset >= -256 && offset < 256;
3366 }
3367
3368 static inline bool
3369 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3370 {
3371   return (offset >= 0
3372           && offset < 4096 * GET_MODE_SIZE (mode)
3373           && offset % GET_MODE_SIZE (mode) == 0);
3374 }
3375
3376 /* Return true if X is a valid address for machine mode MODE.  If it is,
3377    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3378    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3379
3380 static bool
3381 aarch64_classify_address (struct aarch64_address_info *info,
3382                           rtx x, machine_mode mode,
3383                           RTX_CODE outer_code, bool strict_p)
3384 {
3385   enum rtx_code code = GET_CODE (x);
3386   rtx op0, op1;
3387
3388   /* On BE, we use load/store pair for all large int mode load/stores.  */
3389   bool load_store_pair_p = (outer_code == PARALLEL
3390                             || (BYTES_BIG_ENDIAN
3391                                 && aarch64_vect_struct_mode_p (mode)));
3392
3393   bool allow_reg_index_p =
3394     !load_store_pair_p
3395     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3396     && !aarch64_vect_struct_mode_p (mode);
3397
3398   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3399      REG addressing.  */
3400   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3401       && (code != POST_INC && code != REG))
3402     return false;
3403
3404   switch (code)
3405     {
3406     case REG:
3407     case SUBREG:
3408       info->type = ADDRESS_REG_IMM;
3409       info->base = x;
3410       info->offset = const0_rtx;
3411       return aarch64_base_register_rtx_p (x, strict_p);
3412
3413     case PLUS:
3414       op0 = XEXP (x, 0);
3415       op1 = XEXP (x, 1);
3416
3417       if (! strict_p
3418           && REG_P (op0)
3419           && (op0 == virtual_stack_vars_rtx
3420               || op0 == frame_pointer_rtx
3421               || op0 == arg_pointer_rtx)
3422           && CONST_INT_P (op1))
3423         {
3424           info->type = ADDRESS_REG_IMM;
3425           info->base = op0;
3426           info->offset = op1;
3427
3428           return true;
3429         }
3430
3431       if (GET_MODE_SIZE (mode) != 0
3432           && CONST_INT_P (op1)
3433           && aarch64_base_register_rtx_p (op0, strict_p))
3434         {
3435           HOST_WIDE_INT offset = INTVAL (op1);
3436
3437           info->type = ADDRESS_REG_IMM;
3438           info->base = op0;
3439           info->offset = op1;
3440
3441           /* TImode and TFmode values are allowed in both pairs of X
3442              registers and individual Q registers.  The available
3443              address modes are:
3444              X,X: 7-bit signed scaled offset
3445              Q:   9-bit signed offset
3446              We conservatively require an offset representable in either mode.
3447            */
3448           if (mode == TImode || mode == TFmode)
3449             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3450                     && offset_9bit_signed_unscaled_p (mode, offset));
3451
3452           /* A 7bit offset check because OImode will emit a ldp/stp
3453              instruction (only big endian will get here).
3454              For ldp/stp instructions, the offset is scaled for the size of a
3455              single element of the pair.  */
3456           if (mode == OImode)
3457             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3458
3459           /* Three 9/12 bit offsets checks because CImode will emit three
3460              ldr/str instructions (only big endian will get here).  */
3461           if (mode == CImode)
3462             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3463                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3464                         || offset_12bit_unsigned_scaled_p (V16QImode,
3465                                                            offset + 32)));
3466
3467           /* Two 7bit offsets checks because XImode will emit two ldp/stp
3468              instructions (only big endian will get here).  */
3469           if (mode == XImode)
3470             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3471                     && aarch64_offset_7bit_signed_scaled_p (TImode,
3472                                                             offset + 32));
3473
3474           if (load_store_pair_p)
3475             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3476                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3477           else
3478             return (offset_9bit_signed_unscaled_p (mode, offset)
3479                     || offset_12bit_unsigned_scaled_p (mode, offset));
3480         }
3481
3482       if (allow_reg_index_p)
3483         {
3484           /* Look for base + (scaled/extended) index register.  */
3485           if (aarch64_base_register_rtx_p (op0, strict_p)
3486               && aarch64_classify_index (info, op1, mode, strict_p))
3487             {
3488               info->base = op0;
3489               return true;
3490             }
3491           if (aarch64_base_register_rtx_p (op1, strict_p)
3492               && aarch64_classify_index (info, op0, mode, strict_p))
3493             {
3494               info->base = op1;
3495               return true;
3496             }
3497         }
3498
3499       return false;
3500
3501     case POST_INC:
3502     case POST_DEC:
3503     case PRE_INC:
3504     case PRE_DEC:
3505       info->type = ADDRESS_REG_WB;
3506       info->base = XEXP (x, 0);
3507       info->offset = NULL_RTX;
3508       return aarch64_base_register_rtx_p (info->base, strict_p);
3509
3510     case POST_MODIFY:
3511     case PRE_MODIFY:
3512       info->type = ADDRESS_REG_WB;
3513       info->base = XEXP (x, 0);
3514       if (GET_CODE (XEXP (x, 1)) == PLUS
3515           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3516           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3517           && aarch64_base_register_rtx_p (info->base, strict_p))
3518         {
3519           HOST_WIDE_INT offset;
3520           info->offset = XEXP (XEXP (x, 1), 1);
3521           offset = INTVAL (info->offset);
3522
3523           /* TImode and TFmode values are allowed in both pairs of X
3524              registers and individual Q registers.  The available
3525              address modes are:
3526              X,X: 7-bit signed scaled offset
3527              Q:   9-bit signed offset
3528              We conservatively require an offset representable in either mode.
3529            */
3530           if (mode == TImode || mode == TFmode)
3531             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3532                     && offset_9bit_signed_unscaled_p (mode, offset));
3533
3534           if (load_store_pair_p)
3535             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3536                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3537           else
3538             return offset_9bit_signed_unscaled_p (mode, offset);
3539         }
3540       return false;
3541
3542     case CONST:
3543     case SYMBOL_REF:
3544     case LABEL_REF:
3545       /* load literal: pc-relative constant pool entry.  Only supported
3546          for SI mode or larger.  */
3547       info->type = ADDRESS_SYMBOLIC;
3548
3549       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3550         {
3551           rtx sym, addend;
3552
3553           split_const (x, &sym, &addend);
3554           return (GET_CODE (sym) == LABEL_REF
3555                   || (GET_CODE (sym) == SYMBOL_REF
3556                       && CONSTANT_POOL_ADDRESS_P (sym)));
3557         }
3558       return false;
3559
3560     case LO_SUM:
3561       info->type = ADDRESS_LO_SUM;
3562       info->base = XEXP (x, 0);
3563       info->offset = XEXP (x, 1);
3564       if (allow_reg_index_p
3565           && aarch64_base_register_rtx_p (info->base, strict_p))
3566         {
3567           rtx sym, offs;
3568           split_const (info->offset, &sym, &offs);
3569           if (GET_CODE (sym) == SYMBOL_REF
3570               && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3571                   == SYMBOL_SMALL_ABSOLUTE))
3572             {
3573               /* The symbol and offset must be aligned to the access size.  */
3574               unsigned int align;
3575               unsigned int ref_size;
3576
3577               if (CONSTANT_POOL_ADDRESS_P (sym))
3578                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3579               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3580                 {
3581                   tree exp = SYMBOL_REF_DECL (sym);
3582                   align = TYPE_ALIGN (TREE_TYPE (exp));
3583                   align = CONSTANT_ALIGNMENT (exp, align);
3584                 }
3585               else if (SYMBOL_REF_DECL (sym))
3586                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3587               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3588                        && SYMBOL_REF_BLOCK (sym) != NULL)
3589                 align = SYMBOL_REF_BLOCK (sym)->alignment;
3590               else
3591                 align = BITS_PER_UNIT;
3592
3593               ref_size = GET_MODE_SIZE (mode);
3594               if (ref_size == 0)
3595                 ref_size = GET_MODE_SIZE (DImode);
3596
3597               return ((INTVAL (offs) & (ref_size - 1)) == 0
3598                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3599             }
3600         }
3601       return false;
3602
3603     default:
3604       return false;
3605     }
3606 }
3607
3608 bool
3609 aarch64_symbolic_address_p (rtx x)
3610 {
3611   rtx offset;
3612
3613   split_const (x, &x, &offset);
3614   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3615 }
3616
3617 /* Classify the base of symbolic expression X, given that X appears in
3618    context CONTEXT.  */
3619
3620 enum aarch64_symbol_type
3621 aarch64_classify_symbolic_expression (rtx x,
3622                                       enum aarch64_symbol_context context)
3623 {
3624   rtx offset;
3625
3626   split_const (x, &x, &offset);
3627   return aarch64_classify_symbol (x, offset, context);
3628 }
3629
3630
3631 /* Return TRUE if X is a legitimate address for accessing memory in
3632    mode MODE.  */
3633 static bool
3634 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3635 {
3636   struct aarch64_address_info addr;
3637
3638   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3639 }
3640
3641 /* Return TRUE if X is a legitimate address for accessing memory in
3642    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3643    pair operation.  */
3644 bool
3645 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3646                               RTX_CODE outer_code, bool strict_p)
3647 {
3648   struct aarch64_address_info addr;
3649
3650   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3651 }
3652
3653 /* Return TRUE if rtx X is immediate constant 0.0 */
3654 bool
3655 aarch64_float_const_zero_rtx_p (rtx x)
3656 {
3657   REAL_VALUE_TYPE r;
3658
3659   if (GET_MODE (x) == VOIDmode)
3660     return false;
3661
3662   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3663   if (REAL_VALUE_MINUS_ZERO (r))
3664     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3665   return REAL_VALUES_EQUAL (r, dconst0);
3666 }
3667
3668 /* Return the fixed registers used for condition codes.  */
3669
3670 static bool
3671 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3672 {
3673   *p1 = CC_REGNUM;
3674   *p2 = INVALID_REGNUM;
3675   return true;
3676 }
3677
3678 /* Emit call insn with PAT and do aarch64-specific handling.  */
3679
3680 void
3681 aarch64_emit_call_insn (rtx pat)
3682 {
3683   rtx insn = emit_call_insn (pat);
3684
3685   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3686   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3687   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3688 }
3689
3690 machine_mode
3691 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3692 {
3693   /* All floating point compares return CCFP if it is an equality
3694      comparison, and CCFPE otherwise.  */
3695   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3696     {
3697       switch (code)
3698         {
3699         case EQ:
3700         case NE:
3701         case UNORDERED:
3702         case ORDERED:
3703         case UNLT:
3704         case UNLE:
3705         case UNGT:
3706         case UNGE:
3707         case UNEQ:
3708         case LTGT:
3709           return CCFPmode;
3710
3711         case LT:
3712         case LE:
3713         case GT:
3714         case GE:
3715           return CCFPEmode;
3716
3717         default:
3718           gcc_unreachable ();
3719         }
3720     }
3721
3722   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3723       && y == const0_rtx
3724       && (code == EQ || code == NE || code == LT || code == GE)
3725       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3726           || GET_CODE (x) == NEG))
3727     return CC_NZmode;
3728
3729   /* A compare with a shifted operand.  Because of canonicalization,
3730      the comparison will have to be swapped when we emit the assembly
3731      code.  */
3732   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3733       && (REG_P (y) || GET_CODE (y) == SUBREG)
3734       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3735           || GET_CODE (x) == LSHIFTRT
3736           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3737     return CC_SWPmode;
3738
3739   /* Similarly for a negated operand, but we can only do this for
3740      equalities.  */
3741   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3742       && (REG_P (y) || GET_CODE (y) == SUBREG)
3743       && (code == EQ || code == NE)
3744       && GET_CODE (x) == NEG)
3745     return CC_Zmode;
3746
3747   /* A compare of a mode narrower than SI mode against zero can be done
3748      by extending the value in the comparison.  */
3749   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3750       && y == const0_rtx)
3751     /* Only use sign-extension if we really need it.  */
3752     return ((code == GT || code == GE || code == LE || code == LT)
3753             ? CC_SESWPmode : CC_ZESWPmode);
3754
3755   /* For everything else, return CCmode.  */
3756   return CCmode;
3757 }
3758
3759 static int
3760 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3761
3762 int
3763 aarch64_get_condition_code (rtx x)
3764 {
3765   machine_mode mode = GET_MODE (XEXP (x, 0));
3766   enum rtx_code comp_code = GET_CODE (x);
3767
3768   if (GET_MODE_CLASS (mode) != MODE_CC)
3769     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3770   return aarch64_get_condition_code_1 (mode, comp_code);
3771 }
3772
3773 static int
3774 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3775 {
3776   int ne = -1, eq = -1;
3777   switch (mode)
3778     {
3779     case CCFPmode:
3780     case CCFPEmode:
3781       switch (comp_code)
3782         {
3783         case GE: return AARCH64_GE;
3784         case GT: return AARCH64_GT;
3785         case LE: return AARCH64_LS;
3786         case LT: return AARCH64_MI;
3787         case NE: return AARCH64_NE;
3788         case EQ: return AARCH64_EQ;
3789         case ORDERED: return AARCH64_VC;
3790         case UNORDERED: return AARCH64_VS;
3791         case UNLT: return AARCH64_LT;
3792         case UNLE: return AARCH64_LE;
3793         case UNGT: return AARCH64_HI;
3794         case UNGE: return AARCH64_PL;
3795         default: return -1;
3796         }
3797       break;
3798
3799     case CC_DNEmode:
3800       ne = AARCH64_NE;
3801       eq = AARCH64_EQ;
3802       break;
3803
3804     case CC_DEQmode:
3805       ne = AARCH64_EQ;
3806       eq = AARCH64_NE;
3807       break;
3808
3809     case CC_DGEmode:
3810       ne = AARCH64_GE;
3811       eq = AARCH64_LT;
3812       break;
3813
3814     case CC_DLTmode:
3815       ne = AARCH64_LT;
3816       eq = AARCH64_GE;
3817       break;
3818
3819     case CC_DGTmode:
3820       ne = AARCH64_GT;
3821       eq = AARCH64_LE;
3822       break;
3823
3824     case CC_DLEmode:
3825       ne = AARCH64_LE;
3826       eq = AARCH64_GT;
3827       break;
3828
3829     case CC_DGEUmode:
3830       ne = AARCH64_CS;
3831       eq = AARCH64_CC;
3832       break;
3833
3834     case CC_DLTUmode:
3835       ne = AARCH64_CC;
3836       eq = AARCH64_CS;
3837       break;
3838
3839     case CC_DGTUmode:
3840       ne = AARCH64_HI;
3841       eq = AARCH64_LS;
3842       break;
3843
3844     case CC_DLEUmode:
3845       ne = AARCH64_LS;
3846       eq = AARCH64_HI;
3847       break;
3848
3849     case CCmode:
3850       switch (comp_code)
3851         {
3852         case NE: return AARCH64_NE;
3853         case EQ: return AARCH64_EQ;
3854         case GE: return AARCH64_GE;
3855         case GT: return AARCH64_GT;
3856         case LE: return AARCH64_LE;
3857         case LT: return AARCH64_LT;
3858         case GEU: return AARCH64_CS;
3859         case GTU: return AARCH64_HI;
3860         case LEU: return AARCH64_LS;
3861         case LTU: return AARCH64_CC;
3862         default: return -1;
3863         }
3864       break;
3865
3866     case CC_SWPmode:
3867     case CC_ZESWPmode:
3868     case CC_SESWPmode:
3869       switch (comp_code)
3870         {
3871         case NE: return AARCH64_NE;
3872         case EQ: return AARCH64_EQ;
3873         case GE: return AARCH64_LE;
3874         case GT: return AARCH64_LT;
3875         case LE: return AARCH64_GE;
3876         case LT: return AARCH64_GT;
3877         case GEU: return AARCH64_LS;
3878         case GTU: return AARCH64_CC;
3879         case LEU: return AARCH64_CS;
3880         case LTU: return AARCH64_HI;
3881         default: return -1;
3882         }
3883       break;
3884
3885     case CC_NZmode:
3886       switch (comp_code)
3887         {
3888         case NE: return AARCH64_NE;
3889         case EQ: return AARCH64_EQ;
3890         case GE: return AARCH64_PL;
3891         case LT: return AARCH64_MI;
3892         default: return -1;
3893         }
3894       break;
3895
3896     case CC_Zmode:
3897       switch (comp_code)
3898         {
3899         case NE: return AARCH64_NE;
3900         case EQ: return AARCH64_EQ;
3901         default: return -1;
3902         }
3903       break;
3904
3905     default:
3906       return -1;
3907       break;
3908     }
3909
3910   if (comp_code == NE)
3911     return ne;
3912
3913   if (comp_code == EQ)
3914     return eq;
3915
3916   return -1;
3917 }
3918
3919 bool
3920 aarch64_const_vec_all_same_in_range_p (rtx x,
3921                                   HOST_WIDE_INT minval,
3922                                   HOST_WIDE_INT maxval)
3923 {
3924   HOST_WIDE_INT firstval;
3925   int count, i;
3926
3927   if (GET_CODE (x) != CONST_VECTOR
3928       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3929     return false;
3930
3931   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3932   if (firstval < minval || firstval > maxval)
3933     return false;
3934
3935   count = CONST_VECTOR_NUNITS (x);
3936   for (i = 1; i < count; i++)
3937     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3938       return false;
3939
3940   return true;
3941 }
3942
3943 bool
3944 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3945 {
3946   return aarch64_const_vec_all_same_in_range_p (x, val, val);
3947 }
3948
3949 static unsigned
3950 bit_count (unsigned HOST_WIDE_INT value)
3951 {
3952   unsigned count = 0;
3953
3954   while (value)
3955     {
3956       count++;
3957       value &= value - 1;
3958     }
3959
3960   return count;
3961 }
3962
3963 /* N Z C V.  */
3964 #define AARCH64_CC_V 1
3965 #define AARCH64_CC_C (1 << 1)
3966 #define AARCH64_CC_Z (1 << 2)
3967 #define AARCH64_CC_N (1 << 3)
3968
3969 /* N Z C V flags for ccmp.  The first code is for AND op and the other
3970    is for IOR op.  Indexed by AARCH64_COND_CODE.  */
3971 static const int aarch64_nzcv_codes[][2] =
3972 {
3973   {AARCH64_CC_Z, 0}, /* EQ, Z == 1.  */
3974   {0, AARCH64_CC_Z}, /* NE, Z == 0.  */
3975   {AARCH64_CC_C, 0}, /* CS, C == 1.  */
3976   {0, AARCH64_CC_C}, /* CC, C == 0.  */
3977   {AARCH64_CC_N, 0}, /* MI, N == 1.  */
3978   {0, AARCH64_CC_N}, /* PL, N == 0.  */
3979   {AARCH64_CC_V, 0}, /* VS, V == 1.  */
3980   {0, AARCH64_CC_V}, /* VC, V == 0.  */
3981   {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0.  */
3982   {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0).  */
3983   {0, AARCH64_CC_V}, /* GE, N == V.  */
3984   {AARCH64_CC_V, 0}, /* LT, N != V.  */
3985   {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V.  */
3986   {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V).  */
3987   {0, 0}, /* AL, Any.  */
3988   {0, 0}, /* NV, Any.  */
3989 };
3990
3991 int
3992 aarch64_ccmp_mode_to_code (enum machine_mode mode)
3993 {
3994   switch (mode)
3995     {
3996     case CC_DNEmode:
3997       return NE;
3998
3999     case CC_DEQmode:
4000       return EQ;
4001
4002     case CC_DLEmode:
4003       return LE;
4004
4005     case CC_DGTmode:
4006       return GT;
4007
4008     case CC_DLTmode:
4009       return LT;
4010
4011     case CC_DGEmode:
4012       return GE;
4013
4014     case CC_DLEUmode:
4015       return LEU;
4016
4017     case CC_DGTUmode:
4018       return GTU;
4019
4020     case CC_DLTUmode:
4021       return LTU;
4022
4023     case CC_DGEUmode:
4024       return GEU;
4025
4026     default:
4027       gcc_unreachable ();
4028     }
4029 }
4030
4031
4032 void
4033 aarch64_print_operand (FILE *f, rtx x, char code)
4034 {
4035   switch (code)
4036     {
4037     /* An integer or symbol address without a preceding # sign.  */
4038     case 'c':
4039       switch (GET_CODE (x))
4040         {
4041         case CONST_INT:
4042           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4043           break;
4044
4045         case SYMBOL_REF:
4046           output_addr_const (f, x);
4047           break;
4048
4049         case CONST:
4050           if (GET_CODE (XEXP (x, 0)) == PLUS
4051               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4052             {
4053               output_addr_const (f, x);
4054               break;
4055             }
4056           /* Fall through.  */
4057
4058         default:
4059           output_operand_lossage ("Unsupported operand for code '%c'", code);
4060         }
4061       break;
4062
4063     case 'e':
4064       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4065       {
4066         int n;
4067
4068         if (!CONST_INT_P (x)
4069             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4070           {
4071             output_operand_lossage ("invalid operand for '%%%c'", code);
4072             return;
4073           }
4074
4075         switch (n)
4076           {
4077           case 3:
4078             fputc ('b', f);
4079             break;
4080           case 4:
4081             fputc ('h', f);
4082             break;
4083           case 5:
4084             fputc ('w', f);
4085             break;
4086           default:
4087             output_operand_lossage ("invalid operand for '%%%c'", code);
4088             return;
4089           }
4090       }
4091       break;
4092
4093     case 'p':
4094       {
4095         int n;
4096
4097         /* Print N such that 2^N == X.  */
4098         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4099           {
4100             output_operand_lossage ("invalid operand for '%%%c'", code);
4101             return;
4102           }
4103
4104         asm_fprintf (f, "%d", n);
4105       }
4106       break;
4107
4108     case 'P':
4109       /* Print the number of non-zero bits in X (a const_int).  */
4110       if (!CONST_INT_P (x))
4111         {
4112           output_operand_lossage ("invalid operand for '%%%c'", code);
4113           return;
4114         }
4115
4116       asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4117       break;
4118
4119     case 'H':
4120       /* Print the higher numbered register of a pair (TImode) of regs.  */
4121       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4122         {
4123           output_operand_lossage ("invalid operand for '%%%c'", code);
4124           return;
4125         }
4126
4127       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4128       break;
4129
4130     case 'm':
4131       {
4132         int cond_code;
4133         /* Print a condition (eq, ne, etc).  */
4134
4135         /* CONST_TRUE_RTX means always -- that's the default.  */
4136         if (x == const_true_rtx)
4137           return;
4138
4139         if (!COMPARISON_P (x))
4140           {
4141             output_operand_lossage ("invalid operand for '%%%c'", code);
4142             return;
4143           }
4144
4145         cond_code = aarch64_get_condition_code (x);
4146         gcc_assert (cond_code >= 0);
4147         fputs (aarch64_condition_codes[cond_code], f);
4148       }
4149       break;
4150
4151     case 'M':
4152       {
4153         int cond_code;
4154         /* Print the inverse of a condition (eq <-> ne, etc).  */
4155
4156         /* CONST_TRUE_RTX means never -- that's the default.  */
4157         if (x == const_true_rtx)
4158           {
4159             fputs ("nv", f);
4160             return;
4161           }
4162
4163         if (!COMPARISON_P (x))
4164           {
4165             output_operand_lossage ("invalid operand for '%%%c'", code);
4166             return;
4167           }
4168         cond_code = aarch64_get_condition_code (x);
4169         gcc_assert (cond_code >= 0);
4170         fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4171                                        (cond_code)], f);
4172       }
4173       break;
4174
4175     case 'b':
4176     case 'h':
4177     case 's':
4178     case 'd':
4179     case 'q':
4180       /* Print a scalar FP/SIMD register name.  */
4181       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4182         {
4183           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4184           return;
4185         }
4186       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4187       break;
4188
4189     case 'S':
4190     case 'T':
4191     case 'U':
4192     case 'V':
4193       /* Print the first FP/SIMD register name in a list.  */
4194       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4195         {
4196           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4197           return;
4198         }
4199       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4200       break;
4201
4202     case 'R':
4203       /* Print a scalar FP/SIMD register name + 1.  */
4204       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4205         {
4206           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4207           return;
4208         }
4209       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4210       break;
4211
4212     case 'X':
4213       /* Print bottom 16 bits of integer constant in hex.  */
4214       if (!CONST_INT_P (x))
4215         {
4216           output_operand_lossage ("invalid operand for '%%%c'", code);
4217           return;
4218         }
4219       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4220       break;
4221
4222     case 'w':
4223     case 'x':
4224       /* Print a general register name or the zero register (32-bit or
4225          64-bit).  */
4226       if (x == const0_rtx
4227           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4228         {
4229           asm_fprintf (f, "%czr", code);
4230           break;
4231         }
4232
4233       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4234         {
4235           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4236           break;
4237         }
4238
4239       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4240         {
4241           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4242           break;
4243         }
4244
4245       /* Fall through */
4246
4247     case 0:
4248       /* Print a normal operand, if it's a general register, then we
4249          assume DImode.  */
4250       if (x == NULL)
4251         {
4252           output_operand_lossage ("missing operand");
4253           return;
4254         }
4255
4256       switch (GET_CODE (x))
4257         {
4258         case REG:
4259           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4260           break;
4261
4262         case MEM:
4263           aarch64_memory_reference_mode = GET_MODE (x);
4264           output_address (XEXP (x, 0));
4265           break;
4266
4267         case LABEL_REF:
4268         case SYMBOL_REF:
4269           output_addr_const (asm_out_file, x);
4270           break;
4271
4272         case CONST_INT:
4273           asm_fprintf (f, "%wd", INTVAL (x));
4274           break;
4275
4276         case CONST_VECTOR:
4277           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4278             {
4279               gcc_assert (
4280                   aarch64_const_vec_all_same_in_range_p (x,
4281                                                          HOST_WIDE_INT_MIN,
4282                                                          HOST_WIDE_INT_MAX));
4283               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4284             }
4285           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4286             {
4287               fputc ('0', f);
4288             }
4289           else
4290             gcc_unreachable ();
4291           break;
4292
4293         case CONST_DOUBLE:
4294           /* CONST_DOUBLE can represent a double-width integer.
4295              In this case, the mode of x is VOIDmode.  */
4296           if (GET_MODE (x) == VOIDmode)
4297             ; /* Do Nothing.  */
4298           else if (aarch64_float_const_zero_rtx_p (x))
4299             {
4300               fputc ('0', f);
4301               break;
4302             }
4303           else if (aarch64_float_const_representable_p (x))
4304             {
4305 #define buf_size 20
4306               char float_buf[buf_size] = {'\0'};
4307               REAL_VALUE_TYPE r;
4308               REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4309               real_to_decimal_for_mode (float_buf, &r,
4310                                         buf_size, buf_size,
4311                                         1, GET_MODE (x));
4312               asm_fprintf (asm_out_file, "%s", float_buf);
4313               break;
4314 #undef buf_size
4315             }
4316           output_operand_lossage ("invalid constant");
4317           return;
4318         default:
4319           output_operand_lossage ("invalid operand");
4320           return;
4321         }
4322       break;
4323
4324     case 'A':
4325       if (GET_CODE (x) == HIGH)
4326         x = XEXP (x, 0);
4327
4328       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4329         {
4330         case SYMBOL_SMALL_GOT:
4331           asm_fprintf (asm_out_file, ":got:");
4332           break;
4333
4334         case SYMBOL_SMALL_TLSGD:
4335           asm_fprintf (asm_out_file, ":tlsgd:");
4336           break;
4337
4338         case SYMBOL_SMALL_TLSDESC:
4339           asm_fprintf (asm_out_file, ":tlsdesc:");
4340           break;
4341
4342         case SYMBOL_SMALL_GOTTPREL:
4343           asm_fprintf (asm_out_file, ":gottprel:");
4344           break;
4345
4346         case SYMBOL_SMALL_TPREL:
4347           asm_fprintf (asm_out_file, ":tprel:");
4348           break;
4349
4350         case SYMBOL_TINY_GOT:
4351           gcc_unreachable ();
4352           break;
4353
4354         default:
4355           break;
4356         }
4357       output_addr_const (asm_out_file, x);
4358       break;
4359
4360     case 'L':
4361       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4362         {
4363         case SYMBOL_SMALL_GOT:
4364           asm_fprintf (asm_out_file, ":lo12:");
4365           break;
4366
4367         case SYMBOL_SMALL_TLSGD:
4368           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4369           break;
4370
4371         case SYMBOL_SMALL_TLSDESC:
4372           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4373           break;
4374
4375         case SYMBOL_SMALL_GOTTPREL:
4376           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4377           break;
4378
4379         case SYMBOL_SMALL_TPREL:
4380           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4381           break;
4382
4383         case SYMBOL_TINY_GOT:
4384           asm_fprintf (asm_out_file, ":got:");
4385           break;
4386
4387         default:
4388           break;
4389         }
4390       output_addr_const (asm_out_file, x);
4391       break;
4392
4393     case 'G':
4394
4395       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4396         {
4397         case SYMBOL_SMALL_TPREL:
4398           asm_fprintf (asm_out_file, ":tprel_hi12:");
4399           break;
4400         default:
4401           break;
4402         }
4403       output_addr_const (asm_out_file, x);
4404       break;
4405
4406     case 'K':
4407       {
4408         int cond_code;
4409         /* Print nzcv.  */
4410
4411         if (!COMPARISON_P (x))
4412           {
4413             output_operand_lossage ("invalid operand for '%%%c'", code);
4414             return;
4415           }
4416
4417         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4418         gcc_assert (cond_code >= 0);
4419         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4420       }
4421       break;
4422
4423     case 'k':
4424       {
4425         int cond_code;
4426         /* Print nzcv.  */
4427
4428         if (!COMPARISON_P (x))
4429           {
4430             output_operand_lossage ("invalid operand for '%%%c'", code);
4431             return;
4432           }
4433
4434         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4435         gcc_assert (cond_code >= 0);
4436         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4437       }
4438       break;
4439
4440     default:
4441       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4442       return;
4443     }
4444 }
4445
4446 void
4447 aarch64_print_operand_address (FILE *f, rtx x)
4448 {
4449   struct aarch64_address_info addr;
4450
4451   if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4452                              MEM, true))
4453     switch (addr.type)
4454       {
4455       case ADDRESS_REG_IMM:
4456         if (addr.offset == const0_rtx)
4457           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4458         else
4459           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4460                        INTVAL (addr.offset));
4461         return;
4462
4463       case ADDRESS_REG_REG:
4464         if (addr.shift == 0)
4465           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4466                        reg_names [REGNO (addr.offset)]);
4467         else
4468           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4469                        reg_names [REGNO (addr.offset)], addr.shift);
4470         return;
4471
4472       case ADDRESS_REG_UXTW:
4473         if (addr.shift == 0)
4474           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4475                        REGNO (addr.offset) - R0_REGNUM);
4476         else
4477           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4478                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4479         return;
4480
4481       case ADDRESS_REG_SXTW:
4482         if (addr.shift == 0)
4483           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4484                        REGNO (addr.offset) - R0_REGNUM);
4485         else
4486           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4487                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4488         return;
4489
4490       case ADDRESS_REG_WB:
4491         switch (GET_CODE (x))
4492           {
4493           case PRE_INC:
4494             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4495                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4496             return;
4497           case POST_INC:
4498             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4499                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4500             return;
4501           case PRE_DEC:
4502             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4503                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4504             return;
4505           case POST_DEC:
4506             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4507                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4508             return;
4509           case PRE_MODIFY:
4510             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4511                          INTVAL (addr.offset));
4512             return;
4513           case POST_MODIFY:
4514             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4515                          INTVAL (addr.offset));
4516             return;
4517           default:
4518             break;
4519           }
4520         break;
4521
4522       case ADDRESS_LO_SUM:
4523         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4524         output_addr_const (f, addr.offset);
4525         asm_fprintf (f, "]");
4526         return;
4527
4528       case ADDRESS_SYMBOLIC:
4529         break;
4530       }
4531
4532   output_addr_const (f, x);
4533 }
4534
4535 bool
4536 aarch64_label_mentioned_p (rtx x)
4537 {
4538   const char *fmt;
4539   int i;
4540
4541   if (GET_CODE (x) == LABEL_REF)
4542     return true;
4543
4544   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4545      referencing instruction, but they are constant offsets, not
4546      symbols.  */
4547   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4548     return false;
4549
4550   fmt = GET_RTX_FORMAT (GET_CODE (x));
4551   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4552     {
4553       if (fmt[i] == 'E')
4554         {
4555           int j;
4556
4557           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4558             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4559               return 1;
4560         }
4561       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4562         return 1;
4563     }
4564
4565   return 0;
4566 }
4567
4568 /* Implement REGNO_REG_CLASS.  */
4569
4570 enum reg_class
4571 aarch64_regno_regclass (unsigned regno)
4572 {
4573   if (GP_REGNUM_P (regno))
4574     return GENERAL_REGS;
4575
4576   if (regno == SP_REGNUM)
4577     return STACK_REG;
4578
4579   if (regno == FRAME_POINTER_REGNUM
4580       || regno == ARG_POINTER_REGNUM)
4581     return POINTER_REGS;
4582
4583   if (FP_REGNUM_P (regno))
4584     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4585
4586   return NO_REGS;
4587 }
4588
4589 static rtx
4590 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
4591 {
4592   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4593      where mask is selected by alignment and size of the offset.
4594      We try to pick as large a range for the offset as possible to
4595      maximize the chance of a CSE.  However, for aligned addresses
4596      we limit the range to 4k so that structures with different sized
4597      elements are likely to use the same base.  */
4598
4599   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4600     {
4601       HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4602       HOST_WIDE_INT base_offset;
4603
4604       /* Does it look like we'll need a load/store-pair operation?  */
4605       if (GET_MODE_SIZE (mode) > 16
4606           || mode == TImode)
4607         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4608                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
4609       /* For offsets aren't a multiple of the access size, the limit is
4610          -256...255.  */
4611       else if (offset & (GET_MODE_SIZE (mode) - 1))
4612         base_offset = (offset + 0x100) & ~0x1ff;
4613       else
4614         base_offset = offset & ~0xfff;
4615
4616       if (base_offset == 0)
4617         return x;
4618
4619       offset -= base_offset;
4620       rtx base_reg = gen_reg_rtx (Pmode);
4621       rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4622                            NULL_RTX);
4623       emit_move_insn (base_reg, val);
4624       x = plus_constant (Pmode, base_reg, offset);
4625     }
4626
4627   return x;
4628 }
4629
4630 /* Try a machine-dependent way of reloading an illegitimate address
4631    operand.  If we find one, push the reload and return the new rtx.  */
4632
4633 rtx
4634 aarch64_legitimize_reload_address (rtx *x_p,
4635                                    machine_mode mode,
4636                                    int opnum, int type,
4637                                    int ind_levels ATTRIBUTE_UNUSED)
4638 {
4639   rtx x = *x_p;
4640
4641   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4642   if (aarch64_vect_struct_mode_p (mode)
4643       && GET_CODE (x) == PLUS
4644       && REG_P (XEXP (x, 0))
4645       && CONST_INT_P (XEXP (x, 1)))
4646     {
4647       rtx orig_rtx = x;
4648       x = copy_rtx (x);
4649       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4650                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4651                    opnum, (enum reload_type) type);
4652       return x;
4653     }
4654
4655   /* We must recognize output that we have already generated ourselves.  */
4656   if (GET_CODE (x) == PLUS
4657       && GET_CODE (XEXP (x, 0)) == PLUS
4658       && REG_P (XEXP (XEXP (x, 0), 0))
4659       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4660       && CONST_INT_P (XEXP (x, 1)))
4661     {
4662       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4663                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4664                    opnum, (enum reload_type) type);
4665       return x;
4666     }
4667
4668   /* We wish to handle large displacements off a base register by splitting
4669      the addend across an add and the mem insn.  This can cut the number of
4670      extra insns needed from 3 to 1.  It is only useful for load/store of a
4671      single register with 12 bit offset field.  */
4672   if (GET_CODE (x) == PLUS
4673       && REG_P (XEXP (x, 0))
4674       && CONST_INT_P (XEXP (x, 1))
4675       && HARD_REGISTER_P (XEXP (x, 0))
4676       && mode != TImode
4677       && mode != TFmode
4678       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4679     {
4680       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4681       HOST_WIDE_INT low = val & 0xfff;
4682       HOST_WIDE_INT high = val - low;
4683       HOST_WIDE_INT offs;
4684       rtx cst;
4685       machine_mode xmode = GET_MODE (x);
4686
4687       /* In ILP32, xmode can be either DImode or SImode.  */
4688       gcc_assert (xmode == DImode || xmode == SImode);
4689
4690       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4691          BLKmode alignment.  */
4692       if (GET_MODE_SIZE (mode) == 0)
4693         return NULL_RTX;
4694
4695       offs = low % GET_MODE_SIZE (mode);
4696
4697       /* Align misaligned offset by adjusting high part to compensate.  */
4698       if (offs != 0)
4699         {
4700           if (aarch64_uimm12_shift (high + offs))
4701             {
4702               /* Align down.  */
4703               low = low - offs;
4704               high = high + offs;
4705             }
4706           else
4707             {
4708               /* Align up.  */
4709               offs = GET_MODE_SIZE (mode) - offs;
4710               low = low + offs;
4711               high = high + (low & 0x1000) - offs;
4712               low &= 0xfff;
4713             }
4714         }
4715
4716       /* Check for overflow.  */
4717       if (high + low != val)
4718         return NULL_RTX;
4719
4720       cst = GEN_INT (high);
4721       if (!aarch64_uimm12_shift (high))
4722         cst = force_const_mem (xmode, cst);
4723
4724       /* Reload high part into base reg, leaving the low part
4725          in the mem instruction.
4726          Note that replacing this gen_rtx_PLUS with plus_constant is
4727          wrong in this case because we rely on the
4728          (plus (plus reg c1) c2) structure being preserved so that
4729          XEXP (*p, 0) in push_reload below uses the correct term.  */
4730       x = gen_rtx_PLUS (xmode,
4731                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4732                         GEN_INT (low));
4733
4734       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4735                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4736                    opnum, (enum reload_type) type);
4737       return x;
4738     }
4739
4740   return NULL_RTX;
4741 }
4742
4743
4744 static reg_class_t
4745 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4746                           reg_class_t rclass,
4747                           machine_mode mode,
4748                           secondary_reload_info *sri)
4749 {
4750   /* Without the TARGET_SIMD instructions we cannot move a Q register
4751      to a Q register directly.  We need a scratch.  */
4752   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4753       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4754       && reg_class_subset_p (rclass, FP_REGS))
4755     {
4756       if (mode == TFmode)
4757         sri->icode = CODE_FOR_aarch64_reload_movtf;
4758       else if (mode == TImode)
4759         sri->icode = CODE_FOR_aarch64_reload_movti;
4760       return NO_REGS;
4761     }
4762
4763   /* A TFmode or TImode memory access should be handled via an FP_REGS
4764      because AArch64 has richer addressing modes for LDR/STR instructions
4765      than LDP/STP instructions.  */
4766   if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4767       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4768     return FP_REGS;
4769
4770   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4771       return GENERAL_REGS;
4772
4773   return NO_REGS;
4774 }
4775
4776 static bool
4777 aarch64_can_eliminate (const int from, const int to)
4778 {
4779   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4780      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4781
4782   if (frame_pointer_needed)
4783     {
4784       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4785         return true;
4786       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4787         return false;
4788       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4789           && !cfun->calls_alloca)
4790         return true;
4791       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4792         return true;
4793
4794       return false;
4795     }
4796   else
4797     {
4798       /* If we decided that we didn't need a leaf frame pointer but then used
4799          LR in the function, then we'll want a frame pointer after all, so
4800          prevent this elimination to ensure a frame pointer is used.  */
4801       if (to == STACK_POINTER_REGNUM
4802           && flag_omit_leaf_frame_pointer
4803           && df_regs_ever_live_p (LR_REGNUM))
4804         return false;
4805     }
4806
4807   return true;
4808 }
4809
4810 HOST_WIDE_INT
4811 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4812 {
4813   aarch64_layout_frame ();
4814
4815   if (to == HARD_FRAME_POINTER_REGNUM)
4816     {
4817       if (from == ARG_POINTER_REGNUM)
4818         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4819
4820       if (from == FRAME_POINTER_REGNUM)
4821         return (cfun->machine->frame.hard_fp_offset
4822                 - cfun->machine->frame.saved_varargs_size);
4823     }
4824
4825   if (to == STACK_POINTER_REGNUM)
4826     {
4827       if (from == FRAME_POINTER_REGNUM)
4828           return (cfun->machine->frame.frame_size
4829                   - cfun->machine->frame.saved_varargs_size);
4830     }
4831
4832   return cfun->machine->frame.frame_size;
4833 }
4834
4835 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
4836    previous frame.  */
4837
4838 rtx
4839 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4840 {
4841   if (count != 0)
4842     return const0_rtx;
4843   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4844 }
4845
4846
4847 static void
4848 aarch64_asm_trampoline_template (FILE *f)
4849 {
4850   if (TARGET_ILP32)
4851     {
4852       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4853       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4854     }
4855   else
4856     {
4857       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4858       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4859     }
4860   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4861   assemble_aligned_integer (4, const0_rtx);
4862   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4863   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4864 }
4865
4866 static void
4867 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4868 {
4869   rtx fnaddr, mem, a_tramp;
4870   const int tramp_code_sz = 16;
4871
4872   /* Don't need to copy the trailing D-words, we fill those in below.  */
4873   emit_block_move (m_tramp, assemble_trampoline_template (),
4874                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4875   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4876   fnaddr = XEXP (DECL_RTL (fndecl), 0);
4877   if (GET_MODE (fnaddr) != ptr_mode)
4878     fnaddr = convert_memory_address (ptr_mode, fnaddr);
4879   emit_move_insn (mem, fnaddr);
4880
4881   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4882   emit_move_insn (mem, chain_value);
4883
4884   /* XXX We should really define a "clear_cache" pattern and use
4885      gen_clear_cache().  */
4886   a_tramp = XEXP (m_tramp, 0);
4887   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4888                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4889                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4890                      ptr_mode);
4891 }
4892
4893 static unsigned char
4894 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4895 {
4896   switch (regclass)
4897     {
4898     case CALLER_SAVE_REGS:
4899     case POINTER_REGS:
4900     case GENERAL_REGS:
4901     case ALL_REGS:
4902     case FP_REGS:
4903     case FP_LO_REGS:
4904       return
4905         aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4906                                        (GET_MODE_SIZE (mode) + 7) / 8;
4907     case STACK_REG:
4908       return 1;
4909
4910     case NO_REGS:
4911       return 0;
4912
4913     default:
4914       break;
4915     }
4916   gcc_unreachable ();
4917 }
4918
4919 static reg_class_t
4920 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4921 {
4922   if (regclass == POINTER_REGS)
4923     return GENERAL_REGS;
4924
4925   if (regclass == STACK_REG)
4926     {
4927       if (REG_P(x)
4928           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4929           return regclass;
4930
4931       return NO_REGS;
4932     }
4933
4934   /* If it's an integer immediate that MOVI can't handle, then
4935      FP_REGS is not an option, so we return NO_REGS instead.  */
4936   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4937       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4938     return NO_REGS;
4939
4940   /* Register eliminiation can result in a request for
4941      SP+constant->FP_REGS.  We cannot support such operations which
4942      use SP as source and an FP_REG as destination, so reject out
4943      right now.  */
4944   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4945     {
4946       rtx lhs = XEXP (x, 0);
4947
4948       /* Look through a possible SUBREG introduced by ILP32.  */
4949       if (GET_CODE (lhs) == SUBREG)
4950         lhs = SUBREG_REG (lhs);
4951
4952       gcc_assert (REG_P (lhs));
4953       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4954                                       POINTER_REGS));
4955       return NO_REGS;
4956     }
4957
4958   return regclass;
4959 }
4960
4961 void
4962 aarch64_asm_output_labelref (FILE* f, const char *name)
4963 {
4964   asm_fprintf (f, "%U%s", name);
4965 }
4966
4967 static void
4968 aarch64_elf_asm_constructor (rtx symbol, int priority)
4969 {
4970   if (priority == DEFAULT_INIT_PRIORITY)
4971     default_ctor_section_asm_out_constructor (symbol, priority);
4972   else
4973     {
4974       section *s;
4975       char buf[18];
4976       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4977       s = get_section (buf, SECTION_WRITE, NULL);
4978       switch_to_section (s);
4979       assemble_align (POINTER_SIZE);
4980       assemble_aligned_integer (POINTER_BYTES, symbol);
4981     }
4982 }
4983
4984 static void
4985 aarch64_elf_asm_destructor (rtx symbol, int priority)
4986 {
4987   if (priority == DEFAULT_INIT_PRIORITY)
4988     default_dtor_section_asm_out_destructor (symbol, priority);
4989   else
4990     {
4991       section *s;
4992       char buf[18];
4993       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
4994       s = get_section (buf, SECTION_WRITE, NULL);
4995       switch_to_section (s);
4996       assemble_align (POINTER_SIZE);
4997       assemble_aligned_integer (POINTER_BYTES, symbol);
4998     }
4999 }
5000
5001 const char*
5002 aarch64_output_casesi (rtx *operands)
5003 {
5004   char buf[100];
5005   char label[100];
5006   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5007   int index;
5008   static const char *const patterns[4][2] =
5009   {
5010     {
5011       "ldrb\t%w3, [%0,%w1,uxtw]",
5012       "add\t%3, %4, %w3, sxtb #2"
5013     },
5014     {
5015       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5016       "add\t%3, %4, %w3, sxth #2"
5017     },
5018     {
5019       "ldr\t%w3, [%0,%w1,uxtw #2]",
5020       "add\t%3, %4, %w3, sxtw #2"
5021     },
5022     /* We assume that DImode is only generated when not optimizing and
5023        that we don't really need 64-bit address offsets.  That would
5024        imply an object file with 8GB of code in a single function!  */
5025     {
5026       "ldr\t%w3, [%0,%w1,uxtw #2]",
5027       "add\t%3, %4, %w3, sxtw #2"
5028     }
5029   };
5030
5031   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5032
5033   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5034
5035   gcc_assert (index >= 0 && index <= 3);
5036
5037   /* Need to implement table size reduction, by chaning the code below.  */
5038   output_asm_insn (patterns[index][0], operands);
5039   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5040   snprintf (buf, sizeof (buf),
5041             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5042   output_asm_insn (buf, operands);
5043   output_asm_insn (patterns[index][1], operands);
5044   output_asm_insn ("br\t%3", operands);
5045   assemble_label (asm_out_file, label);
5046   return "";
5047 }
5048
5049
5050 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5051    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5052    operator.  */
5053
5054 int
5055 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5056 {
5057   if (shift >= 0 && shift <= 3)
5058     {
5059       int size;
5060       for (size = 8; size <= 32; size *= 2)
5061         {
5062           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5063           if (mask == bits << shift)
5064             return size;
5065         }
5066     }
5067   return 0;
5068 }
5069
5070 static bool
5071 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5072                                    const_rtx x ATTRIBUTE_UNUSED)
5073 {
5074   /* We can't use blocks for constants when we're using a per-function
5075      constant pool.  */
5076   return false;
5077 }
5078
5079 static section *
5080 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5081                             rtx x ATTRIBUTE_UNUSED,
5082                             unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5083 {
5084   /* Force all constant pool entries into the current function section.  */
5085   return function_section (current_function_decl);
5086 }
5087
5088
5089 /* Costs.  */
5090
5091 /* Helper function for rtx cost calculation.  Strip a shift expression
5092    from X.  Returns the inner operand if successful, or the original
5093    expression on failure.  */
5094 static rtx
5095 aarch64_strip_shift (rtx x)
5096 {
5097   rtx op = x;
5098
5099   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5100      we can convert both to ROR during final output.  */
5101   if ((GET_CODE (op) == ASHIFT
5102        || GET_CODE (op) == ASHIFTRT
5103        || GET_CODE (op) == LSHIFTRT
5104        || GET_CODE (op) == ROTATERT
5105        || GET_CODE (op) == ROTATE)
5106       && CONST_INT_P (XEXP (op, 1)))
5107     return XEXP (op, 0);
5108
5109   if (GET_CODE (op) == MULT
5110       && CONST_INT_P (XEXP (op, 1))
5111       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5112     return XEXP (op, 0);
5113
5114   return x;
5115 }
5116
5117 /* Helper function for rtx cost calculation.  Strip an extend
5118    expression from X.  Returns the inner operand if successful, or the
5119    original expression on failure.  We deal with a number of possible
5120    canonicalization variations here.  */
5121 static rtx
5122 aarch64_strip_extend (rtx x)
5123 {
5124   rtx op = x;
5125
5126   /* Zero and sign extraction of a widened value.  */
5127   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5128       && XEXP (op, 2) == const0_rtx
5129       && GET_CODE (XEXP (op, 0)) == MULT
5130       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5131                                          XEXP (op, 1)))
5132     return XEXP (XEXP (op, 0), 0);
5133
5134   /* It can also be represented (for zero-extend) as an AND with an
5135      immediate.  */
5136   if (GET_CODE (op) == AND
5137       && GET_CODE (XEXP (op, 0)) == MULT
5138       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5139       && CONST_INT_P (XEXP (op, 1))
5140       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5141                            INTVAL (XEXP (op, 1))) != 0)
5142     return XEXP (XEXP (op, 0), 0);
5143
5144   /* Now handle extended register, as this may also have an optional
5145      left shift by 1..4.  */
5146   if (GET_CODE (op) == ASHIFT
5147       && CONST_INT_P (XEXP (op, 1))
5148       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5149     op = XEXP (op, 0);
5150
5151   if (GET_CODE (op) == ZERO_EXTEND
5152       || GET_CODE (op) == SIGN_EXTEND)
5153     op = XEXP (op, 0);
5154
5155   if (op != x)
5156     return op;
5157
5158   return x;
5159 }
5160
5161 /* Return true iff CODE is a shift supported in combination
5162    with arithmetic instructions.  */
5163
5164 static bool
5165 aarch64_shift_p (enum rtx_code code)
5166 {
5167   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5168 }
5169
5170 /* Helper function for rtx cost calculation.  Calculate the cost of
5171    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5172    Return the calculated cost of the expression, recursing manually in to
5173    operands where needed.  */
5174
5175 static int
5176 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5177 {
5178   rtx op0, op1;
5179   const struct cpu_cost_table *extra_cost
5180     = aarch64_tune_params->insn_extra_cost;
5181   int cost = 0;
5182   bool compound_p = (outer == PLUS || outer == MINUS);
5183   machine_mode mode = GET_MODE (x);
5184
5185   gcc_checking_assert (code == MULT);
5186
5187   op0 = XEXP (x, 0);
5188   op1 = XEXP (x, 1);
5189
5190   if (VECTOR_MODE_P (mode))
5191     mode = GET_MODE_INNER (mode);
5192
5193   /* Integer multiply/fma.  */
5194   if (GET_MODE_CLASS (mode) == MODE_INT)
5195     {
5196       /* The multiply will be canonicalized as a shift, cost it as such.  */
5197       if (aarch64_shift_p (GET_CODE (x))
5198           || (CONST_INT_P (op1)
5199               && exact_log2 (INTVAL (op1)) > 0))
5200         {
5201           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5202                            || GET_CODE (op0) == SIGN_EXTEND;
5203           if (speed)
5204             {
5205               if (compound_p)
5206                 {
5207                   if (REG_P (op1))
5208                     /* ARITH + shift-by-register.  */
5209                     cost += extra_cost->alu.arith_shift_reg;
5210                   else if (is_extend)
5211                     /* ARITH + extended register.  We don't have a cost field
5212                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
5213                     cost += extra_cost->alu.extend_arith;
5214                   else
5215                     /* ARITH + shift-by-immediate.  */
5216                     cost += extra_cost->alu.arith_shift;
5217                 }
5218               else
5219                 /* LSL (immediate).  */
5220                 cost += extra_cost->alu.shift;
5221
5222             }
5223           /* Strip extends as we will have costed them in the case above.  */
5224           if (is_extend)
5225             op0 = aarch64_strip_extend (op0);
5226
5227           cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5228
5229           return cost;
5230         }
5231
5232       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
5233          compound and let the below cases handle it.  After all, MNEG is a
5234          special-case alias of MSUB.  */
5235       if (GET_CODE (op0) == NEG)
5236         {
5237           op0 = XEXP (op0, 0);
5238           compound_p = true;
5239         }
5240
5241       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5242       if ((GET_CODE (op0) == ZERO_EXTEND
5243            && GET_CODE (op1) == ZERO_EXTEND)
5244           || (GET_CODE (op0) == SIGN_EXTEND
5245               && GET_CODE (op1) == SIGN_EXTEND))
5246         {
5247           cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5248                   + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5249
5250           if (speed)
5251             {
5252               if (compound_p)
5253                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
5254                 cost += extra_cost->mult[0].extend_add;
5255               else
5256                 /* MUL/SMULL/UMULL.  */
5257                 cost += extra_cost->mult[0].extend;
5258             }
5259
5260           return cost;
5261         }
5262
5263       /* This is either an integer multiply or a MADD.  In both cases
5264          we want to recurse and cost the operands.  */
5265       cost += rtx_cost (op0, MULT, 0, speed)
5266               + rtx_cost (op1, MULT, 1, speed);
5267
5268       if (speed)
5269         {
5270           if (compound_p)
5271             /* MADD/MSUB.  */
5272             cost += extra_cost->mult[mode == DImode].add;
5273           else
5274             /* MUL.  */
5275             cost += extra_cost->mult[mode == DImode].simple;
5276         }
5277
5278       return cost;
5279     }
5280   else
5281     {
5282       if (speed)
5283         {
5284           /* Floating-point FMA/FMUL can also support negations of the
5285              operands.  */
5286           if (GET_CODE (op0) == NEG)
5287             op0 = XEXP (op0, 0);
5288           if (GET_CODE (op1) == NEG)
5289             op1 = XEXP (op1, 0);
5290
5291           if (compound_p)
5292             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5293             cost += extra_cost->fp[mode == DFmode].fma;
5294           else
5295             /* FMUL/FNMUL.  */
5296             cost += extra_cost->fp[mode == DFmode].mult;
5297         }
5298
5299       cost += rtx_cost (op0, MULT, 0, speed)
5300               + rtx_cost (op1, MULT, 1, speed);
5301       return cost;
5302     }
5303 }
5304
5305 static int
5306 aarch64_address_cost (rtx x,
5307                       machine_mode mode,
5308                       addr_space_t as ATTRIBUTE_UNUSED,
5309                       bool speed)
5310 {
5311   enum rtx_code c = GET_CODE (x);
5312   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
5313   struct aarch64_address_info info;
5314   int cost = 0;
5315   info.shift = 0;
5316
5317   if (!aarch64_classify_address (&info, x, mode, c, false))
5318     {
5319       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5320         {
5321           /* This is a CONST or SYMBOL ref which will be split
5322              in a different way depending on the code model in use.
5323              Cost it through the generic infrastructure.  */
5324           int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5325           /* Divide through by the cost of one instruction to
5326              bring it to the same units as the address costs.  */
5327           cost_symbol_ref /= COSTS_N_INSNS (1);
5328           /* The cost is then the cost of preparing the address,
5329              followed by an immediate (possibly 0) offset.  */
5330           return cost_symbol_ref + addr_cost->imm_offset;
5331         }
5332       else
5333         {
5334           /* This is most likely a jump table from a case
5335              statement.  */
5336           return addr_cost->register_offset;
5337         }
5338     }
5339
5340   switch (info.type)
5341     {
5342       case ADDRESS_LO_SUM:
5343       case ADDRESS_SYMBOLIC:
5344       case ADDRESS_REG_IMM:
5345         cost += addr_cost->imm_offset;
5346         break;
5347
5348       case ADDRESS_REG_WB:
5349         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5350           cost += addr_cost->pre_modify;
5351         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5352           cost += addr_cost->post_modify;
5353         else
5354           gcc_unreachable ();
5355
5356         break;
5357
5358       case ADDRESS_REG_REG:
5359         cost += addr_cost->register_offset;
5360         break;
5361
5362       case ADDRESS_REG_UXTW:
5363       case ADDRESS_REG_SXTW:
5364         cost += addr_cost->register_extend;
5365         break;
5366
5367       default:
5368         gcc_unreachable ();
5369     }
5370
5371
5372   if (info.shift > 0)
5373     {
5374       /* For the sake of calculating the cost of the shifted register
5375          component, we can treat same sized modes in the same way.  */
5376       switch (GET_MODE_BITSIZE (mode))
5377         {
5378           case 16:
5379             cost += addr_cost->addr_scale_costs.hi;
5380             break;
5381
5382           case 32:
5383             cost += addr_cost->addr_scale_costs.si;
5384             break;
5385
5386           case 64:
5387             cost += addr_cost->addr_scale_costs.di;
5388             break;
5389
5390           /* We can't tell, or this is a 128-bit vector.  */
5391           default:
5392             cost += addr_cost->addr_scale_costs.ti;
5393             break;
5394         }
5395     }
5396
5397   return cost;
5398 }
5399
5400 /* Return true if the RTX X in mode MODE is a zero or sign extract
5401    usable in an ADD or SUB (extended register) instruction.  */
5402 static bool
5403 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5404 {
5405   /* Catch add with a sign extract.
5406      This is add_<optab><mode>_multp2.  */
5407   if (GET_CODE (x) == SIGN_EXTRACT
5408       || GET_CODE (x) == ZERO_EXTRACT)
5409     {
5410       rtx op0 = XEXP (x, 0);
5411       rtx op1 = XEXP (x, 1);
5412       rtx op2 = XEXP (x, 2);
5413
5414       if (GET_CODE (op0) == MULT
5415           && CONST_INT_P (op1)
5416           && op2 == const0_rtx
5417           && CONST_INT_P (XEXP (op0, 1))
5418           && aarch64_is_extend_from_extract (mode,
5419                                              XEXP (op0, 1),
5420                                              op1))
5421         {
5422           return true;
5423         }
5424     }
5425
5426   return false;
5427 }
5428
5429 static bool
5430 aarch64_frint_unspec_p (unsigned int u)
5431 {
5432   switch (u)
5433     {
5434       case UNSPEC_FRINTZ:
5435       case UNSPEC_FRINTP:
5436       case UNSPEC_FRINTM:
5437       case UNSPEC_FRINTA:
5438       case UNSPEC_FRINTN:
5439       case UNSPEC_FRINTX:
5440       case UNSPEC_FRINTI:
5441         return true;
5442
5443       default:
5444         return false;
5445     }
5446 }
5447
5448 /* Return true iff X is an rtx that will match an extr instruction
5449    i.e. as described in the *extr<mode>5_insn family of patterns.
5450    OP0 and OP1 will be set to the operands of the shifts involved
5451    on success and will be NULL_RTX otherwise.  */
5452
5453 static bool
5454 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5455 {
5456   rtx op0, op1;
5457   machine_mode mode = GET_MODE (x);
5458
5459   *res_op0 = NULL_RTX;
5460   *res_op1 = NULL_RTX;
5461
5462   if (GET_CODE (x) != IOR)
5463     return false;
5464
5465   op0 = XEXP (x, 0);
5466   op1 = XEXP (x, 1);
5467
5468   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5469       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5470     {
5471      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
5472       if (GET_CODE (op1) == ASHIFT)
5473         std::swap (op0, op1);
5474
5475       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5476         return false;
5477
5478       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5479       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5480
5481       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5482           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5483         {
5484           *res_op0 = XEXP (op0, 0);
5485           *res_op1 = XEXP (op1, 0);
5486           return true;
5487         }
5488     }
5489
5490   return false;
5491 }
5492
5493 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5494    storing it in *COST.  Result is true if the total cost of the operation
5495    has now been calculated.  */
5496 static bool
5497 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5498 {
5499   rtx inner;
5500   rtx comparator;
5501   enum rtx_code cmpcode;
5502
5503   if (COMPARISON_P (op0))
5504     {
5505       inner = XEXP (op0, 0);
5506       comparator = XEXP (op0, 1);
5507       cmpcode = GET_CODE (op0);
5508     }
5509   else
5510     {
5511       inner = op0;
5512       comparator = const0_rtx;
5513       cmpcode = NE;
5514     }
5515
5516   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5517     {
5518       /* Conditional branch.  */
5519       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5520         return true;
5521       else
5522         {
5523           if (cmpcode == NE || cmpcode == EQ)
5524             {
5525               if (comparator == const0_rtx)
5526                 {
5527                   /* TBZ/TBNZ/CBZ/CBNZ.  */
5528                   if (GET_CODE (inner) == ZERO_EXTRACT)
5529                     /* TBZ/TBNZ.  */
5530                     *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5531                                        0, speed);
5532                 else
5533                   /* CBZ/CBNZ.  */
5534                   *cost += rtx_cost (inner, cmpcode, 0, speed);
5535
5536                 return true;
5537               }
5538             }
5539           else if (cmpcode == LT || cmpcode == GE)
5540             {
5541               /* TBZ/TBNZ.  */
5542               if (comparator == const0_rtx)
5543                 return true;
5544             }
5545         }
5546     }
5547   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5548     {
5549       /* It's a conditional operation based on the status flags,
5550          so it must be some flavor of CSEL.  */
5551
5552       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
5553       if (GET_CODE (op1) == NEG
5554           || GET_CODE (op1) == NOT
5555           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5556         op1 = XEXP (op1, 0);
5557
5558       *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5559       *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5560       return true;
5561     }
5562
5563   /* We don't know what this is, cost all operands.  */
5564   return false;
5565 }
5566
5567 /* Calculate the cost of calculating X, storing it in *COST.  Result
5568    is true if the total cost of the operation has now been calculated.  */
5569 static bool
5570 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5571                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5572 {
5573   rtx op0, op1, op2;
5574   const struct cpu_cost_table *extra_cost
5575     = aarch64_tune_params->insn_extra_cost;
5576   machine_mode mode = GET_MODE (x);
5577
5578   /* By default, assume that everything has equivalent cost to the
5579      cheapest instruction.  Any additional costs are applied as a delta
5580      above this default.  */
5581   *cost = COSTS_N_INSNS (1);
5582
5583   /* TODO: The cost infrastructure currently does not handle
5584      vector operations.  Assume that all vector operations
5585      are equally expensive.  */
5586   if (VECTOR_MODE_P (mode))
5587     {
5588       if (speed)
5589         *cost += extra_cost->vect.alu;
5590       return true;
5591     }
5592
5593   switch (code)
5594     {
5595     case SET:
5596       /* The cost depends entirely on the operands to SET.  */
5597       *cost = 0;
5598       op0 = SET_DEST (x);
5599       op1 = SET_SRC (x);
5600
5601       switch (GET_CODE (op0))
5602         {
5603         case MEM:
5604           if (speed)
5605             {
5606               rtx address = XEXP (op0, 0);
5607               if (GET_MODE_CLASS (mode) == MODE_INT)
5608                 *cost += extra_cost->ldst.store;
5609               else if (mode == SFmode)
5610                 *cost += extra_cost->ldst.storef;
5611               else if (mode == DFmode)
5612                 *cost += extra_cost->ldst.stored;
5613
5614               *cost +=
5615                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5616                                                      0, speed));
5617             }
5618
5619           *cost += rtx_cost (op1, SET, 1, speed);
5620           return true;
5621
5622         case SUBREG:
5623           if (! REG_P (SUBREG_REG (op0)))
5624             *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5625
5626           /* Fall through.  */
5627         case REG:
5628           /* const0_rtx is in general free, but we will use an
5629              instruction to set a register to 0.  */
5630           if (REG_P (op1) || op1 == const0_rtx)
5631             {
5632               /* The cost is 1 per register copied.  */
5633               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5634                               / UNITS_PER_WORD;
5635               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5636             }
5637           else
5638             /* Cost is just the cost of the RHS of the set.  */
5639             *cost += rtx_cost (op1, SET, 1, speed);
5640           return true;
5641
5642         case ZERO_EXTRACT:
5643         case SIGN_EXTRACT:
5644           /* Bit-field insertion.  Strip any redundant widening of
5645              the RHS to meet the width of the target.  */
5646           if (GET_CODE (op1) == SUBREG)
5647             op1 = SUBREG_REG (op1);
5648           if ((GET_CODE (op1) == ZERO_EXTEND
5649                || GET_CODE (op1) == SIGN_EXTEND)
5650               && CONST_INT_P (XEXP (op0, 1))
5651               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5652                   >= INTVAL (XEXP (op0, 1))))
5653             op1 = XEXP (op1, 0);
5654
5655           if (CONST_INT_P (op1))
5656             {
5657               /* MOV immediate is assumed to always be cheap.  */
5658               *cost = COSTS_N_INSNS (1);
5659             }
5660           else
5661             {
5662               /* BFM.  */
5663               if (speed)
5664                 *cost += extra_cost->alu.bfi;
5665               *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5666             }
5667
5668           return true;
5669
5670         default:
5671           /* We can't make sense of this, assume default cost.  */
5672           *cost = COSTS_N_INSNS (1);
5673           return false;
5674         }
5675       return false;
5676
5677     case CONST_INT:
5678       /* If an instruction can incorporate a constant within the
5679          instruction, the instruction's expression avoids calling
5680          rtx_cost() on the constant.  If rtx_cost() is called on a
5681          constant, then it is usually because the constant must be
5682          moved into a register by one or more instructions.
5683
5684          The exception is constant 0, which can be expressed
5685          as XZR/WZR and is therefore free.  The exception to this is
5686          if we have (set (reg) (const0_rtx)) in which case we must cost
5687          the move.  However, we can catch that when we cost the SET, so
5688          we don't need to consider that here.  */
5689       if (x == const0_rtx)
5690         *cost = 0;
5691       else
5692         {
5693           /* To an approximation, building any other constant is
5694              proportionally expensive to the number of instructions
5695              required to build that constant.  This is true whether we
5696              are compiling for SPEED or otherwise.  */
5697           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5698                                  (NULL_RTX, x, false, mode));
5699         }
5700       return true;
5701
5702     case CONST_DOUBLE:
5703       if (speed)
5704         {
5705           /* mov[df,sf]_aarch64.  */
5706           if (aarch64_float_const_representable_p (x))
5707             /* FMOV (scalar immediate).  */
5708             *cost += extra_cost->fp[mode == DFmode].fpconst;
5709           else if (!aarch64_float_const_zero_rtx_p (x))
5710             {
5711               /* This will be a load from memory.  */
5712               if (mode == DFmode)
5713                 *cost += extra_cost->ldst.loadd;
5714               else
5715                 *cost += extra_cost->ldst.loadf;
5716             }
5717           else
5718             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5719                or MOV v0.s[0], wzr - neither of which are modeled by the
5720                cost tables.  Just use the default cost.  */
5721             {
5722             }
5723         }
5724
5725       return true;
5726
5727     case MEM:
5728       if (speed)
5729         {
5730           /* For loads we want the base cost of a load, plus an
5731              approximation for the additional cost of the addressing
5732              mode.  */
5733           rtx address = XEXP (x, 0);
5734           if (GET_MODE_CLASS (mode) == MODE_INT)
5735             *cost += extra_cost->ldst.load;
5736           else if (mode == SFmode)
5737             *cost += extra_cost->ldst.loadf;
5738           else if (mode == DFmode)
5739             *cost += extra_cost->ldst.loadd;
5740
5741           *cost +=
5742                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5743                                                      0, speed));
5744         }
5745
5746       return true;
5747
5748     case NEG:
5749       op0 = XEXP (x, 0);
5750
5751       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5752        {
5753           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5754               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5755             {
5756               /* CSETM.  */
5757               *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5758               return true;
5759             }
5760
5761           /* Cost this as SUB wzr, X.  */
5762           op0 = CONST0_RTX (GET_MODE (x));
5763           op1 = XEXP (x, 0);
5764           goto cost_minus;
5765         }
5766
5767       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5768         {
5769           /* Support (neg(fma...)) as a single instruction only if
5770              sign of zeros is unimportant.  This matches the decision
5771              making in aarch64.md.  */
5772           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5773             {
5774               /* FNMADD.  */
5775               *cost = rtx_cost (op0, NEG, 0, speed);
5776               return true;
5777             }
5778           if (speed)
5779             /* FNEG.  */
5780             *cost += extra_cost->fp[mode == DFmode].neg;
5781           return false;
5782         }
5783
5784       return false;
5785
5786     case CLRSB:
5787     case CLZ:
5788       if (speed)
5789         *cost += extra_cost->alu.clz;
5790
5791       return false;
5792
5793     case COMPARE:
5794       op0 = XEXP (x, 0);
5795       op1 = XEXP (x, 1);
5796
5797       if (op1 == const0_rtx
5798           && GET_CODE (op0) == AND)
5799         {
5800           x = op0;
5801           goto cost_logic;
5802         }
5803
5804       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5805         {
5806           /* TODO: A write to the CC flags possibly costs extra, this
5807              needs encoding in the cost tables.  */
5808
5809           /* CC_ZESWPmode supports zero extend for free.  */
5810           if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5811             op0 = XEXP (op0, 0);
5812
5813           /* ANDS.  */
5814           if (GET_CODE (op0) == AND)
5815             {
5816               x = op0;
5817               goto cost_logic;
5818             }
5819
5820           if (GET_CODE (op0) == PLUS)
5821             {
5822               /* ADDS (and CMN alias).  */
5823               x = op0;
5824               goto cost_plus;
5825             }
5826
5827           if (GET_CODE (op0) == MINUS)
5828             {
5829               /* SUBS.  */
5830               x = op0;
5831               goto cost_minus;
5832             }
5833
5834           if (GET_CODE (op1) == NEG)
5835             {
5836               /* CMN.  */
5837               if (speed)
5838                 *cost += extra_cost->alu.arith;
5839
5840               *cost += rtx_cost (op0, COMPARE, 0, speed);
5841               *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5842               return true;
5843             }
5844
5845           /* CMP.
5846
5847              Compare can freely swap the order of operands, and
5848              canonicalization puts the more complex operation first.
5849              But the integer MINUS logic expects the shift/extend
5850              operation in op1.  */
5851           if (! (REG_P (op0)
5852                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5853           {
5854             op0 = XEXP (x, 1);
5855             op1 = XEXP (x, 0);
5856           }
5857           goto cost_minus;
5858         }
5859
5860       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5861         {
5862           /* FCMP.  */
5863           if (speed)
5864             *cost += extra_cost->fp[mode == DFmode].compare;
5865
5866           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5867             {
5868               *cost += rtx_cost (op0, COMPARE, 0, speed);
5869               /* FCMP supports constant 0.0 for no extra cost. */
5870               return true;
5871             }
5872           return false;
5873         }
5874
5875       return false;
5876
5877     case MINUS:
5878       {
5879         op0 = XEXP (x, 0);
5880         op1 = XEXP (x, 1);
5881
5882 cost_minus:
5883         *cost += rtx_cost (op0, MINUS, 0, speed);
5884
5885         /* Detect valid immediates.  */
5886         if ((GET_MODE_CLASS (mode) == MODE_INT
5887              || (GET_MODE_CLASS (mode) == MODE_CC
5888                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5889             && CONST_INT_P (op1)
5890             && aarch64_uimm12_shift (INTVAL (op1)))
5891           {
5892             if (speed)
5893               /* SUB(S) (immediate).  */
5894               *cost += extra_cost->alu.arith;
5895             return true;
5896           }
5897
5898         /* Look for SUB (extended register).  */
5899         if (aarch64_rtx_arith_op_extract_p (op1, mode))
5900           {
5901             if (speed)
5902               *cost += extra_cost->alu.extend_arith;
5903
5904             *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5905                                (enum rtx_code) GET_CODE (op1),
5906                                0, speed);
5907             return true;
5908           }
5909
5910         rtx new_op1 = aarch64_strip_extend (op1);
5911
5912         /* Cost this as an FMA-alike operation.  */
5913         if ((GET_CODE (new_op1) == MULT
5914              || aarch64_shift_p (GET_CODE (new_op1)))
5915             && code != COMPARE)
5916           {
5917             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5918                                             (enum rtx_code) code,
5919                                             speed);
5920             return true;
5921           }
5922
5923         *cost += rtx_cost (new_op1, MINUS, 1, speed);
5924
5925         if (speed)
5926           {
5927             if (GET_MODE_CLASS (mode) == MODE_INT)
5928               /* SUB(S).  */
5929               *cost += extra_cost->alu.arith;
5930             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5931               /* FSUB.  */
5932               *cost += extra_cost->fp[mode == DFmode].addsub;
5933           }
5934         return true;
5935       }
5936
5937     case PLUS:
5938       {
5939         rtx new_op0;
5940
5941         op0 = XEXP (x, 0);
5942         op1 = XEXP (x, 1);
5943
5944 cost_plus:
5945         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5946             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5947           {
5948             /* CSINC.  */
5949             *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5950             *cost += rtx_cost (op1, PLUS, 1, speed);
5951             return true;
5952           }
5953
5954         if (GET_MODE_CLASS (mode) == MODE_INT
5955             && CONST_INT_P (op1)
5956             && aarch64_uimm12_shift (INTVAL (op1)))
5957           {
5958             *cost += rtx_cost (op0, PLUS, 0, speed);
5959
5960             if (speed)
5961               /* ADD (immediate).  */
5962               *cost += extra_cost->alu.arith;
5963             return true;
5964           }
5965
5966         *cost += rtx_cost (op1, PLUS, 1, speed);
5967
5968         /* Look for ADD (extended register).  */
5969         if (aarch64_rtx_arith_op_extract_p (op0, mode))
5970           {
5971             if (speed)
5972               *cost += extra_cost->alu.extend_arith;
5973
5974             *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5975                                (enum rtx_code) GET_CODE (op0),
5976                                0, speed);
5977             return true;
5978           }
5979
5980         /* Strip any extend, leave shifts behind as we will
5981            cost them through mult_cost.  */
5982         new_op0 = aarch64_strip_extend (op0);
5983
5984         if (GET_CODE (new_op0) == MULT
5985             || aarch64_shift_p (GET_CODE (new_op0)))
5986           {
5987             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5988                                             speed);
5989             return true;
5990           }
5991
5992         *cost += rtx_cost (new_op0, PLUS, 0, speed);
5993
5994         if (speed)
5995           {
5996             if (GET_MODE_CLASS (mode) == MODE_INT)
5997               /* ADD.  */
5998               *cost += extra_cost->alu.arith;
5999             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6000               /* FADD.  */
6001               *cost += extra_cost->fp[mode == DFmode].addsub;
6002           }
6003         return true;
6004       }
6005
6006     case BSWAP:
6007       *cost = COSTS_N_INSNS (1);
6008
6009       if (speed)
6010         *cost += extra_cost->alu.rev;
6011
6012       return false;
6013
6014     case IOR:
6015       if (aarch_rev16_p (x))
6016         {
6017           *cost = COSTS_N_INSNS (1);
6018
6019           if (speed)
6020             *cost += extra_cost->alu.rev;
6021
6022           return true;
6023         }
6024
6025       if (aarch64_extr_rtx_p (x, &op0, &op1))
6026         {
6027           *cost += rtx_cost (op0, IOR, 0, speed)
6028                    + rtx_cost (op1, IOR, 1, speed);
6029           if (speed)
6030             *cost += extra_cost->alu.shift;
6031
6032           return true;
6033         }
6034     /* Fall through.  */
6035     case XOR:
6036     case AND:
6037     cost_logic:
6038       op0 = XEXP (x, 0);
6039       op1 = XEXP (x, 1);
6040
6041       if (code == AND
6042           && GET_CODE (op0) == MULT
6043           && CONST_INT_P (XEXP (op0, 1))
6044           && CONST_INT_P (op1)
6045           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6046                                INTVAL (op1)) != 0)
6047         {
6048           /* This is a UBFM/SBFM.  */
6049           *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
6050           if (speed)
6051             *cost += extra_cost->alu.bfx;
6052           return true;
6053         }
6054
6055       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6056         {
6057           /* We possibly get the immediate for free, this is not
6058              modelled.  */
6059           if (CONST_INT_P (op1)
6060               && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
6061             {
6062               *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6063
6064               if (speed)
6065                 *cost += extra_cost->alu.logical;
6066
6067               return true;
6068             }
6069           else
6070             {
6071               rtx new_op0 = op0;
6072
6073               /* Handle ORN, EON, or BIC.  */
6074               if (GET_CODE (op0) == NOT)
6075                 op0 = XEXP (op0, 0);
6076
6077               new_op0 = aarch64_strip_shift (op0);
6078
6079               /* If we had a shift on op0 then this is a logical-shift-
6080                  by-register/immediate operation.  Otherwise, this is just
6081                  a logical operation.  */
6082               if (speed)
6083                 {
6084                   if (new_op0 != op0)
6085                     {
6086                       /* Shift by immediate.  */
6087                       if (CONST_INT_P (XEXP (op0, 1)))
6088                         *cost += extra_cost->alu.log_shift;
6089                       else
6090                         *cost += extra_cost->alu.log_shift_reg;
6091                     }
6092                   else
6093                     *cost += extra_cost->alu.logical;
6094                 }
6095
6096               /* In both cases we want to cost both operands.  */
6097               *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6098                        + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6099
6100               return true;
6101             }
6102         }
6103       return false;
6104
6105     case NOT:
6106       x = XEXP (x, 0);
6107       op0 = aarch64_strip_shift (x);
6108
6109       /* MVN-shifted-reg.  */
6110       if (op0 != x)
6111         {
6112           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6113
6114           if (speed)
6115             *cost += extra_cost->alu.log_shift;
6116
6117           return true;
6118         }
6119       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6120          Handle the second form here taking care that 'a' in the above can
6121          be a shift.  */
6122       else if (GET_CODE (op0) == XOR)
6123         {
6124           rtx newop0 = XEXP (op0, 0);
6125           rtx newop1 = XEXP (op0, 1);
6126           rtx op0_stripped = aarch64_strip_shift (newop0);
6127
6128           *cost += rtx_cost (newop1, (enum rtx_code) code, 1, speed)
6129                    + rtx_cost (op0_stripped, XOR, 0, speed);
6130
6131           if (speed)
6132             {
6133               if (op0_stripped != newop0)
6134                 *cost += extra_cost->alu.log_shift;
6135               else
6136                 *cost += extra_cost->alu.logical;
6137             }
6138
6139           return true;
6140         }
6141       /* MVN.  */
6142       if (speed)
6143         *cost += extra_cost->alu.logical;
6144
6145       return false;
6146
6147     case ZERO_EXTEND:
6148
6149       op0 = XEXP (x, 0);
6150       /* If a value is written in SI mode, then zero extended to DI
6151          mode, the operation will in general be free as a write to
6152          a 'w' register implicitly zeroes the upper bits of an 'x'
6153          register.  However, if this is
6154
6155            (set (reg) (zero_extend (reg)))
6156
6157          we must cost the explicit register move.  */
6158       if (mode == DImode
6159           && GET_MODE (op0) == SImode
6160           && outer == SET)
6161         {
6162           int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6163
6164           if (!op_cost && speed)
6165             /* MOV.  */
6166             *cost += extra_cost->alu.extend;
6167           else
6168             /* Free, the cost is that of the SI mode operation.  */
6169             *cost = op_cost;
6170
6171           return true;
6172         }
6173       else if (MEM_P (XEXP (x, 0)))
6174         {
6175           /* All loads can zero extend to any size for free.  */
6176           *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6177           return true;
6178         }
6179
6180       /* UXTB/UXTH.  */
6181       if (speed)
6182         *cost += extra_cost->alu.extend;
6183
6184       return false;
6185
6186     case SIGN_EXTEND:
6187       if (MEM_P (XEXP (x, 0)))
6188         {
6189           /* LDRSH.  */
6190           if (speed)
6191             {
6192               rtx address = XEXP (XEXP (x, 0), 0);
6193               *cost += extra_cost->ldst.load_sign_extend;
6194
6195               *cost +=
6196                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6197                                                      0, speed));
6198             }
6199           return true;
6200         }
6201
6202       if (speed)
6203         *cost += extra_cost->alu.extend;
6204       return false;
6205
6206     case ASHIFT:
6207       op0 = XEXP (x, 0);
6208       op1 = XEXP (x, 1);
6209
6210       if (CONST_INT_P (op1))
6211         {
6212           /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6213              aliases.  */
6214           if (speed)
6215             *cost += extra_cost->alu.shift;
6216
6217           /* We can incorporate zero/sign extend for free.  */
6218           if (GET_CODE (op0) == ZERO_EXTEND
6219               || GET_CODE (op0) == SIGN_EXTEND)
6220             op0 = XEXP (op0, 0);
6221
6222           *cost += rtx_cost (op0, ASHIFT, 0, speed);
6223           return true;
6224         }
6225       else
6226         {
6227           /* LSLV.  */
6228           if (speed)
6229             *cost += extra_cost->alu.shift_reg;
6230
6231           return false;  /* All arguments need to be in registers.  */
6232         }
6233
6234     case ROTATE:
6235     case ROTATERT:
6236     case LSHIFTRT:
6237     case ASHIFTRT:
6238       op0 = XEXP (x, 0);
6239       op1 = XEXP (x, 1);
6240
6241       if (CONST_INT_P (op1))
6242         {
6243           /* ASR (immediate) and friends.  */
6244           if (speed)
6245             *cost += extra_cost->alu.shift;
6246
6247           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6248           return true;
6249         }
6250       else
6251         {
6252
6253           /* ASR (register) and friends.  */
6254           if (speed)
6255             *cost += extra_cost->alu.shift_reg;
6256
6257           return false;  /* All arguments need to be in registers.  */
6258         }
6259
6260     case SYMBOL_REF:
6261
6262       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6263         {
6264           /* LDR.  */
6265           if (speed)
6266             *cost += extra_cost->ldst.load;
6267         }
6268       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6269                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6270         {
6271           /* ADRP, followed by ADD.  */
6272           *cost += COSTS_N_INSNS (1);
6273           if (speed)
6274             *cost += 2 * extra_cost->alu.arith;
6275         }
6276       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6277                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6278         {
6279           /* ADR.  */
6280           if (speed)
6281             *cost += extra_cost->alu.arith;
6282         }
6283
6284       if (flag_pic)
6285         {
6286           /* One extra load instruction, after accessing the GOT.  */
6287           *cost += COSTS_N_INSNS (1);
6288           if (speed)
6289             *cost += extra_cost->ldst.load;
6290         }
6291       return true;
6292
6293     case HIGH:
6294     case LO_SUM:
6295       /* ADRP/ADD (immediate).  */
6296       if (speed)
6297         *cost += extra_cost->alu.arith;
6298       return true;
6299
6300     case ZERO_EXTRACT:
6301     case SIGN_EXTRACT:
6302       /* UBFX/SBFX.  */
6303       if (speed)
6304         *cost += extra_cost->alu.bfx;
6305
6306       /* We can trust that the immediates used will be correct (there
6307          are no by-register forms), so we need only cost op0.  */
6308       *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6309       return true;
6310
6311     case MULT:
6312       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6313       /* aarch64_rtx_mult_cost always handles recursion to its
6314          operands.  */
6315       return true;
6316
6317     case MOD:
6318     case UMOD:
6319       if (speed)
6320         {
6321           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6322             *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6323                       + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6324           else if (GET_MODE (x) == DFmode)
6325             *cost += (extra_cost->fp[1].mult
6326                       + extra_cost->fp[1].div);
6327           else if (GET_MODE (x) == SFmode)
6328             *cost += (extra_cost->fp[0].mult
6329                       + extra_cost->fp[0].div);
6330         }
6331       return false;  /* All arguments need to be in registers.  */
6332
6333     case DIV:
6334     case UDIV:
6335     case SQRT:
6336       if (speed)
6337         {
6338           if (GET_MODE_CLASS (mode) == MODE_INT)
6339             /* There is no integer SQRT, so only DIV and UDIV can get
6340                here.  */
6341             *cost += extra_cost->mult[mode == DImode].idiv;
6342           else
6343             *cost += extra_cost->fp[mode == DFmode].div;
6344         }
6345       return false;  /* All arguments need to be in registers.  */
6346
6347     case IF_THEN_ELSE:
6348       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6349                                          XEXP (x, 2), cost, speed);
6350
6351     case EQ:
6352     case NE:
6353     case GT:
6354     case GTU:
6355     case LT:
6356     case LTU:
6357     case GE:
6358     case GEU:
6359     case LE:
6360     case LEU:
6361
6362       return false; /* All arguments must be in registers.  */
6363
6364     case FMA:
6365       op0 = XEXP (x, 0);
6366       op1 = XEXP (x, 1);
6367       op2 = XEXP (x, 2);
6368
6369       if (speed)
6370         *cost += extra_cost->fp[mode == DFmode].fma;
6371
6372       /* FMSUB, FNMADD, and FNMSUB are free.  */
6373       if (GET_CODE (op0) == NEG)
6374         op0 = XEXP (op0, 0);
6375
6376       if (GET_CODE (op2) == NEG)
6377         op2 = XEXP (op2, 0);
6378
6379       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6380          and the by-element operand as operand 0.  */
6381       if (GET_CODE (op1) == NEG)
6382         op1 = XEXP (op1, 0);
6383
6384       /* Catch vector-by-element operations.  The by-element operand can
6385          either be (vec_duplicate (vec_select (x))) or just
6386          (vec_select (x)), depending on whether we are multiplying by
6387          a vector or a scalar.
6388
6389          Canonicalization is not very good in these cases, FMA4 will put the
6390          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
6391       if (GET_CODE (op0) == VEC_DUPLICATE)
6392         op0 = XEXP (op0, 0);
6393       else if (GET_CODE (op1) == VEC_DUPLICATE)
6394         op1 = XEXP (op1, 0);
6395
6396       if (GET_CODE (op0) == VEC_SELECT)
6397         op0 = XEXP (op0, 0);
6398       else if (GET_CODE (op1) == VEC_SELECT)
6399         op1 = XEXP (op1, 0);
6400
6401       /* If the remaining parameters are not registers,
6402          get the cost to put them into registers.  */
6403       *cost += rtx_cost (op0, FMA, 0, speed);
6404       *cost += rtx_cost (op1, FMA, 1, speed);
6405       *cost += rtx_cost (op2, FMA, 2, speed);
6406       return true;
6407
6408     case FLOAT_EXTEND:
6409       if (speed)
6410         *cost += extra_cost->fp[mode == DFmode].widen;
6411       return false;
6412
6413     case FLOAT_TRUNCATE:
6414       if (speed)
6415         *cost += extra_cost->fp[mode == DFmode].narrow;
6416       return false;
6417
6418     case FIX:
6419     case UNSIGNED_FIX:
6420       x = XEXP (x, 0);
6421       /* Strip the rounding part.  They will all be implemented
6422          by the fcvt* family of instructions anyway.  */
6423       if (GET_CODE (x) == UNSPEC)
6424         {
6425           unsigned int uns_code = XINT (x, 1);
6426
6427           if (uns_code == UNSPEC_FRINTA
6428               || uns_code == UNSPEC_FRINTM
6429               || uns_code == UNSPEC_FRINTN
6430               || uns_code == UNSPEC_FRINTP
6431               || uns_code == UNSPEC_FRINTZ)
6432             x = XVECEXP (x, 0, 0);
6433         }
6434
6435       if (speed)
6436         *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6437
6438       *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6439       return true;
6440
6441     case ABS:
6442       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6443         {
6444           op0 = XEXP (x, 0);
6445
6446           /* FABD, which is analogous to FADD.  */
6447           if (GET_CODE (op0) == MINUS)
6448             {
6449               *cost += rtx_cost (XEXP (op0, 0), MINUS, 0, speed);
6450                         + rtx_cost (XEXP (op0, 1), MINUS, 1, speed);
6451               if (speed)
6452                 *cost += extra_cost->fp[mode == DFmode].addsub;
6453
6454               return true;
6455             }
6456           /* Simple FABS is analogous to FNEG.  */
6457           if (speed)
6458             *cost += extra_cost->fp[mode == DFmode].neg;
6459         }
6460       else
6461         {
6462           /* Integer ABS will either be split to
6463              two arithmetic instructions, or will be an ABS
6464              (scalar), which we don't model.  */
6465           *cost = COSTS_N_INSNS (2);
6466           if (speed)
6467             *cost += 2 * extra_cost->alu.arith;
6468         }
6469       return false;
6470
6471     case SMAX:
6472     case SMIN:
6473       if (speed)
6474         {
6475           /* FMAXNM/FMINNM/FMAX/FMIN.
6476              TODO: This may not be accurate for all implementations, but
6477              we do not model this in the cost tables.  */
6478           *cost += extra_cost->fp[mode == DFmode].addsub;
6479         }
6480       return false;
6481
6482     case UNSPEC:
6483       /* The floating point round to integer frint* instructions.  */
6484       if (aarch64_frint_unspec_p (XINT (x, 1)))
6485         {
6486           if (speed)
6487             *cost += extra_cost->fp[mode == DFmode].roundint;
6488
6489           return false;
6490         }
6491
6492       if (XINT (x, 1) == UNSPEC_RBIT)
6493         {
6494           if (speed)
6495             *cost += extra_cost->alu.rev;
6496
6497           return false;
6498         }
6499       break;
6500
6501     case TRUNCATE:
6502
6503       /* Decompose <su>muldi3_highpart.  */
6504       if (/* (truncate:DI  */
6505           mode == DImode
6506           /*   (lshiftrt:TI  */
6507           && GET_MODE (XEXP (x, 0)) == TImode
6508           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6509           /*      (mult:TI  */
6510           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6511           /*        (ANY_EXTEND:TI (reg:DI))
6512                     (ANY_EXTEND:TI (reg:DI)))  */
6513           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6514                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6515               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6516                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6517           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6518           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6519           /*     (const_int 64)  */
6520           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6521           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6522         {
6523           /* UMULH/SMULH.  */
6524           if (speed)
6525             *cost += extra_cost->mult[mode == DImode].extend;
6526           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6527                              MULT, 0, speed);
6528           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6529                              MULT, 1, speed);
6530           return true;
6531         }
6532
6533       /* Fall through.  */
6534     default:
6535       break;
6536     }
6537
6538   if (dump_file && (dump_flags & TDF_DETAILS))
6539     fprintf (dump_file,
6540       "\nFailed to cost RTX.  Assuming default cost.\n");
6541
6542   return true;
6543 }
6544
6545 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6546    calculated for X.  This cost is stored in *COST.  Returns true
6547    if the total cost of X was calculated.  */
6548 static bool
6549 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6550                    int param, int *cost, bool speed)
6551 {
6552   bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6553
6554   if (dump_file && (dump_flags & TDF_DETAILS))
6555     {
6556       print_rtl_single (dump_file, x);
6557       fprintf (dump_file, "\n%s cost: %d (%s)\n",
6558                speed ? "Hot" : "Cold",
6559                *cost, result ? "final" : "partial");
6560     }
6561
6562   return result;
6563 }
6564
6565 static int
6566 aarch64_register_move_cost (machine_mode mode,
6567                             reg_class_t from_i, reg_class_t to_i)
6568 {
6569   enum reg_class from = (enum reg_class) from_i;
6570   enum reg_class to = (enum reg_class) to_i;
6571   const struct cpu_regmove_cost *regmove_cost
6572     = aarch64_tune_params->regmove_cost;
6573
6574   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
6575   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6576     to = GENERAL_REGS;
6577
6578   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6579     from = GENERAL_REGS;
6580
6581   /* Moving between GPR and stack cost is the same as GP2GP.  */
6582   if ((from == GENERAL_REGS && to == STACK_REG)
6583       || (to == GENERAL_REGS && from == STACK_REG))
6584     return regmove_cost->GP2GP;
6585
6586   /* To/From the stack register, we move via the gprs.  */
6587   if (to == STACK_REG || from == STACK_REG)
6588     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6589             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6590
6591   if (GET_MODE_SIZE (mode) == 16)
6592     {
6593       /* 128-bit operations on general registers require 2 instructions.  */
6594       if (from == GENERAL_REGS && to == GENERAL_REGS)
6595         return regmove_cost->GP2GP * 2;
6596       else if (from == GENERAL_REGS)
6597         return regmove_cost->GP2FP * 2;
6598       else if (to == GENERAL_REGS)
6599         return regmove_cost->FP2GP * 2;
6600
6601       /* When AdvSIMD instructions are disabled it is not possible to move
6602          a 128-bit value directly between Q registers.  This is handled in
6603          secondary reload.  A general register is used as a scratch to move
6604          the upper DI value and the lower DI value is moved directly,
6605          hence the cost is the sum of three moves. */
6606       if (! TARGET_SIMD)
6607         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6608
6609       return regmove_cost->FP2FP;
6610     }
6611
6612   if (from == GENERAL_REGS && to == GENERAL_REGS)
6613     return regmove_cost->GP2GP;
6614   else if (from == GENERAL_REGS)
6615     return regmove_cost->GP2FP;
6616   else if (to == GENERAL_REGS)
6617     return regmove_cost->FP2GP;
6618
6619   return regmove_cost->FP2FP;
6620 }
6621
6622 static int
6623 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6624                           reg_class_t rclass ATTRIBUTE_UNUSED,
6625                           bool in ATTRIBUTE_UNUSED)
6626 {
6627   return aarch64_tune_params->memmov_cost;
6628 }
6629
6630 /* Return the number of instructions that can be issued per cycle.  */
6631 static int
6632 aarch64_sched_issue_rate (void)
6633 {
6634   return aarch64_tune_params->issue_rate;
6635 }
6636
6637 static int
6638 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6639 {
6640   int issue_rate = aarch64_sched_issue_rate ();
6641
6642   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6643 }
6644
6645 /* Vectorizer cost model target hooks.  */
6646
6647 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
6648 static int
6649 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6650                                     tree vectype,
6651                                     int misalign ATTRIBUTE_UNUSED)
6652 {
6653   unsigned elements;
6654
6655   switch (type_of_cost)
6656     {
6657       case scalar_stmt:
6658         return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6659
6660       case scalar_load:
6661         return aarch64_tune_params->vec_costs->scalar_load_cost;
6662
6663       case scalar_store:
6664         return aarch64_tune_params->vec_costs->scalar_store_cost;
6665
6666       case vector_stmt:
6667         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6668
6669       case vector_load:
6670         return aarch64_tune_params->vec_costs->vec_align_load_cost;
6671
6672       case vector_store:
6673         return aarch64_tune_params->vec_costs->vec_store_cost;
6674
6675       case vec_to_scalar:
6676         return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6677
6678       case scalar_to_vec:
6679         return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6680
6681       case unaligned_load:
6682         return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6683
6684       case unaligned_store:
6685         return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6686
6687       case cond_branch_taken:
6688         return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6689
6690       case cond_branch_not_taken:
6691         return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6692
6693       case vec_perm:
6694       case vec_promote_demote:
6695         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6696
6697       case vec_construct:
6698         elements = TYPE_VECTOR_SUBPARTS (vectype);
6699         return elements / 2 + 1;
6700
6701       default:
6702         gcc_unreachable ();
6703     }
6704 }
6705
6706 /* Implement targetm.vectorize.add_stmt_cost.  */
6707 static unsigned
6708 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6709                        struct _stmt_vec_info *stmt_info, int misalign,
6710                        enum vect_cost_model_location where)
6711 {
6712   unsigned *cost = (unsigned *) data;
6713   unsigned retval = 0;
6714
6715   if (flag_vect_cost_model)
6716     {
6717       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6718       int stmt_cost =
6719             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6720
6721       /* Statements in an inner loop relative to the loop being
6722          vectorized are weighted more heavily.  The value here is
6723          a function (linear for now) of the loop nest level.  */
6724       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6725         {
6726           loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6727           struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
6728           unsigned nest_level = loop_depth (loop);
6729
6730           count *= nest_level;
6731         }
6732
6733       retval = (unsigned) (count * stmt_cost);
6734       cost[where] += retval;
6735     }
6736
6737   return retval;
6738 }
6739
6740 static void initialize_aarch64_code_model (void);
6741
6742 /* Parse the architecture extension string.  */
6743
6744 static void
6745 aarch64_parse_extension (char *str)
6746 {
6747   /* The extension string is parsed left to right.  */
6748   const struct aarch64_option_extension *opt = NULL;
6749
6750   /* Flag to say whether we are adding or removing an extension.  */
6751   int adding_ext = -1;
6752
6753   while (str != NULL && *str != 0)
6754     {
6755       char *ext;
6756       size_t len;
6757
6758       str++;
6759       ext = strchr (str, '+');
6760
6761       if (ext != NULL)
6762         len = ext - str;
6763       else
6764         len = strlen (str);
6765
6766       if (len >= 2 && strncmp (str, "no", 2) == 0)
6767         {
6768           adding_ext = 0;
6769           len -= 2;
6770           str += 2;
6771         }
6772       else if (len > 0)
6773         adding_ext = 1;
6774
6775       if (len == 0)
6776         {
6777           error ("missing feature modifier after %qs", adding_ext ? "+"
6778                                                                   : "+no");
6779           return;
6780         }
6781
6782       /* Scan over the extensions table trying to find an exact match.  */
6783       for (opt = all_extensions; opt->name != NULL; opt++)
6784         {
6785           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6786             {
6787               /* Add or remove the extension.  */
6788               if (adding_ext)
6789                 aarch64_isa_flags |= opt->flags_on;
6790               else
6791                 aarch64_isa_flags &= ~(opt->flags_off);
6792               break;
6793             }
6794         }
6795
6796       if (opt->name == NULL)
6797         {
6798           /* Extension not found in list.  */
6799           error ("unknown feature modifier %qs", str);
6800           return;
6801         }
6802
6803       str = ext;
6804     };
6805
6806   return;
6807 }
6808
6809 /* Parse the ARCH string.  */
6810
6811 static void
6812 aarch64_parse_arch (void)
6813 {
6814   char *ext;
6815   const struct processor *arch;
6816   char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6817   size_t len;
6818
6819   strcpy (str, aarch64_arch_string);
6820
6821   ext = strchr (str, '+');
6822
6823   if (ext != NULL)
6824     len = ext - str;
6825   else
6826     len = strlen (str);
6827
6828   if (len == 0)
6829     {
6830       error ("missing arch name in -march=%qs", str);
6831       return;
6832     }
6833
6834   /* Loop through the list of supported ARCHs to find a match.  */
6835   for (arch = all_architectures; arch->name != NULL; arch++)
6836     {
6837       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6838         {
6839           selected_arch = arch;
6840           aarch64_isa_flags = selected_arch->flags;
6841
6842           if (!selected_cpu)
6843             selected_cpu = &all_cores[selected_arch->core];
6844
6845           if (ext != NULL)
6846             {
6847               /* ARCH string contains at least one extension.  */
6848               aarch64_parse_extension (ext);
6849             }
6850
6851           if (strcmp (selected_arch->arch, selected_cpu->arch))
6852             {
6853               warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6854                        selected_cpu->name, selected_arch->name);
6855             }
6856
6857           return;
6858         }
6859     }
6860
6861   /* ARCH name not found in list.  */
6862   error ("unknown value %qs for -march", str);
6863   return;
6864 }
6865
6866 /* Parse the CPU string.  */
6867
6868 static void
6869 aarch64_parse_cpu (void)
6870 {
6871   char *ext;
6872   const struct processor *cpu;
6873   char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6874   size_t len;
6875
6876   strcpy (str, aarch64_cpu_string);
6877
6878   ext = strchr (str, '+');
6879
6880   if (ext != NULL)
6881     len = ext - str;
6882   else
6883     len = strlen (str);
6884
6885   if (len == 0)
6886     {
6887       error ("missing cpu name in -mcpu=%qs", str);
6888       return;
6889     }
6890
6891   /* Loop through the list of supported CPUs to find a match.  */
6892   for (cpu = all_cores; cpu->name != NULL; cpu++)
6893     {
6894       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6895         {
6896           selected_cpu = cpu;
6897           aarch64_isa_flags = selected_cpu->flags;
6898
6899           if (ext != NULL)
6900             {
6901               /* CPU string contains at least one extension.  */
6902               aarch64_parse_extension (ext);
6903             }
6904
6905           return;
6906         }
6907     }
6908
6909   /* CPU name not found in list.  */
6910   error ("unknown value %qs for -mcpu", str);
6911   return;
6912 }
6913
6914 /* Parse the TUNE string.  */
6915
6916 static void
6917 aarch64_parse_tune (void)
6918 {
6919   const struct processor *cpu;
6920   char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6921   strcpy (str, aarch64_tune_string);
6922
6923   /* Loop through the list of supported CPUs to find a match.  */
6924   for (cpu = all_cores; cpu->name != NULL; cpu++)
6925     {
6926       if (strcmp (cpu->name, str) == 0)
6927         {
6928           selected_tune = cpu;
6929           return;
6930         }
6931     }
6932
6933   /* CPU name not found in list.  */
6934   error ("unknown value %qs for -mtune", str);
6935   return;
6936 }
6937
6938
6939 /* Implement TARGET_OPTION_OVERRIDE.  */
6940
6941 static void
6942 aarch64_override_options (void)
6943 {
6944   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6945      If either of -march or -mtune is given, they override their
6946      respective component of -mcpu.
6947
6948      So, first parse AARCH64_CPU_STRING, then the others, be careful
6949      with -march as, if -mcpu is not present on the command line, march
6950      must set a sensible default CPU.  */
6951   if (aarch64_cpu_string)
6952     {
6953       aarch64_parse_cpu ();
6954     }
6955
6956   if (aarch64_arch_string)
6957     {
6958       aarch64_parse_arch ();
6959     }
6960
6961   if (aarch64_tune_string)
6962     {
6963       aarch64_parse_tune ();
6964     }
6965
6966 #ifndef HAVE_AS_MABI_OPTION
6967   /* The compiler may have been configured with 2.23.* binutils, which does
6968      not have support for ILP32.  */
6969   if (TARGET_ILP32)
6970     error ("Assembler does not support -mabi=ilp32");
6971 #endif
6972
6973   initialize_aarch64_code_model ();
6974
6975   aarch64_build_bitmask_table ();
6976
6977   /* This target defaults to strict volatile bitfields.  */
6978   if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6979     flag_strict_volatile_bitfields = 1;
6980
6981   /* If the user did not specify a processor, choose the default
6982      one for them.  This will be the CPU set during configuration using
6983      --with-cpu, otherwise it is "generic".  */
6984   if (!selected_cpu)
6985     {
6986       selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6987       aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6988     }
6989
6990   gcc_assert (selected_cpu);
6991
6992   if (!selected_tune)
6993     selected_tune = selected_cpu;
6994
6995   aarch64_tune_flags = selected_tune->flags;
6996   aarch64_tune = selected_tune->core;
6997   aarch64_tune_params = selected_tune->tune;
6998   aarch64_architecture_version = selected_cpu->architecture_version;
6999
7000   if (aarch64_fix_a53_err835769 == 2)
7001     {
7002 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
7003       aarch64_fix_a53_err835769 = 1;
7004 #else
7005       aarch64_fix_a53_err835769 = 0;
7006 #endif
7007     }
7008
7009   /* If not opzimizing for size, set the default
7010      alignment to what the target wants */
7011   if (!optimize_size)
7012     {
7013       if (align_loops <= 0)
7014         align_loops = aarch64_tune_params->loop_align;
7015       if (align_jumps <= 0)
7016         align_jumps = aarch64_tune_params->jump_align;
7017       if (align_functions <= 0)
7018         align_functions = aarch64_tune_params->function_align;
7019     }
7020
7021   if (AARCH64_TUNE_FMA_STEERING)
7022     aarch64_register_fma_steering ();
7023
7024   aarch64_override_options_after_change ();
7025 }
7026
7027 /* Implement targetm.override_options_after_change.  */
7028
7029 static void
7030 aarch64_override_options_after_change (void)
7031 {
7032   if (flag_omit_frame_pointer)
7033     flag_omit_leaf_frame_pointer = false;
7034   else if (flag_omit_leaf_frame_pointer)
7035     flag_omit_frame_pointer = true;
7036 }
7037
7038 static struct machine_function *
7039 aarch64_init_machine_status (void)
7040 {
7041   struct machine_function *machine;
7042   machine = ggc_cleared_alloc<machine_function> ();
7043   return machine;
7044 }
7045
7046 void
7047 aarch64_init_expanders (void)
7048 {
7049   init_machine_status = aarch64_init_machine_status;
7050 }
7051
7052 /* A checking mechanism for the implementation of the various code models.  */
7053 static void
7054 initialize_aarch64_code_model (void)
7055 {
7056    if (flag_pic)
7057      {
7058        switch (aarch64_cmodel_var)
7059          {
7060          case AARCH64_CMODEL_TINY:
7061            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
7062            break;
7063          case AARCH64_CMODEL_SMALL:
7064            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
7065            break;
7066          case AARCH64_CMODEL_LARGE:
7067            sorry ("code model %qs with -f%s", "large",
7068                   flag_pic > 1 ? "PIC" : "pic");
7069          default:
7070            gcc_unreachable ();
7071          }
7072      }
7073    else
7074      aarch64_cmodel = aarch64_cmodel_var;
7075 }
7076
7077 /* Return true if SYMBOL_REF X binds locally.  */
7078
7079 static bool
7080 aarch64_symbol_binds_local_p (const_rtx x)
7081 {
7082   return (SYMBOL_REF_DECL (x)
7083           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
7084           : SYMBOL_REF_LOCAL_P (x));
7085 }
7086
7087 /* Return true if SYMBOL_REF X is thread local */
7088 static bool
7089 aarch64_tls_symbol_p (rtx x)
7090 {
7091   if (! TARGET_HAVE_TLS)
7092     return false;
7093
7094   if (GET_CODE (x) != SYMBOL_REF)
7095     return false;
7096
7097   return SYMBOL_REF_TLS_MODEL (x) != 0;
7098 }
7099
7100 /* Classify a TLS symbol into one of the TLS kinds.  */
7101 enum aarch64_symbol_type
7102 aarch64_classify_tls_symbol (rtx x)
7103 {
7104   enum tls_model tls_kind = tls_symbolic_operand_type (x);
7105
7106   switch (tls_kind)
7107     {
7108     case TLS_MODEL_GLOBAL_DYNAMIC:
7109     case TLS_MODEL_LOCAL_DYNAMIC:
7110       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
7111
7112     case TLS_MODEL_INITIAL_EXEC:
7113       return SYMBOL_SMALL_GOTTPREL;
7114
7115     case TLS_MODEL_LOCAL_EXEC:
7116       return SYMBOL_SMALL_TPREL;
7117
7118     case TLS_MODEL_EMULATED:
7119     case TLS_MODEL_NONE:
7120       return SYMBOL_FORCE_TO_MEM;
7121
7122     default:
7123       gcc_unreachable ();
7124     }
7125 }
7126
7127 /* Return the method that should be used to access SYMBOL_REF or
7128    LABEL_REF X in context CONTEXT.  */
7129
7130 enum aarch64_symbol_type
7131 aarch64_classify_symbol (rtx x, rtx offset,
7132                          enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
7133 {
7134   if (GET_CODE (x) == LABEL_REF)
7135     {
7136       switch (aarch64_cmodel)
7137         {
7138         case AARCH64_CMODEL_LARGE:
7139           return SYMBOL_FORCE_TO_MEM;
7140
7141         case AARCH64_CMODEL_TINY_PIC:
7142         case AARCH64_CMODEL_TINY:
7143           return SYMBOL_TINY_ABSOLUTE;
7144
7145         case AARCH64_CMODEL_SMALL_PIC:
7146         case AARCH64_CMODEL_SMALL:
7147           return SYMBOL_SMALL_ABSOLUTE;
7148
7149         default:
7150           gcc_unreachable ();
7151         }
7152     }
7153
7154   if (GET_CODE (x) == SYMBOL_REF)
7155     {
7156       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7157           return SYMBOL_FORCE_TO_MEM;
7158
7159       if (aarch64_tls_symbol_p (x))
7160         return aarch64_classify_tls_symbol (x);
7161
7162       switch (aarch64_cmodel)
7163         {
7164         case AARCH64_CMODEL_TINY:
7165           /* When we retreive symbol + offset address, we have to make sure
7166              the offset does not cause overflow of the final address.  But
7167              we have no way of knowing the address of symbol at compile time
7168              so we can't accurately say if the distance between the PC and
7169              symbol + offset is outside the addressible range of +/-1M in the
7170              TINY code model.  So we rely on images not being greater than
7171              1M and cap the offset at 1M and anything beyond 1M will have to
7172              be loaded using an alternative mechanism.  */
7173           if (SYMBOL_REF_WEAK (x)
7174               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7175             return SYMBOL_FORCE_TO_MEM;
7176           return SYMBOL_TINY_ABSOLUTE;
7177
7178         case AARCH64_CMODEL_SMALL:
7179           /* Same reasoning as the tiny code model, but the offset cap here is
7180              4G.  */
7181           if (SYMBOL_REF_WEAK (x)
7182               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
7183                             HOST_WIDE_INT_C (4294967264)))
7184             return SYMBOL_FORCE_TO_MEM;
7185           return SYMBOL_SMALL_ABSOLUTE;
7186
7187         case AARCH64_CMODEL_TINY_PIC:
7188           if (!aarch64_symbol_binds_local_p (x))
7189             return SYMBOL_TINY_GOT;
7190           return SYMBOL_TINY_ABSOLUTE;
7191
7192         case AARCH64_CMODEL_SMALL_PIC:
7193           if (!aarch64_symbol_binds_local_p (x))
7194             return SYMBOL_SMALL_GOT;
7195           return SYMBOL_SMALL_ABSOLUTE;
7196
7197         default:
7198           gcc_unreachable ();
7199         }
7200     }
7201
7202   /* By default push everything into the constant pool.  */
7203   return SYMBOL_FORCE_TO_MEM;
7204 }
7205
7206 bool
7207 aarch64_constant_address_p (rtx x)
7208 {
7209   return (CONSTANT_P (x) && memory_address_p (DImode, x));
7210 }
7211
7212 bool
7213 aarch64_legitimate_pic_operand_p (rtx x)
7214 {
7215   if (GET_CODE (x) == SYMBOL_REF
7216       || (GET_CODE (x) == CONST
7217           && GET_CODE (XEXP (x, 0)) == PLUS
7218           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7219      return false;
7220
7221   return true;
7222 }
7223
7224 /* Return true if X holds either a quarter-precision or
7225      floating-point +0.0 constant.  */
7226 static bool
7227 aarch64_valid_floating_const (machine_mode mode, rtx x)
7228 {
7229   if (!CONST_DOUBLE_P (x))
7230     return false;
7231
7232   /* TODO: We could handle moving 0.0 to a TFmode register,
7233      but first we would like to refactor the movtf_aarch64
7234      to be more amicable to split moves properly and
7235      correctly gate on TARGET_SIMD.  For now - reject all
7236      constants which are not to SFmode or DFmode registers.  */
7237   if (!(mode == SFmode || mode == DFmode))
7238     return false;
7239
7240   if (aarch64_float_const_zero_rtx_p (x))
7241     return true;
7242   return aarch64_float_const_representable_p (x);
7243 }
7244
7245 static bool
7246 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7247 {
7248   /* Do not allow vector struct mode constants.  We could support
7249      0 and -1 easily, but they need support in aarch64-simd.md.  */
7250   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7251     return false;
7252
7253   /* This could probably go away because
7254      we now decompose CONST_INTs according to expand_mov_immediate.  */
7255   if ((GET_CODE (x) == CONST_VECTOR
7256        && aarch64_simd_valid_immediate (x, mode, false, NULL))
7257       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7258         return !targetm.cannot_force_const_mem (mode, x);
7259
7260   if (GET_CODE (x) == HIGH
7261       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7262     return true;
7263
7264   return aarch64_constant_address_p (x);
7265 }
7266
7267 rtx
7268 aarch64_load_tp (rtx target)
7269 {
7270   if (!target
7271       || GET_MODE (target) != Pmode
7272       || !register_operand (target, Pmode))
7273     target = gen_reg_rtx (Pmode);
7274
7275   /* Can return in any reg.  */
7276   emit_insn (gen_aarch64_load_tp_hard (target));
7277   return target;
7278 }
7279
7280 /* On AAPCS systems, this is the "struct __va_list".  */
7281 static GTY(()) tree va_list_type;
7282
7283 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7284    Return the type to use as __builtin_va_list.
7285
7286    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7287
7288    struct __va_list
7289    {
7290      void *__stack;
7291      void *__gr_top;
7292      void *__vr_top;
7293      int   __gr_offs;
7294      int   __vr_offs;
7295    };  */
7296
7297 static tree
7298 aarch64_build_builtin_va_list (void)
7299 {
7300   tree va_list_name;
7301   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7302
7303   /* Create the type.  */
7304   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7305   /* Give it the required name.  */
7306   va_list_name = build_decl (BUILTINS_LOCATION,
7307                              TYPE_DECL,
7308                              get_identifier ("__va_list"),
7309                              va_list_type);
7310   DECL_ARTIFICIAL (va_list_name) = 1;
7311   TYPE_NAME (va_list_type) = va_list_name;
7312   TYPE_STUB_DECL (va_list_type) = va_list_name;
7313
7314   /* Create the fields.  */
7315   f_stack = build_decl (BUILTINS_LOCATION,
7316                         FIELD_DECL, get_identifier ("__stack"),
7317                         ptr_type_node);
7318   f_grtop = build_decl (BUILTINS_LOCATION,
7319                         FIELD_DECL, get_identifier ("__gr_top"),
7320                         ptr_type_node);
7321   f_vrtop = build_decl (BUILTINS_LOCATION,
7322                         FIELD_DECL, get_identifier ("__vr_top"),
7323                         ptr_type_node);
7324   f_groff = build_decl (BUILTINS_LOCATION,
7325                         FIELD_DECL, get_identifier ("__gr_offs"),
7326                         integer_type_node);
7327   f_vroff = build_decl (BUILTINS_LOCATION,
7328                         FIELD_DECL, get_identifier ("__vr_offs"),
7329                         integer_type_node);
7330
7331   DECL_ARTIFICIAL (f_stack) = 1;
7332   DECL_ARTIFICIAL (f_grtop) = 1;
7333   DECL_ARTIFICIAL (f_vrtop) = 1;
7334   DECL_ARTIFICIAL (f_groff) = 1;
7335   DECL_ARTIFICIAL (f_vroff) = 1;
7336
7337   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7338   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7339   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7340   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7341   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7342
7343   TYPE_FIELDS (va_list_type) = f_stack;
7344   DECL_CHAIN (f_stack) = f_grtop;
7345   DECL_CHAIN (f_grtop) = f_vrtop;
7346   DECL_CHAIN (f_vrtop) = f_groff;
7347   DECL_CHAIN (f_groff) = f_vroff;
7348
7349   /* Compute its layout.  */
7350   layout_type (va_list_type);
7351
7352   return va_list_type;
7353 }
7354
7355 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
7356 static void
7357 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7358 {
7359   const CUMULATIVE_ARGS *cum;
7360   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7361   tree stack, grtop, vrtop, groff, vroff;
7362   tree t;
7363   int gr_save_area_size;
7364   int vr_save_area_size;
7365   int vr_offset;
7366
7367   cum = &crtl->args.info;
7368   gr_save_area_size
7369     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7370   vr_save_area_size
7371     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7372
7373   if (TARGET_GENERAL_REGS_ONLY)
7374     {
7375       if (cum->aapcs_nvrn > 0)
7376         sorry ("%qs and floating point or vector arguments",
7377                "-mgeneral-regs-only");
7378       vr_save_area_size = 0;
7379     }
7380
7381   f_stack = TYPE_FIELDS (va_list_type_node);
7382   f_grtop = DECL_CHAIN (f_stack);
7383   f_vrtop = DECL_CHAIN (f_grtop);
7384   f_groff = DECL_CHAIN (f_vrtop);
7385   f_vroff = DECL_CHAIN (f_groff);
7386
7387   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7388                   NULL_TREE);
7389   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7390                   NULL_TREE);
7391   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7392                   NULL_TREE);
7393   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7394                   NULL_TREE);
7395   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7396                   NULL_TREE);
7397
7398   /* Emit code to initialize STACK, which points to the next varargs stack
7399      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
7400      by named arguments.  STACK is 8-byte aligned.  */
7401   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7402   if (cum->aapcs_stack_size > 0)
7403     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7404   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7405   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7406
7407   /* Emit code to initialize GRTOP, the top of the GR save area.
7408      virtual_incoming_args_rtx should have been 16 byte aligned.  */
7409   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7410   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7411   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7412
7413   /* Emit code to initialize VRTOP, the top of the VR save area.
7414      This address is gr_save_area_bytes below GRTOP, rounded
7415      down to the next 16-byte boundary.  */
7416   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7417   vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7418                              STACK_BOUNDARY / BITS_PER_UNIT);
7419
7420   if (vr_offset)
7421     t = fold_build_pointer_plus_hwi (t, -vr_offset);
7422   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7423   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7424
7425   /* Emit code to initialize GROFF, the offset from GRTOP of the
7426      next GPR argument.  */
7427   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7428               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7429   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7430
7431   /* Likewise emit code to initialize VROFF, the offset from FTOP
7432      of the next VR argument.  */
7433   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7434               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7435   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7436 }
7437
7438 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
7439
7440 static tree
7441 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7442                               gimple_seq *post_p ATTRIBUTE_UNUSED)
7443 {
7444   tree addr;
7445   bool indirect_p;
7446   bool is_ha;           /* is HFA or HVA.  */
7447   bool dw_align;        /* double-word align.  */
7448   machine_mode ag_mode = VOIDmode;
7449   int nregs;
7450   machine_mode mode;
7451
7452   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7453   tree stack, f_top, f_off, off, arg, roundup, on_stack;
7454   HOST_WIDE_INT size, rsize, adjust, align;
7455   tree t, u, cond1, cond2;
7456
7457   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7458   if (indirect_p)
7459     type = build_pointer_type (type);
7460
7461   mode = TYPE_MODE (type);
7462
7463   f_stack = TYPE_FIELDS (va_list_type_node);
7464   f_grtop = DECL_CHAIN (f_stack);
7465   f_vrtop = DECL_CHAIN (f_grtop);
7466   f_groff = DECL_CHAIN (f_vrtop);
7467   f_vroff = DECL_CHAIN (f_groff);
7468
7469   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7470                   f_stack, NULL_TREE);
7471   size = int_size_in_bytes (type);
7472   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7473
7474   dw_align = false;
7475   adjust = 0;
7476   if (aarch64_vfp_is_call_or_return_candidate (mode,
7477                                                type,
7478                                                &ag_mode,
7479                                                &nregs,
7480                                                &is_ha))
7481     {
7482       /* TYPE passed in fp/simd registers.  */
7483       if (TARGET_GENERAL_REGS_ONLY)
7484         sorry ("%qs and floating point or vector arguments",
7485                "-mgeneral-regs-only");
7486
7487       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
7488                       unshare_expr (valist), f_vrtop, NULL_TREE);
7489       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
7490                       unshare_expr (valist), f_vroff, NULL_TREE);
7491
7492       rsize = nregs * UNITS_PER_VREG;
7493
7494       if (is_ha)
7495         {
7496           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
7497             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
7498         }
7499       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
7500                && size < UNITS_PER_VREG)
7501         {
7502           adjust = UNITS_PER_VREG - size;
7503         }
7504     }
7505   else
7506     {
7507       /* TYPE passed in general registers.  */
7508       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
7509                       unshare_expr (valist), f_grtop, NULL_TREE);
7510       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
7511                       unshare_expr (valist), f_groff, NULL_TREE);
7512       rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
7513       nregs = rsize / UNITS_PER_WORD;
7514
7515       if (align > 8)
7516         dw_align = true;
7517
7518       if (BLOCK_REG_PADDING (mode, type, 1) == downward
7519           && size < UNITS_PER_WORD)
7520         {
7521           adjust = UNITS_PER_WORD  - size;
7522         }
7523     }
7524
7525   /* Get a local temporary for the field value.  */
7526   off = get_initialized_tmp_var (f_off, pre_p, NULL);
7527
7528   /* Emit code to branch if off >= 0.  */
7529   t = build2 (GE_EXPR, boolean_type_node, off,
7530               build_int_cst (TREE_TYPE (off), 0));
7531   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
7532
7533   if (dw_align)
7534     {
7535       /* Emit: offs = (offs + 15) & -16.  */
7536       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7537                   build_int_cst (TREE_TYPE (off), 15));
7538       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
7539                   build_int_cst (TREE_TYPE (off), -16));
7540       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
7541     }
7542   else
7543     roundup = NULL;
7544
7545   /* Update ap.__[g|v]r_offs  */
7546   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7547               build_int_cst (TREE_TYPE (off), rsize));
7548   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
7549
7550   /* String up.  */
7551   if (roundup)
7552     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7553
7554   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
7555   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
7556               build_int_cst (TREE_TYPE (f_off), 0));
7557   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
7558
7559   /* String up: make sure the assignment happens before the use.  */
7560   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
7561   COND_EXPR_ELSE (cond1) = t;
7562
7563   /* Prepare the trees handling the argument that is passed on the stack;
7564      the top level node will store in ON_STACK.  */
7565   arg = get_initialized_tmp_var (stack, pre_p, NULL);
7566   if (align > 8)
7567     {
7568       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
7569       t = fold_convert (intDI_type_node, arg);
7570       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7571                   build_int_cst (TREE_TYPE (t), 15));
7572       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7573                   build_int_cst (TREE_TYPE (t), -16));
7574       t = fold_convert (TREE_TYPE (arg), t);
7575       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7576     }
7577   else
7578     roundup = NULL;
7579   /* Advance ap.__stack  */
7580   t = fold_convert (intDI_type_node, arg);
7581   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7582               build_int_cst (TREE_TYPE (t), size + 7));
7583   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7584               build_int_cst (TREE_TYPE (t), -8));
7585   t = fold_convert (TREE_TYPE (arg), t);
7586   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7587   /* String up roundup and advance.  */
7588   if (roundup)
7589     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7590   /* String up with arg */
7591   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7592   /* Big-endianness related address adjustment.  */
7593   if (BLOCK_REG_PADDING (mode, type, 1) == downward
7594       && size < UNITS_PER_WORD)
7595   {
7596     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7597                 size_int (UNITS_PER_WORD - size));
7598     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7599   }
7600
7601   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7602   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7603
7604   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
7605   t = off;
7606   if (adjust)
7607     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7608                 build_int_cst (TREE_TYPE (off), adjust));
7609
7610   t = fold_convert (sizetype, t);
7611   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7612
7613   if (is_ha)
7614     {
7615       /* type ha; // treat as "struct {ftype field[n];}"
7616          ... [computing offs]
7617          for (i = 0; i <nregs; ++i, offs += 16)
7618            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7619          return ha;  */
7620       int i;
7621       tree tmp_ha, field_t, field_ptr_t;
7622
7623       /* Declare a local variable.  */
7624       tmp_ha = create_tmp_var_raw (type, "ha");
7625       gimple_add_tmp_var (tmp_ha);
7626
7627       /* Establish the base type.  */
7628       switch (ag_mode)
7629         {
7630         case SFmode:
7631           field_t = float_type_node;
7632           field_ptr_t = float_ptr_type_node;
7633           break;
7634         case DFmode:
7635           field_t = double_type_node;
7636           field_ptr_t = double_ptr_type_node;
7637           break;
7638         case TFmode:
7639           field_t = long_double_type_node;
7640           field_ptr_t = long_double_ptr_type_node;
7641           break;
7642 /* The half precision and quad precision are not fully supported yet.  Enable
7643    the following code after the support is complete.  Need to find the correct
7644    type node for __fp16 *.  */
7645 #if 0
7646         case HFmode:
7647           field_t = float_type_node;
7648           field_ptr_t = float_ptr_type_node;
7649           break;
7650 #endif
7651         case V2SImode:
7652         case V4SImode:
7653             {
7654               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7655               field_t = build_vector_type_for_mode (innertype, ag_mode);
7656               field_ptr_t = build_pointer_type (field_t);
7657             }
7658           break;
7659         default:
7660           gcc_assert (0);
7661         }
7662
7663       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
7664       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7665       addr = t;
7666       t = fold_convert (field_ptr_t, addr);
7667       t = build2 (MODIFY_EXPR, field_t,
7668                   build1 (INDIRECT_REF, field_t, tmp_ha),
7669                   build1 (INDIRECT_REF, field_t, t));
7670
7671       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
7672       for (i = 1; i < nregs; ++i)
7673         {
7674           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7675           u = fold_convert (field_ptr_t, addr);
7676           u = build2 (MODIFY_EXPR, field_t,
7677                       build2 (MEM_REF, field_t, tmp_ha,
7678                               build_int_cst (field_ptr_t,
7679                                              (i *
7680                                               int_size_in_bytes (field_t)))),
7681                       build1 (INDIRECT_REF, field_t, u));
7682           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7683         }
7684
7685       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7686       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7687     }
7688
7689   COND_EXPR_ELSE (cond2) = t;
7690   addr = fold_convert (build_pointer_type (type), cond1);
7691   addr = build_va_arg_indirect_ref (addr);
7692
7693   if (indirect_p)
7694     addr = build_va_arg_indirect_ref (addr);
7695
7696   return addr;
7697 }
7698
7699 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
7700
7701 static void
7702 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7703                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7704                                 int no_rtl)
7705 {
7706   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7707   CUMULATIVE_ARGS local_cum;
7708   int gr_saved, vr_saved;
7709
7710   /* The caller has advanced CUM up to, but not beyond, the last named
7711      argument.  Advance a local copy of CUM past the last "real" named
7712      argument, to find out how many registers are left over.  */
7713   local_cum = *cum;
7714   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7715
7716   /* Found out how many registers we need to save.  */
7717   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7718   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7719
7720   if (TARGET_GENERAL_REGS_ONLY)
7721     {
7722       if (local_cum.aapcs_nvrn > 0)
7723         sorry ("%qs and floating point or vector arguments",
7724                "-mgeneral-regs-only");
7725       vr_saved = 0;
7726     }
7727
7728   if (!no_rtl)
7729     {
7730       if (gr_saved > 0)
7731         {
7732           rtx ptr, mem;
7733
7734           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
7735           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7736                                - gr_saved * UNITS_PER_WORD);
7737           mem = gen_frame_mem (BLKmode, ptr);
7738           set_mem_alias_set (mem, get_varargs_alias_set ());
7739
7740           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7741                                mem, gr_saved);
7742         }
7743       if (vr_saved > 0)
7744         {
7745           /* We can't use move_block_from_reg, because it will use
7746              the wrong mode, storing D regs only.  */
7747           machine_mode mode = TImode;
7748           int off, i;
7749
7750           /* Set OFF to the offset from virtual_incoming_args_rtx of
7751              the first vector register.  The VR save area lies below
7752              the GR one, and is aligned to 16 bytes.  */
7753           off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7754                                    STACK_BOUNDARY / BITS_PER_UNIT);
7755           off -= vr_saved * UNITS_PER_VREG;
7756
7757           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7758             {
7759               rtx ptr, mem;
7760
7761               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7762               mem = gen_frame_mem (mode, ptr);
7763               set_mem_alias_set (mem, get_varargs_alias_set ());
7764               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7765               off += UNITS_PER_VREG;
7766             }
7767         }
7768     }
7769
7770   /* We don't save the size into *PRETEND_SIZE because we want to avoid
7771      any complication of having crtl->args.pretend_args_size changed.  */
7772   cfun->machine->frame.saved_varargs_size
7773     = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7774                       STACK_BOUNDARY / BITS_PER_UNIT)
7775        + vr_saved * UNITS_PER_VREG);
7776 }
7777
7778 static void
7779 aarch64_conditional_register_usage (void)
7780 {
7781   int i;
7782   if (!TARGET_FLOAT)
7783     {
7784       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7785         {
7786           fixed_regs[i] = 1;
7787           call_used_regs[i] = 1;
7788         }
7789     }
7790 }
7791
7792 /* Walk down the type tree of TYPE counting consecutive base elements.
7793    If *MODEP is VOIDmode, then set it to the first valid floating point
7794    type.  If a non-floating point type is found, or if a floating point
7795    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7796    otherwise return the count in the sub-tree.  */
7797 static int
7798 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
7799 {
7800   machine_mode mode;
7801   HOST_WIDE_INT size;
7802
7803   switch (TREE_CODE (type))
7804     {
7805     case REAL_TYPE:
7806       mode = TYPE_MODE (type);
7807       if (mode != DFmode && mode != SFmode && mode != TFmode)
7808         return -1;
7809
7810       if (*modep == VOIDmode)
7811         *modep = mode;
7812
7813       if (*modep == mode)
7814         return 1;
7815
7816       break;
7817
7818     case COMPLEX_TYPE:
7819       mode = TYPE_MODE (TREE_TYPE (type));
7820       if (mode != DFmode && mode != SFmode && mode != TFmode)
7821         return -1;
7822
7823       if (*modep == VOIDmode)
7824         *modep = mode;
7825
7826       if (*modep == mode)
7827         return 2;
7828
7829       break;
7830
7831     case VECTOR_TYPE:
7832       /* Use V2SImode and V4SImode as representatives of all 64-bit
7833          and 128-bit vector types.  */
7834       size = int_size_in_bytes (type);
7835       switch (size)
7836         {
7837         case 8:
7838           mode = V2SImode;
7839           break;
7840         case 16:
7841           mode = V4SImode;
7842           break;
7843         default:
7844           return -1;
7845         }
7846
7847       if (*modep == VOIDmode)
7848         *modep = mode;
7849
7850       /* Vector modes are considered to be opaque: two vectors are
7851          equivalent for the purposes of being homogeneous aggregates
7852          if they are the same size.  */
7853       if (*modep == mode)
7854         return 1;
7855
7856       break;
7857
7858     case ARRAY_TYPE:
7859       {
7860         int count;
7861         tree index = TYPE_DOMAIN (type);
7862
7863         /* Can't handle incomplete types nor sizes that are not
7864            fixed.  */
7865         if (!COMPLETE_TYPE_P (type)
7866             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7867           return -1;
7868
7869         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7870         if (count == -1
7871             || !index
7872             || !TYPE_MAX_VALUE (index)
7873             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7874             || !TYPE_MIN_VALUE (index)
7875             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7876             || count < 0)
7877           return -1;
7878
7879         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7880                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7881
7882         /* There must be no padding.  */
7883         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7884           return -1;
7885
7886         return count;
7887       }
7888
7889     case RECORD_TYPE:
7890       {
7891         int count = 0;
7892         int sub_count;
7893         tree field;
7894
7895         /* Can't handle incomplete types nor sizes that are not
7896            fixed.  */
7897         if (!COMPLETE_TYPE_P (type)
7898             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7899           return -1;
7900
7901         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7902           {
7903             if (TREE_CODE (field) != FIELD_DECL)
7904               continue;
7905
7906             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7907             if (sub_count < 0)
7908               return -1;
7909             count += sub_count;
7910           }
7911
7912         /* There must be no padding.  */
7913         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7914           return -1;
7915
7916         return count;
7917       }
7918
7919     case UNION_TYPE:
7920     case QUAL_UNION_TYPE:
7921       {
7922         /* These aren't very interesting except in a degenerate case.  */
7923         int count = 0;
7924         int sub_count;
7925         tree field;
7926
7927         /* Can't handle incomplete types nor sizes that are not
7928            fixed.  */
7929         if (!COMPLETE_TYPE_P (type)
7930             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7931           return -1;
7932
7933         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7934           {
7935             if (TREE_CODE (field) != FIELD_DECL)
7936               continue;
7937
7938             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7939             if (sub_count < 0)
7940               return -1;
7941             count = count > sub_count ? count : sub_count;
7942           }
7943
7944         /* There must be no padding.  */
7945         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7946           return -1;
7947
7948         return count;
7949       }
7950
7951     default:
7952       break;
7953     }
7954
7955   return -1;
7956 }
7957
7958 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7959    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
7960    array types.  The C99 floating-point complex types are also considered
7961    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
7962    types, which are GCC extensions and out of the scope of AAPCS64, are
7963    treated as composite types here as well.
7964
7965    Note that MODE itself is not sufficient in determining whether a type
7966    is such a composite type or not.  This is because
7967    stor-layout.c:compute_record_mode may have already changed the MODE
7968    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
7969    structure with only one field may have its MODE set to the mode of the
7970    field.  Also an integer mode whose size matches the size of the
7971    RECORD_TYPE type may be used to substitute the original mode
7972    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
7973    solely relied on.  */
7974
7975 static bool
7976 aarch64_composite_type_p (const_tree type,
7977                           machine_mode mode)
7978 {
7979   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7980     return true;
7981
7982   if (mode == BLKmode
7983       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7984       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7985     return true;
7986
7987   return false;
7988 }
7989
7990 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7991    type as described in AAPCS64 \S 4.1.2.
7992
7993    See the comment above aarch64_composite_type_p for the notes on MODE.  */
7994
7995 static bool
7996 aarch64_short_vector_p (const_tree type,
7997                         machine_mode mode)
7998 {
7999   HOST_WIDE_INT size = -1;
8000
8001   if (type && TREE_CODE (type) == VECTOR_TYPE)
8002     size = int_size_in_bytes (type);
8003   else if (!aarch64_composite_type_p (type, mode)
8004            && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
8005                || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
8006     size = GET_MODE_SIZE (mode);
8007
8008   return (size == 8 || size == 16) ? true : false;
8009 }
8010
8011 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
8012    shall be passed or returned in simd/fp register(s) (providing these
8013    parameter passing registers are available).
8014
8015    Upon successful return, *COUNT returns the number of needed registers,
8016    *BASE_MODE returns the mode of the individual register and when IS_HAF
8017    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
8018    floating-point aggregate or a homogeneous short-vector aggregate.  */
8019
8020 static bool
8021 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
8022                                          const_tree type,
8023                                          machine_mode *base_mode,
8024                                          int *count,
8025                                          bool *is_ha)
8026 {
8027   machine_mode new_mode = VOIDmode;
8028   bool composite_p = aarch64_composite_type_p (type, mode);
8029
8030   if (is_ha != NULL) *is_ha = false;
8031
8032   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
8033       || aarch64_short_vector_p (type, mode))
8034     {
8035       *count = 1;
8036       new_mode = mode;
8037     }
8038   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
8039     {
8040       if (is_ha != NULL) *is_ha = true;
8041       *count = 2;
8042       new_mode = GET_MODE_INNER (mode);
8043     }
8044   else if (type && composite_p)
8045     {
8046       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
8047
8048       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
8049         {
8050           if (is_ha != NULL) *is_ha = true;
8051           *count = ag_count;
8052         }
8053       else
8054         return false;
8055     }
8056   else
8057     return false;
8058
8059   *base_mode = new_mode;
8060   return true;
8061 }
8062
8063 /* Implement TARGET_STRUCT_VALUE_RTX.  */
8064
8065 static rtx
8066 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
8067                           int incoming ATTRIBUTE_UNUSED)
8068 {
8069   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
8070 }
8071
8072 /* Implements target hook vector_mode_supported_p.  */
8073 static bool
8074 aarch64_vector_mode_supported_p (machine_mode mode)
8075 {
8076   if (TARGET_SIMD
8077       && (mode == V4SImode  || mode == V8HImode
8078           || mode == V16QImode || mode == V2DImode
8079           || mode == V2SImode  || mode == V4HImode
8080           || mode == V8QImode || mode == V2SFmode
8081           || mode == V4SFmode || mode == V2DFmode
8082           || mode == V1DFmode))
8083     return true;
8084
8085   return false;
8086 }
8087
8088 /* Return appropriate SIMD container
8089    for MODE within a vector of WIDTH bits.  */
8090 static machine_mode
8091 aarch64_simd_container_mode (machine_mode mode, unsigned width)
8092 {
8093   gcc_assert (width == 64 || width == 128);
8094   if (TARGET_SIMD)
8095     {
8096       if (width == 128)
8097         switch (mode)
8098           {
8099           case DFmode:
8100             return V2DFmode;
8101           case SFmode:
8102             return V4SFmode;
8103           case SImode:
8104             return V4SImode;
8105           case HImode:
8106             return V8HImode;
8107           case QImode:
8108             return V16QImode;
8109           case DImode:
8110             return V2DImode;
8111           default:
8112             break;
8113           }
8114       else
8115         switch (mode)
8116           {
8117           case SFmode:
8118             return V2SFmode;
8119           case SImode:
8120             return V2SImode;
8121           case HImode:
8122             return V4HImode;
8123           case QImode:
8124             return V8QImode;
8125           default:
8126             break;
8127           }
8128     }
8129   return word_mode;
8130 }
8131
8132 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
8133 static machine_mode
8134 aarch64_preferred_simd_mode (machine_mode mode)
8135 {
8136   return aarch64_simd_container_mode (mode, 128);
8137 }
8138
8139 /* Return the bitmask of possible vector sizes for the vectorizer
8140    to iterate over.  */
8141 static unsigned int
8142 aarch64_autovectorize_vector_sizes (void)
8143 {
8144   return (16 | 8);
8145 }
8146
8147 /* Implement TARGET_MANGLE_TYPE.  */
8148
8149 static const char *
8150 aarch64_mangle_type (const_tree type)
8151 {
8152   /* The AArch64 ABI documents say that "__va_list" has to be
8153      managled as if it is in the "std" namespace.  */
8154   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8155     return "St9__va_list";
8156
8157   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
8158      builtin types.  */
8159   if (TYPE_NAME (type) != NULL)
8160     return aarch64_mangle_builtin_type (type);
8161
8162   /* Use the default mangling.  */
8163   return NULL;
8164 }
8165
8166
8167 /* Return true if the rtx_insn contains a MEM RTX somewhere
8168    in it.  */
8169
8170 static bool
8171 has_memory_op (rtx_insn *mem_insn)
8172 {
8173   subrtx_iterator::array_type array;
8174   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8175     if (MEM_P (*iter))
8176       return true;
8177
8178   return false;
8179 }
8180
8181 /* Find the first rtx_insn before insn that will generate an assembly
8182    instruction.  */
8183
8184 static rtx_insn *
8185 aarch64_prev_real_insn (rtx_insn *insn)
8186 {
8187   if (!insn)
8188     return NULL;
8189
8190   do
8191     {
8192       insn = prev_real_insn (insn);
8193     }
8194   while (insn && recog_memoized (insn) < 0);
8195
8196   return insn;
8197 }
8198
8199 static bool
8200 is_madd_op (enum attr_type t1)
8201 {
8202   unsigned int i;
8203   /* A number of these may be AArch32 only.  */
8204   enum attr_type mlatypes[] = {
8205     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8206     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8207     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8208   };
8209
8210   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8211     {
8212       if (t1 == mlatypes[i])
8213         return true;
8214     }
8215
8216   return false;
8217 }
8218
8219 /* Check if there is a register dependency between a load and the insn
8220    for which we hold recog_data.  */
8221
8222 static bool
8223 dep_between_memop_and_curr (rtx memop)
8224 {
8225   rtx load_reg;
8226   int opno;
8227
8228   gcc_assert (GET_CODE (memop) == SET);
8229
8230   if (!REG_P (SET_DEST (memop)))
8231     return false;
8232
8233   load_reg = SET_DEST (memop);
8234   for (opno = 1; opno < recog_data.n_operands; opno++)
8235     {
8236       rtx operand = recog_data.operand[opno];
8237       if (REG_P (operand)
8238           && reg_overlap_mentioned_p (load_reg, operand))
8239         return true;
8240
8241     }
8242   return false;
8243 }
8244
8245
8246 /* When working around the Cortex-A53 erratum 835769,
8247    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8248    instruction and has a preceding memory instruction such that a NOP
8249    should be inserted between them.  */
8250
8251 bool
8252 aarch64_madd_needs_nop (rtx_insn* insn)
8253 {
8254   enum attr_type attr_type;
8255   rtx_insn *prev;
8256   rtx body;
8257
8258   if (!aarch64_fix_a53_err835769)
8259     return false;
8260
8261   if (recog_memoized (insn) < 0)
8262     return false;
8263
8264   attr_type = get_attr_type (insn);
8265   if (!is_madd_op (attr_type))
8266     return false;
8267
8268   prev = aarch64_prev_real_insn (insn);
8269   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8270      Restore recog state to INSN to avoid state corruption.  */
8271   extract_constrain_insn_cached (insn);
8272
8273   if (!prev || !has_memory_op (prev))
8274     return false;
8275
8276   body = single_set (prev);
8277
8278   /* If the previous insn is a memory op and there is no dependency between
8279      it and the DImode madd, emit a NOP between them.  If body is NULL then we
8280      have a complex memory operation, probably a load/store pair.
8281      Be conservative for now and emit a NOP.  */
8282   if (GET_MODE (recog_data.operand[0]) == DImode
8283       && (!body || !dep_between_memop_and_curr (body)))
8284     return true;
8285
8286   return false;
8287
8288 }
8289
8290
8291 /* Implement FINAL_PRESCAN_INSN.  */
8292
8293 void
8294 aarch64_final_prescan_insn (rtx_insn *insn)
8295 {
8296   if (aarch64_madd_needs_nop (insn))
8297     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8298 }
8299
8300
8301 /* Return the equivalent letter for size.  */
8302 static char
8303 sizetochar (int size)
8304 {
8305   switch (size)
8306     {
8307     case 64: return 'd';
8308     case 32: return 's';
8309     case 16: return 'h';
8310     case 8 : return 'b';
8311     default: gcc_unreachable ();
8312     }
8313 }
8314
8315 /* Return true iff x is a uniform vector of floating-point
8316    constants, and the constant can be represented in
8317    quarter-precision form.  Note, as aarch64_float_const_representable
8318    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
8319 static bool
8320 aarch64_vect_float_const_representable_p (rtx x)
8321 {
8322   int i = 0;
8323   REAL_VALUE_TYPE r0, ri;
8324   rtx x0, xi;
8325
8326   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8327     return false;
8328
8329   x0 = CONST_VECTOR_ELT (x, 0);
8330   if (!CONST_DOUBLE_P (x0))
8331     return false;
8332
8333   REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8334
8335   for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8336     {
8337       xi = CONST_VECTOR_ELT (x, i);
8338       if (!CONST_DOUBLE_P (xi))
8339         return false;
8340
8341       REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8342       if (!REAL_VALUES_EQUAL (r0, ri))
8343         return false;
8344     }
8345
8346   return aarch64_float_const_representable_p (x0);
8347 }
8348
8349 /* Return true for valid and false for invalid.  */
8350 bool
8351 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8352                               struct simd_immediate_info *info)
8353 {
8354 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
8355   matches = 1;                                          \
8356   for (i = 0; i < idx; i += (STRIDE))                   \
8357     if (!(TEST))                                        \
8358       matches = 0;                                      \
8359   if (matches)                                          \
8360     {                                                   \
8361       immtype = (CLASS);                                \
8362       elsize = (ELSIZE);                                \
8363       eshift = (SHIFT);                                 \
8364       emvn = (NEG);                                     \
8365       break;                                            \
8366     }
8367
8368   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8369   unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8370   unsigned char bytes[16];
8371   int immtype = -1, matches;
8372   unsigned int invmask = inverse ? 0xff : 0;
8373   int eshift, emvn;
8374
8375   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8376     {
8377       if (! (aarch64_simd_imm_zero_p (op, mode)
8378              || aarch64_vect_float_const_representable_p (op)))
8379         return false;
8380
8381       if (info)
8382         {
8383           info->value = CONST_VECTOR_ELT (op, 0);
8384           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8385           info->mvn = false;
8386           info->shift = 0;
8387         }
8388
8389       return true;
8390     }
8391
8392   /* Splat vector constant out into a byte vector.  */
8393   for (i = 0; i < n_elts; i++)
8394     {
8395       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
8396          it must be laid out in the vector register in reverse order.  */
8397       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8398       unsigned HOST_WIDE_INT elpart;
8399       unsigned int part, parts;
8400
8401       if (CONST_INT_P (el))
8402         {
8403           elpart = INTVAL (el);
8404           parts = 1;
8405         }
8406       else if (GET_CODE (el) == CONST_DOUBLE)
8407         {
8408           elpart = CONST_DOUBLE_LOW (el);
8409           parts = 2;
8410         }
8411       else
8412         gcc_unreachable ();
8413
8414       for (part = 0; part < parts; part++)
8415         {
8416           unsigned int byte;
8417           for (byte = 0; byte < innersize; byte++)
8418             {
8419               bytes[idx++] = (elpart & 0xff) ^ invmask;
8420               elpart >>= BITS_PER_UNIT;
8421             }
8422           if (GET_CODE (el) == CONST_DOUBLE)
8423             elpart = CONST_DOUBLE_HIGH (el);
8424         }
8425     }
8426
8427   /* Sanity check.  */
8428   gcc_assert (idx == GET_MODE_SIZE (mode));
8429
8430   do
8431     {
8432       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8433              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8434
8435       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8436              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8437
8438       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8439              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8440
8441       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8442              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8443
8444       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8445
8446       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8447
8448       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8449              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8450
8451       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8452              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8453
8454       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8455              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8456
8457       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8458              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8459
8460       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8461
8462       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8463
8464       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8465              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8466
8467       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8468              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8469
8470       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8471              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8472
8473       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8474              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8475
8476       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8477
8478       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8479              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8480     }
8481   while (0);
8482
8483   if (immtype == -1)
8484     return false;
8485
8486   if (info)
8487     {
8488       info->element_width = elsize;
8489       info->mvn = emvn != 0;
8490       info->shift = eshift;
8491
8492       unsigned HOST_WIDE_INT imm = 0;
8493
8494       if (immtype >= 12 && immtype <= 15)
8495         info->msl = true;
8496
8497       /* Un-invert bytes of recognized vector, if necessary.  */
8498       if (invmask != 0)
8499         for (i = 0; i < idx; i++)
8500           bytes[i] ^= invmask;
8501
8502       if (immtype == 17)
8503         {
8504           /* FIXME: Broken on 32-bit H_W_I hosts.  */
8505           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8506
8507           for (i = 0; i < 8; i++)
8508             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8509               << (i * BITS_PER_UNIT);
8510
8511
8512           info->value = GEN_INT (imm);
8513         }
8514       else
8515         {
8516           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8517             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8518
8519           /* Construct 'abcdefgh' because the assembler cannot handle
8520              generic constants.  */
8521           if (info->mvn)
8522             imm = ~imm;
8523           imm = (imm >> info->shift) & 0xff;
8524           info->value = GEN_INT (imm);
8525         }
8526     }
8527
8528   return true;
8529 #undef CHECK
8530 }
8531
8532 /* Check of immediate shift constants are within range.  */
8533 bool
8534 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8535 {
8536   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8537   if (left)
8538     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8539   else
8540     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8541 }
8542
8543 /* Return true if X is a uniform vector where all elements
8544    are either the floating-point constant 0.0 or the
8545    integer constant 0.  */
8546 bool
8547 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8548 {
8549   return x == CONST0_RTX (mode);
8550 }
8551
8552 bool
8553 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8554 {
8555   HOST_WIDE_INT imm = INTVAL (x);
8556   int i;
8557
8558   for (i = 0; i < 8; i++)
8559     {
8560       unsigned int byte = imm & 0xff;
8561       if (byte != 0xff && byte != 0)
8562        return false;
8563       imm >>= 8;
8564     }
8565
8566   return true;
8567 }
8568
8569 bool
8570 aarch64_mov_operand_p (rtx x,
8571                        enum aarch64_symbol_context context,
8572                        machine_mode mode)
8573 {
8574   if (GET_CODE (x) == HIGH
8575       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8576     return true;
8577
8578   if (CONST_INT_P (x))
8579     return true;
8580
8581   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8582     return true;
8583
8584   return aarch64_classify_symbolic_expression (x, context)
8585     == SYMBOL_TINY_ABSOLUTE;
8586 }
8587
8588 /* Return a const_int vector of VAL.  */
8589 rtx
8590 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8591 {
8592   int nunits = GET_MODE_NUNITS (mode);
8593   rtvec v = rtvec_alloc (nunits);
8594   int i;
8595
8596   for (i=0; i < nunits; i++)
8597     RTVEC_ELT (v, i) = GEN_INT (val);
8598
8599   return gen_rtx_CONST_VECTOR (mode, v);
8600 }
8601
8602 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
8603
8604 bool
8605 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8606 {
8607   machine_mode vmode;
8608
8609   gcc_assert (!VECTOR_MODE_P (mode));
8610   vmode = aarch64_preferred_simd_mode (mode);
8611   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8612   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8613 }
8614
8615 /* Construct and return a PARALLEL RTX vector with elements numbering the
8616    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8617    the vector - from the perspective of the architecture.  This does not
8618    line up with GCC's perspective on lane numbers, so we end up with
8619    different masks depending on our target endian-ness.  The diagram
8620    below may help.  We must draw the distinction when building masks
8621    which select one half of the vector.  An instruction selecting
8622    architectural low-lanes for a big-endian target, must be described using
8623    a mask selecting GCC high-lanes.
8624
8625                  Big-Endian             Little-Endian
8626
8627 GCC             0   1   2   3           3   2   1   0
8628               | x | x | x | x |       | x | x | x | x |
8629 Architecture    3   2   1   0           3   2   1   0
8630
8631 Low Mask:         { 2, 3 }                { 0, 1 }
8632 High Mask:        { 0, 1 }                { 2, 3 }
8633 */
8634
8635 rtx
8636 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8637 {
8638   int nunits = GET_MODE_NUNITS (mode);
8639   rtvec v = rtvec_alloc (nunits / 2);
8640   int high_base = nunits / 2;
8641   int low_base = 0;
8642   int base;
8643   rtx t1;
8644   int i;
8645
8646   if (BYTES_BIG_ENDIAN)
8647     base = high ? low_base : high_base;
8648   else
8649     base = high ? high_base : low_base;
8650
8651   for (i = 0; i < nunits / 2; i++)
8652     RTVEC_ELT (v, i) = GEN_INT (base + i);
8653
8654   t1 = gen_rtx_PARALLEL (mode, v);
8655   return t1;
8656 }
8657
8658 /* Check OP for validity as a PARALLEL RTX vector with elements
8659    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8660    from the perspective of the architecture.  See the diagram above
8661    aarch64_simd_vect_par_cnst_half for more details.  */
8662
8663 bool
8664 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8665                                        bool high)
8666 {
8667   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8668   HOST_WIDE_INT count_op = XVECLEN (op, 0);
8669   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8670   int i = 0;
8671
8672   if (!VECTOR_MODE_P (mode))
8673     return false;
8674
8675   if (count_op != count_ideal)
8676     return false;
8677
8678   for (i = 0; i < count_ideal; i++)
8679     {
8680       rtx elt_op = XVECEXP (op, 0, i);
8681       rtx elt_ideal = XVECEXP (ideal, 0, i);
8682
8683       if (!CONST_INT_P (elt_op)
8684           || INTVAL (elt_ideal) != INTVAL (elt_op))
8685         return false;
8686     }
8687   return true;
8688 }
8689
8690 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
8691    HIGH (exclusive).  */
8692 void
8693 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
8694                           const_tree exp)
8695 {
8696   HOST_WIDE_INT lane;
8697   gcc_assert (CONST_INT_P (operand));
8698   lane = INTVAL (operand);
8699
8700   if (lane < low || lane >= high)
8701   {
8702     if (exp)
8703       error ("%Klane %ld out of range %ld - %ld", exp, lane, low, high - 1);
8704     else
8705       error ("lane %ld out of range %ld - %ld", lane, low, high - 1);
8706   }
8707 }
8708
8709 /* Return TRUE if OP is a valid vector addressing mode.  */
8710 bool
8711 aarch64_simd_mem_operand_p (rtx op)
8712 {
8713   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8714                         || REG_P (XEXP (op, 0)));
8715 }
8716
8717 /* Emit a register copy from operand to operand, taking care not to
8718    early-clobber source registers in the process.
8719
8720    COUNT is the number of components into which the copy needs to be
8721    decomposed.  */
8722 void
8723 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
8724                                 unsigned int count)
8725 {
8726   unsigned int i;
8727   int rdest = REGNO (operands[0]);
8728   int rsrc = REGNO (operands[1]);
8729
8730   if (!reg_overlap_mentioned_p (operands[0], operands[1])
8731       || rdest < rsrc)
8732     for (i = 0; i < count; i++)
8733       emit_move_insn (gen_rtx_REG (mode, rdest + i),
8734                       gen_rtx_REG (mode, rsrc + i));
8735   else
8736     for (i = 0; i < count; i++)
8737       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
8738                       gen_rtx_REG (mode, rsrc + count - i - 1));
8739 }
8740
8741 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8742    one of VSTRUCT modes: OI, CI or XI.  */
8743 int
8744 aarch64_simd_attr_length_move (rtx_insn *insn)
8745 {
8746   machine_mode mode;
8747
8748   extract_insn_cached (insn);
8749
8750   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8751     {
8752       mode = GET_MODE (recog_data.operand[0]);
8753       switch (mode)
8754         {
8755         case OImode:
8756           return 8;
8757         case CImode:
8758           return 12;
8759         case XImode:
8760           return 16;
8761         default:
8762           gcc_unreachable ();
8763         }
8764     }
8765   return 4;
8766 }
8767
8768 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
8769    one of VSTRUCT modes: OI, CI, EI, or XI.  */
8770 int
8771 aarch64_simd_attr_length_rglist (enum machine_mode mode)
8772 {
8773   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
8774 }
8775
8776 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
8777    alignment of a vector to 128 bits.  */
8778 static HOST_WIDE_INT
8779 aarch64_simd_vector_alignment (const_tree type)
8780 {
8781   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8782   return MIN (align, 128);
8783 }
8784
8785 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
8786 static bool
8787 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8788 {
8789   if (is_packed)
8790     return false;
8791
8792   /* We guarantee alignment for vectors up to 128-bits.  */
8793   if (tree_int_cst_compare (TYPE_SIZE (type),
8794                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8795     return false;
8796
8797   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
8798   return true;
8799 }
8800
8801 /* If VALS is a vector constant that can be loaded into a register
8802    using DUP, generate instructions to do so and return an RTX to
8803    assign to the register.  Otherwise return NULL_RTX.  */
8804 static rtx
8805 aarch64_simd_dup_constant (rtx vals)
8806 {
8807   machine_mode mode = GET_MODE (vals);
8808   machine_mode inner_mode = GET_MODE_INNER (mode);
8809   int n_elts = GET_MODE_NUNITS (mode);
8810   bool all_same = true;
8811   rtx x;
8812   int i;
8813
8814   if (GET_CODE (vals) != CONST_VECTOR)
8815     return NULL_RTX;
8816
8817   for (i = 1; i < n_elts; ++i)
8818     {
8819       x = CONST_VECTOR_ELT (vals, i);
8820       if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8821         all_same = false;
8822     }
8823
8824   if (!all_same)
8825     return NULL_RTX;
8826
8827   /* We can load this constant by using DUP and a constant in a
8828      single ARM register.  This will be cheaper than a vector
8829      load.  */
8830   x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8831   return gen_rtx_VEC_DUPLICATE (mode, x);
8832 }
8833
8834
8835 /* Generate code to load VALS, which is a PARALLEL containing only
8836    constants (for vec_init) or CONST_VECTOR, efficiently into a
8837    register.  Returns an RTX to copy into the register, or NULL_RTX
8838    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
8839 static rtx
8840 aarch64_simd_make_constant (rtx vals)
8841 {
8842   machine_mode mode = GET_MODE (vals);
8843   rtx const_dup;
8844   rtx const_vec = NULL_RTX;
8845   int n_elts = GET_MODE_NUNITS (mode);
8846   int n_const = 0;
8847   int i;
8848
8849   if (GET_CODE (vals) == CONST_VECTOR)
8850     const_vec = vals;
8851   else if (GET_CODE (vals) == PARALLEL)
8852     {
8853       /* A CONST_VECTOR must contain only CONST_INTs and
8854          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8855          Only store valid constants in a CONST_VECTOR.  */
8856       for (i = 0; i < n_elts; ++i)
8857         {
8858           rtx x = XVECEXP (vals, 0, i);
8859           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8860             n_const++;
8861         }
8862       if (n_const == n_elts)
8863         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8864     }
8865   else
8866     gcc_unreachable ();
8867
8868   if (const_vec != NULL_RTX
8869       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8870     /* Load using MOVI/MVNI.  */
8871     return const_vec;
8872   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8873     /* Loaded using DUP.  */
8874     return const_dup;
8875   else if (const_vec != NULL_RTX)
8876     /* Load from constant pool. We can not take advantage of single-cycle
8877        LD1 because we need a PC-relative addressing mode.  */
8878     return const_vec;
8879   else
8880     /* A PARALLEL containing something not valid inside CONST_VECTOR.
8881        We can not construct an initializer.  */
8882     return NULL_RTX;
8883 }
8884
8885 void
8886 aarch64_expand_vector_init (rtx target, rtx vals)
8887 {
8888   machine_mode mode = GET_MODE (target);
8889   machine_mode inner_mode = GET_MODE_INNER (mode);
8890   int n_elts = GET_MODE_NUNITS (mode);
8891   int n_var = 0;
8892   rtx any_const = NULL_RTX;
8893   bool all_same = true;
8894
8895   for (int i = 0; i < n_elts; ++i)
8896     {
8897       rtx x = XVECEXP (vals, 0, i);
8898       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8899         ++n_var;
8900       else
8901         any_const = x;
8902
8903       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8904         all_same = false;
8905     }
8906
8907   if (n_var == 0)
8908     {
8909       rtx constant = aarch64_simd_make_constant (vals);
8910       if (constant != NULL_RTX)
8911         {
8912           emit_move_insn (target, constant);
8913           return;
8914         }
8915     }
8916
8917   /* Splat a single non-constant element if we can.  */
8918   if (all_same)
8919     {
8920       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8921       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8922       return;
8923     }
8924
8925   /* Half the fields (or less) are non-constant.  Load constant then overwrite
8926      varying fields.  Hope that this is more efficient than using the stack.  */
8927   if (n_var <= n_elts/2)
8928     {
8929       rtx copy = copy_rtx (vals);
8930
8931       /* Load constant part of vector.  We really don't care what goes into the
8932          parts we will overwrite, but we're more likely to be able to load the
8933          constant efficiently if it has fewer, larger, repeating parts
8934          (see aarch64_simd_valid_immediate).  */
8935       for (int i = 0; i < n_elts; i++)
8936         {
8937           rtx x = XVECEXP (vals, 0, i);
8938           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8939             continue;
8940           rtx subst = any_const;
8941           for (int bit = n_elts / 2; bit > 0; bit /= 2)
8942             {
8943               /* Look in the copied vector, as more elements are const.  */
8944               rtx test = XVECEXP (copy, 0, i ^ bit);
8945               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
8946                 {
8947                   subst = test;
8948                   break;
8949                 }
8950             }
8951           XVECEXP (copy, 0, i) = subst;
8952         }
8953       aarch64_expand_vector_init (target, copy);
8954
8955       /* Insert variables.  */
8956       enum insn_code icode = optab_handler (vec_set_optab, mode);
8957       gcc_assert (icode != CODE_FOR_nothing);
8958
8959       for (int i = 0; i < n_elts; i++)
8960         {
8961           rtx x = XVECEXP (vals, 0, i);
8962           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8963             continue;
8964           x = copy_to_mode_reg (inner_mode, x);
8965           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
8966         }
8967       return;
8968     }
8969
8970   /* Construct the vector in memory one field at a time
8971      and load the whole vector.  */
8972   rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8973   for (int i = 0; i < n_elts; i++)
8974     emit_move_insn (adjust_address_nv (mem, inner_mode,
8975                                     i * GET_MODE_SIZE (inner_mode)),
8976                     XVECEXP (vals, 0, i));
8977   emit_move_insn (target, mem);
8978
8979 }
8980
8981 static unsigned HOST_WIDE_INT
8982 aarch64_shift_truncation_mask (machine_mode mode)
8983 {
8984   return
8985     (aarch64_vector_mode_supported_p (mode)
8986      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8987 }
8988
8989 #ifndef TLS_SECTION_ASM_FLAG
8990 #define TLS_SECTION_ASM_FLAG 'T'
8991 #endif
8992
8993 void
8994 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8995                                tree decl ATTRIBUTE_UNUSED)
8996 {
8997   char flagchars[10], *f = flagchars;
8998
8999   /* If we have already declared this section, we can use an
9000      abbreviated form to switch back to it -- unless this section is
9001      part of a COMDAT groups, in which case GAS requires the full
9002      declaration every time.  */
9003   if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9004       && (flags & SECTION_DECLARED))
9005     {
9006       fprintf (asm_out_file, "\t.section\t%s\n", name);
9007       return;
9008     }
9009
9010   if (!(flags & SECTION_DEBUG))
9011     *f++ = 'a';
9012   if (flags & SECTION_WRITE)
9013     *f++ = 'w';
9014   if (flags & SECTION_CODE)
9015     *f++ = 'x';
9016   if (flags & SECTION_SMALL)
9017     *f++ = 's';
9018   if (flags & SECTION_MERGE)
9019     *f++ = 'M';
9020   if (flags & SECTION_STRINGS)
9021     *f++ = 'S';
9022   if (flags & SECTION_TLS)
9023     *f++ = TLS_SECTION_ASM_FLAG;
9024   if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9025     *f++ = 'G';
9026   *f = '\0';
9027
9028   fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
9029
9030   if (!(flags & SECTION_NOTYPE))
9031     {
9032       const char *type;
9033       const char *format;
9034
9035       if (flags & SECTION_BSS)
9036         type = "nobits";
9037       else
9038         type = "progbits";
9039
9040 #ifdef TYPE_OPERAND_FMT
9041       format = "," TYPE_OPERAND_FMT;
9042 #else
9043       format = ",@%s";
9044 #endif
9045
9046       fprintf (asm_out_file, format, type);
9047
9048       if (flags & SECTION_ENTSIZE)
9049         fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
9050       if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9051         {
9052           if (TREE_CODE (decl) == IDENTIFIER_NODE)
9053             fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
9054           else
9055             fprintf (asm_out_file, ",%s,comdat",
9056                      IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
9057         }
9058     }
9059
9060   putc ('\n', asm_out_file);
9061 }
9062
9063 /* Select a format to encode pointers in exception handling data.  */
9064 int
9065 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
9066 {
9067    int type;
9068    switch (aarch64_cmodel)
9069      {
9070      case AARCH64_CMODEL_TINY:
9071      case AARCH64_CMODEL_TINY_PIC:
9072      case AARCH64_CMODEL_SMALL:
9073      case AARCH64_CMODEL_SMALL_PIC:
9074        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
9075           for everything.  */
9076        type = DW_EH_PE_sdata4;
9077        break;
9078      default:
9079        /* No assumptions here.  8-byte relocs required.  */
9080        type = DW_EH_PE_sdata8;
9081        break;
9082      }
9083    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
9084 }
9085
9086 /* Emit load exclusive.  */
9087
9088 static void
9089 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
9090                              rtx mem, rtx model_rtx)
9091 {
9092   rtx (*gen) (rtx, rtx, rtx);
9093
9094   switch (mode)
9095     {
9096     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
9097     case HImode: gen = gen_aarch64_load_exclusivehi; break;
9098     case SImode: gen = gen_aarch64_load_exclusivesi; break;
9099     case DImode: gen = gen_aarch64_load_exclusivedi; break;
9100     default:
9101       gcc_unreachable ();
9102     }
9103
9104   emit_insn (gen (rval, mem, model_rtx));
9105 }
9106
9107 /* Emit store exclusive.  */
9108
9109 static void
9110 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
9111                               rtx rval, rtx mem, rtx model_rtx)
9112 {
9113   rtx (*gen) (rtx, rtx, rtx, rtx);
9114
9115   switch (mode)
9116     {
9117     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
9118     case HImode: gen = gen_aarch64_store_exclusivehi; break;
9119     case SImode: gen = gen_aarch64_store_exclusivesi; break;
9120     case DImode: gen = gen_aarch64_store_exclusivedi; break;
9121     default:
9122       gcc_unreachable ();
9123     }
9124
9125   emit_insn (gen (bval, rval, mem, model_rtx));
9126 }
9127
9128 /* Mark the previous jump instruction as unlikely.  */
9129
9130 static void
9131 aarch64_emit_unlikely_jump (rtx insn)
9132 {
9133   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
9134
9135   insn = emit_jump_insn (insn);
9136   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
9137 }
9138
9139 /* Expand a compare and swap pattern.  */
9140
9141 void
9142 aarch64_expand_compare_and_swap (rtx operands[])
9143 {
9144   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9145   machine_mode mode, cmp_mode;
9146   rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9147
9148   bval = operands[0];
9149   rval = operands[1];
9150   mem = operands[2];
9151   oldval = operands[3];
9152   newval = operands[4];
9153   is_weak = operands[5];
9154   mod_s = operands[6];
9155   mod_f = operands[7];
9156   mode = GET_MODE (mem);
9157   cmp_mode = mode;
9158
9159   /* Normally the succ memory model must be stronger than fail, but in the
9160      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9161      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
9162
9163   if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
9164       && INTVAL (mod_s) == MEMMODEL_RELEASE)
9165     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9166
9167   switch (mode)
9168     {
9169     case QImode:
9170     case HImode:
9171       /* For short modes, we're going to perform the comparison in SImode,
9172          so do the zero-extension now.  */
9173       cmp_mode = SImode;
9174       rval = gen_reg_rtx (SImode);
9175       oldval = convert_modes (SImode, mode, oldval, true);
9176       /* Fall through.  */
9177
9178     case SImode:
9179     case DImode:
9180       /* Force the value into a register if needed.  */
9181       if (!aarch64_plus_operand (oldval, mode))
9182         oldval = force_reg (cmp_mode, oldval);
9183       break;
9184
9185     default:
9186       gcc_unreachable ();
9187     }
9188
9189   switch (mode)
9190     {
9191     case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9192     case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9193     case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9194     case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9195     default:
9196       gcc_unreachable ();
9197     }
9198
9199   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9200
9201   if (mode == QImode || mode == HImode)
9202     emit_move_insn (operands[1], gen_lowpart (mode, rval));
9203
9204   x = gen_rtx_REG (CCmode, CC_REGNUM);
9205   x = gen_rtx_EQ (SImode, x, const0_rtx);
9206   emit_insn (gen_rtx_SET (VOIDmode, bval, x));
9207 }
9208
9209 /* Split a compare and swap pattern.  */
9210
9211 void
9212 aarch64_split_compare_and_swap (rtx operands[])
9213 {
9214   rtx rval, mem, oldval, newval, scratch;
9215   machine_mode mode;
9216   bool is_weak;
9217   rtx_code_label *label1, *label2;
9218   rtx x, cond;
9219
9220   rval = operands[0];
9221   mem = operands[1];
9222   oldval = operands[2];
9223   newval = operands[3];
9224   is_weak = (operands[4] != const0_rtx);
9225   scratch = operands[7];
9226   mode = GET_MODE (mem);
9227
9228   label1 = NULL;
9229   if (!is_weak)
9230     {
9231       label1 = gen_label_rtx ();
9232       emit_label (label1);
9233     }
9234   label2 = gen_label_rtx ();
9235
9236   aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
9237
9238   cond = aarch64_gen_compare_reg (NE, rval, oldval);
9239   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9240   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9241                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9242   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9243
9244   aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
9245
9246   if (!is_weak)
9247     {
9248       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9249       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9250                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9251       aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9252     }
9253   else
9254     {
9255       cond = gen_rtx_REG (CCmode, CC_REGNUM);
9256       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9257       emit_insn (gen_rtx_SET (VOIDmode, cond, x));
9258     }
9259
9260   emit_label (label2);
9261 }
9262
9263 /* Split an atomic operation.  */
9264
9265 void
9266 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9267                      rtx value, rtx model_rtx, rtx cond)
9268 {
9269   machine_mode mode = GET_MODE (mem);
9270   machine_mode wmode = (mode == DImode ? DImode : SImode);
9271   rtx_code_label *label;
9272   rtx x;
9273
9274   label = gen_label_rtx ();
9275   emit_label (label);
9276
9277   if (new_out)
9278     new_out = gen_lowpart (wmode, new_out);
9279   if (old_out)
9280     old_out = gen_lowpart (wmode, old_out);
9281   else
9282     old_out = new_out;
9283   value = simplify_gen_subreg (wmode, value, mode, 0);
9284
9285   aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9286
9287   switch (code)
9288     {
9289     case SET:
9290       new_out = value;
9291       break;
9292
9293     case NOT:
9294       x = gen_rtx_AND (wmode, old_out, value);
9295       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9296       x = gen_rtx_NOT (wmode, new_out);
9297       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9298       break;
9299
9300     case MINUS:
9301       if (CONST_INT_P (value))
9302         {
9303           value = GEN_INT (-INTVAL (value));
9304           code = PLUS;
9305         }
9306       /* Fall through.  */
9307
9308     default:
9309       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9310       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9311       break;
9312     }
9313
9314   aarch64_emit_store_exclusive (mode, cond, mem,
9315                                 gen_lowpart (mode, new_out), model_rtx);
9316
9317   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9318   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9319                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9320   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9321 }
9322
9323 static void
9324 aarch64_print_extension (void)
9325 {
9326   const struct aarch64_option_extension *opt = NULL;
9327
9328   for (opt = all_extensions; opt->name != NULL; opt++)
9329     if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9330       asm_fprintf (asm_out_file, "+%s", opt->name);
9331
9332   asm_fprintf (asm_out_file, "\n");
9333 }
9334
9335 static void
9336 aarch64_start_file (void)
9337 {
9338   if (selected_arch)
9339     {
9340       asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9341       aarch64_print_extension ();
9342     }
9343   else if (selected_cpu)
9344     {
9345       const char *truncated_name
9346             = aarch64_rewrite_selected_cpu (selected_cpu->name);
9347       asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9348       aarch64_print_extension ();
9349     }
9350   default_file_start();
9351 }
9352
9353 /* Target hook for c_mode_for_suffix.  */
9354 static machine_mode
9355 aarch64_c_mode_for_suffix (char suffix)
9356 {
9357   if (suffix == 'q')
9358     return TFmode;
9359
9360   return VOIDmode;
9361 }
9362
9363 /* We can only represent floating point constants which will fit in
9364    "quarter-precision" values.  These values are characterised by
9365    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
9366    by:
9367
9368    (-1)^s * (n/16) * 2^r
9369
9370    Where:
9371      's' is the sign bit.
9372      'n' is an integer in the range 16 <= n <= 31.
9373      'r' is an integer in the range -3 <= r <= 4.  */
9374
9375 /* Return true iff X can be represented by a quarter-precision
9376    floating point immediate operand X.  Note, we cannot represent 0.0.  */
9377 bool
9378 aarch64_float_const_representable_p (rtx x)
9379 {
9380   /* This represents our current view of how many bits
9381      make up the mantissa.  */
9382   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9383   int exponent;
9384   unsigned HOST_WIDE_INT mantissa, mask;
9385   REAL_VALUE_TYPE r, m;
9386   bool fail;
9387
9388   if (!CONST_DOUBLE_P (x))
9389     return false;
9390
9391   if (GET_MODE (x) == VOIDmode)
9392     return false;
9393
9394   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9395
9396   /* We cannot represent infinities, NaNs or +/-zero.  We won't
9397      know if we have +zero until we analyse the mantissa, but we
9398      can reject the other invalid values.  */
9399   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9400       || REAL_VALUE_MINUS_ZERO (r))
9401     return false;
9402
9403   /* Extract exponent.  */
9404   r = real_value_abs (&r);
9405   exponent = REAL_EXP (&r);
9406
9407   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9408      highest (sign) bit, with a fixed binary point at bit point_pos.
9409      m1 holds the low part of the mantissa, m2 the high part.
9410      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9411      bits for the mantissa, this can fail (low bits will be lost).  */
9412   real_ldexp (&m, &r, point_pos - exponent);
9413   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9414
9415   /* If the low part of the mantissa has bits set we cannot represent
9416      the value.  */
9417   if (w.elt (0) != 0)
9418     return false;
9419   /* We have rejected the lower HOST_WIDE_INT, so update our
9420      understanding of how many bits lie in the mantissa and
9421      look only at the high HOST_WIDE_INT.  */
9422   mantissa = w.elt (1);
9423   point_pos -= HOST_BITS_PER_WIDE_INT;
9424
9425   /* We can only represent values with a mantissa of the form 1.xxxx.  */
9426   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9427   if ((mantissa & mask) != 0)
9428     return false;
9429
9430   /* Having filtered unrepresentable values, we may now remove all
9431      but the highest 5 bits.  */
9432   mantissa >>= point_pos - 5;
9433
9434   /* We cannot represent the value 0.0, so reject it.  This is handled
9435      elsewhere.  */
9436   if (mantissa == 0)
9437     return false;
9438
9439   /* Then, as bit 4 is always set, we can mask it off, leaving
9440      the mantissa in the range [0, 15].  */
9441   mantissa &= ~(1 << 4);
9442   gcc_assert (mantissa <= 15);
9443
9444   /* GCC internally does not use IEEE754-like encoding (where normalized
9445      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
9446      Our mantissa values are shifted 4 places to the left relative to
9447      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9448      by 5 places to correct for GCC's representation.  */
9449   exponent = 5 - exponent;
9450
9451   return (exponent >= 0 && exponent <= 7);
9452 }
9453
9454 char*
9455 aarch64_output_simd_mov_immediate (rtx const_vector,
9456                                    machine_mode mode,
9457                                    unsigned width)
9458 {
9459   bool is_valid;
9460   static char templ[40];
9461   const char *mnemonic;
9462   const char *shift_op;
9463   unsigned int lane_count = 0;
9464   char element_char;
9465
9466   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
9467
9468   /* This will return true to show const_vector is legal for use as either
9469      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
9470      also update INFO to show how the immediate should be generated.  */
9471   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
9472   gcc_assert (is_valid);
9473
9474   element_char = sizetochar (info.element_width);
9475   lane_count = width / info.element_width;
9476
9477   mode = GET_MODE_INNER (mode);
9478   if (mode == SFmode || mode == DFmode)
9479     {
9480       gcc_assert (info.shift == 0 && ! info.mvn);
9481       if (aarch64_float_const_zero_rtx_p (info.value))
9482         info.value = GEN_INT (0);
9483       else
9484         {
9485 #define buf_size 20
9486           REAL_VALUE_TYPE r;
9487           REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
9488           char float_buf[buf_size] = {'\0'};
9489           real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
9490 #undef buf_size
9491
9492           if (lane_count == 1)
9493             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
9494           else
9495             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
9496                       lane_count, element_char, float_buf);
9497           return templ;
9498         }
9499     }
9500
9501   mnemonic = info.mvn ? "mvni" : "movi";
9502   shift_op = info.msl ? "msl" : "lsl";
9503
9504   if (lane_count == 1)
9505     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9506               mnemonic, UINTVAL (info.value));
9507   else if (info.shift)
9508     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9509               ", %s %d", mnemonic, lane_count, element_char,
9510               UINTVAL (info.value), shift_op, info.shift);
9511   else
9512     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9513               mnemonic, lane_count, element_char, UINTVAL (info.value));
9514   return templ;
9515 }
9516
9517 char*
9518 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9519                                           machine_mode mode)
9520 {
9521   machine_mode vmode;
9522
9523   gcc_assert (!VECTOR_MODE_P (mode));
9524   vmode = aarch64_simd_container_mode (mode, 64);
9525   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9526   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9527 }
9528
9529 /* Split operands into moves from op[1] + op[2] into op[0].  */
9530
9531 void
9532 aarch64_split_combinev16qi (rtx operands[3])
9533 {
9534   unsigned int dest = REGNO (operands[0]);
9535   unsigned int src1 = REGNO (operands[1]);
9536   unsigned int src2 = REGNO (operands[2]);
9537   machine_mode halfmode = GET_MODE (operands[1]);
9538   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9539   rtx destlo, desthi;
9540
9541   gcc_assert (halfmode == V16QImode);
9542
9543   if (src1 == dest && src2 == dest + halfregs)
9544     {
9545       /* No-op move.  Can't split to nothing; emit something.  */
9546       emit_note (NOTE_INSN_DELETED);
9547       return;
9548     }
9549
9550   /* Preserve register attributes for variable tracking.  */
9551   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9552   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9553                                GET_MODE_SIZE (halfmode));
9554
9555   /* Special case of reversed high/low parts.  */
9556   if (reg_overlap_mentioned_p (operands[2], destlo)
9557       && reg_overlap_mentioned_p (operands[1], desthi))
9558     {
9559       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9560       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9561       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9562     }
9563   else if (!reg_overlap_mentioned_p (operands[2], destlo))
9564     {
9565       /* Try to avoid unnecessary moves if part of the result
9566          is in the right place already.  */
9567       if (src1 != dest)
9568         emit_move_insn (destlo, operands[1]);
9569       if (src2 != dest + halfregs)
9570         emit_move_insn (desthi, operands[2]);
9571     }
9572   else
9573     {
9574       if (src2 != dest + halfregs)
9575         emit_move_insn (desthi, operands[2]);
9576       if (src1 != dest)
9577         emit_move_insn (destlo, operands[1]);
9578     }
9579 }
9580
9581 /* vec_perm support.  */
9582
9583 #define MAX_VECT_LEN 16
9584
9585 struct expand_vec_perm_d
9586 {
9587   rtx target, op0, op1;
9588   unsigned char perm[MAX_VECT_LEN];
9589   machine_mode vmode;
9590   unsigned char nelt;
9591   bool one_vector_p;
9592   bool testing_p;
9593 };
9594
9595 /* Generate a variable permutation.  */
9596
9597 static void
9598 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9599 {
9600   machine_mode vmode = GET_MODE (target);
9601   bool one_vector_p = rtx_equal_p (op0, op1);
9602
9603   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9604   gcc_checking_assert (GET_MODE (op0) == vmode);
9605   gcc_checking_assert (GET_MODE (op1) == vmode);
9606   gcc_checking_assert (GET_MODE (sel) == vmode);
9607   gcc_checking_assert (TARGET_SIMD);
9608
9609   if (one_vector_p)
9610     {
9611       if (vmode == V8QImode)
9612         {
9613           /* Expand the argument to a V16QI mode by duplicating it.  */
9614           rtx pair = gen_reg_rtx (V16QImode);
9615           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9616           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9617         }
9618       else
9619         {
9620           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9621         }
9622     }
9623   else
9624     {
9625       rtx pair;
9626
9627       if (vmode == V8QImode)
9628         {
9629           pair = gen_reg_rtx (V16QImode);
9630           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9631           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9632         }
9633       else
9634         {
9635           pair = gen_reg_rtx (OImode);
9636           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9637           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9638         }
9639     }
9640 }
9641
9642 void
9643 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9644 {
9645   machine_mode vmode = GET_MODE (target);
9646   unsigned int nelt = GET_MODE_NUNITS (vmode);
9647   bool one_vector_p = rtx_equal_p (op0, op1);
9648   rtx mask;
9649
9650   /* The TBL instruction does not use a modulo index, so we must take care
9651      of that ourselves.  */
9652   mask = aarch64_simd_gen_const_vector_dup (vmode,
9653       one_vector_p ? nelt - 1 : 2 * nelt - 1);
9654   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9655
9656   /* For big-endian, we also need to reverse the index within the vector
9657      (but not which vector).  */
9658   if (BYTES_BIG_ENDIAN)
9659     {
9660       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
9661       if (!one_vector_p)
9662         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9663       sel = expand_simple_binop (vmode, XOR, sel, mask,
9664                                  NULL, 0, OPTAB_LIB_WIDEN);
9665     }
9666   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9667 }
9668
9669 /* Recognize patterns suitable for the TRN instructions.  */
9670 static bool
9671 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9672 {
9673   unsigned int i, odd, mask, nelt = d->nelt;
9674   rtx out, in0, in1, x;
9675   rtx (*gen) (rtx, rtx, rtx);
9676   machine_mode vmode = d->vmode;
9677
9678   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9679     return false;
9680
9681   /* Note that these are little-endian tests.
9682      We correct for big-endian later.  */
9683   if (d->perm[0] == 0)
9684     odd = 0;
9685   else if (d->perm[0] == 1)
9686     odd = 1;
9687   else
9688     return false;
9689   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9690
9691   for (i = 0; i < nelt; i += 2)
9692     {
9693       if (d->perm[i] != i + odd)
9694         return false;
9695       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9696         return false;
9697     }
9698
9699   /* Success!  */
9700   if (d->testing_p)
9701     return true;
9702
9703   in0 = d->op0;
9704   in1 = d->op1;
9705   if (BYTES_BIG_ENDIAN)
9706     {
9707       x = in0, in0 = in1, in1 = x;
9708       odd = !odd;
9709     }
9710   out = d->target;
9711
9712   if (odd)
9713     {
9714       switch (vmode)
9715         {
9716         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9717         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9718         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9719         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9720         case V4SImode: gen = gen_aarch64_trn2v4si; break;
9721         case V2SImode: gen = gen_aarch64_trn2v2si; break;
9722         case V2DImode: gen = gen_aarch64_trn2v2di; break;
9723         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9724         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9725         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9726         default:
9727           return false;
9728         }
9729     }
9730   else
9731     {
9732       switch (vmode)
9733         {
9734         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9735         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9736         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9737         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9738         case V4SImode: gen = gen_aarch64_trn1v4si; break;
9739         case V2SImode: gen = gen_aarch64_trn1v2si; break;
9740         case V2DImode: gen = gen_aarch64_trn1v2di; break;
9741         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9742         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9743         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9744         default:
9745           return false;
9746         }
9747     }
9748
9749   emit_insn (gen (out, in0, in1));
9750   return true;
9751 }
9752
9753 /* Recognize patterns suitable for the UZP instructions.  */
9754 static bool
9755 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
9756 {
9757   unsigned int i, odd, mask, nelt = d->nelt;
9758   rtx out, in0, in1, x;
9759   rtx (*gen) (rtx, rtx, rtx);
9760   machine_mode vmode = d->vmode;
9761
9762   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9763     return false;
9764
9765   /* Note that these are little-endian tests.
9766      We correct for big-endian later.  */
9767   if (d->perm[0] == 0)
9768     odd = 0;
9769   else if (d->perm[0] == 1)
9770     odd = 1;
9771   else
9772     return false;
9773   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9774
9775   for (i = 0; i < nelt; i++)
9776     {
9777       unsigned elt = (i * 2 + odd) & mask;
9778       if (d->perm[i] != elt)
9779         return false;
9780     }
9781
9782   /* Success!  */
9783   if (d->testing_p)
9784     return true;
9785
9786   in0 = d->op0;
9787   in1 = d->op1;
9788   if (BYTES_BIG_ENDIAN)
9789     {
9790       x = in0, in0 = in1, in1 = x;
9791       odd = !odd;
9792     }
9793   out = d->target;
9794
9795   if (odd)
9796     {
9797       switch (vmode)
9798         {
9799         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9800         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9801         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9802         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9803         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9804         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9805         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9806         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9807         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9808         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9809         default:
9810           return false;
9811         }
9812     }
9813   else
9814     {
9815       switch (vmode)
9816         {
9817         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9818         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9819         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9820         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9821         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9822         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9823         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9824         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9825         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9826         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9827         default:
9828           return false;
9829         }
9830     }
9831
9832   emit_insn (gen (out, in0, in1));
9833   return true;
9834 }
9835
9836 /* Recognize patterns suitable for the ZIP instructions.  */
9837 static bool
9838 aarch64_evpc_zip (struct expand_vec_perm_d *d)
9839 {
9840   unsigned int i, high, mask, nelt = d->nelt;
9841   rtx out, in0, in1, x;
9842   rtx (*gen) (rtx, rtx, rtx);
9843   machine_mode vmode = d->vmode;
9844
9845   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9846     return false;
9847
9848   /* Note that these are little-endian tests.
9849      We correct for big-endian later.  */
9850   high = nelt / 2;
9851   if (d->perm[0] == high)
9852     /* Do Nothing.  */
9853     ;
9854   else if (d->perm[0] == 0)
9855     high = 0;
9856   else
9857     return false;
9858   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9859
9860   for (i = 0; i < nelt / 2; i++)
9861     {
9862       unsigned elt = (i + high) & mask;
9863       if (d->perm[i * 2] != elt)
9864         return false;
9865       elt = (elt + nelt) & mask;
9866       if (d->perm[i * 2 + 1] != elt)
9867         return false;
9868     }
9869
9870   /* Success!  */
9871   if (d->testing_p)
9872     return true;
9873
9874   in0 = d->op0;
9875   in1 = d->op1;
9876   if (BYTES_BIG_ENDIAN)
9877     {
9878       x = in0, in0 = in1, in1 = x;
9879       high = !high;
9880     }
9881   out = d->target;
9882
9883   if (high)
9884     {
9885       switch (vmode)
9886         {
9887         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9888         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9889         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9890         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9891         case V4SImode: gen = gen_aarch64_zip2v4si; break;
9892         case V2SImode: gen = gen_aarch64_zip2v2si; break;
9893         case V2DImode: gen = gen_aarch64_zip2v2di; break;
9894         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9895         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9896         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9897         default:
9898           return false;
9899         }
9900     }
9901   else
9902     {
9903       switch (vmode)
9904         {
9905         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9906         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9907         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9908         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9909         case V4SImode: gen = gen_aarch64_zip1v4si; break;
9910         case V2SImode: gen = gen_aarch64_zip1v2si; break;
9911         case V2DImode: gen = gen_aarch64_zip1v2di; break;
9912         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9913         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9914         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9915         default:
9916           return false;
9917         }
9918     }
9919
9920   emit_insn (gen (out, in0, in1));
9921   return true;
9922 }
9923
9924 /* Recognize patterns for the EXT insn.  */
9925
9926 static bool
9927 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9928 {
9929   unsigned int i, nelt = d->nelt;
9930   rtx (*gen) (rtx, rtx, rtx, rtx);
9931   rtx offset;
9932
9933   unsigned int location = d->perm[0]; /* Always < nelt.  */
9934
9935   /* Check if the extracted indices are increasing by one.  */
9936   for (i = 1; i < nelt; i++)
9937     {
9938       unsigned int required = location + i;
9939       if (d->one_vector_p)
9940         {
9941           /* We'll pass the same vector in twice, so allow indices to wrap.  */
9942           required &= (nelt - 1);
9943         }
9944       if (d->perm[i] != required)
9945         return false;
9946     }
9947
9948   switch (d->vmode)
9949     {
9950     case V16QImode: gen = gen_aarch64_extv16qi; break;
9951     case V8QImode: gen = gen_aarch64_extv8qi; break;
9952     case V4HImode: gen = gen_aarch64_extv4hi; break;
9953     case V8HImode: gen = gen_aarch64_extv8hi; break;
9954     case V2SImode: gen = gen_aarch64_extv2si; break;
9955     case V4SImode: gen = gen_aarch64_extv4si; break;
9956     case V2SFmode: gen = gen_aarch64_extv2sf; break;
9957     case V4SFmode: gen = gen_aarch64_extv4sf; break;
9958     case V2DImode: gen = gen_aarch64_extv2di; break;
9959     case V2DFmode: gen = gen_aarch64_extv2df; break;
9960     default:
9961       return false;
9962     }
9963
9964   /* Success! */
9965   if (d->testing_p)
9966     return true;
9967
9968   /* The case where (location == 0) is a no-op for both big- and little-endian,
9969      and is removed by the mid-end at optimization levels -O1 and higher.  */
9970
9971   if (BYTES_BIG_ENDIAN && (location != 0))
9972     {
9973       /* After setup, we want the high elements of the first vector (stored
9974          at the LSB end of the register), and the low elements of the second
9975          vector (stored at the MSB end of the register). So swap.  */
9976       std::swap (d->op0, d->op1);
9977       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
9978       location = nelt - location;
9979     }
9980
9981   offset = GEN_INT (location);
9982   emit_insn (gen (d->target, d->op0, d->op1, offset));
9983   return true;
9984 }
9985
9986 /* Recognize patterns for the REV insns.  */
9987
9988 static bool
9989 aarch64_evpc_rev (struct expand_vec_perm_d *d)
9990 {
9991   unsigned int i, j, diff, nelt = d->nelt;
9992   rtx (*gen) (rtx, rtx);
9993
9994   if (!d->one_vector_p)
9995     return false;
9996
9997   diff = d->perm[0];
9998   switch (diff)
9999     {
10000     case 7:
10001       switch (d->vmode)
10002         {
10003         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
10004         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
10005         default:
10006           return false;
10007         }
10008       break;
10009     case 3:
10010       switch (d->vmode)
10011         {
10012         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
10013         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
10014         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
10015         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
10016         default:
10017           return false;
10018         }
10019       break;
10020     case 1:
10021       switch (d->vmode)
10022         {
10023         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
10024         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
10025         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
10026         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
10027         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
10028         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
10029         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
10030         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
10031         default:
10032           return false;
10033         }
10034       break;
10035     default:
10036       return false;
10037     }
10038
10039   for (i = 0; i < nelt ; i += diff + 1)
10040     for (j = 0; j <= diff; j += 1)
10041       {
10042         /* This is guaranteed to be true as the value of diff
10043            is 7, 3, 1 and we should have enough elements in the
10044            queue to generate this.  Getting a vector mask with a
10045            value of diff other than these values implies that
10046            something is wrong by the time we get here.  */
10047         gcc_assert (i + j < nelt);
10048         if (d->perm[i + j] != i + diff - j)
10049           return false;
10050       }
10051
10052   /* Success! */
10053   if (d->testing_p)
10054     return true;
10055
10056   emit_insn (gen (d->target, d->op0));
10057   return true;
10058 }
10059
10060 static bool
10061 aarch64_evpc_dup (struct expand_vec_perm_d *d)
10062 {
10063   rtx (*gen) (rtx, rtx, rtx);
10064   rtx out = d->target;
10065   rtx in0;
10066   machine_mode vmode = d->vmode;
10067   unsigned int i, elt, nelt = d->nelt;
10068   rtx lane;
10069
10070   elt = d->perm[0];
10071   for (i = 1; i < nelt; i++)
10072     {
10073       if (elt != d->perm[i])
10074         return false;
10075     }
10076
10077   /* The generic preparation in aarch64_expand_vec_perm_const_1
10078      swaps the operand order and the permute indices if it finds
10079      d->perm[0] to be in the second operand.  Thus, we can always
10080      use d->op0 and need not do any extra arithmetic to get the
10081      correct lane number.  */
10082   in0 = d->op0;
10083   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
10084
10085   switch (vmode)
10086     {
10087     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
10088     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
10089     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
10090     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
10091     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
10092     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
10093     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
10094     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
10095     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
10096     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
10097     default:
10098       return false;
10099     }
10100
10101   emit_insn (gen (out, in0, lane));
10102   return true;
10103 }
10104
10105 static bool
10106 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
10107 {
10108   rtx rperm[MAX_VECT_LEN], sel;
10109   machine_mode vmode = d->vmode;
10110   unsigned int i, nelt = d->nelt;
10111
10112   if (d->testing_p)
10113     return true;
10114
10115   /* Generic code will try constant permutation twice.  Once with the
10116      original mode and again with the elements lowered to QImode.
10117      So wait and don't do the selector expansion ourselves.  */
10118   if (vmode != V8QImode && vmode != V16QImode)
10119     return false;
10120
10121   for (i = 0; i < nelt; ++i)
10122     {
10123       int nunits = GET_MODE_NUNITS (vmode);
10124
10125       /* If big-endian and two vectors we end up with a weird mixed-endian
10126          mode on NEON.  Reverse the index within each word but not the word
10127          itself.  */
10128       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
10129                                            : d->perm[i]);
10130     }
10131   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
10132   sel = force_reg (vmode, sel);
10133
10134   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
10135   return true;
10136 }
10137
10138 static bool
10139 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10140 {
10141   /* The pattern matching functions above are written to look for a small
10142      number to begin the sequence (0, 1, N/2).  If we begin with an index
10143      from the second operand, we can swap the operands.  */
10144   if (d->perm[0] >= d->nelt)
10145     {
10146       unsigned i, nelt = d->nelt;
10147
10148       gcc_assert (nelt == (nelt & -nelt));
10149       for (i = 0; i < nelt; ++i)
10150         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
10151
10152       std::swap (d->op0, d->op1);
10153     }
10154
10155   if (TARGET_SIMD)
10156     {
10157       if (aarch64_evpc_rev (d))
10158         return true;
10159       else if (aarch64_evpc_ext (d))
10160         return true;
10161       else if (aarch64_evpc_dup (d))
10162         return true;
10163       else if (aarch64_evpc_zip (d))
10164         return true;
10165       else if (aarch64_evpc_uzp (d))
10166         return true;
10167       else if (aarch64_evpc_trn (d))
10168         return true;
10169       return aarch64_evpc_tbl (d);
10170     }
10171   return false;
10172 }
10173
10174 /* Expand a vec_perm_const pattern.  */
10175
10176 bool
10177 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10178 {
10179   struct expand_vec_perm_d d;
10180   int i, nelt, which;
10181
10182   d.target = target;
10183   d.op0 = op0;
10184   d.op1 = op1;
10185
10186   d.vmode = GET_MODE (target);
10187   gcc_assert (VECTOR_MODE_P (d.vmode));
10188   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10189   d.testing_p = false;
10190
10191   for (i = which = 0; i < nelt; ++i)
10192     {
10193       rtx e = XVECEXP (sel, 0, i);
10194       int ei = INTVAL (e) & (2 * nelt - 1);
10195       which |= (ei < nelt ? 1 : 2);
10196       d.perm[i] = ei;
10197     }
10198
10199   switch (which)
10200     {
10201     default:
10202       gcc_unreachable ();
10203
10204     case 3:
10205       d.one_vector_p = false;
10206       if (!rtx_equal_p (op0, op1))
10207         break;
10208
10209       /* The elements of PERM do not suggest that only the first operand
10210          is used, but both operands are identical.  Allow easier matching
10211          of the permutation by folding the permutation into the single
10212          input vector.  */
10213       /* Fall Through.  */
10214     case 2:
10215       for (i = 0; i < nelt; ++i)
10216         d.perm[i] &= nelt - 1;
10217       d.op0 = op1;
10218       d.one_vector_p = true;
10219       break;
10220
10221     case 1:
10222       d.op1 = op0;
10223       d.one_vector_p = true;
10224       break;
10225     }
10226
10227   return aarch64_expand_vec_perm_const_1 (&d);
10228 }
10229
10230 static bool
10231 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10232                                      const unsigned char *sel)
10233 {
10234   struct expand_vec_perm_d d;
10235   unsigned int i, nelt, which;
10236   bool ret;
10237
10238   d.vmode = vmode;
10239   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10240   d.testing_p = true;
10241   memcpy (d.perm, sel, nelt);
10242
10243   /* Calculate whether all elements are in one vector.  */
10244   for (i = which = 0; i < nelt; ++i)
10245     {
10246       unsigned char e = d.perm[i];
10247       gcc_assert (e < 2 * nelt);
10248       which |= (e < nelt ? 1 : 2);
10249     }
10250
10251   /* If all elements are from the second vector, reindex as if from the
10252      first vector.  */
10253   if (which == 2)
10254     for (i = 0; i < nelt; ++i)
10255       d.perm[i] -= nelt;
10256
10257   /* Check whether the mask can be applied to a single vector.  */
10258   d.one_vector_p = (which != 3);
10259
10260   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10261   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10262   if (!d.one_vector_p)
10263     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10264
10265   start_sequence ();
10266   ret = aarch64_expand_vec_perm_const_1 (&d);
10267   end_sequence ();
10268
10269   return ret;
10270 }
10271
10272 rtx
10273 aarch64_reverse_mask (enum machine_mode mode)
10274 {
10275   /* We have to reverse each vector because we dont have
10276      a permuted load that can reverse-load according to ABI rules.  */
10277   rtx mask;
10278   rtvec v = rtvec_alloc (16);
10279   int i, j;
10280   int nunits = GET_MODE_NUNITS (mode);
10281   int usize = GET_MODE_UNIT_SIZE (mode);
10282
10283   gcc_assert (BYTES_BIG_ENDIAN);
10284   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
10285
10286   for (i = 0; i < nunits; i++)
10287     for (j = 0; j < usize; j++)
10288       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
10289   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
10290   return force_reg (V16QImode, mask);
10291 }
10292
10293 /* Implement MODES_TIEABLE_P.  */
10294
10295 bool
10296 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10297 {
10298   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10299     return true;
10300
10301   /* We specifically want to allow elements of "structure" modes to
10302      be tieable to the structure.  This more general condition allows
10303      other rarer situations too.  */
10304   if (TARGET_SIMD
10305       && aarch64_vector_mode_p (mode1)
10306       && aarch64_vector_mode_p (mode2))
10307     return true;
10308
10309   return false;
10310 }
10311
10312 /* Return a new RTX holding the result of moving POINTER forward by
10313    AMOUNT bytes.  */
10314
10315 static rtx
10316 aarch64_move_pointer (rtx pointer, int amount)
10317 {
10318   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10319
10320   return adjust_automodify_address (pointer, GET_MODE (pointer),
10321                                     next, amount);
10322 }
10323
10324 /* Return a new RTX holding the result of moving POINTER forward by the
10325    size of the mode it points to.  */
10326
10327 static rtx
10328 aarch64_progress_pointer (rtx pointer)
10329 {
10330   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10331
10332   return aarch64_move_pointer (pointer, amount);
10333 }
10334
10335 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10336    MODE bytes.  */
10337
10338 static void
10339 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10340                                               machine_mode mode)
10341 {
10342   rtx reg = gen_reg_rtx (mode);
10343
10344   /* "Cast" the pointers to the correct mode.  */
10345   *src = adjust_address (*src, mode, 0);
10346   *dst = adjust_address (*dst, mode, 0);
10347   /* Emit the memcpy.  */
10348   emit_move_insn (reg, *src);
10349   emit_move_insn (*dst, reg);
10350   /* Move the pointers forward.  */
10351   *src = aarch64_progress_pointer (*src);
10352   *dst = aarch64_progress_pointer (*dst);
10353 }
10354
10355 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
10356    we succeed, otherwise return false.  */
10357
10358 bool
10359 aarch64_expand_movmem (rtx *operands)
10360 {
10361   unsigned int n;
10362   rtx dst = operands[0];
10363   rtx src = operands[1];
10364   rtx base;
10365   bool speed_p = !optimize_function_for_size_p (cfun);
10366
10367   /* When optimizing for size, give a better estimate of the length of a
10368      memcpy call, but use the default otherwise.  */
10369   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10370
10371   /* We can't do anything smart if the amount to copy is not constant.  */
10372   if (!CONST_INT_P (operands[2]))
10373     return false;
10374
10375   n = UINTVAL (operands[2]);
10376
10377   /* Try to keep the number of instructions low.  For cases below 16 bytes we
10378      need to make at most two moves.  For cases above 16 bytes it will be one
10379      move for each 16 byte chunk, then at most two additional moves.  */
10380   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10381     return false;
10382
10383   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10384   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10385
10386   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10387   src = adjust_automodify_address (src, VOIDmode, base, 0);
10388
10389   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10390      1-byte chunk.  */
10391   if (n < 4)
10392     {
10393       if (n >= 2)
10394         {
10395           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10396           n -= 2;
10397         }
10398
10399       if (n == 1)
10400         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10401
10402       return true;
10403     }
10404
10405   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
10406      4-byte chunk, partially overlapping with the previously copied chunk.  */
10407   if (n < 8)
10408     {
10409       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10410       n -= 4;
10411       if (n > 0)
10412         {
10413           int move = n - 4;
10414
10415           src = aarch64_move_pointer (src, move);
10416           dst = aarch64_move_pointer (dst, move);
10417           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10418         }
10419       return true;
10420     }
10421
10422   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
10423      them, then (if applicable) an 8-byte chunk.  */
10424   while (n >= 8)
10425     {
10426       if (n / 16)
10427         {
10428           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10429           n -= 16;
10430         }
10431       else
10432         {
10433           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10434           n -= 8;
10435         }
10436     }
10437
10438   /* Finish the final bytes of the copy.  We can always do this in one
10439      instruction.  We either copy the exact amount we need, or partially
10440      overlap with the previous chunk we copied and copy 8-bytes.  */
10441   if (n == 0)
10442     return true;
10443   else if (n == 1)
10444     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10445   else if (n == 2)
10446     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10447   else if (n == 4)
10448     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10449   else
10450     {
10451       if (n == 3)
10452         {
10453           src = aarch64_move_pointer (src, -1);
10454           dst = aarch64_move_pointer (dst, -1);
10455           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10456         }
10457       else
10458         {
10459           int move = n - 8;
10460
10461           src = aarch64_move_pointer (src, move);
10462           dst = aarch64_move_pointer (dst, move);
10463           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10464         }
10465     }
10466
10467   return true;
10468 }
10469
10470 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
10471
10472 static unsigned HOST_WIDE_INT
10473 aarch64_asan_shadow_offset (void)
10474 {
10475   return (HOST_WIDE_INT_1 << 36);
10476 }
10477
10478 static bool
10479 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
10480                                         unsigned int align,
10481                                         enum by_pieces_operation op,
10482                                         bool speed_p)
10483 {
10484   /* STORE_BY_PIECES can be used when copying a constant string, but
10485      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10486      For now we always fail this and let the move_by_pieces code copy
10487      the string from read-only memory.  */
10488   if (op == STORE_BY_PIECES)
10489     return false;
10490
10491   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10492 }
10493
10494 static enum machine_mode
10495 aarch64_code_to_ccmode (enum rtx_code code)
10496 {
10497   switch (code)
10498     {
10499     case NE:
10500       return CC_DNEmode;
10501
10502     case EQ:
10503       return CC_DEQmode;
10504
10505     case LE:
10506       return CC_DLEmode;
10507
10508     case LT:
10509       return CC_DLTmode;
10510
10511     case GE:
10512       return CC_DGEmode;
10513
10514     case GT:
10515       return CC_DGTmode;
10516
10517     case LEU:
10518       return CC_DLEUmode;
10519
10520     case LTU:
10521       return CC_DLTUmode;
10522
10523     case GEU:
10524       return CC_DGEUmode;
10525
10526     case GTU:
10527       return CC_DGTUmode;
10528
10529     default:
10530       return CCmode;
10531     }
10532 }
10533
10534 static rtx
10535 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
10536                         int code, tree treeop0, tree treeop1)
10537 {
10538   enum machine_mode op_mode, cmp_mode, cc_mode;
10539   rtx op0, op1, cmp, target;
10540   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10541   enum insn_code icode;
10542   struct expand_operand ops[4];
10543
10544   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
10545   if (cc_mode == CCmode)
10546     return NULL_RTX;
10547
10548   start_sequence ();
10549   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10550
10551   op_mode = GET_MODE (op0);
10552   if (op_mode == VOIDmode)
10553     op_mode = GET_MODE (op1);
10554
10555   switch (op_mode)
10556     {
10557     case QImode:
10558     case HImode:
10559     case SImode:
10560       cmp_mode = SImode;
10561       icode = CODE_FOR_cmpsi;
10562       break;
10563
10564     case DImode:
10565       cmp_mode = DImode;
10566       icode = CODE_FOR_cmpdi;
10567       break;
10568
10569     default:
10570       end_sequence ();
10571       return NULL_RTX;
10572     }
10573
10574   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10575   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10576   if (!op0 || !op1)
10577     {
10578       end_sequence ();
10579       return NULL_RTX;
10580     }
10581   *prep_seq = get_insns ();
10582   end_sequence ();
10583
10584   cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
10585   target = gen_rtx_REG (CCmode, CC_REGNUM);
10586
10587   create_output_operand (&ops[0], target, CCmode);
10588   create_fixed_operand (&ops[1], cmp);
10589   create_fixed_operand (&ops[2], op0);
10590   create_fixed_operand (&ops[3], op1);
10591
10592   start_sequence ();
10593   if (!maybe_expand_insn (icode, 4, ops))
10594     {
10595       end_sequence ();
10596       return NULL_RTX;
10597     }
10598   *gen_seq = get_insns ();
10599   end_sequence ();
10600
10601   return gen_rtx_REG (cc_mode, CC_REGNUM);
10602 }
10603
10604 static rtx
10605 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
10606                        tree treeop0, tree treeop1, int bit_code)
10607 {
10608   rtx op0, op1, cmp0, cmp1, target;
10609   enum machine_mode op_mode, cmp_mode, cc_mode;
10610   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10611   enum insn_code icode = CODE_FOR_ccmp_andsi;
10612   struct expand_operand ops[6];
10613
10614   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
10615   if (cc_mode == CCmode)
10616     return NULL_RTX;
10617
10618   push_to_sequence ((rtx_insn*) *prep_seq);
10619   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10620
10621   op_mode = GET_MODE (op0);
10622   if (op_mode == VOIDmode)
10623     op_mode = GET_MODE (op1);
10624
10625   switch (op_mode)
10626     {
10627     case QImode:
10628     case HImode:
10629     case SImode:
10630       cmp_mode = SImode;
10631       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
10632                                                 : CODE_FOR_ccmp_iorsi;
10633       break;
10634
10635     case DImode:
10636       cmp_mode = DImode;
10637       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
10638                                                 : CODE_FOR_ccmp_iordi;
10639       break;
10640
10641     default:
10642       end_sequence ();
10643       return NULL_RTX;
10644     }
10645
10646   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10647   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10648   if (!op0 || !op1)
10649     {
10650       end_sequence ();
10651       return NULL_RTX;
10652     }
10653   *prep_seq = get_insns ();
10654   end_sequence ();
10655
10656   target = gen_rtx_REG (cc_mode, CC_REGNUM);
10657   cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
10658   cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
10659
10660   create_fixed_operand (&ops[0], prev);
10661   create_fixed_operand (&ops[1], target);
10662   create_fixed_operand (&ops[2], op0);
10663   create_fixed_operand (&ops[3], op1);
10664   create_fixed_operand (&ops[4], cmp0);
10665   create_fixed_operand (&ops[5], cmp1);
10666
10667   push_to_sequence ((rtx_insn*) *gen_seq);
10668   if (!maybe_expand_insn (icode, 6, ops))
10669     {
10670       end_sequence ();
10671       return NULL_RTX;
10672     }
10673
10674   *gen_seq = get_insns ();
10675   end_sequence ();
10676
10677   return target;
10678 }
10679
10680 #undef TARGET_GEN_CCMP_FIRST
10681 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
10682
10683 #undef TARGET_GEN_CCMP_NEXT
10684 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
10685
10686 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
10687    instruction fusion of some sort.  */
10688
10689 static bool
10690 aarch64_macro_fusion_p (void)
10691 {
10692   return aarch64_tune_params->fuseable_ops != AARCH64_FUSE_NOTHING;
10693 }
10694
10695
10696 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
10697    should be kept together during scheduling.  */
10698
10699 static bool
10700 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
10701 {
10702   rtx set_dest;
10703   rtx prev_set = single_set (prev);
10704   rtx curr_set = single_set (curr);
10705   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
10706   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
10707
10708   if (!aarch64_macro_fusion_p ())
10709     return false;
10710
10711   if (simple_sets_p
10712       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOV_MOVK))
10713     {
10714       /* We are trying to match:
10715          prev (mov)  == (set (reg r0) (const_int imm16))
10716          curr (movk) == (set (zero_extract (reg r0)
10717                                            (const_int 16)
10718                                            (const_int 16))
10719                              (const_int imm16_1))  */
10720
10721       set_dest = SET_DEST (curr_set);
10722
10723       if (GET_CODE (set_dest) == ZERO_EXTRACT
10724           && CONST_INT_P (SET_SRC (curr_set))
10725           && CONST_INT_P (SET_SRC (prev_set))
10726           && CONST_INT_P (XEXP (set_dest, 2))
10727           && INTVAL (XEXP (set_dest, 2)) == 16
10728           && REG_P (XEXP (set_dest, 0))
10729           && REG_P (SET_DEST (prev_set))
10730           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
10731         {
10732           return true;
10733         }
10734     }
10735
10736   if (simple_sets_p
10737       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_ADD))
10738     {
10739
10740       /*  We're trying to match:
10741           prev (adrp) == (set (reg r1)
10742                               (high (symbol_ref ("SYM"))))
10743           curr (add) == (set (reg r0)
10744                              (lo_sum (reg r1)
10745                                      (symbol_ref ("SYM"))))
10746           Note that r0 need not necessarily be the same as r1, especially
10747           during pre-regalloc scheduling.  */
10748
10749       if (satisfies_constraint_Ush (SET_SRC (prev_set))
10750           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10751         {
10752           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
10753               && REG_P (XEXP (SET_SRC (curr_set), 0))
10754               && REGNO (XEXP (SET_SRC (curr_set), 0))
10755                  == REGNO (SET_DEST (prev_set))
10756               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
10757                               XEXP (SET_SRC (curr_set), 1)))
10758             return true;
10759         }
10760     }
10761
10762   if (simple_sets_p
10763       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOVK_MOVK))
10764     {
10765
10766       /* We're trying to match:
10767          prev (movk) == (set (zero_extract (reg r0)
10768                                            (const_int 16)
10769                                            (const_int 32))
10770                              (const_int imm16_1))
10771          curr (movk) == (set (zero_extract (reg r0)
10772                                            (const_int 16)
10773                                            (const_int 48))
10774                              (const_int imm16_2))  */
10775
10776       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
10777           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
10778           && REG_P (XEXP (SET_DEST (prev_set), 0))
10779           && REG_P (XEXP (SET_DEST (curr_set), 0))
10780           && REGNO (XEXP (SET_DEST (prev_set), 0))
10781              == REGNO (XEXP (SET_DEST (curr_set), 0))
10782           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
10783           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
10784           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
10785           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
10786           && CONST_INT_P (SET_SRC (prev_set))
10787           && CONST_INT_P (SET_SRC (curr_set)))
10788         return true;
10789
10790     }
10791   if (simple_sets_p
10792       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_LDR))
10793     {
10794       /* We're trying to match:
10795           prev (adrp) == (set (reg r0)
10796                               (high (symbol_ref ("SYM"))))
10797           curr (ldr) == (set (reg r1)
10798                              (mem (lo_sum (reg r0)
10799                                              (symbol_ref ("SYM")))))
10800                  or
10801           curr (ldr) == (set (reg r1)
10802                              (zero_extend (mem
10803                                            (lo_sum (reg r0)
10804                                                    (symbol_ref ("SYM"))))))  */
10805       if (satisfies_constraint_Ush (SET_SRC (prev_set))
10806           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10807         {
10808           rtx curr_src = SET_SRC (curr_set);
10809
10810           if (GET_CODE (curr_src) == ZERO_EXTEND)
10811             curr_src = XEXP (curr_src, 0);
10812
10813           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
10814               && REG_P (XEXP (XEXP (curr_src, 0), 0))
10815               && REGNO (XEXP (XEXP (curr_src, 0), 0))
10816                  == REGNO (SET_DEST (prev_set))
10817               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
10818                               XEXP (SET_SRC (prev_set), 0)))
10819               return true;
10820         }
10821     }
10822
10823   if ((aarch64_tune_params->fuseable_ops & AARCH64_FUSE_CMP_BRANCH)
10824       && any_condjump_p (curr))
10825     {
10826       enum attr_type prev_type = get_attr_type (prev);
10827
10828       /* FIXME: this misses some which is considered simple arthematic
10829          instructions for ThunderX.  Simple shifts are missed here.  */
10830       if (prev_type == TYPE_ALUS_SREG
10831           || prev_type == TYPE_ALUS_IMM
10832           || prev_type == TYPE_LOGICS_REG
10833           || prev_type == TYPE_LOGICS_IMM)
10834         return true;
10835     }
10836
10837   return false;
10838 }
10839
10840 /* If MEM is in the form of [base+offset], extract the two parts
10841    of address and set to BASE and OFFSET, otherwise return false
10842    after clearing BASE and OFFSET.  */
10843
10844 bool
10845 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
10846 {
10847   rtx addr;
10848
10849   gcc_assert (MEM_P (mem));
10850
10851   addr = XEXP (mem, 0);
10852
10853   if (REG_P (addr))
10854     {
10855       *base = addr;
10856       *offset = const0_rtx;
10857       return true;
10858     }
10859
10860   if (GET_CODE (addr) == PLUS
10861       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
10862     {
10863       *base = XEXP (addr, 0);
10864       *offset = XEXP (addr, 1);
10865       return true;
10866     }
10867
10868   *base = NULL_RTX;
10869   *offset = NULL_RTX;
10870
10871   return false;
10872 }
10873
10874 /* Types for scheduling fusion.  */
10875 enum sched_fusion_type
10876 {
10877   SCHED_FUSION_NONE = 0,
10878   SCHED_FUSION_LD_SIGN_EXTEND,
10879   SCHED_FUSION_LD_ZERO_EXTEND,
10880   SCHED_FUSION_LD,
10881   SCHED_FUSION_ST,
10882   SCHED_FUSION_NUM
10883 };
10884
10885 /* If INSN is a load or store of address in the form of [base+offset],
10886    extract the two parts and set to BASE and OFFSET.  Return scheduling
10887    fusion type this INSN is.  */
10888
10889 static enum sched_fusion_type
10890 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
10891 {
10892   rtx x, dest, src;
10893   enum sched_fusion_type fusion = SCHED_FUSION_LD;
10894
10895   gcc_assert (INSN_P (insn));
10896   x = PATTERN (insn);
10897   if (GET_CODE (x) != SET)
10898     return SCHED_FUSION_NONE;
10899
10900   src = SET_SRC (x);
10901   dest = SET_DEST (x);
10902
10903   if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
10904       && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
10905     return SCHED_FUSION_NONE;
10906
10907   if (GET_CODE (src) == SIGN_EXTEND)
10908     {
10909       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
10910       src = XEXP (src, 0);
10911       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10912         return SCHED_FUSION_NONE;
10913     }
10914   else if (GET_CODE (src) == ZERO_EXTEND)
10915     {
10916       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
10917       src = XEXP (src, 0);
10918       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10919         return SCHED_FUSION_NONE;
10920     }
10921
10922   if (GET_CODE (src) == MEM && REG_P (dest))
10923     extract_base_offset_in_addr (src, base, offset);
10924   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
10925     {
10926       fusion = SCHED_FUSION_ST;
10927       extract_base_offset_in_addr (dest, base, offset);
10928     }
10929   else
10930     return SCHED_FUSION_NONE;
10931
10932   if (*base == NULL_RTX || *offset == NULL_RTX)
10933     fusion = SCHED_FUSION_NONE;
10934
10935   return fusion;
10936 }
10937
10938 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
10939
10940    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
10941    and PRI are only calculated for these instructions.  For other instruction,
10942    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
10943    type instruction fusion can be added by returning different priorities.
10944
10945    It's important that irrelevant instructions get the largest FUSION_PRI.  */
10946
10947 static void
10948 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
10949                                int *fusion_pri, int *pri)
10950 {
10951   int tmp, off_val;
10952   rtx base, offset;
10953   enum sched_fusion_type fusion;
10954
10955   gcc_assert (INSN_P (insn));
10956
10957   tmp = max_pri - 1;
10958   fusion = fusion_load_store (insn, &base, &offset);
10959   if (fusion == SCHED_FUSION_NONE)
10960     {
10961       *pri = tmp;
10962       *fusion_pri = tmp;
10963       return;
10964     }
10965
10966   /* Set FUSION_PRI according to fusion type and base register.  */
10967   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
10968
10969   /* Calculate PRI.  */
10970   tmp /= 2;
10971
10972   /* INSN with smaller offset goes first.  */
10973   off_val = (int)(INTVAL (offset));
10974   if (off_val >= 0)
10975     tmp -= (off_val & 0xfffff);
10976   else
10977     tmp += ((- off_val) & 0xfffff);
10978
10979   *pri = tmp;
10980   return;
10981 }
10982
10983 /* Given OPERANDS of consecutive load/store, check if we can merge
10984    them into ldp/stp.  LOAD is true if they are load instructions.
10985    MODE is the mode of memory operands.  */
10986
10987 bool
10988 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
10989                                 enum machine_mode mode)
10990 {
10991   HOST_WIDE_INT offval_1, offval_2, msize;
10992   enum reg_class rclass_1, rclass_2;
10993   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
10994
10995   if (load)
10996     {
10997       mem_1 = operands[1];
10998       mem_2 = operands[3];
10999       reg_1 = operands[0];
11000       reg_2 = operands[2];
11001       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
11002       if (REGNO (reg_1) == REGNO (reg_2))
11003         return false;
11004     }
11005   else
11006     {
11007       mem_1 = operands[0];
11008       mem_2 = operands[2];
11009       reg_1 = operands[1];
11010       reg_2 = operands[3];
11011     }
11012
11013   /* The mems cannot be volatile.  */
11014   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
11015     return false;
11016
11017   /* Check if the addresses are in the form of [base+offset].  */
11018   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11019   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11020     return false;
11021   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11022   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11023     return false;
11024
11025   /* Check if the bases are same.  */
11026   if (!rtx_equal_p (base_1, base_2))
11027     return false;
11028
11029   offval_1 = INTVAL (offset_1);
11030   offval_2 = INTVAL (offset_2);
11031   msize = GET_MODE_SIZE (mode);
11032   /* Check if the offsets are consecutive.  */
11033   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
11034     return false;
11035
11036   /* Check if the addresses are clobbered by load.  */
11037   if (load)
11038     {
11039       if (reg_mentioned_p (reg_1, mem_1))
11040         return false;
11041
11042       /* In increasing order, the last load can clobber the address.  */
11043       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
11044       return false;
11045     }
11046
11047   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11048     rclass_1 = FP_REGS;
11049   else
11050     rclass_1 = GENERAL_REGS;
11051
11052   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11053     rclass_2 = FP_REGS;
11054   else
11055     rclass_2 = GENERAL_REGS;
11056
11057   /* Check if the registers are of same class.  */
11058   if (rclass_1 != rclass_2)
11059     return false;
11060
11061   return true;
11062 }
11063
11064 /* Given OPERANDS of consecutive load/store, check if we can merge
11065    them into ldp/stp by adjusting the offset.  LOAD is true if they
11066    are load instructions.  MODE is the mode of memory operands.
11067
11068    Given below consecutive stores:
11069
11070      str  w1, [xb, 0x100]
11071      str  w1, [xb, 0x104]
11072      str  w1, [xb, 0x108]
11073      str  w1, [xb, 0x10c]
11074
11075    Though the offsets are out of the range supported by stp, we can
11076    still pair them after adjusting the offset, like:
11077
11078      add  scratch, xb, 0x100
11079      stp  w1, w1, [scratch]
11080      stp  w1, w1, [scratch, 0x8]
11081
11082    The peephole patterns detecting this opportunity should guarantee
11083    the scratch register is avaliable.  */
11084
11085 bool
11086 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
11087                                        enum machine_mode mode)
11088 {
11089   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
11090   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
11091   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
11092   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
11093
11094   if (load)
11095     {
11096       reg_1 = operands[0];
11097       mem_1 = operands[1];
11098       reg_2 = operands[2];
11099       mem_2 = operands[3];
11100       reg_3 = operands[4];
11101       mem_3 = operands[5];
11102       reg_4 = operands[6];
11103       mem_4 = operands[7];
11104       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
11105                   && REG_P (reg_3) && REG_P (reg_4));
11106       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
11107         return false;
11108     }
11109   else
11110     {
11111       mem_1 = operands[0];
11112       reg_1 = operands[1];
11113       mem_2 = operands[2];
11114       reg_2 = operands[3];
11115       mem_3 = operands[4];
11116       reg_3 = operands[5];
11117       mem_4 = operands[6];
11118       reg_4 = operands[7];
11119     }
11120   /* Skip if memory operand is by itslef valid for ldp/stp.  */
11121   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
11122     return false;
11123
11124   /* The mems cannot be volatile.  */
11125   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
11126       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
11127     return false;
11128
11129   /* Check if the addresses are in the form of [base+offset].  */
11130   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11131   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11132     return false;
11133   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11134   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11135     return false;
11136   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11137   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11138     return false;
11139   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11140   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11141     return false;
11142
11143   /* Check if the bases are same.  */
11144   if (!rtx_equal_p (base_1, base_2)
11145       || !rtx_equal_p (base_2, base_3)
11146       || !rtx_equal_p (base_3, base_4))
11147     return false;
11148
11149   offval_1 = INTVAL (offset_1);
11150   offval_2 = INTVAL (offset_2);
11151   offval_3 = INTVAL (offset_3);
11152   offval_4 = INTVAL (offset_4);
11153   msize = GET_MODE_SIZE (mode);
11154   /* Check if the offsets are consecutive.  */
11155   if ((offval_1 != (offval_2 + msize)
11156        || offval_1 != (offval_3 + msize * 2)
11157        || offval_1 != (offval_4 + msize * 3))
11158       && (offval_4 != (offval_3 + msize)
11159           || offval_4 != (offval_2 + msize * 2)
11160           || offval_4 != (offval_1 + msize * 3)))
11161     return false;
11162
11163   /* Check if the addresses are clobbered by load.  */
11164   if (load)
11165     {
11166       if (reg_mentioned_p (reg_1, mem_1)
11167           || reg_mentioned_p (reg_2, mem_2)
11168           || reg_mentioned_p (reg_3, mem_3))
11169         return false;
11170
11171       /* In increasing order, the last load can clobber the address.  */
11172       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11173         return false;
11174     }
11175
11176   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11177     rclass_1 = FP_REGS;
11178   else
11179     rclass_1 = GENERAL_REGS;
11180
11181   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11182     rclass_2 = FP_REGS;
11183   else
11184     rclass_2 = GENERAL_REGS;
11185
11186   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11187     rclass_3 = FP_REGS;
11188   else
11189     rclass_3 = GENERAL_REGS;
11190
11191   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11192     rclass_4 = FP_REGS;
11193   else
11194     rclass_4 = GENERAL_REGS;
11195
11196   /* Check if the registers are of same class.  */
11197   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11198     return false;
11199
11200   return true;
11201 }
11202
11203 /* Given OPERANDS of consecutive load/store, this function pairs them
11204    into ldp/stp after adjusting the offset.  It depends on the fact
11205    that addresses of load/store instructions are in increasing order.
11206    MODE is the mode of memory operands.  CODE is the rtl operator
11207    which should be applied to all memory operands, it's SIGN_EXTEND,
11208    ZERO_EXTEND or UNKNOWN.  */
11209
11210 bool
11211 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11212                              enum machine_mode mode, RTX_CODE code)
11213 {
11214   rtx base, offset, t1, t2;
11215   rtx mem_1, mem_2, mem_3, mem_4;
11216   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11217
11218   if (load)
11219     {
11220       mem_1 = operands[1];
11221       mem_2 = operands[3];
11222       mem_3 = operands[5];
11223       mem_4 = operands[7];
11224     }
11225   else
11226     {
11227       mem_1 = operands[0];
11228       mem_2 = operands[2];
11229       mem_3 = operands[4];
11230       mem_4 = operands[6];
11231       gcc_assert (code == UNKNOWN);
11232     }
11233
11234   extract_base_offset_in_addr (mem_1, &base, &offset);
11235   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11236
11237   /* Adjust offset thus it can fit in ldp/stp instruction.  */
11238   msize = GET_MODE_SIZE (mode);
11239   stp_off_limit = msize * 0x40;
11240   off_val = INTVAL (offset);
11241   abs_off = (off_val < 0) ? -off_val : off_val;
11242   new_off = abs_off % stp_off_limit;
11243   adj_off = abs_off - new_off;
11244
11245   /* Further adjust to make sure all offsets are OK.  */
11246   if ((new_off + msize * 2) >= stp_off_limit)
11247     {
11248       adj_off += stp_off_limit;
11249       new_off -= stp_off_limit;
11250     }
11251
11252   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
11253   if (adj_off >= 0x1000)
11254     return false;
11255
11256   if (off_val < 0)
11257     {
11258       adj_off = -adj_off;
11259       new_off = -new_off;
11260     }
11261
11262   /* Create new memory references.  */
11263   mem_1 = change_address (mem_1, VOIDmode,
11264                           plus_constant (DImode, operands[8], new_off));
11265
11266   /* Check if the adjusted address is OK for ldp/stp.  */
11267   if (!aarch64_mem_pair_operand (mem_1, mode))
11268     return false;
11269
11270   msize = GET_MODE_SIZE (mode);
11271   mem_2 = change_address (mem_2, VOIDmode,
11272                           plus_constant (DImode,
11273                                          operands[8],
11274                                          new_off + msize));
11275   mem_3 = change_address (mem_3, VOIDmode,
11276                           plus_constant (DImode,
11277                                          operands[8],
11278                                          new_off + msize * 2));
11279   mem_4 = change_address (mem_4, VOIDmode,
11280                           plus_constant (DImode,
11281                                          operands[8],
11282                                          new_off + msize * 3));
11283
11284   if (code == ZERO_EXTEND)
11285     {
11286       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11287       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11288       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11289       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11290     }
11291   else if (code == SIGN_EXTEND)
11292     {
11293       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11294       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11295       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11296       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11297     }
11298
11299   if (load)
11300     {
11301       operands[1] = mem_1;
11302       operands[3] = mem_2;
11303       operands[5] = mem_3;
11304       operands[7] = mem_4;
11305     }
11306   else
11307     {
11308       operands[0] = mem_1;
11309       operands[2] = mem_2;
11310       operands[4] = mem_3;
11311       operands[6] = mem_4;
11312     }
11313
11314   /* Emit adjusting instruction.  */
11315   emit_insn (gen_rtx_SET (VOIDmode, operands[8],
11316                           plus_constant (DImode, base, adj_off)));
11317   /* Emit ldp/stp instructions.  */
11318   t1 = gen_rtx_SET (VOIDmode, operands[0], operands[1]);
11319   t2 = gen_rtx_SET (VOIDmode, operands[2], operands[3]);
11320   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11321   t1 = gen_rtx_SET (VOIDmode, operands[4], operands[5]);
11322   t2 = gen_rtx_SET (VOIDmode, operands[6], operands[7]);
11323   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11324   return true;
11325 }
11326
11327 #undef TARGET_ADDRESS_COST
11328 #define TARGET_ADDRESS_COST aarch64_address_cost
11329
11330 /* This hook will determines whether unnamed bitfields affect the alignment
11331    of the containing structure.  The hook returns true if the structure
11332    should inherit the alignment requirements of an unnamed bitfield's
11333    type.  */
11334 #undef TARGET_ALIGN_ANON_BITFIELD
11335 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11336
11337 #undef TARGET_ASM_ALIGNED_DI_OP
11338 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11339
11340 #undef TARGET_ASM_ALIGNED_HI_OP
11341 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11342
11343 #undef TARGET_ASM_ALIGNED_SI_OP
11344 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11345
11346 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11347 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11348   hook_bool_const_tree_hwi_hwi_const_tree_true
11349
11350 #undef TARGET_ASM_FILE_START
11351 #define TARGET_ASM_FILE_START aarch64_start_file
11352
11353 #undef TARGET_ASM_OUTPUT_MI_THUNK
11354 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11355
11356 #undef TARGET_ASM_SELECT_RTX_SECTION
11357 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11358
11359 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11360 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11361
11362 #undef TARGET_BUILD_BUILTIN_VA_LIST
11363 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11364
11365 #undef TARGET_CALLEE_COPIES
11366 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11367
11368 #undef TARGET_CAN_ELIMINATE
11369 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11370
11371 #undef TARGET_CANNOT_FORCE_CONST_MEM
11372 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11373
11374 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11375 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11376
11377 /* Only the least significant bit is used for initialization guard
11378    variables.  */
11379 #undef TARGET_CXX_GUARD_MASK_BIT
11380 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11381
11382 #undef TARGET_C_MODE_FOR_SUFFIX
11383 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11384
11385 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11386 #undef  TARGET_DEFAULT_TARGET_FLAGS
11387 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11388 #endif
11389
11390 #undef TARGET_CLASS_MAX_NREGS
11391 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11392
11393 #undef TARGET_BUILTIN_DECL
11394 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11395
11396 #undef  TARGET_EXPAND_BUILTIN
11397 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11398
11399 #undef TARGET_EXPAND_BUILTIN_VA_START
11400 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11401
11402 #undef TARGET_FOLD_BUILTIN
11403 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11404
11405 #undef TARGET_FUNCTION_ARG
11406 #define TARGET_FUNCTION_ARG aarch64_function_arg
11407
11408 #undef TARGET_FUNCTION_ARG_ADVANCE
11409 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11410
11411 #undef TARGET_FUNCTION_ARG_BOUNDARY
11412 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11413
11414 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11415 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11416
11417 #undef TARGET_FUNCTION_VALUE
11418 #define TARGET_FUNCTION_VALUE aarch64_function_value
11419
11420 #undef TARGET_FUNCTION_VALUE_REGNO_P
11421 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11422
11423 #undef TARGET_FRAME_POINTER_REQUIRED
11424 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11425
11426 #undef TARGET_GIMPLE_FOLD_BUILTIN
11427 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11428
11429 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11430 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11431
11432 #undef  TARGET_INIT_BUILTINS
11433 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
11434
11435 #undef TARGET_LEGITIMATE_ADDRESS_P
11436 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11437
11438 #undef TARGET_LEGITIMATE_CONSTANT_P
11439 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11440
11441 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11442 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11443
11444 #undef TARGET_LRA_P
11445 #define TARGET_LRA_P hook_bool_void_true
11446
11447 #undef TARGET_MANGLE_TYPE
11448 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11449
11450 #undef TARGET_MEMORY_MOVE_COST
11451 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11452
11453 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11454 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11455
11456 #undef TARGET_MUST_PASS_IN_STACK
11457 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11458
11459 /* This target hook should return true if accesses to volatile bitfields
11460    should use the narrowest mode possible.  It should return false if these
11461    accesses should use the bitfield container type.  */
11462 #undef TARGET_NARROW_VOLATILE_BITFIELD
11463 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11464
11465 #undef  TARGET_OPTION_OVERRIDE
11466 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11467
11468 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11469 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11470   aarch64_override_options_after_change
11471
11472 #undef TARGET_PASS_BY_REFERENCE
11473 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11474
11475 #undef TARGET_PREFERRED_RELOAD_CLASS
11476 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11477
11478 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11479 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11480
11481 #undef TARGET_SECONDARY_RELOAD
11482 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11483
11484 #undef TARGET_SHIFT_TRUNCATION_MASK
11485 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11486
11487 #undef TARGET_SETUP_INCOMING_VARARGS
11488 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11489
11490 #undef TARGET_STRUCT_VALUE_RTX
11491 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
11492
11493 #undef TARGET_REGISTER_MOVE_COST
11494 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11495
11496 #undef TARGET_RETURN_IN_MEMORY
11497 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11498
11499 #undef TARGET_RETURN_IN_MSB
11500 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11501
11502 #undef TARGET_RTX_COSTS
11503 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11504
11505 #undef TARGET_SCHED_ISSUE_RATE
11506 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11507
11508 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
11509 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
11510   aarch64_sched_first_cycle_multipass_dfa_lookahead
11511
11512 #undef TARGET_TRAMPOLINE_INIT
11513 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11514
11515 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11516 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11517
11518 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11519 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11520
11521 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11522 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11523
11524 #undef TARGET_VECTORIZE_ADD_STMT_COST
11525 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11526
11527 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11528 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11529   aarch64_builtin_vectorization_cost
11530
11531 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11532 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11533
11534 #undef TARGET_VECTORIZE_BUILTINS
11535 #define TARGET_VECTORIZE_BUILTINS
11536
11537 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11538 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11539   aarch64_builtin_vectorized_function
11540
11541 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11542 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11543   aarch64_autovectorize_vector_sizes
11544
11545 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11546 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11547   aarch64_atomic_assign_expand_fenv
11548
11549 /* Section anchor support.  */
11550
11551 #undef TARGET_MIN_ANCHOR_OFFSET
11552 #define TARGET_MIN_ANCHOR_OFFSET -256
11553
11554 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11555    byte offset; we can do much more for larger data types, but have no way
11556    to determine the size of the access.  We assume accesses are aligned.  */
11557 #undef TARGET_MAX_ANCHOR_OFFSET
11558 #define TARGET_MAX_ANCHOR_OFFSET 4095
11559
11560 #undef TARGET_VECTOR_ALIGNMENT
11561 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11562
11563 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11564 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11565   aarch64_simd_vector_alignment_reachable
11566
11567 /* vec_perm support.  */
11568
11569 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11570 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11571   aarch64_vectorize_vec_perm_const_ok
11572
11573
11574 #undef TARGET_FIXED_CONDITION_CODE_REGS
11575 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11576
11577 #undef TARGET_FLAGS_REGNUM
11578 #define TARGET_FLAGS_REGNUM CC_REGNUM
11579
11580 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11581 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11582
11583 #undef TARGET_ASAN_SHADOW_OFFSET
11584 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11585
11586 #undef TARGET_LEGITIMIZE_ADDRESS
11587 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11588
11589 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11590 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11591   aarch64_use_by_pieces_infrastructure_p
11592
11593 #undef TARGET_CAN_USE_DOLOOP_P
11594 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11595
11596 #undef TARGET_SCHED_MACRO_FUSION_P
11597 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11598
11599 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11600 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11601
11602 #undef TARGET_SCHED_FUSION_PRIORITY
11603 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11604
11605 struct gcc_target targetm = TARGET_INITIALIZER;
11606
11607 #include "gt-aarch64.h"