gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2015 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "insn-codes.h"
  26 #include "rtl.h"
  27 #include "insn-attr.h"
  28 #include "hash-set.h"
  29 #include "machmode.h"
  30 #include "vec.h"
  31 #include "double-int.h"
  32 #include "input.h"
  33 #include "alias.h"
  34 #include "symtab.h"
  35 #include "wide-int.h"
  36 #include "inchash.h"
  37 #include "tree.h"
  38 #include "fold-const.h"
  39 #include "stringpool.h"
  40 #include "stor-layout.h"
  41 #include "calls.h"
  42 #include "varasm.h"
  43 #include "regs.h"
  44 #include "dominance.h"
  45 #include "cfg.h"
  46 #include "cfgrtl.h"
  47 #include "cfganal.h"
  48 #include "lcm.h"
  49 #include "cfgbuild.h"
  50 #include "cfgcleanup.h"
  51 #include "predict.h"
  52 #include "basic-block.h"
  53 #include "df.h"
  54 #include "hard-reg-set.h"
  55 #include "output.h"
  56 #include "hashtab.h"
  57 #include "function.h"
  58 #include "flags.h"
  59 #include "statistics.h"
  60 #include "real.h"
  61 #include "fixed-value.h"
  62 #include "insn-config.h"
  63 #include "expmed.h"
  64 #include "dojump.h"
  65 #include "explow.h"
  66 #include "emit-rtl.h"
  67 #include "stmt.h"
  68 #include "expr.h"
  69 #include "reload.h"
  70 #include "toplev.h"
  71 #include "target.h"
  72 #include "target-def.h"
  73 #include "targhooks.h"
  74 #include "ggc.h"
  75 #include "tm_p.h"
  76 #include "recog.h"
  77 #include "langhooks.h"
  78 #include "diagnostic-core.h"
  79 #include "hash-table.h"
  80 #include "tree-ssa-alias.h"
  81 #include "internal-fn.h"
  82 #include "gimple-fold.h"
  83 #include "tree-eh.h"
  84 #include "gimple-expr.h"
  85 #include "is-a.h"
  86 #include "gimple.h"
  87 #include "gimplify.h"
  88 #include "optabs.h"
  89 #include "dwarf2.h"
  90 #include "cfgloop.h"
  91 #include "tree-vectorizer.h"
  92 #include "aarch64-cost-tables.h"
  93 #include "dumpfile.h"
  94 #include "builtins.h"
  95 #include "rtl-iter.h"
  96 #include "tm-constrs.h"
  97 #include "sched-int.h"
  98 #include "cortex-a57-fma-steering.h"
  99
 100 /* Defined for convenience.  */
 101 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
 102
 103 /* Classifies an address.
 104
 105    ADDRESS_REG_IMM
 106        A simple base register plus immediate offset.
 107
 108    ADDRESS_REG_WB
 109        A base register indexed by immediate offset with writeback.
 110
 111    ADDRESS_REG_REG
 112        A base register indexed by (optionally scaled) register.
 113
 114    ADDRESS_REG_UXTW
 115        A base register indexed by (optionally scaled) zero-extended register.
 116
 117    ADDRESS_REG_SXTW
 118        A base register indexed by (optionally scaled) sign-extended register.
 119
 120    ADDRESS_LO_SUM
 121        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 122
 123    ADDRESS_SYMBOLIC:
 124        A constant symbolic address, in pc-relative literal pool.  */
 125
 126 enum aarch64_address_type {
 127   ADDRESS_REG_IMM,
 128   ADDRESS_REG_WB,
 129   ADDRESS_REG_REG,
 130   ADDRESS_REG_UXTW,
 131   ADDRESS_REG_SXTW,
 132   ADDRESS_LO_SUM,
 133   ADDRESS_SYMBOLIC
 134 };
 135
 136 struct aarch64_address_info {
 137   enum aarch64_address_type type;
 138   rtx base;
 139   rtx offset;
 140   int shift;
 141   enum aarch64_symbol_type symbol_type;
 142 };
 143
 144 struct simd_immediate_info
 145 {
 146   rtx value;
 147   int shift;
 148   int element_width;
 149   bool mvn;
 150   bool msl;
 151 };
 152
 153 /* The current code model.  */
 154 enum aarch64_code_model aarch64_cmodel;
 155
 156 #ifdef HAVE_AS_TLS
 157 #undef TARGET_HAVE_TLS
 158 #define TARGET_HAVE_TLS 1
 159 #endif
 160
 161 static bool aarch64_composite_type_p (const_tree, machine_mode);
 162 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 163                                                      const_tree,
 164                                                      machine_mode *, int *,
 165                                                      bool *);
 166 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 167 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 168 static void aarch64_override_options_after_change (void);
 169 static bool aarch64_vector_mode_supported_p (machine_mode);
 170 static unsigned bit_count (unsigned HOST_WIDE_INT);
 171 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 172                                                  const unsigned char *sel);
 173 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 174
 175 /* Major revision number of the ARM Architecture implemented by the target.  */
 176 unsigned aarch64_architecture_version;
 177
 178 /* The processor for which instructions should be scheduled.  */
 179 enum aarch64_processor aarch64_tune = cortexa53;
 180
 181 /* The current tuning set.  */
 182 const struct tune_params *aarch64_tune_params;
 183
 184 /* Mask to specify which instructions we are allowed to generate.  */
 185 unsigned long aarch64_isa_flags = 0;
 186
 187 /* Mask to specify which instruction scheduling options should be used.  */
 188 unsigned long aarch64_tune_flags = 0;
 189
 190 /* Tuning parameters.  */
 191
 192 static const struct cpu_addrcost_table generic_addrcost_table =
 193 {
 194     {
 195       0, /* hi  */
 196       0, /* si  */
 197       0, /* di  */
 198       0, /* ti  */
 199     },
 200   0, /* pre_modify  */
 201   0, /* post_modify  */
 202   0, /* register_offset  */
 203   0, /* register_extend  */
 204   0 /* imm_offset  */
 205 };
 206
 207 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 208 {
 209     {
 210       1, /* hi  */
 211       0, /* si  */
 212       0, /* di  */
 213       1, /* ti  */
 214     },
 215   0, /* pre_modify  */
 216   0, /* post_modify  */
 217   0, /* register_offset  */
 218   0, /* register_extend  */
 219   0, /* imm_offset  */
 220 };
 221
 222 static const struct cpu_addrcost_table xgene1_addrcost_table =
 223 {
 224     {
 225       1, /* hi  */
 226       0, /* si  */
 227       0, /* di  */
 228       1, /* ti  */
 229     },
 230   1, /* pre_modify  */
 231   0, /* post_modify  */
 232   0, /* register_offset  */
 233   1, /* register_extend  */
 234   0, /* imm_offset  */
 235 };
 236
 237 static const struct cpu_regmove_cost generic_regmove_cost =
 238 {
 239   1, /* GP2GP  */
 240   /* Avoid the use of slow int<->fp moves for spilling by setting
 241      their cost higher than memmov_cost.  */
 242   5, /* GP2FP  */
 243   5, /* FP2GP  */
 244   2 /* FP2FP  */
 245 };
 246
 247 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 248 {
 249   1, /* GP2GP  */
 250   /* Avoid the use of slow int<->fp moves for spilling by setting
 251      their cost higher than memmov_cost.  */
 252   5, /* GP2FP  */
 253   5, /* FP2GP  */
 254   2 /* FP2FP  */
 255 };
 256
 257 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 258 {
 259   1, /* GP2GP  */
 260   /* Avoid the use of slow int<->fp moves for spilling by setting
 261      their cost higher than memmov_cost.  */
 262   5, /* GP2FP  */
 263   5, /* FP2GP  */
 264   2 /* FP2FP  */
 265 };
 266
 267 static const struct cpu_regmove_cost thunderx_regmove_cost =
 268 {
 269   2, /* GP2GP  */
 270   2, /* GP2FP  */
 271   6, /* FP2GP  */
 272   4 /* FP2FP  */
 273 };
 274
 275 static const struct cpu_regmove_cost xgene1_regmove_cost =
 276 {
 277   1, /* GP2GP  */
 278   /* Avoid the use of slow int<->fp moves for spilling by setting
 279      their cost higher than memmov_cost.  */
 280   8, /* GP2FP  */
 281   8, /* FP2GP  */
 282   2 /* FP2FP  */
 283 };
 284
 285 /* Generic costs for vector insn classes.  */
 286 static const struct cpu_vector_cost generic_vector_cost =
 287 {
 288   1, /* scalar_stmt_cost  */
 289   1, /* scalar_load_cost  */
 290   1, /* scalar_store_cost  */
 291   1, /* vec_stmt_cost  */
 292   1, /* vec_to_scalar_cost  */
 293   1, /* scalar_to_vec_cost  */
 294   1, /* vec_align_load_cost  */
 295   1, /* vec_unalign_load_cost  */
 296   1, /* vec_unalign_store_cost  */
 297   1, /* vec_store_cost  */
 298   3, /* cond_taken_branch_cost  */
 299   1 /* cond_not_taken_branch_cost  */
 300 };
 301
 302 /* Generic costs for vector insn classes.  */
 303 static const struct cpu_vector_cost cortexa57_vector_cost =
 304 {
 305   1, /* scalar_stmt_cost  */
 306   4, /* scalar_load_cost  */
 307   1, /* scalar_store_cost  */
 308   3, /* vec_stmt_cost  */
 309   8, /* vec_to_scalar_cost  */
 310   8, /* scalar_to_vec_cost  */
 311   5, /* vec_align_load_cost  */
 312   5, /* vec_unalign_load_cost  */
 313   1, /* vec_unalign_store_cost  */
 314   1, /* vec_store_cost  */
 315   1, /* cond_taken_branch_cost  */
 316   1 /* cond_not_taken_branch_cost  */
 317 };
 318
 319 /* Generic costs for vector insn classes.  */
 320 static const struct cpu_vector_cost xgene1_vector_cost =
 321 {
 322   1, /* scalar_stmt_cost  */
 323   5, /* scalar_load_cost  */
 324   1, /* scalar_store_cost  */
 325   2, /* vec_stmt_cost  */
 326   4, /* vec_to_scalar_cost  */
 327   4, /* scalar_to_vec_cost  */
 328   10, /* vec_align_load_cost  */
 329   10, /* vec_unalign_load_cost  */
 330   2, /* vec_unalign_store_cost  */
 331   2, /* vec_store_cost  */
 332   2, /* cond_taken_branch_cost  */
 333   1 /* cond_not_taken_branch_cost  */
 334 };
 335
 336 #define AARCH64_FUSE_NOTHING    (0)
 337 #define AARCH64_FUSE_MOV_MOVK   (1 << 0)
 338 #define AARCH64_FUSE_ADRP_ADD   (1 << 1)
 339 #define AARCH64_FUSE_MOVK_MOVK  (1 << 2)
 340 #define AARCH64_FUSE_ADRP_LDR   (1 << 3)
 341 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
 342
 343 static const struct tune_params generic_tunings =
 344 {
 345   &cortexa57_extra_costs,
 346   &generic_addrcost_table,
 347   &generic_regmove_cost,
 348   &generic_vector_cost,
 349   4, /* memmov_cost  */
 350   2, /* issue_rate  */
 351   AARCH64_FUSE_NOTHING, /* fuseable_ops  */
 352   8,    /* function_align.  */
 353   8,    /* jump_align.  */
 354   4,    /* loop_align.  */
 355   2,    /* int_reassoc_width.  */
 356   4,    /* fp_reassoc_width.  */
 357   1     /* vec_reassoc_width.  */
 358 };
 359
 360 static const struct tune_params cortexa53_tunings =
 361 {
 362   &cortexa53_extra_costs,
 363   &generic_addrcost_table,
 364   &cortexa53_regmove_cost,
 365   &generic_vector_cost,
 366   4, /* memmov_cost  */
 367   2, /* issue_rate  */
 368   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 369    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fuseable_ops  */
 370   8,    /* function_align.  */
 371   8,    /* jump_align.  */
 372   4,    /* loop_align.  */
 373   2,    /* int_reassoc_width.  */
 374   4,    /* fp_reassoc_width.  */
 375   1     /* vec_reassoc_width.  */
 376 };
 377
 378 static const struct tune_params cortexa57_tunings =
 379 {
 380   &cortexa57_extra_costs,
 381   &cortexa57_addrcost_table,
 382   &cortexa57_regmove_cost,
 383   &cortexa57_vector_cost,
 384   4, /* memmov_cost  */
 385   3, /* issue_rate  */
 386   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 387    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 388   16,   /* function_align.  */
 389   8,    /* jump_align.  */
 390   4,    /* loop_align.  */
 391   2,    /* int_reassoc_width.  */
 392   4,    /* fp_reassoc_width.  */
 393   1     /* vec_reassoc_width.  */
 394 };
 395
 396 static const struct tune_params thunderx_tunings =
 397 {
 398   &thunderx_extra_costs,
 399   &generic_addrcost_table,
 400   &thunderx_regmove_cost,
 401   &generic_vector_cost,
 402   6, /* memmov_cost  */
 403   2, /* issue_rate  */
 404   AARCH64_FUSE_CMP_BRANCH, /* fuseable_ops  */
 405   8,    /* function_align.  */
 406   8,    /* jump_align.  */
 407   8,    /* loop_align.  */
 408   2,    /* int_reassoc_width.  */
 409   4,    /* fp_reassoc_width.  */
 410   1     /* vec_reassoc_width.  */
 411 };
 412
 413 static const struct tune_params xgene1_tunings =
 414 {
 415   &xgene1_extra_costs,
 416   &xgene1_addrcost_table,
 417   &xgene1_regmove_cost,
 418   &xgene1_vector_cost,
 419   6, /* memmov_cost  */
 420   4, /* issue_rate  */
 421   AARCH64_FUSE_NOTHING, /* fuseable_ops  */
 422   16,   /* function_align.  */
 423   8,    /* jump_align.  */
 424   16,   /* loop_align.  */
 425   2,    /* int_reassoc_width.  */
 426   4,    /* fp_reassoc_width.  */
 427   1     /* vec_reassoc_width.  */
 428 };
 429
 430 /* A processor implementing AArch64.  */
 431 struct processor
 432 {
 433   const char *const name;
 434   enum aarch64_processor core;
 435   const char *arch;
 436   unsigned architecture_version;
 437   const unsigned long flags;
 438   const struct tune_params *const tune;
 439 };
 440
 441 /* Processor cores implementing AArch64.  */
 442 static const struct processor all_cores[] =
 443 {
 444 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
 445   {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
 446 #include "aarch64-cores.def"
 447 #undef AARCH64_CORE
 448   {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
 449   {NULL, aarch64_none, NULL, 0, 0, NULL}
 450 };
 451
 452 /* Architectures implementing AArch64.  */
 453 static const struct processor all_architectures[] =
 454 {
 455 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
 456   {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
 457 #include "aarch64-arches.def"
 458 #undef AARCH64_ARCH
 459   {NULL, aarch64_none, NULL, 0, 0, NULL}
 460 };
 461
 462 /* Target specification.  These are populated as commandline arguments
 463    are processed, or NULL if not specified.  */
 464 static const struct processor *selected_arch;
 465 static const struct processor *selected_cpu;
 466 static const struct processor *selected_tune;
 467
 468 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 469
 470 /* An ISA extension in the co-processor and main instruction set space.  */
 471 struct aarch64_option_extension
 472 {
 473   const char *const name;
 474   const unsigned long flags_on;
 475   const unsigned long flags_off;
 476 };
 477
 478 /* ISA extensions in AArch64.  */
 479 static const struct aarch64_option_extension all_extensions[] =
 480 {
 481 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
 482   {NAME, FLAGS_ON, FLAGS_OFF},
 483 #include "aarch64-option-extensions.def"
 484 #undef AARCH64_OPT_EXTENSION
 485   {NULL, 0, 0}
 486 };
 487
 488 /* Used to track the size of an address when generating a pre/post
 489    increment address.  */
 490 static machine_mode aarch64_memory_reference_mode;
 491
 492 /* A table of valid AArch64 "bitmask immediate" values for
 493    logical instructions.  */
 494
 495 #define AARCH64_NUM_BITMASKS  5334
 496 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
 497
 498 typedef enum aarch64_cond_code
 499 {
 500   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 501   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 502   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 503 }
 504 aarch64_cc;
 505
 506 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 507
 508 /* The condition codes of the processor, and the inverse function.  */
 509 static const char * const aarch64_condition_codes[] =
 510 {
 511   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 512   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 513 };
 514
 515 static unsigned int
 516 aarch64_min_divisions_for_recip_mul (enum machine_mode mode ATTRIBUTE_UNUSED)
 517 {
 518   return 2;
 519 }
 520
 521 static int
 522 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 523                              enum machine_mode mode)
 524 {
 525   if (VECTOR_MODE_P (mode))
 526     return aarch64_tune_params->vec_reassoc_width;
 527   if (INTEGRAL_MODE_P (mode))
 528     return aarch64_tune_params->int_reassoc_width;
 529   if (FLOAT_MODE_P (mode))
 530     return aarch64_tune_params->fp_reassoc_width;
 531   return 1;
 532 }
 533
 534 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 535 unsigned
 536 aarch64_dbx_register_number (unsigned regno)
 537 {
 538    if (GP_REGNUM_P (regno))
 539      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 540    else if (regno == SP_REGNUM)
 541      return AARCH64_DWARF_SP;
 542    else if (FP_REGNUM_P (regno))
 543      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 544
 545    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 546       equivalent DWARF register.  */
 547    return DWARF_FRAME_REGISTERS;
 548 }
 549
 550 /* Return TRUE if MODE is any of the large INT modes.  */
 551 static bool
 552 aarch64_vect_struct_mode_p (machine_mode mode)
 553 {
 554   return mode == OImode || mode == CImode || mode == XImode;
 555 }
 556
 557 /* Return TRUE if MODE is any of the vector modes.  */
 558 static bool
 559 aarch64_vector_mode_p (machine_mode mode)
 560 {
 561   return aarch64_vector_mode_supported_p (mode)
 562          || aarch64_vect_struct_mode_p (mode);
 563 }
 564
 565 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 566 static bool
 567 aarch64_array_mode_supported_p (machine_mode mode,
 568                                 unsigned HOST_WIDE_INT nelems)
 569 {
 570   if (TARGET_SIMD
 571       && AARCH64_VALID_SIMD_QREG_MODE (mode)
 572       && (nelems >= 2 && nelems <= 4))
 573     return true;
 574
 575   return false;
 576 }
 577
 578 /* Implement HARD_REGNO_NREGS.  */
 579
 580 int
 581 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
 582 {
 583   switch (aarch64_regno_regclass (regno))
 584     {
 585     case FP_REGS:
 586     case FP_LO_REGS:
 587       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 588     default:
 589       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 590     }
 591   gcc_unreachable ();
 592 }
 593
 594 /* Implement HARD_REGNO_MODE_OK.  */
 595
 596 int
 597 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
 598 {
 599   if (GET_MODE_CLASS (mode) == MODE_CC)
 600     return regno == CC_REGNUM;
 601
 602   if (regno == SP_REGNUM)
 603     /* The purpose of comparing with ptr_mode is to support the
 604        global register variable associated with the stack pointer
 605        register via the syntax of asm ("wsp") in ILP32.  */
 606     return mode == Pmode || mode == ptr_mode;
 607
 608   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 609     return mode == Pmode;
 610
 611   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 612     return 1;
 613
 614   if (FP_REGNUM_P (regno))
 615     {
 616       if (aarch64_vect_struct_mode_p (mode))
 617         return
 618           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 619       else
 620         return 1;
 621     }
 622
 623   return 0;
 624 }
 625
 626 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 627 machine_mode
 628 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 629                                      machine_mode mode)
 630 {
 631   /* Handle modes that fit within single registers.  */
 632   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 633     {
 634       if (GET_MODE_SIZE (mode) >= 4)
 635         return mode;
 636       else
 637         return SImode;
 638     }
 639   /* Fall back to generic for multi-reg and very large modes.  */
 640   else
 641     return choose_hard_reg_mode (regno, nregs, false);
 642 }
 643
 644 /* Return true if calls to DECL should be treated as
 645    long-calls (ie called via a register).  */
 646 static bool
 647 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 648 {
 649   return false;
 650 }
 651
 652 /* Return true if calls to symbol-ref SYM should be treated as
 653    long-calls (ie called via a register).  */
 654 bool
 655 aarch64_is_long_call_p (rtx sym)
 656 {
 657   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 658 }
 659
 660 /* Return true if the offsets to a zero/sign-extract operation
 661    represent an expression that matches an extend operation.  The
 662    operands represent the paramters from
 663
 664    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 665 bool
 666 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
 667                                 rtx extract_imm)
 668 {
 669   HOST_WIDE_INT mult_val, extract_val;
 670
 671   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 672     return false;
 673
 674   mult_val = INTVAL (mult_imm);
 675   extract_val = INTVAL (extract_imm);
 676
 677   if (extract_val > 8
 678       && extract_val < GET_MODE_BITSIZE (mode)
 679       && exact_log2 (extract_val & ~7) > 0
 680       && (extract_val & 7) <= 4
 681       && mult_val == (1 << (extract_val & 7)))
 682     return true;
 683
 684   return false;
 685 }
 686
 687 /* Emit an insn that's a simple single-set.  Both the operands must be
 688    known to be valid.  */
 689 inline static rtx
 690 emit_set_insn (rtx x, rtx y)
 691 {
 692   return emit_insn (gen_rtx_SET (VOIDmode, x, y));
 693 }
 694
 695 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 696    return the rtx for register 0 in the proper mode.  */
 697 rtx
 698 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 699 {
 700   machine_mode mode = SELECT_CC_MODE (code, x, y);
 701   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 702
 703   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 704   return cc_reg;
 705 }
 706
 707 /* Build the SYMBOL_REF for __tls_get_addr.  */
 708
 709 static GTY(()) rtx tls_get_addr_libfunc;
 710
 711 rtx
 712 aarch64_tls_get_addr (void)
 713 {
 714   if (!tls_get_addr_libfunc)
 715     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 716   return tls_get_addr_libfunc;
 717 }
 718
 719 /* Return the TLS model to use for ADDR.  */
 720
 721 static enum tls_model
 722 tls_symbolic_operand_type (rtx addr)
 723 {
 724   enum tls_model tls_kind = TLS_MODEL_NONE;
 725   rtx sym, addend;
 726
 727   if (GET_CODE (addr) == CONST)
 728     {
 729       split_const (addr, &sym, &addend);
 730       if (GET_CODE (sym) == SYMBOL_REF)
 731         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 732     }
 733   else if (GET_CODE (addr) == SYMBOL_REF)
 734     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 735
 736   return tls_kind;
 737 }
 738
 739 /* We'll allow lo_sum's in addresses in our legitimate addresses
 740    so that combine would take care of combining addresses where
 741    necessary, but for generation purposes, we'll generate the address
 742    as :
 743    RTL                               Absolute
 744    tmp = hi (symbol_ref);            adrp  x1, foo
 745    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 746                                      nop
 747
 748    PIC                               TLS
 749    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 750    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 751                                      bl   __tls_get_addr
 752                                      nop
 753
 754    Load TLS symbol, depending on TLS mechanism and TLS access model.
 755
 756    Global Dynamic - Traditional TLS:
 757    adrp tmp, :tlsgd:imm
 758    add  dest, tmp, #:tlsgd_lo12:imm
 759    bl   __tls_get_addr
 760
 761    Global Dynamic - TLS Descriptors:
 762    adrp dest, :tlsdesc:imm
 763    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 764    add  dest, dest, #:tlsdesc_lo12:imm
 765    blr  tmp
 766    mrs  tp, tpidr_el0
 767    add  dest, dest, tp
 768
 769    Initial Exec:
 770    mrs  tp, tpidr_el0
 771    adrp tmp, :gottprel:imm
 772    ldr  dest, [tmp, #:gottprel_lo12:imm]
 773    add  dest, dest, tp
 774
 775    Local Exec:
 776    mrs  tp, tpidr_el0
 777    add  t0, tp, #:tprel_hi12:imm, lsl #12
 778    add  t0, t0, #:tprel_lo12_nc:imm
 779 */
 780
 781 static void
 782 aarch64_load_symref_appropriately (rtx dest, rtx imm,
 783                                    enum aarch64_symbol_type type)
 784 {
 785   switch (type)
 786     {
 787     case SYMBOL_SMALL_ABSOLUTE:
 788       {
 789         /* In ILP32, the mode of dest can be either SImode or DImode.  */
 790         rtx tmp_reg = dest;
 791         machine_mode mode = GET_MODE (dest);
 792
 793         gcc_assert (mode == Pmode || mode == ptr_mode);
 794
 795         if (can_create_pseudo_p ())
 796           tmp_reg = gen_reg_rtx (mode);
 797
 798         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 799         emit_insn (gen_add_losym (dest, tmp_reg, imm));
 800         return;
 801       }
 802
 803     case SYMBOL_TINY_ABSOLUTE:
 804       emit_insn (gen_rtx_SET (Pmode, dest, imm));
 805       return;
 806
 807     case SYMBOL_SMALL_GOT:
 808       {
 809         /* In ILP32, the mode of dest can be either SImode or DImode,
 810            while the got entry is always of SImode size.  The mode of
 811            dest depends on how dest is used: if dest is assigned to a
 812            pointer (e.g. in the memory), it has SImode; it may have
 813            DImode if dest is dereferenced to access the memeory.
 814            This is why we have to handle three different ldr_got_small
 815            patterns here (two patterns for ILP32).  */
 816         rtx tmp_reg = dest;
 817         machine_mode mode = GET_MODE (dest);
 818
 819         if (can_create_pseudo_p ())
 820           tmp_reg = gen_reg_rtx (mode);
 821
 822         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 823         if (mode == ptr_mode)
 824           {
 825             if (mode == DImode)
 826               emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
 827             else
 828               emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
 829           }
 830         else
 831           {
 832             gcc_assert (mode == Pmode);
 833             emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
 834           }
 835
 836         return;
 837       }
 838
 839     case SYMBOL_SMALL_TLSGD:
 840       {
 841         rtx_insn *insns;
 842         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
 843
 844         start_sequence ();
 845         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
 846         insns = get_insns ();
 847         end_sequence ();
 848
 849         RTL_CONST_CALL_P (insns) = 1;
 850         emit_libcall_block (insns, dest, result, imm);
 851         return;
 852       }
 853
 854     case SYMBOL_SMALL_TLSDESC:
 855       {
 856         machine_mode mode = GET_MODE (dest);
 857         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
 858         rtx tp;
 859
 860         gcc_assert (mode == Pmode || mode == ptr_mode);
 861
 862         /* In ILP32, the got entry is always of SImode size.  Unlike
 863            small GOT, the dest is fixed at reg 0.  */
 864         if (TARGET_ILP32)
 865           emit_insn (gen_tlsdesc_small_si (imm));
 866         else
 867           emit_insn (gen_tlsdesc_small_di (imm));
 868         tp = aarch64_load_tp (NULL);
 869
 870         if (mode != Pmode)
 871           tp = gen_lowpart (mode, tp);
 872
 873         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
 874         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 875         return;
 876       }
 877
 878     case SYMBOL_SMALL_GOTTPREL:
 879       {
 880         /* In ILP32, the mode of dest can be either SImode or DImode,
 881            while the got entry is always of SImode size.  The mode of
 882            dest depends on how dest is used: if dest is assigned to a
 883            pointer (e.g. in the memory), it has SImode; it may have
 884            DImode if dest is dereferenced to access the memeory.
 885            This is why we have to handle three different tlsie_small
 886            patterns here (two patterns for ILP32).  */
 887         machine_mode mode = GET_MODE (dest);
 888         rtx tmp_reg = gen_reg_rtx (mode);
 889         rtx tp = aarch64_load_tp (NULL);
 890
 891         if (mode == ptr_mode)
 892           {
 893             if (mode == DImode)
 894               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
 895             else
 896               {
 897                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
 898                 tp = gen_lowpart (mode, tp);
 899               }
 900           }
 901         else
 902           {
 903             gcc_assert (mode == Pmode);
 904             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
 905           }
 906
 907         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
 908         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 909         return;
 910       }
 911
 912     case SYMBOL_SMALL_TPREL:
 913       {
 914         rtx tp = aarch64_load_tp (NULL);
 915
 916         if (GET_MODE (dest) != Pmode)
 917           tp = gen_lowpart (GET_MODE (dest), tp);
 918
 919         emit_insn (gen_tlsle_small (dest, tp, imm));
 920         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 921         return;
 922       }
 923
 924     case SYMBOL_TINY_GOT:
 925       emit_insn (gen_ldr_got_tiny (dest, imm));
 926       return;
 927
 928     default:
 929       gcc_unreachable ();
 930     }
 931 }
 932
 933 /* Emit a move from SRC to DEST.  Assume that the move expanders can
 934    handle all moves if !can_create_pseudo_p ().  The distinction is
 935    important because, unlike emit_move_insn, the move expanders know
 936    how to force Pmode objects into the constant pool even when the
 937    constant pool address is not itself legitimate.  */
 938 static rtx
 939 aarch64_emit_move (rtx dest, rtx src)
 940 {
 941   return (can_create_pseudo_p ()
 942           ? emit_move_insn (dest, src)
 943           : emit_move_insn_1 (dest, src));
 944 }
 945
 946 /* Split a 128-bit move operation into two 64-bit move operations,
 947    taking care to handle partial overlap of register to register
 948    copies.  Special cases are needed when moving between GP regs and
 949    FP regs.  SRC can be a register, constant or memory; DST a register
 950    or memory.  If either operand is memory it must not have any side
 951    effects.  */
 952 void
 953 aarch64_split_128bit_move (rtx dst, rtx src)
 954 {
 955   rtx dst_lo, dst_hi;
 956   rtx src_lo, src_hi;
 957
 958   machine_mode mode = GET_MODE (dst);
 959
 960   gcc_assert (mode == TImode || mode == TFmode);
 961   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
 962   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
 963
 964   if (REG_P (dst) && REG_P (src))
 965     {
 966       int src_regno = REGNO (src);
 967       int dst_regno = REGNO (dst);
 968
 969       /* Handle FP <-> GP regs.  */
 970       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
 971         {
 972           src_lo = gen_lowpart (word_mode, src);
 973           src_hi = gen_highpart (word_mode, src);
 974
 975           if (mode == TImode)
 976             {
 977               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
 978               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
 979             }
 980           else
 981             {
 982               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
 983               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
 984             }
 985           return;
 986         }
 987       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
 988         {
 989           dst_lo = gen_lowpart (word_mode, dst);
 990           dst_hi = gen_highpart (word_mode, dst);
 991
 992           if (mode == TImode)
 993             {
 994               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
 995               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
 996             }
 997           else
 998             {
 999               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1000               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1001             }
1002           return;
1003         }
1004     }
1005
1006   dst_lo = gen_lowpart (word_mode, dst);
1007   dst_hi = gen_highpart (word_mode, dst);
1008   src_lo = gen_lowpart (word_mode, src);
1009   src_hi = gen_highpart_mode (word_mode, mode, src);
1010
1011   /* At most one pairing may overlap.  */
1012   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1013     {
1014       aarch64_emit_move (dst_hi, src_hi);
1015       aarch64_emit_move (dst_lo, src_lo);
1016     }
1017   else
1018     {
1019       aarch64_emit_move (dst_lo, src_lo);
1020       aarch64_emit_move (dst_hi, src_hi);
1021     }
1022 }
1023
1024 bool
1025 aarch64_split_128bit_move_p (rtx dst, rtx src)
1026 {
1027   return (! REG_P (src)
1028           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1029 }
1030
1031 /* Split a complex SIMD combine.  */
1032
1033 void
1034 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1035 {
1036   machine_mode src_mode = GET_MODE (src1);
1037   machine_mode dst_mode = GET_MODE (dst);
1038
1039   gcc_assert (VECTOR_MODE_P (dst_mode));
1040
1041   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1042     {
1043       rtx (*gen) (rtx, rtx, rtx);
1044
1045       switch (src_mode)
1046         {
1047         case V8QImode:
1048           gen = gen_aarch64_simd_combinev8qi;
1049           break;
1050         case V4HImode:
1051           gen = gen_aarch64_simd_combinev4hi;
1052           break;
1053         case V2SImode:
1054           gen = gen_aarch64_simd_combinev2si;
1055           break;
1056         case V2SFmode:
1057           gen = gen_aarch64_simd_combinev2sf;
1058           break;
1059         case DImode:
1060           gen = gen_aarch64_simd_combinedi;
1061           break;
1062         case DFmode:
1063           gen = gen_aarch64_simd_combinedf;
1064           break;
1065         default:
1066           gcc_unreachable ();
1067         }
1068
1069       emit_insn (gen (dst, src1, src2));
1070       return;
1071     }
1072 }
1073
1074 /* Split a complex SIMD move.  */
1075
1076 void
1077 aarch64_split_simd_move (rtx dst, rtx src)
1078 {
1079   machine_mode src_mode = GET_MODE (src);
1080   machine_mode dst_mode = GET_MODE (dst);
1081
1082   gcc_assert (VECTOR_MODE_P (dst_mode));
1083
1084   if (REG_P (dst) && REG_P (src))
1085     {
1086       rtx (*gen) (rtx, rtx);
1087
1088       gcc_assert (VECTOR_MODE_P (src_mode));
1089
1090       switch (src_mode)
1091         {
1092         case V16QImode:
1093           gen = gen_aarch64_split_simd_movv16qi;
1094           break;
1095         case V8HImode:
1096           gen = gen_aarch64_split_simd_movv8hi;
1097           break;
1098         case V4SImode:
1099           gen = gen_aarch64_split_simd_movv4si;
1100           break;
1101         case V2DImode:
1102           gen = gen_aarch64_split_simd_movv2di;
1103           break;
1104         case V4SFmode:
1105           gen = gen_aarch64_split_simd_movv4sf;
1106           break;
1107         case V2DFmode:
1108           gen = gen_aarch64_split_simd_movv2df;
1109           break;
1110         default:
1111           gcc_unreachable ();
1112         }
1113
1114       emit_insn (gen (dst, src));
1115       return;
1116     }
1117 }
1118
1119 static rtx
1120 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1121 {
1122   if (can_create_pseudo_p ())
1123     return force_reg (mode, value);
1124   else
1125     {
1126       x = aarch64_emit_move (x, value);
1127       return x;
1128     }
1129 }
1130
1131
1132 static rtx
1133 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1134 {
1135   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1136     {
1137       rtx high;
1138       /* Load the full offset into a register.  This
1139          might be improvable in the future.  */
1140       high = GEN_INT (offset);
1141       offset = 0;
1142       high = aarch64_force_temporary (mode, temp, high);
1143       reg = aarch64_force_temporary (mode, temp,
1144                                      gen_rtx_PLUS (mode, high, reg));
1145     }
1146   return plus_constant (mode, reg, offset);
1147 }
1148
1149 static int
1150 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1151                                 machine_mode mode)
1152 {
1153   unsigned HOST_WIDE_INT mask;
1154   int i;
1155   bool first;
1156   unsigned HOST_WIDE_INT val;
1157   bool subtargets;
1158   rtx subtarget;
1159   int one_match, zero_match, first_not_ffff_match;
1160   int num_insns = 0;
1161
1162   if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1163     {
1164       if (generate)
1165         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1166       num_insns++;
1167       return num_insns;
1168     }
1169
1170   if (mode == SImode)
1171     {
1172       /* We know we can't do this in 1 insn, and we must be able to do it
1173          in two; so don't mess around looking for sequences that don't buy
1174          us anything.  */
1175       if (generate)
1176         {
1177           emit_insn (gen_rtx_SET (VOIDmode, dest,
1178                                   GEN_INT (INTVAL (imm) & 0xffff)));
1179           emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1180                                      GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1181         }
1182       num_insns += 2;
1183       return num_insns;
1184     }
1185
1186   /* Remaining cases are all for DImode.  */
1187
1188   val = INTVAL (imm);
1189   subtargets = optimize && can_create_pseudo_p ();
1190
1191   one_match = 0;
1192   zero_match = 0;
1193   mask = 0xffff;
1194   first_not_ffff_match = -1;
1195
1196   for (i = 0; i < 64; i += 16, mask <<= 16)
1197     {
1198       if ((val & mask) == mask)
1199         one_match++;
1200       else
1201         {
1202           if (first_not_ffff_match < 0)
1203             first_not_ffff_match = i;
1204           if ((val & mask) == 0)
1205             zero_match++;
1206         }
1207     }
1208
1209   if (one_match == 2)
1210     {
1211       /* Set one of the quarters and then insert back into result.  */
1212       mask = 0xffffll << first_not_ffff_match;
1213       if (generate)
1214         {
1215           emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1216           emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1217                                      GEN_INT ((val >> first_not_ffff_match)
1218                                               & 0xffff)));
1219         }
1220       num_insns += 2;
1221       return num_insns;
1222     }
1223
1224   if (zero_match == 2)
1225     goto simple_sequence;
1226
1227   mask = 0x0ffff0000UL;
1228   for (i = 16; i < 64; i += 16, mask <<= 16)
1229     {
1230       HOST_WIDE_INT comp = mask & ~(mask - 1);
1231
1232       if (aarch64_uimm12_shift (val - (val & mask)))
1233         {
1234           if (generate)
1235             {
1236               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1237               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1238                                       GEN_INT (val & mask)));
1239               emit_insn (gen_adddi3 (dest, subtarget,
1240                                      GEN_INT (val - (val & mask))));
1241             }
1242           num_insns += 2;
1243           return num_insns;
1244         }
1245       else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1246         {
1247           if (generate)
1248             {
1249               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1250               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1251                                       GEN_INT ((val + comp) & mask)));
1252               emit_insn (gen_adddi3 (dest, subtarget,
1253                                      GEN_INT (val - ((val + comp) & mask))));
1254             }
1255           num_insns += 2;
1256           return num_insns;
1257         }
1258       else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1259         {
1260           if (generate)
1261             {
1262               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1263               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1264                                       GEN_INT ((val - comp) | ~mask)));
1265               emit_insn (gen_adddi3 (dest, subtarget,
1266                                      GEN_INT (val - ((val - comp) | ~mask))));
1267             }
1268           num_insns += 2;
1269           return num_insns;
1270         }
1271       else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1272         {
1273           if (generate)
1274             {
1275               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1276               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1277                                       GEN_INT (val | ~mask)));
1278               emit_insn (gen_adddi3 (dest, subtarget,
1279                                      GEN_INT (val - (val | ~mask))));
1280             }
1281           num_insns += 2;
1282           return num_insns;
1283         }
1284     }
1285
1286   /* See if we can do it by arithmetically combining two
1287      immediates.  */
1288   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1289     {
1290       int j;
1291       mask = 0xffff;
1292
1293       if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1294           || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1295         {
1296           if (generate)
1297             {
1298               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1299               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1300                                       GEN_INT (aarch64_bitmasks[i])));
1301               emit_insn (gen_adddi3 (dest, subtarget,
1302                                      GEN_INT (val - aarch64_bitmasks[i])));
1303             }
1304           num_insns += 2;
1305           return num_insns;
1306         }
1307
1308       for (j = 0; j < 64; j += 16, mask <<= 16)
1309         {
1310           if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1311             {
1312               if (generate)
1313                 {
1314                   emit_insn (gen_rtx_SET (VOIDmode, dest,
1315                                           GEN_INT (aarch64_bitmasks[i])));
1316                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1317                                              GEN_INT ((val >> j) & 0xffff)));
1318                 }
1319               num_insns += 2;
1320               return num_insns;
1321             }
1322         }
1323     }
1324
1325   /* See if we can do it by logically combining two immediates.  */
1326   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1327     {
1328       if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1329         {
1330           int j;
1331
1332           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1333             if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1334               {
1335                 if (generate)
1336                   {
1337                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1338                     emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1339                                             GEN_INT (aarch64_bitmasks[i])));
1340                     emit_insn (gen_iordi3 (dest, subtarget,
1341                                            GEN_INT (aarch64_bitmasks[j])));
1342                   }
1343                 num_insns += 2;
1344                 return num_insns;
1345               }
1346         }
1347       else if ((val & aarch64_bitmasks[i]) == val)
1348         {
1349           int j;
1350
1351           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1352             if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1353               {
1354                 if (generate)
1355                   {
1356                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1357                     emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1358                                             GEN_INT (aarch64_bitmasks[j])));
1359                     emit_insn (gen_anddi3 (dest, subtarget,
1360                                            GEN_INT (aarch64_bitmasks[i])));
1361                   }
1362                 num_insns += 2;
1363                 return num_insns;
1364               }
1365         }
1366     }
1367
1368   if (one_match > zero_match)
1369     {
1370       /* Set either first three quarters or all but the third.   */
1371       mask = 0xffffll << (16 - first_not_ffff_match);
1372       if (generate)
1373         emit_insn (gen_rtx_SET (VOIDmode, dest,
1374                                 GEN_INT (val | mask | 0xffffffff00000000ull)));
1375       num_insns ++;
1376
1377       /* Now insert other two quarters.  */
1378       for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1379            i < 64; i += 16, mask <<= 16)
1380         {
1381           if ((val & mask) != mask)
1382             {
1383               if (generate)
1384                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1385                                            GEN_INT ((val >> i) & 0xffff)));
1386               num_insns ++;
1387             }
1388         }
1389       return num_insns;
1390     }
1391
1392  simple_sequence:
1393   first = true;
1394   mask = 0xffff;
1395   for (i = 0; i < 64; i += 16, mask <<= 16)
1396     {
1397       if ((val & mask) != 0)
1398         {
1399           if (first)
1400             {
1401               if (generate)
1402                 emit_insn (gen_rtx_SET (VOIDmode, dest,
1403                                         GEN_INT (val & mask)));
1404               num_insns ++;
1405               first = false;
1406             }
1407           else
1408             {
1409               if (generate)
1410                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1411                                            GEN_INT ((val >> i) & 0xffff)));
1412               num_insns ++;
1413             }
1414         }
1415     }
1416
1417   return num_insns;
1418 }
1419
1420
1421 void
1422 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1423 {
1424   machine_mode mode = GET_MODE (dest);
1425
1426   gcc_assert (mode == SImode || mode == DImode);
1427
1428   /* Check on what type of symbol it is.  */
1429   if (GET_CODE (imm) == SYMBOL_REF
1430       || GET_CODE (imm) == LABEL_REF
1431       || GET_CODE (imm) == CONST)
1432     {
1433       rtx mem, base, offset;
1434       enum aarch64_symbol_type sty;
1435
1436       /* If we have (const (plus symbol offset)), separate out the offset
1437          before we start classifying the symbol.  */
1438       split_const (imm, &base, &offset);
1439
1440       sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1441       switch (sty)
1442         {
1443         case SYMBOL_FORCE_TO_MEM:
1444           if (offset != const0_rtx
1445               && targetm.cannot_force_const_mem (mode, imm))
1446             {
1447               gcc_assert (can_create_pseudo_p ());
1448               base = aarch64_force_temporary (mode, dest, base);
1449               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1450               aarch64_emit_move (dest, base);
1451               return;
1452             }
1453           mem = force_const_mem (ptr_mode, imm);
1454           gcc_assert (mem);
1455           if (mode != ptr_mode)
1456             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1457           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1458           return;
1459
1460         case SYMBOL_SMALL_TLSGD:
1461         case SYMBOL_SMALL_TLSDESC:
1462         case SYMBOL_SMALL_GOTTPREL:
1463         case SYMBOL_SMALL_GOT:
1464         case SYMBOL_TINY_GOT:
1465           if (offset != const0_rtx)
1466             {
1467               gcc_assert(can_create_pseudo_p ());
1468               base = aarch64_force_temporary (mode, dest, base);
1469               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1470               aarch64_emit_move (dest, base);
1471               return;
1472             }
1473           /* FALLTHRU */
1474
1475         case SYMBOL_SMALL_TPREL:
1476         case SYMBOL_SMALL_ABSOLUTE:
1477         case SYMBOL_TINY_ABSOLUTE:
1478           aarch64_load_symref_appropriately (dest, imm, sty);
1479           return;
1480
1481         default:
1482           gcc_unreachable ();
1483         }
1484     }
1485
1486   if (!CONST_INT_P (imm))
1487     {
1488       if (GET_CODE (imm) == HIGH)
1489         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1490       else
1491         {
1492           rtx mem = force_const_mem (mode, imm);
1493           gcc_assert (mem);
1494           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1495         }
1496
1497       return;
1498     }
1499
1500   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1501 }
1502
1503 static bool
1504 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1505                                  tree exp ATTRIBUTE_UNUSED)
1506 {
1507   /* Currently, always true.  */
1508   return true;
1509 }
1510
1511 /* Implement TARGET_PASS_BY_REFERENCE.  */
1512
1513 static bool
1514 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1515                            machine_mode mode,
1516                            const_tree type,
1517                            bool named ATTRIBUTE_UNUSED)
1518 {
1519   HOST_WIDE_INT size;
1520   machine_mode dummymode;
1521   int nregs;
1522
1523   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1524   size = (mode == BLKmode && type)
1525     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1526
1527   /* Aggregates are passed by reference based on their size.  */
1528   if (type && AGGREGATE_TYPE_P (type))
1529     {
1530       size = int_size_in_bytes (type);
1531     }
1532
1533   /* Variable sized arguments are always returned by reference.  */
1534   if (size < 0)
1535     return true;
1536
1537   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1538   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1539                                                &dummymode, &nregs,
1540                                                NULL))
1541     return false;
1542
1543   /* Arguments which are variable sized or larger than 2 registers are
1544      passed by reference unless they are a homogenous floating point
1545      aggregate.  */
1546   return size > 2 * UNITS_PER_WORD;
1547 }
1548
1549 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1550 static bool
1551 aarch64_return_in_msb (const_tree valtype)
1552 {
1553   machine_mode dummy_mode;
1554   int dummy_int;
1555
1556   /* Never happens in little-endian mode.  */
1557   if (!BYTES_BIG_ENDIAN)
1558     return false;
1559
1560   /* Only composite types smaller than or equal to 16 bytes can
1561      be potentially returned in registers.  */
1562   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1563       || int_size_in_bytes (valtype) <= 0
1564       || int_size_in_bytes (valtype) > 16)
1565     return false;
1566
1567   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1568      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1569      is always passed/returned in the least significant bits of fp/simd
1570      register(s).  */
1571   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1572                                                &dummy_mode, &dummy_int, NULL))
1573     return false;
1574
1575   return true;
1576 }
1577
1578 /* Implement TARGET_FUNCTION_VALUE.
1579    Define how to find the value returned by a function.  */
1580
1581 static rtx
1582 aarch64_function_value (const_tree type, const_tree func,
1583                         bool outgoing ATTRIBUTE_UNUSED)
1584 {
1585   machine_mode mode;
1586   int unsignedp;
1587   int count;
1588   machine_mode ag_mode;
1589
1590   mode = TYPE_MODE (type);
1591   if (INTEGRAL_TYPE_P (type))
1592     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1593
1594   if (aarch64_return_in_msb (type))
1595     {
1596       HOST_WIDE_INT size = int_size_in_bytes (type);
1597
1598       if (size % UNITS_PER_WORD != 0)
1599         {
1600           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1601           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1602         }
1603     }
1604
1605   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1606                                                &ag_mode, &count, NULL))
1607     {
1608       if (!aarch64_composite_type_p (type, mode))
1609         {
1610           gcc_assert (count == 1 && mode == ag_mode);
1611           return gen_rtx_REG (mode, V0_REGNUM);
1612         }
1613       else
1614         {
1615           int i;
1616           rtx par;
1617
1618           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1619           for (i = 0; i < count; i++)
1620             {
1621               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1622               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1623                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1624               XVECEXP (par, 0, i) = tmp;
1625             }
1626           return par;
1627         }
1628     }
1629   else
1630     return gen_rtx_REG (mode, R0_REGNUM);
1631 }
1632
1633 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1634    Return true if REGNO is the number of a hard register in which the values
1635    of called function may come back.  */
1636
1637 static bool
1638 aarch64_function_value_regno_p (const unsigned int regno)
1639 {
1640   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1641      of 16-byte return values are: 128-bit integers and 16-byte small
1642      structures (excluding homogeneous floating-point aggregates).  */
1643   if (regno == R0_REGNUM || regno == R1_REGNUM)
1644     return true;
1645
1646   /* Up to four fp/simd registers can return a function value, e.g. a
1647      homogeneous floating-point aggregate having four members.  */
1648   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1649     return !TARGET_GENERAL_REGS_ONLY;
1650
1651   return false;
1652 }
1653
1654 /* Implement TARGET_RETURN_IN_MEMORY.
1655
1656    If the type T of the result of a function is such that
1657      void func (T arg)
1658    would require that arg be passed as a value in a register (or set of
1659    registers) according to the parameter passing rules, then the result
1660    is returned in the same registers as would be used for such an
1661    argument.  */
1662
1663 static bool
1664 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1665 {
1666   HOST_WIDE_INT size;
1667   machine_mode ag_mode;
1668   int count;
1669
1670   if (!AGGREGATE_TYPE_P (type)
1671       && TREE_CODE (type) != COMPLEX_TYPE
1672       && TREE_CODE (type) != VECTOR_TYPE)
1673     /* Simple scalar types always returned in registers.  */
1674     return false;
1675
1676   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1677                                                type,
1678                                                &ag_mode,
1679                                                &count,
1680                                                NULL))
1681     return false;
1682
1683   /* Types larger than 2 registers returned in memory.  */
1684   size = int_size_in_bytes (type);
1685   return (size < 0 || size > 2 * UNITS_PER_WORD);
1686 }
1687
1688 static bool
1689 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1690                                const_tree type, int *nregs)
1691 {
1692   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1693   return aarch64_vfp_is_call_or_return_candidate (mode,
1694                                                   type,
1695                                                   &pcum->aapcs_vfp_rmode,
1696                                                   nregs,
1697                                                   NULL);
1698 }
1699
1700 /* Given MODE and TYPE of a function argument, return the alignment in
1701    bits.  The idea is to suppress any stronger alignment requested by
1702    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1703    This is a helper function for local use only.  */
1704
1705 static unsigned int
1706 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1707 {
1708   unsigned int alignment;
1709
1710   if (type)
1711     {
1712       if (!integer_zerop (TYPE_SIZE (type)))
1713         {
1714           if (TYPE_MODE (type) == mode)
1715             alignment = TYPE_ALIGN (type);
1716           else
1717             alignment = GET_MODE_ALIGNMENT (mode);
1718         }
1719       else
1720         alignment = 0;
1721     }
1722   else
1723     alignment = GET_MODE_ALIGNMENT (mode);
1724
1725   return alignment;
1726 }
1727
1728 /* Layout a function argument according to the AAPCS64 rules.  The rule
1729    numbers refer to the rule numbers in the AAPCS64.  */
1730
1731 static void
1732 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1733                     const_tree type,
1734                     bool named ATTRIBUTE_UNUSED)
1735 {
1736   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1737   int ncrn, nvrn, nregs;
1738   bool allocate_ncrn, allocate_nvrn;
1739   HOST_WIDE_INT size;
1740
1741   /* We need to do this once per argument.  */
1742   if (pcum->aapcs_arg_processed)
1743     return;
1744
1745   pcum->aapcs_arg_processed = true;
1746
1747   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1748   size
1749     = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1750                         UNITS_PER_WORD);
1751
1752   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1753   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1754                                                  mode,
1755                                                  type,
1756                                                  &nregs);
1757
1758   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1759      The following code thus handles passing by SIMD/FP registers first.  */
1760
1761   nvrn = pcum->aapcs_nvrn;
1762
1763   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1764      and homogenous short-vector aggregates (HVA).  */
1765   if (allocate_nvrn)
1766     {
1767       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1768         {
1769           pcum->aapcs_nextnvrn = nvrn + nregs;
1770           if (!aarch64_composite_type_p (type, mode))
1771             {
1772               gcc_assert (nregs == 1);
1773               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1774             }
1775           else
1776             {
1777               rtx par;
1778               int i;
1779               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1780               for (i = 0; i < nregs; i++)
1781                 {
1782                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1783                                          V0_REGNUM + nvrn + i);
1784                   tmp = gen_rtx_EXPR_LIST
1785                     (VOIDmode, tmp,
1786                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1787                   XVECEXP (par, 0, i) = tmp;
1788                 }
1789               pcum->aapcs_reg = par;
1790             }
1791           return;
1792         }
1793       else
1794         {
1795           /* C.3 NSRN is set to 8.  */
1796           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1797           goto on_stack;
1798         }
1799     }
1800
1801   ncrn = pcum->aapcs_ncrn;
1802   nregs = size / UNITS_PER_WORD;
1803
1804   /* C6 - C9.  though the sign and zero extension semantics are
1805      handled elsewhere.  This is the case where the argument fits
1806      entirely general registers.  */
1807   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1808     {
1809       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1810
1811       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1812
1813       /* C.8 if the argument has an alignment of 16 then the NGRN is
1814          rounded up to the next even number.  */
1815       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1816         {
1817           ++ncrn;
1818           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1819         }
1820       /* NREGS can be 0 when e.g. an empty structure is to be passed.
1821          A reg is still generated for it, but the caller should be smart
1822          enough not to use it.  */
1823       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1824         {
1825           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1826         }
1827       else
1828         {
1829           rtx par;
1830           int i;
1831
1832           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1833           for (i = 0; i < nregs; i++)
1834             {
1835               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1836               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1837                                        GEN_INT (i * UNITS_PER_WORD));
1838               XVECEXP (par, 0, i) = tmp;
1839             }
1840           pcum->aapcs_reg = par;
1841         }
1842
1843       pcum->aapcs_nextncrn = ncrn + nregs;
1844       return;
1845     }
1846
1847   /* C.11  */
1848   pcum->aapcs_nextncrn = NUM_ARG_REGS;
1849
1850   /* The argument is passed on stack; record the needed number of words for
1851      this argument and align the total size if necessary.  */
1852 on_stack:
1853   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1854   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1855     pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1856                                                16 / UNITS_PER_WORD);
1857   return;
1858 }
1859
1860 /* Implement TARGET_FUNCTION_ARG.  */
1861
1862 static rtx
1863 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1864                       const_tree type, bool named)
1865 {
1866   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1867   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1868
1869   if (mode == VOIDmode)
1870     return NULL_RTX;
1871
1872   aarch64_layout_arg (pcum_v, mode, type, named);
1873   return pcum->aapcs_reg;
1874 }
1875
1876 void
1877 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1878                            const_tree fntype ATTRIBUTE_UNUSED,
1879                            rtx libname ATTRIBUTE_UNUSED,
1880                            const_tree fndecl ATTRIBUTE_UNUSED,
1881                            unsigned n_named ATTRIBUTE_UNUSED)
1882 {
1883   pcum->aapcs_ncrn = 0;
1884   pcum->aapcs_nvrn = 0;
1885   pcum->aapcs_nextncrn = 0;
1886   pcum->aapcs_nextnvrn = 0;
1887   pcum->pcs_variant = ARM_PCS_AAPCS64;
1888   pcum->aapcs_reg = NULL_RTX;
1889   pcum->aapcs_arg_processed = false;
1890   pcum->aapcs_stack_words = 0;
1891   pcum->aapcs_stack_size = 0;
1892
1893   return;
1894 }
1895
1896 static void
1897 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1898                               machine_mode mode,
1899                               const_tree type,
1900                               bool named)
1901 {
1902   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1903   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1904     {
1905       aarch64_layout_arg (pcum_v, mode, type, named);
1906       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1907                   != (pcum->aapcs_stack_words != 0));
1908       pcum->aapcs_arg_processed = false;
1909       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1910       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1911       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1912       pcum->aapcs_stack_words = 0;
1913       pcum->aapcs_reg = NULL_RTX;
1914     }
1915 }
1916
1917 bool
1918 aarch64_function_arg_regno_p (unsigned regno)
1919 {
1920   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1921           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1922 }
1923
1924 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
1925    PARM_BOUNDARY bits of alignment, but will be given anything up
1926    to STACK_BOUNDARY bits if the type requires it.  This makes sure
1927    that both before and after the layout of each argument, the Next
1928    Stacked Argument Address (NSAA) will have a minimum alignment of
1929    8 bytes.  */
1930
1931 static unsigned int
1932 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1933 {
1934   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1935
1936   if (alignment < PARM_BOUNDARY)
1937     alignment = PARM_BOUNDARY;
1938   if (alignment > STACK_BOUNDARY)
1939     alignment = STACK_BOUNDARY;
1940   return alignment;
1941 }
1942
1943 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1944
1945    Return true if an argument passed on the stack should be padded upwards,
1946    i.e. if the least-significant byte of the stack slot has useful data.
1947
1948    Small aggregate types are placed in the lowest memory address.
1949
1950    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
1951
1952 bool
1953 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1954 {
1955   /* On little-endian targets, the least significant byte of every stack
1956      argument is passed at the lowest byte address of the stack slot.  */
1957   if (!BYTES_BIG_ENDIAN)
1958     return true;
1959
1960   /* Otherwise, integral, floating-point and pointer types are padded downward:
1961      the least significant byte of a stack argument is passed at the highest
1962      byte address of the stack slot.  */
1963   if (type
1964       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1965          || POINTER_TYPE_P (type))
1966       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1967     return false;
1968
1969   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
1970   return true;
1971 }
1972
1973 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1974
1975    It specifies padding for the last (may also be the only)
1976    element of a block move between registers and memory.  If
1977    assuming the block is in the memory, padding upward means that
1978    the last element is padded after its highest significant byte,
1979    while in downward padding, the last element is padded at the
1980    its least significant byte side.
1981
1982    Small aggregates and small complex types are always padded
1983    upwards.
1984
1985    We don't need to worry about homogeneous floating-point or
1986    short-vector aggregates; their move is not affected by the
1987    padding direction determined here.  Regardless of endianness,
1988    each element of such an aggregate is put in the least
1989    significant bits of a fp/simd register.
1990
1991    Return !BYTES_BIG_ENDIAN if the least significant byte of the
1992    register has useful data, and return the opposite if the most
1993    significant byte does.  */
1994
1995 bool
1996 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
1997                      bool first ATTRIBUTE_UNUSED)
1998 {
1999
2000   /* Small composite types are always padded upward.  */
2001   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2002     {
2003       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2004                             : GET_MODE_SIZE (mode));
2005       if (size < 2 * UNITS_PER_WORD)
2006         return true;
2007     }
2008
2009   /* Otherwise, use the default padding.  */
2010   return !BYTES_BIG_ENDIAN;
2011 }
2012
2013 static machine_mode
2014 aarch64_libgcc_cmp_return_mode (void)
2015 {
2016   return SImode;
2017 }
2018
2019 static bool
2020 aarch64_frame_pointer_required (void)
2021 {
2022   /* In aarch64_override_options_after_change
2023      flag_omit_leaf_frame_pointer turns off the frame pointer by
2024      default.  Turn it back on now if we've not got a leaf
2025      function.  */
2026   if (flag_omit_leaf_frame_pointer
2027       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2028     return true;
2029
2030   return false;
2031 }
2032
2033 /* Mark the registers that need to be saved by the callee and calculate
2034    the size of the callee-saved registers area and frame record (both FP
2035    and LR may be omitted).  */
2036 static void
2037 aarch64_layout_frame (void)
2038 {
2039   HOST_WIDE_INT offset = 0;
2040   int regno;
2041
2042   if (reload_completed && cfun->machine->frame.laid_out)
2043     return;
2044
2045 #define SLOT_NOT_REQUIRED (-2)
2046 #define SLOT_REQUIRED     (-1)
2047
2048   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2049   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2050
2051   /* First mark all the registers that really need to be saved...  */
2052   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2053     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2054
2055   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2056     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2057
2058   /* ... that includes the eh data registers (if needed)...  */
2059   if (crtl->calls_eh_return)
2060     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2061       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2062         = SLOT_REQUIRED;
2063
2064   /* ... and any callee saved register that dataflow says is live.  */
2065   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2066     if (df_regs_ever_live_p (regno)
2067         && (regno == R30_REGNUM
2068             || !call_used_regs[regno]))
2069       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2070
2071   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2072     if (df_regs_ever_live_p (regno)
2073         && !call_used_regs[regno])
2074       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2075
2076   if (frame_pointer_needed)
2077     {
2078       /* FP and LR are placed in the linkage record.  */
2079       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2080       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2081       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2082       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2083       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2084       offset += 2 * UNITS_PER_WORD;
2085     }
2086
2087   /* Now assign stack slots for them.  */
2088   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2089     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2090       {
2091         cfun->machine->frame.reg_offset[regno] = offset;
2092         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2093           cfun->machine->frame.wb_candidate1 = regno;
2094         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2095           cfun->machine->frame.wb_candidate2 = regno;
2096         offset += UNITS_PER_WORD;
2097       }
2098
2099   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2100     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2101       {
2102         cfun->machine->frame.reg_offset[regno] = offset;
2103         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2104           cfun->machine->frame.wb_candidate1 = regno;
2105         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2106                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2107           cfun->machine->frame.wb_candidate2 = regno;
2108         offset += UNITS_PER_WORD;
2109       }
2110
2111   cfun->machine->frame.padding0 =
2112     (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2113   offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2114
2115   cfun->machine->frame.saved_regs_size = offset;
2116
2117   cfun->machine->frame.hard_fp_offset
2118     = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2119                         + get_frame_size ()
2120                         + cfun->machine->frame.saved_regs_size,
2121                         STACK_BOUNDARY / BITS_PER_UNIT);
2122
2123   cfun->machine->frame.frame_size
2124     = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2125                         + crtl->outgoing_args_size,
2126                         STACK_BOUNDARY / BITS_PER_UNIT);
2127
2128   cfun->machine->frame.laid_out = true;
2129 }
2130
2131 static bool
2132 aarch64_register_saved_on_entry (int regno)
2133 {
2134   return cfun->machine->frame.reg_offset[regno] >= 0;
2135 }
2136
2137 static unsigned
2138 aarch64_next_callee_save (unsigned regno, unsigned limit)
2139 {
2140   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2141     regno ++;
2142   return regno;
2143 }
2144
2145 static void
2146 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2147                            HOST_WIDE_INT adjustment)
2148  {
2149   rtx base_rtx = stack_pointer_rtx;
2150   rtx insn, reg, mem;
2151
2152   reg = gen_rtx_REG (mode, regno);
2153   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2154                             plus_constant (Pmode, base_rtx, -adjustment));
2155   mem = gen_rtx_MEM (mode, mem);
2156
2157   insn = emit_move_insn (mem, reg);
2158   RTX_FRAME_RELATED_P (insn) = 1;
2159 }
2160
2161 static rtx
2162 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2163                           HOST_WIDE_INT adjustment)
2164 {
2165   switch (mode)
2166     {
2167     case DImode:
2168       return gen_storewb_pairdi_di (base, base, reg, reg2,
2169                                     GEN_INT (-adjustment),
2170                                     GEN_INT (UNITS_PER_WORD - adjustment));
2171     case DFmode:
2172       return gen_storewb_pairdf_di (base, base, reg, reg2,
2173                                     GEN_INT (-adjustment),
2174                                     GEN_INT (UNITS_PER_WORD - adjustment));
2175     default:
2176       gcc_unreachable ();
2177     }
2178 }
2179
2180 static void
2181 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2182                          unsigned regno2, HOST_WIDE_INT adjustment)
2183 {
2184   rtx_insn *insn;
2185   rtx reg1 = gen_rtx_REG (mode, regno1);
2186   rtx reg2 = gen_rtx_REG (mode, regno2);
2187
2188   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2189                                               reg2, adjustment));
2190   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2191   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2192   RTX_FRAME_RELATED_P (insn) = 1;
2193 }
2194
2195 static rtx
2196 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2197                          HOST_WIDE_INT adjustment)
2198 {
2199   switch (mode)
2200     {
2201     case DImode:
2202       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2203                                    GEN_INT (UNITS_PER_WORD));
2204     case DFmode:
2205       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2206                                    GEN_INT (UNITS_PER_WORD));
2207     default:
2208       gcc_unreachable ();
2209     }
2210 }
2211
2212 static rtx
2213 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2214                         rtx reg2)
2215 {
2216   switch (mode)
2217     {
2218     case DImode:
2219       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2220
2221     case DFmode:
2222       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2223
2224     default:
2225       gcc_unreachable ();
2226     }
2227 }
2228
2229 static rtx
2230 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2231                        rtx mem2)
2232 {
2233   switch (mode)
2234     {
2235     case DImode:
2236       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2237
2238     case DFmode:
2239       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2240
2241     default:
2242       gcc_unreachable ();
2243     }
2244 }
2245
2246
2247 static void
2248 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2249                            unsigned start, unsigned limit, bool skip_wb)
2250 {
2251   rtx_insn *insn;
2252   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2253                                                  ? gen_frame_mem : gen_rtx_MEM);
2254   unsigned regno;
2255   unsigned regno2;
2256
2257   for (regno = aarch64_next_callee_save (start, limit);
2258        regno <= limit;
2259        regno = aarch64_next_callee_save (regno + 1, limit))
2260     {
2261       rtx reg, mem;
2262       HOST_WIDE_INT offset;
2263
2264       if (skip_wb
2265           && (regno == cfun->machine->frame.wb_candidate1
2266               || regno == cfun->machine->frame.wb_candidate2))
2267         continue;
2268
2269       reg = gen_rtx_REG (mode, regno);
2270       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2271       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2272                                               offset));
2273
2274       regno2 = aarch64_next_callee_save (regno + 1, limit);
2275
2276       if (regno2 <= limit
2277           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2278               == cfun->machine->frame.reg_offset[regno2]))
2279
2280         {
2281           rtx reg2 = gen_rtx_REG (mode, regno2);
2282           rtx mem2;
2283
2284           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2285           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2286                                                    offset));
2287           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2288                                                     reg2));
2289
2290           /* The first part of a frame-related parallel insn is
2291              always assumed to be relevant to the frame
2292              calculations; subsequent parts, are only
2293              frame-related if explicitly marked.  */
2294           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2295           regno = regno2;
2296         }
2297       else
2298         insn = emit_move_insn (mem, reg);
2299
2300       RTX_FRAME_RELATED_P (insn) = 1;
2301     }
2302 }
2303
2304 static void
2305 aarch64_restore_callee_saves (machine_mode mode,
2306                               HOST_WIDE_INT start_offset, unsigned start,
2307                               unsigned limit, bool skip_wb, rtx *cfi_ops)
2308 {
2309   rtx base_rtx = stack_pointer_rtx;
2310   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2311                                                  ? gen_frame_mem : gen_rtx_MEM);
2312   unsigned regno;
2313   unsigned regno2;
2314   HOST_WIDE_INT offset;
2315
2316   for (regno = aarch64_next_callee_save (start, limit);
2317        regno <= limit;
2318        regno = aarch64_next_callee_save (regno + 1, limit))
2319     {
2320       rtx reg, mem;
2321
2322       if (skip_wb
2323           && (regno == cfun->machine->frame.wb_candidate1
2324               || regno == cfun->machine->frame.wb_candidate2))
2325         continue;
2326
2327       reg = gen_rtx_REG (mode, regno);
2328       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2329       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2330
2331       regno2 = aarch64_next_callee_save (regno + 1, limit);
2332
2333       if (regno2 <= limit
2334           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2335               == cfun->machine->frame.reg_offset[regno2]))
2336         {
2337           rtx reg2 = gen_rtx_REG (mode, regno2);
2338           rtx mem2;
2339
2340           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2341           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2342           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2343
2344           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2345           regno = regno2;
2346         }
2347       else
2348         emit_move_insn (reg, mem);
2349       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2350     }
2351 }
2352
2353 /* AArch64 stack frames generated by this compiler look like:
2354
2355         +-------------------------------+
2356         |                               |
2357         |  incoming stack arguments     |
2358         |                               |
2359         +-------------------------------+
2360         |                               | <-- incoming stack pointer (aligned)
2361         |  callee-allocated save area   |
2362         |  for register varargs         |
2363         |                               |
2364         +-------------------------------+
2365         |  local variables              | <-- frame_pointer_rtx
2366         |                               |
2367         +-------------------------------+
2368         |  padding0                     | \
2369         +-------------------------------+  |
2370         |  callee-saved registers       |  | frame.saved_regs_size
2371         +-------------------------------+  |
2372         |  LR'                          |  |
2373         +-------------------------------+  |
2374         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2375         +-------------------------------+
2376         |  dynamic allocation           |
2377         +-------------------------------+
2378         |  padding                      |
2379         +-------------------------------+
2380         |  outgoing stack arguments     | <-- arg_pointer
2381         |                               |
2382         +-------------------------------+
2383         |                               | <-- stack_pointer_rtx (aligned)
2384
2385    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2386    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2387    unchanged.  */
2388
2389 /* Generate the prologue instructions for entry into a function.
2390    Establish the stack frame by decreasing the stack pointer with a
2391    properly calculated size and, if necessary, create a frame record
2392    filled with the values of LR and previous frame pointer.  The
2393    current FP is also set up if it is in use.  */
2394
2395 void
2396 aarch64_expand_prologue (void)
2397 {
2398   /* sub sp, sp, #<frame_size>
2399      stp {fp, lr}, [sp, #<frame_size> - 16]
2400      add fp, sp, #<frame_size> - hardfp_offset
2401      stp {cs_reg}, [fp, #-16] etc.
2402
2403      sub sp, sp, <final_adjustment_if_any>
2404   */
2405   HOST_WIDE_INT frame_size, offset;
2406   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2407   HOST_WIDE_INT hard_fp_offset;
2408   rtx_insn *insn;
2409
2410   aarch64_layout_frame ();
2411
2412   offset = frame_size = cfun->machine->frame.frame_size;
2413   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2414   fp_offset = frame_size - hard_fp_offset;
2415
2416   if (flag_stack_usage_info)
2417     current_function_static_stack_size = frame_size;
2418
2419   /* Store pairs and load pairs have a range only -512 to 504.  */
2420   if (offset >= 512)
2421     {
2422       /* When the frame has a large size, an initial decrease is done on
2423          the stack pointer to jump over the callee-allocated save area for
2424          register varargs, the local variable area and/or the callee-saved
2425          register area.  This will allow the pre-index write-back
2426          store pair instructions to be used for setting up the stack frame
2427          efficiently.  */
2428       offset = hard_fp_offset;
2429       if (offset >= 512)
2430         offset = cfun->machine->frame.saved_regs_size;
2431
2432       frame_size -= (offset + crtl->outgoing_args_size);
2433       fp_offset = 0;
2434
2435       if (frame_size >= 0x1000000)
2436         {
2437           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2438           emit_move_insn (op0, GEN_INT (-frame_size));
2439           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2440
2441           add_reg_note (insn, REG_CFA_ADJUST_CFA,
2442                         gen_rtx_SET (VOIDmode, stack_pointer_rtx,
2443                                      plus_constant (Pmode, stack_pointer_rtx,
2444                                                     -frame_size)));
2445           RTX_FRAME_RELATED_P (insn) = 1;
2446         }
2447       else if (frame_size > 0)
2448         {
2449           int hi_ofs = frame_size & 0xfff000;
2450           int lo_ofs = frame_size & 0x000fff;
2451
2452           if (hi_ofs)
2453             {
2454               insn = emit_insn (gen_add2_insn
2455                                 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2456               RTX_FRAME_RELATED_P (insn) = 1;
2457             }
2458           if (lo_ofs)
2459             {
2460               insn = emit_insn (gen_add2_insn
2461                                 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2462               RTX_FRAME_RELATED_P (insn) = 1;
2463             }
2464         }
2465     }
2466   else
2467     frame_size = -1;
2468
2469   if (offset > 0)
2470     {
2471       bool skip_wb = false;
2472
2473       if (frame_pointer_needed)
2474         {
2475           skip_wb = true;
2476
2477           if (fp_offset)
2478             {
2479               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2480                                                GEN_INT (-offset)));
2481               RTX_FRAME_RELATED_P (insn) = 1;
2482
2483               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2484                                          R30_REGNUM, false);
2485             }
2486           else
2487             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2488
2489           /* Set up frame pointer to point to the location of the
2490              previous frame pointer on the stack.  */
2491           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2492                                            stack_pointer_rtx,
2493                                            GEN_INT (fp_offset)));
2494           RTX_FRAME_RELATED_P (insn) = 1;
2495           emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2496         }
2497       else
2498         {
2499           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2500           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2501
2502           if (fp_offset
2503               || reg1 == FIRST_PSEUDO_REGISTER
2504               || (reg2 == FIRST_PSEUDO_REGISTER
2505                   && offset >= 256))
2506             {
2507               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2508                                                GEN_INT (-offset)));
2509               RTX_FRAME_RELATED_P (insn) = 1;
2510             }
2511           else
2512             {
2513               machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2514
2515               skip_wb = true;
2516
2517               if (reg2 == FIRST_PSEUDO_REGISTER)
2518                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2519               else
2520                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2521             }
2522         }
2523
2524       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2525                                  skip_wb);
2526       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2527                                  skip_wb);
2528     }
2529
2530   /* when offset >= 512,
2531      sub sp, sp, #<outgoing_args_size> */
2532   if (frame_size > -1)
2533     {
2534       if (crtl->outgoing_args_size > 0)
2535         {
2536           insn = emit_insn (gen_add2_insn
2537                             (stack_pointer_rtx,
2538                              GEN_INT (- crtl->outgoing_args_size)));
2539           RTX_FRAME_RELATED_P (insn) = 1;
2540         }
2541     }
2542 }
2543
2544 /* Return TRUE if we can use a simple_return insn.
2545
2546    This function checks whether the callee saved stack is empty, which
2547    means no restore actions are need. The pro_and_epilogue will use
2548    this to check whether shrink-wrapping opt is feasible.  */
2549
2550 bool
2551 aarch64_use_return_insn_p (void)
2552 {
2553   if (!reload_completed)
2554     return false;
2555
2556   if (crtl->profile)
2557     return false;
2558
2559   aarch64_layout_frame ();
2560
2561   return cfun->machine->frame.frame_size == 0;
2562 }
2563
2564 /* Generate the epilogue instructions for returning from a function.  */
2565 void
2566 aarch64_expand_epilogue (bool for_sibcall)
2567 {
2568   HOST_WIDE_INT frame_size, offset;
2569   HOST_WIDE_INT fp_offset;
2570   HOST_WIDE_INT hard_fp_offset;
2571   rtx_insn *insn;
2572   /* We need to add memory barrier to prevent read from deallocated stack.  */
2573   bool need_barrier_p = (get_frame_size () != 0
2574                          || cfun->machine->frame.saved_varargs_size);
2575
2576   aarch64_layout_frame ();
2577
2578   offset = frame_size = cfun->machine->frame.frame_size;
2579   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2580   fp_offset = frame_size - hard_fp_offset;
2581
2582   /* Store pairs and load pairs have a range only -512 to 504.  */
2583   if (offset >= 512)
2584     {
2585       offset = hard_fp_offset;
2586       if (offset >= 512)
2587         offset = cfun->machine->frame.saved_regs_size;
2588
2589       frame_size -= (offset + crtl->outgoing_args_size);
2590       fp_offset = 0;
2591       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2592         {
2593           insn = emit_insn (gen_add2_insn
2594                             (stack_pointer_rtx,
2595                              GEN_INT (crtl->outgoing_args_size)));
2596           RTX_FRAME_RELATED_P (insn) = 1;
2597         }
2598     }
2599   else
2600     frame_size = -1;
2601
2602   /* If there were outgoing arguments or we've done dynamic stack
2603      allocation, then restore the stack pointer from the frame
2604      pointer.  This is at most one insn and more efficient than using
2605      GCC's internal mechanism.  */
2606   if (frame_pointer_needed
2607       && (crtl->outgoing_args_size || cfun->calls_alloca))
2608     {
2609       if (cfun->calls_alloca)
2610         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2611
2612       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2613                                        hard_frame_pointer_rtx,
2614                                        GEN_INT (0)));
2615       offset = offset - fp_offset;
2616     }
2617
2618   if (offset > 0)
2619     {
2620       unsigned reg1 = cfun->machine->frame.wb_candidate1;
2621       unsigned reg2 = cfun->machine->frame.wb_candidate2;
2622       bool skip_wb = true;
2623       rtx cfi_ops = NULL;
2624
2625       if (frame_pointer_needed)
2626         fp_offset = 0;
2627       else if (fp_offset
2628                || reg1 == FIRST_PSEUDO_REGISTER
2629                || (reg2 == FIRST_PSEUDO_REGISTER
2630                    && offset >= 256))
2631         skip_wb = false;
2632
2633       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2634                                     skip_wb, &cfi_ops);
2635       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2636                                     skip_wb, &cfi_ops);
2637
2638       if (need_barrier_p)
2639         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2640
2641       if (skip_wb)
2642         {
2643           machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2644           rtx rreg1 = gen_rtx_REG (mode1, reg1);
2645
2646           cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2647           if (reg2 == FIRST_PSEUDO_REGISTER)
2648             {
2649               rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2650               mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2651               mem = gen_rtx_MEM (mode1, mem);
2652               insn = emit_move_insn (rreg1, mem);
2653             }
2654           else
2655             {
2656               rtx rreg2 = gen_rtx_REG (mode1, reg2);
2657
2658               cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2659               insn = emit_insn (aarch64_gen_loadwb_pair
2660                                 (mode1, stack_pointer_rtx, rreg1,
2661                                  rreg2, offset));
2662             }
2663         }
2664       else
2665         {
2666           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2667                                            GEN_INT (offset)));
2668         }
2669
2670       /* Reset the CFA to be SP + FRAME_SIZE.  */
2671       rtx new_cfa = stack_pointer_rtx;
2672       if (frame_size > 0)
2673         new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2674       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2675       REG_NOTES (insn) = cfi_ops;
2676       RTX_FRAME_RELATED_P (insn) = 1;
2677     }
2678
2679   if (frame_size > 0)
2680     {
2681       if (need_barrier_p)
2682         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2683
2684       if (frame_size >= 0x1000000)
2685         {
2686           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2687           emit_move_insn (op0, GEN_INT (frame_size));
2688           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2689         }
2690       else
2691         {
2692           int hi_ofs = frame_size & 0xfff000;
2693           int lo_ofs = frame_size & 0x000fff;
2694
2695           if (hi_ofs && lo_ofs)
2696             {
2697               insn = emit_insn (gen_add2_insn
2698                                 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2699               RTX_FRAME_RELATED_P (insn) = 1;
2700               frame_size = lo_ofs;
2701             }
2702           insn = emit_insn (gen_add2_insn
2703                             (stack_pointer_rtx, GEN_INT (frame_size)));
2704         }
2705
2706       /* Reset the CFA to be SP + 0.  */
2707       add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2708       RTX_FRAME_RELATED_P (insn) = 1;
2709     }
2710
2711   /* Stack adjustment for exception handler.  */
2712   if (crtl->calls_eh_return)
2713     {
2714       /* We need to unwind the stack by the offset computed by
2715          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
2716          to be SP; letting the CFA move during this adjustment
2717          is just as correct as retaining the CFA from the body
2718          of the function.  Therefore, do nothing special.  */
2719       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2720     }
2721
2722   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2723   if (!for_sibcall)
2724     emit_jump_insn (ret_rtx);
2725 }
2726
2727 /* Return the place to copy the exception unwinding return address to.
2728    This will probably be a stack slot, but could (in theory be the
2729    return register).  */
2730 rtx
2731 aarch64_final_eh_return_addr (void)
2732 {
2733   HOST_WIDE_INT fp_offset;
2734
2735   aarch64_layout_frame ();
2736
2737   fp_offset = cfun->machine->frame.frame_size
2738               - cfun->machine->frame.hard_fp_offset;
2739
2740   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2741     return gen_rtx_REG (DImode, LR_REGNUM);
2742
2743   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2744      result in a store to save LR introduced by builtin_eh_return () being
2745      incorrectly deleted because the alias is not detected.
2746      So in the calculation of the address to copy the exception unwinding
2747      return address to, we note 2 cases.
2748      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2749      we return a SP-relative location since all the addresses are SP-relative
2750      in this case.  This prevents the store from being optimized away.
2751      If the fp_offset is not 0, then the addresses will be FP-relative and
2752      therefore we return a FP-relative location.  */
2753
2754   if (frame_pointer_needed)
2755     {
2756       if (fp_offset)
2757         return gen_frame_mem (DImode,
2758                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2759       else
2760         return gen_frame_mem (DImode,
2761                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2762     }
2763
2764   /* If FP is not needed, we calculate the location of LR, which would be
2765      at the top of the saved registers block.  */
2766
2767   return gen_frame_mem (DImode,
2768                         plus_constant (Pmode,
2769                                        stack_pointer_rtx,
2770                                        fp_offset
2771                                        + cfun->machine->frame.saved_regs_size
2772                                        - 2 * UNITS_PER_WORD));
2773 }
2774
2775 /* Possibly output code to build up a constant in a register.  For
2776    the benefit of the costs infrastructure, returns the number of
2777    instructions which would be emitted.  GENERATE inhibits or
2778    enables code generation.  */
2779
2780 static int
2781 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2782 {
2783   int insns = 0;
2784
2785   if (aarch64_bitmask_imm (val, DImode))
2786     {
2787       if (generate)
2788         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2789       insns = 1;
2790     }
2791   else
2792     {
2793       int i;
2794       int ncount = 0;
2795       int zcount = 0;
2796       HOST_WIDE_INT valp = val >> 16;
2797       HOST_WIDE_INT valm;
2798       HOST_WIDE_INT tval;
2799
2800       for (i = 16; i < 64; i += 16)
2801         {
2802           valm = (valp & 0xffff);
2803
2804           if (valm != 0)
2805             ++ zcount;
2806
2807           if (valm != 0xffff)
2808             ++ ncount;
2809
2810           valp >>= 16;
2811         }
2812
2813       /* zcount contains the number of additional MOVK instructions
2814          required if the constant is built up with an initial MOVZ instruction,
2815          while ncount is the number of MOVK instructions required if starting
2816          with a MOVN instruction.  Choose the sequence that yields the fewest
2817          number of instructions, preferring MOVZ instructions when they are both
2818          the same.  */
2819       if (ncount < zcount)
2820         {
2821           if (generate)
2822             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2823                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2824           tval = 0xffff;
2825           insns++;
2826         }
2827       else
2828         {
2829           if (generate)
2830             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2831                             GEN_INT (val & 0xffff));
2832           tval = 0;
2833           insns++;
2834         }
2835
2836       val >>= 16;
2837
2838       for (i = 16; i < 64; i += 16)
2839         {
2840           if ((val & 0xffff) != tval)
2841             {
2842               if (generate)
2843                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2844                                            GEN_INT (i),
2845                                            GEN_INT (val & 0xffff)));
2846               insns++;
2847             }
2848           val >>= 16;
2849         }
2850     }
2851   return insns;
2852 }
2853
2854 static void
2855 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2856 {
2857   HOST_WIDE_INT mdelta = delta;
2858   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2859   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2860
2861   if (mdelta < 0)
2862     mdelta = -mdelta;
2863
2864   if (mdelta >= 4096 * 4096)
2865     {
2866       (void) aarch64_build_constant (scratchreg, delta, true);
2867       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2868     }
2869   else if (mdelta > 0)
2870     {
2871       if (mdelta >= 4096)
2872         {
2873           emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2874           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2875           if (delta < 0)
2876             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2877                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
2878           else
2879             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2880                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
2881         }
2882       if (mdelta % 4096 != 0)
2883         {
2884           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2885           emit_insn (gen_rtx_SET (Pmode, this_rtx,
2886                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2887         }
2888     }
2889 }
2890
2891 /* Output code to add DELTA to the first argument, and then jump
2892    to FUNCTION.  Used for C++ multiple inheritance.  */
2893 static void
2894 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2895                          HOST_WIDE_INT delta,
2896                          HOST_WIDE_INT vcall_offset,
2897                          tree function)
2898 {
2899   /* The this pointer is always in x0.  Note that this differs from
2900      Arm where the this pointer maybe bumped to r1 if r0 is required
2901      to return a pointer to an aggregate.  On AArch64 a result value
2902      pointer will be in x8.  */
2903   int this_regno = R0_REGNUM;
2904   rtx this_rtx, temp0, temp1, addr, funexp;
2905   rtx_insn *insn;
2906
2907   reload_completed = 1;
2908   emit_note (NOTE_INSN_PROLOGUE_END);
2909
2910   if (vcall_offset == 0)
2911     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2912   else
2913     {
2914       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2915
2916       this_rtx = gen_rtx_REG (Pmode, this_regno);
2917       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2918       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2919
2920       addr = this_rtx;
2921       if (delta != 0)
2922         {
2923           if (delta >= -256 && delta < 256)
2924             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2925                                        plus_constant (Pmode, this_rtx, delta));
2926           else
2927             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2928         }
2929
2930       if (Pmode == ptr_mode)
2931         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2932       else
2933         aarch64_emit_move (temp0,
2934                            gen_rtx_ZERO_EXTEND (Pmode,
2935                                                 gen_rtx_MEM (ptr_mode, addr)));
2936
2937       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2938           addr = plus_constant (Pmode, temp0, vcall_offset);
2939       else
2940         {
2941           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2942           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2943         }
2944
2945       if (Pmode == ptr_mode)
2946         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2947       else
2948         aarch64_emit_move (temp1,
2949                            gen_rtx_SIGN_EXTEND (Pmode,
2950                                                 gen_rtx_MEM (ptr_mode, addr)));
2951
2952       emit_insn (gen_add2_insn (this_rtx, temp1));
2953     }
2954
2955   /* Generate a tail call to the target function.  */
2956   if (!TREE_USED (function))
2957     {
2958       assemble_external (function);
2959       TREE_USED (function) = 1;
2960     }
2961   funexp = XEXP (DECL_RTL (function), 0);
2962   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2963   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2964   SIBLING_CALL_P (insn) = 1;
2965
2966   insn = get_insns ();
2967   shorten_branches (insn);
2968   final_start_function (insn, file, 1);
2969   final (insn, file, 1);
2970   final_end_function ();
2971
2972   /* Stop pretending to be a post-reload pass.  */
2973   reload_completed = 0;
2974 }
2975
2976 static bool
2977 aarch64_tls_referenced_p (rtx x)
2978 {
2979   if (!TARGET_HAVE_TLS)
2980     return false;
2981   subrtx_iterator::array_type array;
2982   FOR_EACH_SUBRTX (iter, array, x, ALL)
2983     {
2984       const_rtx x = *iter;
2985       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
2986         return true;
2987       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2988          TLS offsets, not real symbol references.  */
2989       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
2990         iter.skip_subrtxes ();
2991     }
2992   return false;
2993 }
2994
2995
2996 static int
2997 aarch64_bitmasks_cmp (const void *i1, const void *i2)
2998 {
2999   const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3000   const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3001
3002   if (*imm1 < *imm2)
3003     return -1;
3004   if (*imm1 > *imm2)
3005     return +1;
3006   return 0;
3007 }
3008
3009
3010 static void
3011 aarch64_build_bitmask_table (void)
3012 {
3013   unsigned HOST_WIDE_INT mask, imm;
3014   unsigned int log_e, e, s, r;
3015   unsigned int nimms = 0;
3016
3017   for (log_e = 1; log_e <= 6; log_e++)
3018     {
3019       e = 1 << log_e;
3020       if (e == 64)
3021         mask = ~(HOST_WIDE_INT) 0;
3022       else
3023         mask = ((HOST_WIDE_INT) 1 << e) - 1;
3024       for (s = 1; s < e; s++)
3025         {
3026           for (r = 0; r < e; r++)
3027             {
3028               /* set s consecutive bits to 1 (s < 64) */
3029               imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3030               /* rotate right by r */
3031               if (r != 0)
3032                 imm = ((imm >> r) | (imm << (e - r))) & mask;
3033               /* replicate the constant depending on SIMD size */
3034               switch (log_e) {
3035               case 1: imm |= (imm <<  2);
3036               case 2: imm |= (imm <<  4);
3037               case 3: imm |= (imm <<  8);
3038               case 4: imm |= (imm << 16);
3039               case 5: imm |= (imm << 32);
3040               case 6:
3041                 break;
3042               default:
3043                 gcc_unreachable ();
3044               }
3045               gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3046               aarch64_bitmasks[nimms++] = imm;
3047             }
3048         }
3049     }
3050
3051   gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3052   qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3053          aarch64_bitmasks_cmp);
3054 }
3055
3056
3057 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3058    a left shift of 0 or 12 bits.  */
3059 bool
3060 aarch64_uimm12_shift (HOST_WIDE_INT val)
3061 {
3062   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3063           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3064           );
3065 }
3066
3067
3068 /* Return true if val is an immediate that can be loaded into a
3069    register by a MOVZ instruction.  */
3070 static bool
3071 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3072 {
3073   if (GET_MODE_SIZE (mode) > 4)
3074     {
3075       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3076           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3077         return 1;
3078     }
3079   else
3080     {
3081       /* Ignore sign extension.  */
3082       val &= (HOST_WIDE_INT) 0xffffffff;
3083     }
3084   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3085           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3086 }
3087
3088
3089 /* Return true if val is a valid bitmask immediate.  */
3090 bool
3091 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3092 {
3093   if (GET_MODE_SIZE (mode) < 8)
3094     {
3095       /* Replicate bit pattern.  */
3096       val &= (HOST_WIDE_INT) 0xffffffff;
3097       val |= val << 32;
3098     }
3099   return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3100                   sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3101 }
3102
3103
3104 /* Return true if val is an immediate that can be loaded into a
3105    register in a single instruction.  */
3106 bool
3107 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3108 {
3109   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3110     return 1;
3111   return aarch64_bitmask_imm (val, mode);
3112 }
3113
3114 static bool
3115 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3116 {
3117   rtx base, offset;
3118
3119   if (GET_CODE (x) == HIGH)
3120     return true;
3121
3122   split_const (x, &base, &offset);
3123   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3124     {
3125       if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3126           != SYMBOL_FORCE_TO_MEM)
3127         return true;
3128       else
3129         /* Avoid generating a 64-bit relocation in ILP32; leave
3130            to aarch64_expand_mov_immediate to handle it properly.  */
3131         return mode != ptr_mode;
3132     }
3133
3134   return aarch64_tls_referenced_p (x);
3135 }
3136
3137 /* Return true if register REGNO is a valid index register.
3138    STRICT_P is true if REG_OK_STRICT is in effect.  */
3139
3140 bool
3141 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3142 {
3143   if (!HARD_REGISTER_NUM_P (regno))
3144     {
3145       if (!strict_p)
3146         return true;
3147
3148       if (!reg_renumber)
3149         return false;
3150
3151       regno = reg_renumber[regno];
3152     }
3153   return GP_REGNUM_P (regno);
3154 }
3155
3156 /* Return true if register REGNO is a valid base register for mode MODE.
3157    STRICT_P is true if REG_OK_STRICT is in effect.  */
3158
3159 bool
3160 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3161 {
3162   if (!HARD_REGISTER_NUM_P (regno))
3163     {
3164       if (!strict_p)
3165         return true;
3166
3167       if (!reg_renumber)
3168         return false;
3169
3170       regno = reg_renumber[regno];
3171     }
3172
3173   /* The fake registers will be eliminated to either the stack or
3174      hard frame pointer, both of which are usually valid base registers.
3175      Reload deals with the cases where the eliminated form isn't valid.  */
3176   return (GP_REGNUM_P (regno)
3177           || regno == SP_REGNUM
3178           || regno == FRAME_POINTER_REGNUM
3179           || regno == ARG_POINTER_REGNUM);
3180 }
3181
3182 /* Return true if X is a valid base register for mode MODE.
3183    STRICT_P is true if REG_OK_STRICT is in effect.  */
3184
3185 static bool
3186 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3187 {
3188   if (!strict_p && GET_CODE (x) == SUBREG)
3189     x = SUBREG_REG (x);
3190
3191   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3192 }
3193
3194 /* Return true if address offset is a valid index.  If it is, fill in INFO
3195    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3196
3197 static bool
3198 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3199                         machine_mode mode, bool strict_p)
3200 {
3201   enum aarch64_address_type type;
3202   rtx index;
3203   int shift;
3204
3205   /* (reg:P) */
3206   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3207       && GET_MODE (x) == Pmode)
3208     {
3209       type = ADDRESS_REG_REG;
3210       index = x;
3211       shift = 0;
3212     }
3213   /* (sign_extend:DI (reg:SI)) */
3214   else if ((GET_CODE (x) == SIGN_EXTEND
3215             || GET_CODE (x) == ZERO_EXTEND)
3216            && GET_MODE (x) == DImode
3217            && GET_MODE (XEXP (x, 0)) == SImode)
3218     {
3219       type = (GET_CODE (x) == SIGN_EXTEND)
3220         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3221       index = XEXP (x, 0);
3222       shift = 0;
3223     }
3224   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3225   else if (GET_CODE (x) == MULT
3226            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3227                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3228            && GET_MODE (XEXP (x, 0)) == DImode
3229            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3230            && CONST_INT_P (XEXP (x, 1)))
3231     {
3232       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3233         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3234       index = XEXP (XEXP (x, 0), 0);
3235       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3236     }
3237   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3238   else if (GET_CODE (x) == ASHIFT
3239            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3240                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3241            && GET_MODE (XEXP (x, 0)) == DImode
3242            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3243            && CONST_INT_P (XEXP (x, 1)))
3244     {
3245       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3246         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3247       index = XEXP (XEXP (x, 0), 0);
3248       shift = INTVAL (XEXP (x, 1));
3249     }
3250   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3251   else if ((GET_CODE (x) == SIGN_EXTRACT
3252             || GET_CODE (x) == ZERO_EXTRACT)
3253            && GET_MODE (x) == DImode
3254            && GET_CODE (XEXP (x, 0)) == MULT
3255            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3256            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3257     {
3258       type = (GET_CODE (x) == SIGN_EXTRACT)
3259         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3260       index = XEXP (XEXP (x, 0), 0);
3261       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3262       if (INTVAL (XEXP (x, 1)) != 32 + shift
3263           || INTVAL (XEXP (x, 2)) != 0)
3264         shift = -1;
3265     }
3266   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3267      (const_int 0xffffffff<<shift)) */
3268   else if (GET_CODE (x) == AND
3269            && GET_MODE (x) == DImode
3270            && GET_CODE (XEXP (x, 0)) == MULT
3271            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3272            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3273            && CONST_INT_P (XEXP (x, 1)))
3274     {
3275       type = ADDRESS_REG_UXTW;
3276       index = XEXP (XEXP (x, 0), 0);
3277       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3278       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3279         shift = -1;
3280     }
3281   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3282   else if ((GET_CODE (x) == SIGN_EXTRACT
3283             || GET_CODE (x) == ZERO_EXTRACT)
3284            && GET_MODE (x) == DImode
3285            && GET_CODE (XEXP (x, 0)) == ASHIFT
3286            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3287            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3288     {
3289       type = (GET_CODE (x) == SIGN_EXTRACT)
3290         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3291       index = XEXP (XEXP (x, 0), 0);
3292       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3293       if (INTVAL (XEXP (x, 1)) != 32 + shift
3294           || INTVAL (XEXP (x, 2)) != 0)
3295         shift = -1;
3296     }
3297   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3298      (const_int 0xffffffff<<shift)) */
3299   else if (GET_CODE (x) == AND
3300            && GET_MODE (x) == DImode
3301            && GET_CODE (XEXP (x, 0)) == ASHIFT
3302            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3303            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3304            && CONST_INT_P (XEXP (x, 1)))
3305     {
3306       type = ADDRESS_REG_UXTW;
3307       index = XEXP (XEXP (x, 0), 0);
3308       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3309       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3310         shift = -1;
3311     }
3312   /* (mult:P (reg:P) (const_int scale)) */
3313   else if (GET_CODE (x) == MULT
3314            && GET_MODE (x) == Pmode
3315            && GET_MODE (XEXP (x, 0)) == Pmode
3316            && CONST_INT_P (XEXP (x, 1)))
3317     {
3318       type = ADDRESS_REG_REG;
3319       index = XEXP (x, 0);
3320       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3321     }
3322   /* (ashift:P (reg:P) (const_int shift)) */
3323   else if (GET_CODE (x) == ASHIFT
3324            && GET_MODE (x) == Pmode
3325            && GET_MODE (XEXP (x, 0)) == Pmode
3326            && CONST_INT_P (XEXP (x, 1)))
3327     {
3328       type = ADDRESS_REG_REG;
3329       index = XEXP (x, 0);
3330       shift = INTVAL (XEXP (x, 1));
3331     }
3332   else
3333     return false;
3334
3335   if (GET_CODE (index) == SUBREG)
3336     index = SUBREG_REG (index);
3337
3338   if ((shift == 0 ||
3339        (shift > 0 && shift <= 3
3340         && (1 << shift) == GET_MODE_SIZE (mode)))
3341       && REG_P (index)
3342       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3343     {
3344       info->type = type;
3345       info->offset = index;
3346       info->shift = shift;
3347       return true;
3348     }
3349
3350   return false;
3351 }
3352
3353 bool
3354 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3355 {
3356   return (offset >= -64 * GET_MODE_SIZE (mode)
3357           && offset < 64 * GET_MODE_SIZE (mode)
3358           && offset % GET_MODE_SIZE (mode) == 0);
3359 }
3360
3361 static inline bool
3362 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3363                                HOST_WIDE_INT offset)
3364 {
3365   return offset >= -256 && offset < 256;
3366 }
3367
3368 static inline bool
3369 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3370 {
3371   return (offset >= 0
3372           && offset < 4096 * GET_MODE_SIZE (mode)
3373           && offset % GET_MODE_SIZE (mode) == 0);
3374 }
3375
3376 /* Return true if X is a valid address for machine mode MODE.  If it is,
3377    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3378    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3379
3380 static bool
3381 aarch64_classify_address (struct aarch64_address_info *info,
3382                           rtx x, machine_mode mode,
3383                           RTX_CODE outer_code, bool strict_p)
3384 {
3385   enum rtx_code code = GET_CODE (x);
3386   rtx op0, op1;
3387
3388   /* On BE, we use load/store pair for all large int mode load/stores.  */
3389   bool load_store_pair_p = (outer_code == PARALLEL
3390                             || (BYTES_BIG_ENDIAN
3391                                 && aarch64_vect_struct_mode_p (mode)));
3392
3393   bool allow_reg_index_p =
3394     !load_store_pair_p
3395     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3396     && !aarch64_vect_struct_mode_p (mode);
3397
3398   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3399      REG addressing.  */
3400   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3401       && (code != POST_INC && code != REG))
3402     return false;
3403
3404   switch (code)
3405     {
3406     case REG:
3407     case SUBREG:
3408       info->type = ADDRESS_REG_IMM;
3409       info->base = x;
3410       info->offset = const0_rtx;
3411       return aarch64_base_register_rtx_p (x, strict_p);
3412
3413     case PLUS:
3414       op0 = XEXP (x, 0);
3415       op1 = XEXP (x, 1);
3416
3417       if (! strict_p
3418           && REG_P (op0)
3419           && (op0 == virtual_stack_vars_rtx
3420               || op0 == frame_pointer_rtx
3421               || op0 == arg_pointer_rtx)
3422           && CONST_INT_P (op1))
3423         {
3424           info->type = ADDRESS_REG_IMM;
3425           info->base = op0;
3426           info->offset = op1;
3427
3428           return true;
3429         }
3430
3431       if (GET_MODE_SIZE (mode) != 0
3432           && CONST_INT_P (op1)
3433           && aarch64_base_register_rtx_p (op0, strict_p))
3434         {
3435           HOST_WIDE_INT offset = INTVAL (op1);
3436
3437           info->type = ADDRESS_REG_IMM;
3438           info->base = op0;
3439           info->offset = op1;
3440
3441           /* TImode and TFmode values are allowed in both pairs of X
3442              registers and individual Q registers.  The available
3443              address modes are:
3444              X,X: 7-bit signed scaled offset
3445              Q:   9-bit signed offset
3446              We conservatively require an offset representable in either mode.
3447            */
3448           if (mode == TImode || mode == TFmode)
3449             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3450                     && offset_9bit_signed_unscaled_p (mode, offset));
3451
3452           /* A 7bit offset check because OImode will emit a ldp/stp
3453              instruction (only big endian will get here).
3454              For ldp/stp instructions, the offset is scaled for the size of a
3455              single element of the pair.  */
3456           if (mode == OImode)
3457             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3458
3459           /* Three 9/12 bit offsets checks because CImode will emit three
3460              ldr/str instructions (only big endian will get here).  */
3461           if (mode == CImode)
3462             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3463                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3464                         || offset_12bit_unsigned_scaled_p (V16QImode,
3465                                                            offset + 32)));
3466
3467           /* Two 7bit offsets checks because XImode will emit two ldp/stp
3468              instructions (only big endian will get here).  */
3469           if (mode == XImode)
3470             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3471                     && aarch64_offset_7bit_signed_scaled_p (TImode,
3472                                                             offset + 32));
3473
3474           if (load_store_pair_p)
3475             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3476                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3477           else
3478             return (offset_9bit_signed_unscaled_p (mode, offset)
3479                     || offset_12bit_unsigned_scaled_p (mode, offset));
3480         }
3481
3482       if (allow_reg_index_p)
3483         {
3484           /* Look for base + (scaled/extended) index register.  */
3485           if (aarch64_base_register_rtx_p (op0, strict_p)
3486               && aarch64_classify_index (info, op1, mode, strict_p))
3487             {
3488               info->base = op0;
3489               return true;
3490             }
3491           if (aarch64_base_register_rtx_p (op1, strict_p)
3492               && aarch64_classify_index (info, op0, mode, strict_p))
3493             {
3494               info->base = op1;
3495               return true;
3496             }
3497         }
3498
3499       return false;
3500
3501     case POST_INC:
3502     case POST_DEC:
3503     case PRE_INC:
3504     case PRE_DEC:
3505       info->type = ADDRESS_REG_WB;
3506       info->base = XEXP (x, 0);
3507       info->offset = NULL_RTX;
3508       return aarch64_base_register_rtx_p (info->base, strict_p);
3509
3510     case POST_MODIFY:
3511     case PRE_MODIFY:
3512       info->type = ADDRESS_REG_WB;
3513       info->base = XEXP (x, 0);
3514       if (GET_CODE (XEXP (x, 1)) == PLUS
3515           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3516           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3517           && aarch64_base_register_rtx_p (info->base, strict_p))
3518         {
3519           HOST_WIDE_INT offset;
3520           info->offset = XEXP (XEXP (x, 1), 1);
3521           offset = INTVAL (info->offset);
3522
3523           /* TImode and TFmode values are allowed in both pairs of X
3524              registers and individual Q registers.  The available
3525              address modes are:
3526              X,X: 7-bit signed scaled offset
3527              Q:   9-bit signed offset
3528              We conservatively require an offset representable in either mode.
3529            */
3530           if (mode == TImode || mode == TFmode)
3531             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3532                     && offset_9bit_signed_unscaled_p (mode, offset));
3533
3534           if (load_store_pair_p)
3535             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3536                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3537           else
3538             return offset_9bit_signed_unscaled_p (mode, offset);
3539         }
3540       return false;
3541
3542     case CONST:
3543     case SYMBOL_REF:
3544     case LABEL_REF:
3545       /* load literal: pc-relative constant pool entry.  Only supported
3546          for SI mode or larger.  */
3547       info->type = ADDRESS_SYMBOLIC;
3548
3549       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3550         {
3551           rtx sym, addend;
3552
3553           split_const (x, &sym, &addend);
3554           return (GET_CODE (sym) == LABEL_REF
3555                   || (GET_CODE (sym) == SYMBOL_REF
3556                       && CONSTANT_POOL_ADDRESS_P (sym)));
3557         }
3558       return false;
3559
3560     case LO_SUM:
3561       info->type = ADDRESS_LO_SUM;
3562       info->base = XEXP (x, 0);
3563       info->offset = XEXP (x, 1);
3564       if (allow_reg_index_p
3565           && aarch64_base_register_rtx_p (info->base, strict_p))
3566         {
3567           rtx sym, offs;
3568           split_const (info->offset, &sym, &offs);
3569           if (GET_CODE (sym) == SYMBOL_REF
3570               && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3571                   == SYMBOL_SMALL_ABSOLUTE))
3572             {
3573               /* The symbol and offset must be aligned to the access size.  */
3574               unsigned int align;
3575               unsigned int ref_size;
3576
3577               if (CONSTANT_POOL_ADDRESS_P (sym))
3578                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3579               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3580                 {
3581                   tree exp = SYMBOL_REF_DECL (sym);
3582                   align = TYPE_ALIGN (TREE_TYPE (exp));
3583                   align = CONSTANT_ALIGNMENT (exp, align);
3584                 }
3585               else if (SYMBOL_REF_DECL (sym))
3586                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3587               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3588                        && SYMBOL_REF_BLOCK (sym) != NULL)
3589                 align = SYMBOL_REF_BLOCK (sym)->alignment;
3590               else
3591                 align = BITS_PER_UNIT;
3592
3593               ref_size = GET_MODE_SIZE (mode);
3594               if (ref_size == 0)
3595                 ref_size = GET_MODE_SIZE (DImode);
3596
3597               return ((INTVAL (offs) & (ref_size - 1)) == 0
3598                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3599             }
3600         }
3601       return false;
3602
3603     default:
3604       return false;
3605     }
3606 }
3607
3608 bool
3609 aarch64_symbolic_address_p (rtx x)
3610 {
3611   rtx offset;
3612
3613   split_const (x, &x, &offset);
3614   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3615 }
3616
3617 /* Classify the base of symbolic expression X, given that X appears in
3618    context CONTEXT.  */
3619
3620 enum aarch64_symbol_type
3621 aarch64_classify_symbolic_expression (rtx x,
3622                                       enum aarch64_symbol_context context)
3623 {
3624   rtx offset;
3625
3626   split_const (x, &x, &offset);
3627   return aarch64_classify_symbol (x, offset, context);
3628 }
3629
3630
3631 /* Return TRUE if X is a legitimate address for accessing memory in
3632    mode MODE.  */
3633 static bool
3634 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3635 {
3636   struct aarch64_address_info addr;
3637
3638   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3639 }
3640
3641 /* Return TRUE if X is a legitimate address for accessing memory in
3642    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3643    pair operation.  */
3644 bool
3645 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3646                               RTX_CODE outer_code, bool strict_p)
3647 {
3648   struct aarch64_address_info addr;
3649
3650   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3651 }
3652
3653 /* Return TRUE if rtx X is immediate constant 0.0 */
3654 bool
3655 aarch64_float_const_zero_rtx_p (rtx x)
3656 {
3657   REAL_VALUE_TYPE r;
3658
3659   if (GET_MODE (x) == VOIDmode)
3660     return false;
3661
3662   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3663   if (REAL_VALUE_MINUS_ZERO (r))
3664     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3665   return REAL_VALUES_EQUAL (r, dconst0);
3666 }
3667
3668 /* Return the fixed registers used for condition codes.  */
3669
3670 static bool
3671 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3672 {
3673   *p1 = CC_REGNUM;
3674   *p2 = INVALID_REGNUM;
3675   return true;
3676 }
3677
3678 /* Emit call insn with PAT and do aarch64-specific handling.  */
3679
3680 void
3681 aarch64_emit_call_insn (rtx pat)
3682 {
3683   rtx insn = emit_call_insn (pat);
3684
3685   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3686   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3687   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3688 }
3689
3690 machine_mode
3691 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3692 {
3693   /* All floating point compares return CCFP if it is an equality
3694      comparison, and CCFPE otherwise.  */
3695   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3696     {
3697       switch (code)
3698         {
3699         case EQ:
3700         case NE:
3701         case UNORDERED:
3702         case ORDERED:
3703         case UNLT:
3704         case UNLE:
3705         case UNGT:
3706         case UNGE:
3707         case UNEQ:
3708         case LTGT:
3709           return CCFPmode;
3710
3711         case LT:
3712         case LE:
3713         case GT:
3714         case GE:
3715           return CCFPEmode;
3716
3717         default:
3718           gcc_unreachable ();
3719         }
3720     }
3721
3722   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3723       && y == const0_rtx
3724       && (code == EQ || code == NE || code == LT || code == GE)
3725       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3726           || GET_CODE (x) == NEG))
3727     return CC_NZmode;
3728
3729   /* A compare with a shifted operand.  Because of canonicalization,
3730      the comparison will have to be swapped when we emit the assembly
3731      code.  */
3732   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3733       && (REG_P (y) || GET_CODE (y) == SUBREG)
3734       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3735           || GET_CODE (x) == LSHIFTRT
3736           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3737     return CC_SWPmode;
3738
3739   /* Similarly for a negated operand, but we can only do this for
3740      equalities.  */
3741   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3742       && (REG_P (y) || GET_CODE (y) == SUBREG)
3743       && (code == EQ || code == NE)
3744       && GET_CODE (x) == NEG)
3745     return CC_Zmode;
3746
3747   /* A compare of a mode narrower than SI mode against zero can be done
3748      by extending the value in the comparison.  */
3749   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3750       && y == const0_rtx)
3751     /* Only use sign-extension if we really need it.  */
3752     return ((code == GT || code == GE || code == LE || code == LT)
3753             ? CC_SESWPmode : CC_ZESWPmode);
3754
3755   /* For everything else, return CCmode.  */
3756   return CCmode;
3757 }
3758
3759 static int
3760 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3761
3762 int
3763 aarch64_get_condition_code (rtx x)
3764 {
3765   machine_mode mode = GET_MODE (XEXP (x, 0));
3766   enum rtx_code comp_code = GET_CODE (x);
3767
3768   if (GET_MODE_CLASS (mode) != MODE_CC)
3769     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3770   return aarch64_get_condition_code_1 (mode, comp_code);
3771 }
3772
3773 static int
3774 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3775 {
3776   int ne = -1, eq = -1;
3777   switch (mode)
3778     {
3779     case CCFPmode:
3780     case CCFPEmode:
3781       switch (comp_code)
3782         {
3783         case GE: return AARCH64_GE;
3784         case GT: return AARCH64_GT;
3785         case LE: return AARCH64_LS;
3786         case LT: return AARCH64_MI;
3787         case NE: return AARCH64_NE;
3788         case EQ: return AARCH64_EQ;
3789         case ORDERED: return AARCH64_VC;
3790         case UNORDERED: return AARCH64_VS;
3791         case UNLT: return AARCH64_LT;
3792         case UNLE: return AARCH64_LE;
3793         case UNGT: return AARCH64_HI;
3794         case UNGE: return AARCH64_PL;
3795         default: return -1;
3796         }
3797       break;
3798
3799     case CC_DNEmode:
3800       ne = AARCH64_NE;
3801       eq = AARCH64_EQ;
3802       break;
3803
3804     case CC_DEQmode:
3805       ne = AARCH64_EQ;
3806       eq = AARCH64_NE;
3807       break;
3808
3809     case CC_DGEmode:
3810       ne = AARCH64_GE;
3811       eq = AARCH64_LT;
3812       break;
3813
3814     case CC_DLTmode:
3815       ne = AARCH64_LT;
3816       eq = AARCH64_GE;
3817       break;
3818
3819     case CC_DGTmode:
3820       ne = AARCH64_GT;
3821       eq = AARCH64_LE;
3822       break;
3823
3824     case CC_DLEmode:
3825       ne = AARCH64_LE;
3826       eq = AARCH64_GT;
3827       break;
3828
3829     case CC_DGEUmode:
3830       ne = AARCH64_CS;
3831       eq = AARCH64_CC;
3832       break;
3833
3834     case CC_DLTUmode:
3835       ne = AARCH64_CC;
3836       eq = AARCH64_CS;
3837       break;
3838
3839     case CC_DGTUmode:
3840       ne = AARCH64_HI;
3841       eq = AARCH64_LS;
3842       break;
3843
3844     case CC_DLEUmode:
3845       ne = AARCH64_LS;
3846       eq = AARCH64_HI;
3847       break;
3848
3849     case CCmode:
3850       switch (comp_code)
3851         {
3852         case NE: return AARCH64_NE;
3853         case EQ: return AARCH64_EQ;
3854         case GE: return AARCH64_GE;
3855         case GT: return AARCH64_GT;
3856         case LE: return AARCH64_LE;
3857         case LT: return AARCH64_LT;
3858         case GEU: return AARCH64_CS;
3859         case GTU: return AARCH64_HI;
3860         case LEU: return AARCH64_LS;
3861         case LTU: return AARCH64_CC;
3862         default: return -1;
3863         }
3864       break;
3865
3866     case CC_SWPmode:
3867     case CC_ZESWPmode:
3868     case CC_SESWPmode:
3869       switch (comp_code)
3870         {
3871         case NE: return AARCH64_NE;
3872         case EQ: return AARCH64_EQ;
3873         case GE: return AARCH64_LE;
3874         case GT: return AARCH64_LT;
3875         case LE: return AARCH64_GE;
3876         case LT: return AARCH64_GT;
3877         case GEU: return AARCH64_LS;
3878         case GTU: return AARCH64_CC;
3879         case LEU: return AARCH64_CS;
3880         case LTU: return AARCH64_HI;
3881         default: return -1;
3882         }
3883       break;
3884
3885     case CC_NZmode:
3886       switch (comp_code)
3887         {
3888         case NE: return AARCH64_NE;
3889         case EQ: return AARCH64_EQ;
3890         case GE: return AARCH64_PL;
3891         case LT: return AARCH64_MI;
3892         default: return -1;
3893         }
3894       break;
3895
3896     case CC_Zmode:
3897       switch (comp_code)
3898         {
3899         case NE: return AARCH64_NE;
3900         case EQ: return AARCH64_EQ;
3901         default: return -1;
3902         }
3903       break;
3904
3905     default:
3906       return -1;
3907       break;
3908     }
3909
3910   if (comp_code == NE)
3911     return ne;
3912
3913   if (comp_code == EQ)
3914     return eq;
3915
3916   return -1;
3917 }
3918
3919 bool
3920 aarch64_const_vec_all_same_in_range_p (rtx x,
3921                                   HOST_WIDE_INT minval,
3922                                   HOST_WIDE_INT maxval)
3923 {
3924   HOST_WIDE_INT firstval;
3925   int count, i;
3926
3927   if (GET_CODE (x) != CONST_VECTOR
3928       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3929     return false;
3930
3931   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3932   if (firstval < minval || firstval > maxval)
3933     return false;
3934
3935   count = CONST_VECTOR_NUNITS (x);
3936   for (i = 1; i < count; i++)
3937     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3938       return false;
3939
3940   return true;
3941 }
3942
3943 bool
3944 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3945 {
3946   return aarch64_const_vec_all_same_in_range_p (x, val, val);
3947 }
3948
3949 static unsigned
3950 bit_count (unsigned HOST_WIDE_INT value)
3951 {
3952   unsigned count = 0;
3953
3954   while (value)
3955     {
3956       count++;
3957       value &= value - 1;
3958     }
3959
3960   return count;
3961 }
3962
3963 /* N Z C V.  */
3964 #define AARCH64_CC_V 1
3965 #define AARCH64_CC_C (1 << 1)
3966 #define AARCH64_CC_Z (1 << 2)
3967 #define AARCH64_CC_N (1 << 3)
3968
3969 /* N Z C V flags for ccmp.  The first code is for AND op and the other
3970    is for IOR op.  Indexed by AARCH64_COND_CODE.  */
3971 static const int aarch64_nzcv_codes[][2] =
3972 {
3973   {AARCH64_CC_Z, 0}, /* EQ, Z == 1.  */
3974   {0, AARCH64_CC_Z}, /* NE, Z == 0.  */
3975   {AARCH64_CC_C, 0}, /* CS, C == 1.  */
3976   {0, AARCH64_CC_C}, /* CC, C == 0.  */
3977   {AARCH64_CC_N, 0}, /* MI, N == 1.  */
3978   {0, AARCH64_CC_N}, /* PL, N == 0.  */
3979   {AARCH64_CC_V, 0}, /* VS, V == 1.  */
3980   {0, AARCH64_CC_V}, /* VC, V == 0.  */
3981   {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0.  */
3982   {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0).  */
3983   {0, AARCH64_CC_V}, /* GE, N == V.  */
3984   {AARCH64_CC_V, 0}, /* LT, N != V.  */
3985   {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V.  */
3986   {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V).  */
3987   {0, 0}, /* AL, Any.  */
3988   {0, 0}, /* NV, Any.  */
3989 };
3990
3991 int
3992 aarch64_ccmp_mode_to_code (enum machine_mode mode)
3993 {
3994   switch (mode)
3995     {
3996     case CC_DNEmode:
3997       return NE;
3998
3999     case CC_DEQmode:
4000       return EQ;
4001
4002     case CC_DLEmode:
4003       return LE;
4004
4005     case CC_DGTmode:
4006       return GT;
4007
4008     case CC_DLTmode:
4009       return LT;
4010
4011     case CC_DGEmode:
4012       return GE;
4013
4014     case CC_DLEUmode:
4015       return LEU;
4016
4017     case CC_DGTUmode:
4018       return GTU;
4019
4020     case CC_DLTUmode:
4021       return LTU;
4022
4023     case CC_DGEUmode:
4024       return GEU;
4025
4026     default:
4027       gcc_unreachable ();
4028     }
4029 }
4030
4031
4032 void
4033 aarch64_print_operand (FILE *f, rtx x, char code)
4034 {
4035   switch (code)
4036     {
4037     /* An integer or symbol address without a preceding # sign.  */
4038     case 'c':
4039       switch (GET_CODE (x))
4040         {
4041         case CONST_INT:
4042           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4043           break;
4044
4045         case SYMBOL_REF:
4046           output_addr_const (f, x);
4047           break;
4048
4049         case CONST:
4050           if (GET_CODE (XEXP (x, 0)) == PLUS
4051               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4052             {
4053               output_addr_const (f, x);
4054               break;
4055             }
4056           /* Fall through.  */
4057
4058         default:
4059           output_operand_lossage ("Unsupported operand for code '%c'", code);
4060         }
4061       break;
4062
4063     case 'e':
4064       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4065       {
4066         int n;
4067
4068         if (!CONST_INT_P (x)
4069             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4070           {
4071             output_operand_lossage ("invalid operand for '%%%c'", code);
4072             return;
4073           }
4074
4075         switch (n)
4076           {
4077           case 3:
4078             fputc ('b', f);
4079             break;
4080           case 4:
4081             fputc ('h', f);
4082             break;
4083           case 5:
4084             fputc ('w', f);
4085             break;
4086           default:
4087             output_operand_lossage ("invalid operand for '%%%c'", code);
4088             return;
4089           }
4090       }
4091       break;
4092
4093     case 'p':
4094       {
4095         int n;
4096
4097         /* Print N such that 2^N == X.  */
4098         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4099           {
4100             output_operand_lossage ("invalid operand for '%%%c'", code);
4101             return;
4102           }
4103
4104         asm_fprintf (f, "%d", n);
4105       }
4106       break;
4107
4108     case 'P':
4109       /* Print the number of non-zero bits in X (a const_int).  */
4110       if (!CONST_INT_P (x))
4111         {
4112           output_operand_lossage ("invalid operand for '%%%c'", code);
4113           return;
4114         }
4115
4116       asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4117       break;
4118
4119     case 'H':
4120       /* Print the higher numbered register of a pair (TImode) of regs.  */
4121       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4122         {
4123           output_operand_lossage ("invalid operand for '%%%c'", code);
4124           return;
4125         }
4126
4127       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4128       break;
4129
4130     case 'm':
4131       {
4132         int cond_code;
4133         /* Print a condition (eq, ne, etc).  */
4134
4135         /* CONST_TRUE_RTX means always -- that's the default.  */
4136         if (x == const_true_rtx)
4137           return;
4138
4139         if (!COMPARISON_P (x))
4140           {
4141             output_operand_lossage ("invalid operand for '%%%c'", code);
4142             return;
4143           }
4144
4145         cond_code = aarch64_get_condition_code (x);
4146         gcc_assert (cond_code >= 0);
4147         fputs (aarch64_condition_codes[cond_code], f);
4148       }
4149       break;
4150
4151     case 'M':
4152       {
4153         int cond_code;
4154         /* Print the inverse of a condition (eq <-> ne, etc).  */
4155
4156         /* CONST_TRUE_RTX means never -- that's the default.  */
4157         if (x == const_true_rtx)
4158           {
4159             fputs ("nv", f);
4160             return;
4161           }
4162
4163         if (!COMPARISON_P (x))
4164           {
4165             output_operand_lossage ("invalid operand for '%%%c'", code);
4166             return;
4167           }
4168         cond_code = aarch64_get_condition_code (x);
4169         gcc_assert (cond_code >= 0);
4170         fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4171                                        (cond_code)], f);
4172       }
4173       break;
4174
4175     case 'b':
4176     case 'h':
4177     case 's':
4178     case 'd':
4179     case 'q':
4180       /* Print a scalar FP/SIMD register name.  */
4181       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4182         {
4183           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4184           return;
4185         }
4186       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4187       break;
4188
4189     case 'S':
4190     case 'T':
4191     case 'U':
4192     case 'V':
4193       /* Print the first FP/SIMD register name in a list.  */
4194       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4195         {
4196           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4197           return;
4198         }
4199       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4200       break;
4201
4202     case 'R':
4203       /* Print a scalar FP/SIMD register name + 1.  */
4204       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4205         {
4206           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4207           return;
4208         }
4209       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4210       break;
4211
4212     case 'X':
4213       /* Print bottom 16 bits of integer constant in hex.  */
4214       if (!CONST_INT_P (x))
4215         {
4216           output_operand_lossage ("invalid operand for '%%%c'", code);
4217           return;
4218         }
4219       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4220       break;
4221
4222     case 'w':
4223     case 'x':
4224       /* Print a general register name or the zero register (32-bit or
4225          64-bit).  */
4226       if (x == const0_rtx
4227           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4228         {
4229           asm_fprintf (f, "%czr", code);
4230           break;
4231         }
4232
4233       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4234         {
4235           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4236           break;
4237         }
4238
4239       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4240         {
4241           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4242           break;
4243         }
4244
4245       /* Fall through */
4246
4247     case 0:
4248       /* Print a normal operand, if it's a general register, then we
4249          assume DImode.  */
4250       if (x == NULL)
4251         {
4252           output_operand_lossage ("missing operand");
4253           return;
4254         }
4255
4256       switch (GET_CODE (x))
4257         {
4258         case REG:
4259           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4260           break;
4261
4262         case MEM:
4263           aarch64_memory_reference_mode = GET_MODE (x);
4264           output_address (XEXP (x, 0));
4265           break;
4266
4267         case LABEL_REF:
4268         case SYMBOL_REF:
4269           output_addr_const (asm_out_file, x);
4270           break;
4271
4272         case CONST_INT:
4273           asm_fprintf (f, "%wd", INTVAL (x));
4274           break;
4275
4276         case CONST_VECTOR:
4277           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4278             {
4279               gcc_assert (
4280                   aarch64_const_vec_all_same_in_range_p (x,
4281                                                          HOST_WIDE_INT_MIN,
4282                                                          HOST_WIDE_INT_MAX));
4283               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4284             }
4285           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4286             {
4287               fputc ('0', f);
4288             }
4289           else
4290             gcc_unreachable ();
4291           break;
4292
4293         case CONST_DOUBLE:
4294           /* CONST_DOUBLE can represent a double-width integer.
4295              In this case, the mode of x is VOIDmode.  */
4296           if (GET_MODE (x) == VOIDmode)
4297             ; /* Do Nothing.  */
4298           else if (aarch64_float_const_zero_rtx_p (x))
4299             {
4300               fputc ('0', f);
4301               break;
4302             }
4303           else if (aarch64_float_const_representable_p (x))
4304             {
4305 #define buf_size 20
4306               char float_buf[buf_size] = {'\0'};
4307               REAL_VALUE_TYPE r;
4308               REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4309               real_to_decimal_for_mode (float_buf, &r,
4310                                         buf_size, buf_size,
4311                                         1, GET_MODE (x));
4312               asm_fprintf (asm_out_file, "%s", float_buf);
4313               break;
4314 #undef buf_size
4315             }
4316           output_operand_lossage ("invalid constant");
4317           return;
4318         default:
4319           output_operand_lossage ("invalid operand");
4320           return;
4321         }
4322       break;
4323
4324     case 'A':
4325       if (GET_CODE (x) == HIGH)
4326         x = XEXP (x, 0);
4327
4328       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4329         {
4330         case SYMBOL_SMALL_GOT:
4331           asm_fprintf (asm_out_file, ":got:");
4332           break;
4333
4334         case SYMBOL_SMALL_TLSGD:
4335           asm_fprintf (asm_out_file, ":tlsgd:");
4336           break;
4337
4338         case SYMBOL_SMALL_TLSDESC:
4339           asm_fprintf (asm_out_file, ":tlsdesc:");
4340           break;
4341
4342         case SYMBOL_SMALL_GOTTPREL:
4343           asm_fprintf (asm_out_file, ":gottprel:");
4344           break;
4345
4346         case SYMBOL_SMALL_TPREL:
4347           asm_fprintf (asm_out_file, ":tprel:");
4348           break;
4349
4350         case SYMBOL_TINY_GOT:
4351           gcc_unreachable ();
4352           break;
4353
4354         default:
4355           break;
4356         }
4357       output_addr_const (asm_out_file, x);
4358       break;
4359
4360     case 'L':
4361       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4362         {
4363         case SYMBOL_SMALL_GOT:
4364           asm_fprintf (asm_out_file, ":lo12:");
4365           break;
4366
4367         case SYMBOL_SMALL_TLSGD:
4368           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4369           break;
4370
4371         case SYMBOL_SMALL_TLSDESC:
4372           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4373           break;
4374
4375         case SYMBOL_SMALL_GOTTPREL:
4376           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4377           break;
4378
4379         case SYMBOL_SMALL_TPREL:
4380           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4381           break;
4382
4383         case SYMBOL_TINY_GOT:
4384           asm_fprintf (asm_out_file, ":got:");
4385           break;
4386
4387         default:
4388           break;
4389         }
4390       output_addr_const (asm_out_file, x);
4391       break;
4392
4393     case 'G':
4394
4395       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4396         {
4397         case SYMBOL_SMALL_TPREL:
4398           asm_fprintf (asm_out_file, ":tprel_hi12:");
4399           break;
4400         default:
4401           break;
4402         }
4403       output_addr_const (asm_out_file, x);
4404       break;
4405
4406     case 'K':
4407       {
4408         int cond_code;
4409         /* Print nzcv.  */
4410
4411         if (!COMPARISON_P (x))
4412           {
4413             output_operand_lossage ("invalid operand for '%%%c'", code);
4414             return;
4415           }
4416
4417         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4418         gcc_assert (cond_code >= 0);
4419         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4420       }
4421       break;
4422
4423     case 'k':
4424       {
4425         int cond_code;
4426         /* Print nzcv.  */
4427
4428         if (!COMPARISON_P (x))
4429           {
4430             output_operand_lossage ("invalid operand for '%%%c'", code);
4431             return;
4432           }
4433
4434         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4435         gcc_assert (cond_code >= 0);
4436         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4437       }
4438       break;
4439
4440     default:
4441       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4442       return;
4443     }
4444 }
4445
4446 void
4447 aarch64_print_operand_address (FILE *f, rtx x)
4448 {
4449   struct aarch64_address_info addr;
4450
4451   if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4452                              MEM, true))
4453     switch (addr.type)
4454       {
4455       case ADDRESS_REG_IMM:
4456         if (addr.offset == const0_rtx)
4457           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4458         else
4459           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4460                        INTVAL (addr.offset));
4461         return;
4462
4463       case ADDRESS_REG_REG:
4464         if (addr.shift == 0)
4465           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4466                        reg_names [REGNO (addr.offset)]);
4467         else
4468           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4469                        reg_names [REGNO (addr.offset)], addr.shift);
4470         return;
4471
4472       case ADDRESS_REG_UXTW:
4473         if (addr.shift == 0)
4474           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4475                        REGNO (addr.offset) - R0_REGNUM);
4476         else
4477           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4478                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4479         return;
4480
4481       case ADDRESS_REG_SXTW:
4482         if (addr.shift == 0)
4483           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4484                        REGNO (addr.offset) - R0_REGNUM);
4485         else
4486           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4487                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4488         return;
4489
4490       case ADDRESS_REG_WB:
4491         switch (GET_CODE (x))
4492           {
4493           case PRE_INC:
4494             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4495                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4496             return;
4497           case POST_INC:
4498             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4499                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4500             return;
4501           case PRE_DEC:
4502             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4503                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4504             return;
4505           case POST_DEC:
4506             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4507                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4508             return;
4509           case PRE_MODIFY:
4510             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4511                          INTVAL (addr.offset));
4512             return;
4513           case POST_MODIFY:
4514             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4515                          INTVAL (addr.offset));
4516             return;
4517           default:
4518             break;
4519           }
4520         break;
4521
4522       case ADDRESS_LO_SUM:
4523         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4524         output_addr_const (f, addr.offset);
4525         asm_fprintf (f, "]");
4526         return;
4527
4528       case ADDRESS_SYMBOLIC:
4529         break;
4530       }
4531
4532   output_addr_const (f, x);
4533 }
4534
4535 bool
4536 aarch64_label_mentioned_p (rtx x)
4537 {
4538   const char *fmt;
4539   int i;
4540
4541   if (GET_CODE (x) == LABEL_REF)
4542     return true;
4543
4544   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4545      referencing instruction, but they are constant offsets, not
4546      symbols.  */
4547   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4548     return false;
4549
4550   fmt = GET_RTX_FORMAT (GET_CODE (x));
4551   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4552     {
4553       if (fmt[i] == 'E')
4554         {
4555           int j;
4556
4557           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4558             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4559               return 1;
4560         }
4561       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4562         return 1;
4563     }
4564
4565   return 0;
4566 }
4567
4568 /* Implement REGNO_REG_CLASS.  */
4569
4570 enum reg_class
4571 aarch64_regno_regclass (unsigned regno)
4572 {
4573   if (GP_REGNUM_P (regno))
4574     return GENERAL_REGS;
4575
4576   if (regno == SP_REGNUM)
4577     return STACK_REG;
4578
4579   if (regno == FRAME_POINTER_REGNUM
4580       || regno == ARG_POINTER_REGNUM)
4581     return POINTER_REGS;
4582
4583   if (FP_REGNUM_P (regno))
4584     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4585
4586   return NO_REGS;
4587 }
4588
4589 static rtx
4590 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
4591 {
4592   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4593      where mask is selected by alignment and size of the offset.
4594      We try to pick as large a range for the offset as possible to
4595      maximize the chance of a CSE.  However, for aligned addresses
4596      we limit the range to 4k so that structures with different sized
4597      elements are likely to use the same base.  */
4598
4599   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4600     {
4601       HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4602       HOST_WIDE_INT base_offset;
4603
4604       /* Does it look like we'll need a load/store-pair operation?  */
4605       if (GET_MODE_SIZE (mode) > 16
4606           || mode == TImode)
4607         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4608                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
4609       /* For offsets aren't a multiple of the access size, the limit is
4610          -256...255.  */
4611       else if (offset & (GET_MODE_SIZE (mode) - 1))
4612         base_offset = (offset + 0x100) & ~0x1ff;
4613       else
4614         base_offset = offset & ~0xfff;
4615
4616       if (base_offset == 0)
4617         return x;
4618
4619       offset -= base_offset;
4620       rtx base_reg = gen_reg_rtx (Pmode);
4621       rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4622                            NULL_RTX);
4623       emit_move_insn (base_reg, val);
4624       x = plus_constant (Pmode, base_reg, offset);
4625     }
4626
4627   return x;
4628 }
4629
4630 /* Try a machine-dependent way of reloading an illegitimate address
4631    operand.  If we find one, push the reload and return the new rtx.  */
4632
4633 rtx
4634 aarch64_legitimize_reload_address (rtx *x_p,
4635                                    machine_mode mode,
4636                                    int opnum, int type,
4637                                    int ind_levels ATTRIBUTE_UNUSED)
4638 {
4639   rtx x = *x_p;
4640
4641   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4642   if (aarch64_vect_struct_mode_p (mode)
4643       && GET_CODE (x) == PLUS
4644       && REG_P (XEXP (x, 0))
4645       && CONST_INT_P (XEXP (x, 1)))
4646     {
4647       rtx orig_rtx = x;
4648       x = copy_rtx (x);
4649       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4650                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4651                    opnum, (enum reload_type) type);
4652       return x;
4653     }
4654
4655   /* We must recognize output that we have already generated ourselves.  */
4656   if (GET_CODE (x) == PLUS
4657       && GET_CODE (XEXP (x, 0)) == PLUS
4658       && REG_P (XEXP (XEXP (x, 0), 0))
4659       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4660       && CONST_INT_P (XEXP (x, 1)))
4661     {
4662       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4663                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4664                    opnum, (enum reload_type) type);
4665       return x;
4666     }
4667
4668   /* We wish to handle large displacements off a base register by splitting
4669      the addend across an add and the mem insn.  This can cut the number of
4670      extra insns needed from 3 to 1.  It is only useful for load/store of a
4671      single register with 12 bit offset field.  */
4672   if (GET_CODE (x) == PLUS
4673       && REG_P (XEXP (x, 0))
4674       && CONST_INT_P (XEXP (x, 1))
4675       && HARD_REGISTER_P (XEXP (x, 0))
4676       && mode != TImode
4677       && mode != TFmode
4678       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4679     {
4680       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4681       HOST_WIDE_INT low = val & 0xfff;
4682       HOST_WIDE_INT high = val - low;
4683       HOST_WIDE_INT offs;
4684       rtx cst;
4685       machine_mode xmode = GET_MODE (x);
4686
4687       /* In ILP32, xmode can be either DImode or SImode.  */
4688       gcc_assert (xmode == DImode || xmode == SImode);
4689
4690       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4691          BLKmode alignment.  */
4692       if (GET_MODE_SIZE (mode) == 0)
4693         return NULL_RTX;
4694
4695       offs = low % GET_MODE_SIZE (mode);
4696
4697       /* Align misaligned offset by adjusting high part to compensate.  */
4698       if (offs != 0)
4699         {
4700           if (aarch64_uimm12_shift (high + offs))
4701             {
4702               /* Align down.  */
4703               low = low - offs;
4704               high = high + offs;
4705             }
4706           else
4707             {
4708               /* Align up.  */
4709               offs = GET_MODE_SIZE (mode) - offs;
4710               low = low + offs;
4711               high = high + (low & 0x1000) - offs;
4712               low &= 0xfff;
4713             }
4714         }
4715
4716       /* Check for overflow.  */
4717       if (high + low != val)
4718         return NULL_RTX;
4719
4720       cst = GEN_INT (high);
4721       if (!aarch64_uimm12_shift (high))
4722         cst = force_const_mem (xmode, cst);
4723
4724       /* Reload high part into base reg, leaving the low part
4725          in the mem instruction.
4726          Note that replacing this gen_rtx_PLUS with plus_constant is
4727          wrong in this case because we rely on the
4728          (plus (plus reg c1) c2) structure being preserved so that
4729          XEXP (*p, 0) in push_reload below uses the correct term.  */
4730       x = gen_rtx_PLUS (xmode,
4731                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4732                         GEN_INT (low));
4733
4734       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4735                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4736                    opnum, (enum reload_type) type);
4737       return x;
4738     }
4739
4740   return NULL_RTX;
4741 }
4742
4743
4744 static reg_class_t
4745 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4746                           reg_class_t rclass,
4747                           machine_mode mode,
4748                           secondary_reload_info *sri)
4749 {
4750   /* Without the TARGET_SIMD instructions we cannot move a Q register
4751      to a Q register directly.  We need a scratch.  */
4752   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4753       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4754       && reg_class_subset_p (rclass, FP_REGS))
4755     {
4756       if (mode == TFmode)
4757         sri->icode = CODE_FOR_aarch64_reload_movtf;
4758       else if (mode == TImode)
4759         sri->icode = CODE_FOR_aarch64_reload_movti;
4760       return NO_REGS;
4761     }
4762
4763   /* A TFmode or TImode memory access should be handled via an FP_REGS
4764      because AArch64 has richer addressing modes for LDR/STR instructions
4765      than LDP/STP instructions.  */
4766   if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4767       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4768     return FP_REGS;
4769
4770   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4771       return GENERAL_REGS;
4772
4773   return NO_REGS;
4774 }
4775
4776 static bool
4777 aarch64_can_eliminate (const int from, const int to)
4778 {
4779   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4780      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4781
4782   if (frame_pointer_needed)
4783     {
4784       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4785         return true;
4786       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4787         return false;
4788       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4789           && !cfun->calls_alloca)
4790         return true;
4791       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4792         return true;
4793
4794       return false;
4795     }
4796   else
4797     {
4798       /* If we decided that we didn't need a leaf frame pointer but then used
4799          LR in the function, then we'll want a frame pointer after all, so
4800          prevent this elimination to ensure a frame pointer is used.  */
4801       if (to == STACK_POINTER_REGNUM
4802           && flag_omit_leaf_frame_pointer
4803           && df_regs_ever_live_p (LR_REGNUM))
4804         return false;
4805     }
4806
4807   return true;
4808 }
4809
4810 HOST_WIDE_INT
4811 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4812 {
4813   aarch64_layout_frame ();
4814
4815   if (to == HARD_FRAME_POINTER_REGNUM)
4816     {
4817       if (from == ARG_POINTER_REGNUM)
4818         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4819
4820       if (from == FRAME_POINTER_REGNUM)
4821         return (cfun->machine->frame.hard_fp_offset
4822                 - cfun->machine->frame.saved_varargs_size);
4823     }
4824
4825   if (to == STACK_POINTER_REGNUM)
4826     {
4827       if (from == FRAME_POINTER_REGNUM)
4828           return (cfun->machine->frame.frame_size
4829                   - cfun->machine->frame.saved_varargs_size);
4830     }
4831
4832   return cfun->machine->frame.frame_size;
4833 }
4834
4835 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
4836    previous frame.  */
4837
4838 rtx
4839 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4840 {
4841   if (count != 0)
4842     return const0_rtx;
4843   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4844 }
4845
4846
4847 static void
4848 aarch64_asm_trampoline_template (FILE *f)
4849 {
4850   if (TARGET_ILP32)
4851     {
4852       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4853       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4854     }
4855   else
4856     {
4857       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4858       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4859     }
4860   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4861   assemble_aligned_integer (4, const0_rtx);
4862   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4863   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4864 }
4865
4866 static void
4867 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4868 {
4869   rtx fnaddr, mem, a_tramp;
4870   const int tramp_code_sz = 16;
4871
4872   /* Don't need to copy the trailing D-words, we fill those in below.  */
4873   emit_block_move (m_tramp, assemble_trampoline_template (),
4874                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4875   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4876   fnaddr = XEXP (DECL_RTL (fndecl), 0);
4877   if (GET_MODE (fnaddr) != ptr_mode)
4878     fnaddr = convert_memory_address (ptr_mode, fnaddr);
4879   emit_move_insn (mem, fnaddr);
4880
4881   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4882   emit_move_insn (mem, chain_value);
4883
4884   /* XXX We should really define a "clear_cache" pattern and use
4885      gen_clear_cache().  */
4886   a_tramp = XEXP (m_tramp, 0);
4887   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4888                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4889                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4890                      ptr_mode);
4891 }
4892
4893 static unsigned char
4894 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4895 {
4896   switch (regclass)
4897     {
4898     case CALLER_SAVE_REGS:
4899     case POINTER_REGS:
4900     case GENERAL_REGS:
4901     case ALL_REGS:
4902     case FP_REGS:
4903     case FP_LO_REGS:
4904       return
4905         aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4906                                        (GET_MODE_SIZE (mode) + 7) / 8;
4907     case STACK_REG:
4908       return 1;
4909
4910     case NO_REGS:
4911       return 0;
4912
4913     default:
4914       break;
4915     }
4916   gcc_unreachable ();
4917 }
4918
4919 static reg_class_t
4920 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4921 {
4922   if (regclass == POINTER_REGS)
4923     return GENERAL_REGS;
4924
4925   if (regclass == STACK_REG)
4926     {
4927       if (REG_P(x)
4928           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4929           return regclass;
4930
4931       return NO_REGS;
4932     }
4933
4934   /* If it's an integer immediate that MOVI can't handle, then
4935      FP_REGS is not an option, so we return NO_REGS instead.  */
4936   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4937       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4938     return NO_REGS;
4939
4940   /* Register eliminiation can result in a request for
4941      SP+constant->FP_REGS.  We cannot support such operations which
4942      use SP as source and an FP_REG as destination, so reject out
4943      right now.  */
4944   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4945     {
4946       rtx lhs = XEXP (x, 0);
4947
4948       /* Look through a possible SUBREG introduced by ILP32.  */
4949       if (GET_CODE (lhs) == SUBREG)
4950         lhs = SUBREG_REG (lhs);
4951
4952       gcc_assert (REG_P (lhs));
4953       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4954                                       POINTER_REGS));
4955       return NO_REGS;
4956     }
4957
4958   return regclass;
4959 }
4960
4961 void
4962 aarch64_asm_output_labelref (FILE* f, const char *name)
4963 {
4964   asm_fprintf (f, "%U%s", name);
4965 }
4966
4967 static void
4968 aarch64_elf_asm_constructor (rtx symbol, int priority)
4969 {
4970   if (priority == DEFAULT_INIT_PRIORITY)
4971     default_ctor_section_asm_out_constructor (symbol, priority);
4972   else
4973     {
4974       section *s;
4975       char buf[18];
4976       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4977       s = get_section (buf, SECTION_WRITE, NULL);
4978       switch_to_section (s);
4979       assemble_align (POINTER_SIZE);
4980       assemble_aligned_integer (POINTER_BYTES, symbol);
4981     }
4982 }
4983
4984 static void
4985 aarch64_elf_asm_destructor (rtx symbol, int priority)
4986 {
4987   if (priority == DEFAULT_INIT_PRIORITY)
4988     default_dtor_section_asm_out_destructor (symbol, priority);
4989   else
4990     {
4991       section *s;
4992       char buf[18];
4993       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
4994       s = get_section (buf, SECTION_WRITE, NULL);
4995       switch_to_section (s);
4996       assemble_align (POINTER_SIZE);
4997       assemble_aligned_integer (POINTER_BYTES, symbol);
4998     }
4999 }
5000
5001 const char*
5002 aarch64_output_casesi (rtx *operands)
5003 {
5004   char buf[100];
5005   char label[100];
5006   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5007   int index;
5008   static const char *const patterns[4][2] =
5009   {
5010     {
5011       "ldrb\t%w3, [%0,%w1,uxtw]",
5012       "add\t%3, %4, %w3, sxtb #2"
5013     },
5014     {
5015       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5016       "add\t%3, %4, %w3, sxth #2"
5017     },
5018     {
5019       "ldr\t%w3, [%0,%w1,uxtw #2]",
5020       "add\t%3, %4, %w3, sxtw #2"
5021     },
5022     /* We assume that DImode is only generated when not optimizing and
5023        that we don't really need 64-bit address offsets.  That would
5024        imply an object file with 8GB of code in a single function!  */
5025     {
5026       "ldr\t%w3, [%0,%w1,uxtw #2]",
5027       "add\t%3, %4, %w3, sxtw #2"
5028     }
5029   };
5030
5031   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5032
5033   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5034
5035   gcc_assert (index >= 0 && index <= 3);
5036
5037   /* Need to implement table size reduction, by chaning the code below.  */
5038   output_asm_insn (patterns[index][0], operands);
5039   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5040   snprintf (buf, sizeof (buf),
5041             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5042   output_asm_insn (buf, operands);
5043   output_asm_insn (patterns[index][1], operands);
5044   output_asm_insn ("br\t%3", operands);
5045   assemble_label (asm_out_file, label);
5046   return "";
5047 }
5048
5049
5050 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5051    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5052    operator.  */
5053
5054 int
5055 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5056 {
5057   if (shift >= 0 && shift <= 3)
5058     {
5059       int size;
5060       for (size = 8; size <= 32; size *= 2)
5061         {
5062           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5063           if (mask == bits << shift)
5064             return size;
5065         }
5066     }
5067   return 0;
5068 }
5069
5070 static bool
5071 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5072                                    const_rtx x ATTRIBUTE_UNUSED)
5073 {
5074   /* We can't use blocks for constants when we're using a per-function
5075      constant pool.  */
5076   return false;
5077 }
5078
5079 static section *
5080 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5081                             rtx x ATTRIBUTE_UNUSED,
5082                             unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5083 {
5084   /* Force all constant pool entries into the current function section.  */
5085   return function_section (current_function_decl);
5086 }
5087
5088
5089 /* Costs.  */
5090
5091 /* Helper function for rtx cost calculation.  Strip a shift expression
5092    from X.  Returns the inner operand if successful, or the original
5093    expression on failure.  */
5094 static rtx
5095 aarch64_strip_shift (rtx x)
5096 {
5097   rtx op = x;
5098
5099   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5100      we can convert both to ROR during final output.  */
5101   if ((GET_CODE (op) == ASHIFT
5102        || GET_CODE (op) == ASHIFTRT
5103        || GET_CODE (op) == LSHIFTRT
5104        || GET_CODE (op) == ROTATERT
5105        || GET_CODE (op) == ROTATE)
5106       && CONST_INT_P (XEXP (op, 1)))
5107     return XEXP (op, 0);
5108
5109   if (GET_CODE (op) == MULT
5110       && CONST_INT_P (XEXP (op, 1))
5111       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5112     return XEXP (op, 0);
5113
5114   return x;
5115 }
5116
5117 /* Helper function for rtx cost calculation.  Strip an extend
5118    expression from X.  Returns the inner operand if successful, or the
5119    original expression on failure.  We deal with a number of possible
5120    canonicalization variations here.  */
5121 static rtx
5122 aarch64_strip_extend (rtx x)
5123 {
5124   rtx op = x;
5125
5126   /* Zero and sign extraction of a widened value.  */
5127   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5128       && XEXP (op, 2) == const0_rtx
5129       && GET_CODE (XEXP (op, 0)) == MULT
5130       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5131                                          XEXP (op, 1)))
5132     return XEXP (XEXP (op, 0), 0);
5133
5134   /* It can also be represented (for zero-extend) as an AND with an
5135      immediate.  */
5136   if (GET_CODE (op) == AND
5137       && GET_CODE (XEXP (op, 0)) == MULT
5138       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5139       && CONST_INT_P (XEXP (op, 1))
5140       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5141                            INTVAL (XEXP (op, 1))) != 0)
5142     return XEXP (XEXP (op, 0), 0);
5143
5144   /* Now handle extended register, as this may also have an optional
5145      left shift by 1..4.  */
5146   if (GET_CODE (op) == ASHIFT
5147       && CONST_INT_P (XEXP (op, 1))
5148       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5149     op = XEXP (op, 0);
5150
5151   if (GET_CODE (op) == ZERO_EXTEND
5152       || GET_CODE (op) == SIGN_EXTEND)
5153     op = XEXP (op, 0);
5154
5155   if (op != x)
5156     return op;
5157
5158   return x;
5159 }
5160
5161 /* Return true iff CODE is a shift supported in combination
5162    with arithmetic instructions.  */
5163
5164 static bool
5165 aarch64_shift_p (enum rtx_code code)
5166 {
5167   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5168 }
5169
5170 /* Helper function for rtx cost calculation.  Calculate the cost of
5171    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5172    Return the calculated cost of the expression, recursing manually in to
5173    operands where needed.  */
5174
5175 static int
5176 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5177 {
5178   rtx op0, op1;
5179   const struct cpu_cost_table *extra_cost
5180     = aarch64_tune_params->insn_extra_cost;
5181   int cost = 0;
5182   bool compound_p = (outer == PLUS || outer == MINUS);
5183   machine_mode mode = GET_MODE (x);
5184
5185   gcc_checking_assert (code == MULT);
5186
5187   op0 = XEXP (x, 0);
5188   op1 = XEXP (x, 1);
5189
5190   if (VECTOR_MODE_P (mode))
5191     mode = GET_MODE_INNER (mode);
5192
5193   /* Integer multiply/fma.  */
5194   if (GET_MODE_CLASS (mode) == MODE_INT)
5195     {
5196       /* The multiply will be canonicalized as a shift, cost it as such.  */
5197       if (aarch64_shift_p (GET_CODE (x))
5198           || (CONST_INT_P (op1)
5199               && exact_log2 (INTVAL (op1)) > 0))
5200         {
5201           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5202                            || GET_CODE (op0) == SIGN_EXTEND;
5203           if (speed)
5204             {
5205               if (compound_p)
5206                 {
5207                   if (REG_P (op1))
5208                     /* ARITH + shift-by-register.  */
5209                     cost += extra_cost->alu.arith_shift_reg;
5210                   else if (is_extend)
5211                     /* ARITH + extended register.  We don't have a cost field
5212                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
5213                     cost += extra_cost->alu.extend_arith;
5214                   else
5215                     /* ARITH + shift-by-immediate.  */
5216                     cost += extra_cost->alu.arith_shift;
5217                 }
5218               else
5219                 /* LSL (immediate).  */
5220                 cost += extra_cost->alu.shift;
5221
5222             }
5223           /* Strip extends as we will have costed them in the case above.  */
5224           if (is_extend)
5225             op0 = aarch64_strip_extend (op0);
5226
5227           cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5228
5229           return cost;
5230         }
5231
5232       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
5233          compound and let the below cases handle it.  After all, MNEG is a
5234          special-case alias of MSUB.  */
5235       if (GET_CODE (op0) == NEG)
5236         {
5237           op0 = XEXP (op0, 0);
5238           compound_p = true;
5239         }
5240
5241       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5242       if ((GET_CODE (op0) == ZERO_EXTEND
5243            && GET_CODE (op1) == ZERO_EXTEND)
5244           || (GET_CODE (op0) == SIGN_EXTEND
5245               && GET_CODE (op1) == SIGN_EXTEND))
5246         {
5247           cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5248                   + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5249
5250           if (speed)
5251             {
5252               if (compound_p)
5253                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
5254                 cost += extra_cost->mult[0].extend_add;
5255               else
5256                 /* MUL/SMULL/UMULL.  */
5257                 cost += extra_cost->mult[0].extend;
5258             }
5259
5260           return cost;
5261         }
5262
5263       /* This is either an integer multiply or a MADD.  In both cases
5264          we want to recurse and cost the operands.  */
5265       cost += rtx_cost (op0, MULT, 0, speed)
5266               + rtx_cost (op1, MULT, 1, speed);
5267
5268       if (speed)
5269         {
5270           if (compound_p)
5271             /* MADD/MSUB.  */
5272             cost += extra_cost->mult[mode == DImode].add;
5273           else
5274             /* MUL.  */
5275             cost += extra_cost->mult[mode == DImode].simple;
5276         }
5277
5278       return cost;
5279     }
5280   else
5281     {
5282       if (speed)
5283         {
5284           /* Floating-point FMA/FMUL can also support negations of the
5285              operands.  */
5286           if (GET_CODE (op0) == NEG)
5287             op0 = XEXP (op0, 0);
5288           if (GET_CODE (op1) == NEG)
5289             op1 = XEXP (op1, 0);
5290
5291           if (compound_p)
5292             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5293             cost += extra_cost->fp[mode == DFmode].fma;
5294           else
5295             /* FMUL/FNMUL.  */
5296             cost += extra_cost->fp[mode == DFmode].mult;
5297         }
5298
5299       cost += rtx_cost (op0, MULT, 0, speed)
5300               + rtx_cost (op1, MULT, 1, speed);
5301       return cost;
5302     }
5303 }
5304
5305 static int
5306 aarch64_address_cost (rtx x,
5307                       machine_mode mode,
5308                       addr_space_t as ATTRIBUTE_UNUSED,
5309                       bool speed)
5310 {
5311   enum rtx_code c = GET_CODE (x);
5312   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
5313   struct aarch64_address_info info;
5314   int cost = 0;
5315   info.shift = 0;
5316
5317   if (!aarch64_classify_address (&info, x, mode, c, false))
5318     {
5319       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5320         {
5321           /* This is a CONST or SYMBOL ref which will be split
5322              in a different way depending on the code model in use.
5323              Cost it through the generic infrastructure.  */
5324           int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5325           /* Divide through by the cost of one instruction to
5326              bring it to the same units as the address costs.  */
5327           cost_symbol_ref /= COSTS_N_INSNS (1);
5328           /* The cost is then the cost of preparing the address,
5329              followed by an immediate (possibly 0) offset.  */
5330           return cost_symbol_ref + addr_cost->imm_offset;
5331         }
5332       else
5333         {
5334           /* This is most likely a jump table from a case
5335              statement.  */
5336           return addr_cost->register_offset;
5337         }
5338     }
5339
5340   switch (info.type)
5341     {
5342       case ADDRESS_LO_SUM:
5343       case ADDRESS_SYMBOLIC:
5344       case ADDRESS_REG_IMM:
5345         cost += addr_cost->imm_offset;
5346         break;
5347
5348       case ADDRESS_REG_WB:
5349         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5350           cost += addr_cost->pre_modify;
5351         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5352           cost += addr_cost->post_modify;
5353         else
5354           gcc_unreachable ();
5355
5356         break;
5357
5358       case ADDRESS_REG_REG:
5359         cost += addr_cost->register_offset;
5360         break;
5361
5362       case ADDRESS_REG_UXTW:
5363       case ADDRESS_REG_SXTW:
5364         cost += addr_cost->register_extend;
5365         break;
5366
5367       default:
5368         gcc_unreachable ();
5369     }
5370
5371
5372   if (info.shift > 0)
5373     {
5374       /* For the sake of calculating the cost of the shifted register
5375          component, we can treat same sized modes in the same way.  */
5376       switch (GET_MODE_BITSIZE (mode))
5377         {
5378           case 16:
5379             cost += addr_cost->addr_scale_costs.hi;
5380             break;
5381
5382           case 32:
5383             cost += addr_cost->addr_scale_costs.si;
5384             break;
5385
5386           case 64:
5387             cost += addr_cost->addr_scale_costs.di;
5388             break;
5389
5390           /* We can't tell, or this is a 128-bit vector.  */
5391           default:
5392             cost += addr_cost->addr_scale_costs.ti;
5393             break;
5394         }
5395     }
5396
5397   return cost;
5398 }
5399
5400 /* Return true if the RTX X in mode MODE is a zero or sign extract
5401    usable in an ADD or SUB (extended register) instruction.  */
5402 static bool
5403 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5404 {
5405   /* Catch add with a sign extract.
5406      This is add_<optab><mode>_multp2.  */
5407   if (GET_CODE (x) == SIGN_EXTRACT
5408       || GET_CODE (x) == ZERO_EXTRACT)
5409     {
5410       rtx op0 = XEXP (x, 0);
5411       rtx op1 = XEXP (x, 1);
5412       rtx op2 = XEXP (x, 2);
5413
5414       if (GET_CODE (op0) == MULT
5415           && CONST_INT_P (op1)
5416           && op2 == const0_rtx
5417           && CONST_INT_P (XEXP (op0, 1))
5418           && aarch64_is_extend_from_extract (mode,
5419                                              XEXP (op0, 1),
5420                                              op1))
5421         {
5422           return true;
5423         }
5424     }
5425
5426   return false;
5427 }
5428
5429 static bool
5430 aarch64_frint_unspec_p (unsigned int u)
5431 {
5432   switch (u)
5433     {
5434       case UNSPEC_FRINTZ:
5435       case UNSPEC_FRINTP:
5436       case UNSPEC_FRINTM:
5437       case UNSPEC_FRINTA:
5438       case UNSPEC_FRINTN:
5439       case UNSPEC_FRINTX:
5440       case UNSPEC_FRINTI:
5441         return true;
5442
5443       default:
5444         return false;
5445     }
5446 }
5447
5448 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5449    storing it in *COST.  Result is true if the total cost of the operation
5450    has now been calculated.  */
5451 static bool
5452 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5453 {
5454   rtx inner;
5455   rtx comparator;
5456   enum rtx_code cmpcode;
5457
5458   if (COMPARISON_P (op0))
5459     {
5460       inner = XEXP (op0, 0);
5461       comparator = XEXP (op0, 1);
5462       cmpcode = GET_CODE (op0);
5463     }
5464   else
5465     {
5466       inner = op0;
5467       comparator = const0_rtx;
5468       cmpcode = NE;
5469     }
5470
5471   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5472     {
5473       /* Conditional branch.  */
5474       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5475         return true;
5476       else
5477         {
5478           if (cmpcode == NE || cmpcode == EQ)
5479             {
5480               if (comparator == const0_rtx)
5481                 {
5482                   /* TBZ/TBNZ/CBZ/CBNZ.  */
5483                   if (GET_CODE (inner) == ZERO_EXTRACT)
5484                     /* TBZ/TBNZ.  */
5485                     *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5486                                        0, speed);
5487                 else
5488                   /* CBZ/CBNZ.  */
5489                   *cost += rtx_cost (inner, cmpcode, 0, speed);
5490
5491                 return true;
5492               }
5493             }
5494           else if (cmpcode == LT || cmpcode == GE)
5495             {
5496               /* TBZ/TBNZ.  */
5497               if (comparator == const0_rtx)
5498                 return true;
5499             }
5500         }
5501     }
5502   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5503     {
5504       /* It's a conditional operation based on the status flags,
5505          so it must be some flavor of CSEL.  */
5506
5507       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
5508       if (GET_CODE (op1) == NEG
5509           || GET_CODE (op1) == NOT
5510           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5511         op1 = XEXP (op1, 0);
5512
5513       *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5514       *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5515       return true;
5516     }
5517
5518   /* We don't know what this is, cost all operands.  */
5519   return false;
5520 }
5521
5522 /* Calculate the cost of calculating X, storing it in *COST.  Result
5523    is true if the total cost of the operation has now been calculated.  */
5524 static bool
5525 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5526                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5527 {
5528   rtx op0, op1, op2;
5529   const struct cpu_cost_table *extra_cost
5530     = aarch64_tune_params->insn_extra_cost;
5531   machine_mode mode = GET_MODE (x);
5532
5533   /* By default, assume that everything has equivalent cost to the
5534      cheapest instruction.  Any additional costs are applied as a delta
5535      above this default.  */
5536   *cost = COSTS_N_INSNS (1);
5537
5538   /* TODO: The cost infrastructure currently does not handle
5539      vector operations.  Assume that all vector operations
5540      are equally expensive.  */
5541   if (VECTOR_MODE_P (mode))
5542     {
5543       if (speed)
5544         *cost += extra_cost->vect.alu;
5545       return true;
5546     }
5547
5548   switch (code)
5549     {
5550     case SET:
5551       /* The cost depends entirely on the operands to SET.  */
5552       *cost = 0;
5553       op0 = SET_DEST (x);
5554       op1 = SET_SRC (x);
5555
5556       switch (GET_CODE (op0))
5557         {
5558         case MEM:
5559           if (speed)
5560             {
5561               rtx address = XEXP (op0, 0);
5562               if (GET_MODE_CLASS (mode) == MODE_INT)
5563                 *cost += extra_cost->ldst.store;
5564               else if (mode == SFmode)
5565                 *cost += extra_cost->ldst.storef;
5566               else if (mode == DFmode)
5567                 *cost += extra_cost->ldst.stored;
5568
5569               *cost +=
5570                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5571                                                      0, speed));
5572             }
5573
5574           *cost += rtx_cost (op1, SET, 1, speed);
5575           return true;
5576
5577         case SUBREG:
5578           if (! REG_P (SUBREG_REG (op0)))
5579             *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5580
5581           /* Fall through.  */
5582         case REG:
5583           /* const0_rtx is in general free, but we will use an
5584              instruction to set a register to 0.  */
5585           if (REG_P (op1) || op1 == const0_rtx)
5586             {
5587               /* The cost is 1 per register copied.  */
5588               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5589                               / UNITS_PER_WORD;
5590               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5591             }
5592           else
5593             /* Cost is just the cost of the RHS of the set.  */
5594             *cost += rtx_cost (op1, SET, 1, speed);
5595           return true;
5596
5597         case ZERO_EXTRACT:
5598         case SIGN_EXTRACT:
5599           /* Bit-field insertion.  Strip any redundant widening of
5600              the RHS to meet the width of the target.  */
5601           if (GET_CODE (op1) == SUBREG)
5602             op1 = SUBREG_REG (op1);
5603           if ((GET_CODE (op1) == ZERO_EXTEND
5604                || GET_CODE (op1) == SIGN_EXTEND)
5605               && CONST_INT_P (XEXP (op0, 1))
5606               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5607                   >= INTVAL (XEXP (op0, 1))))
5608             op1 = XEXP (op1, 0);
5609
5610           if (CONST_INT_P (op1))
5611             {
5612               /* MOV immediate is assumed to always be cheap.  */
5613               *cost = COSTS_N_INSNS (1);
5614             }
5615           else
5616             {
5617               /* BFM.  */
5618               if (speed)
5619                 *cost += extra_cost->alu.bfi;
5620               *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5621             }
5622
5623           return true;
5624
5625         default:
5626           /* We can't make sense of this, assume default cost.  */
5627           *cost = COSTS_N_INSNS (1);
5628           return false;
5629         }
5630       return false;
5631
5632     case CONST_INT:
5633       /* If an instruction can incorporate a constant within the
5634          instruction, the instruction's expression avoids calling
5635          rtx_cost() on the constant.  If rtx_cost() is called on a
5636          constant, then it is usually because the constant must be
5637          moved into a register by one or more instructions.
5638
5639          The exception is constant 0, which can be expressed
5640          as XZR/WZR and is therefore free.  The exception to this is
5641          if we have (set (reg) (const0_rtx)) in which case we must cost
5642          the move.  However, we can catch that when we cost the SET, so
5643          we don't need to consider that here.  */
5644       if (x == const0_rtx)
5645         *cost = 0;
5646       else
5647         {
5648           /* To an approximation, building any other constant is
5649              proportionally expensive to the number of instructions
5650              required to build that constant.  This is true whether we
5651              are compiling for SPEED or otherwise.  */
5652           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5653                                  (NULL_RTX, x, false, mode));
5654         }
5655       return true;
5656
5657     case CONST_DOUBLE:
5658       if (speed)
5659         {
5660           /* mov[df,sf]_aarch64.  */
5661           if (aarch64_float_const_representable_p (x))
5662             /* FMOV (scalar immediate).  */
5663             *cost += extra_cost->fp[mode == DFmode].fpconst;
5664           else if (!aarch64_float_const_zero_rtx_p (x))
5665             {
5666               /* This will be a load from memory.  */
5667               if (mode == DFmode)
5668                 *cost += extra_cost->ldst.loadd;
5669               else
5670                 *cost += extra_cost->ldst.loadf;
5671             }
5672           else
5673             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5674                or MOV v0.s[0], wzr - neither of which are modeled by the
5675                cost tables.  Just use the default cost.  */
5676             {
5677             }
5678         }
5679
5680       return true;
5681
5682     case MEM:
5683       if (speed)
5684         {
5685           /* For loads we want the base cost of a load, plus an
5686              approximation for the additional cost of the addressing
5687              mode.  */
5688           rtx address = XEXP (x, 0);
5689           if (GET_MODE_CLASS (mode) == MODE_INT)
5690             *cost += extra_cost->ldst.load;
5691           else if (mode == SFmode)
5692             *cost += extra_cost->ldst.loadf;
5693           else if (mode == DFmode)
5694             *cost += extra_cost->ldst.loadd;
5695
5696           *cost +=
5697                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5698                                                      0, speed));
5699         }
5700
5701       return true;
5702
5703     case NEG:
5704       op0 = XEXP (x, 0);
5705
5706       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5707        {
5708           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5709               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5710             {
5711               /* CSETM.  */
5712               *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5713               return true;
5714             }
5715
5716           /* Cost this as SUB wzr, X.  */
5717           op0 = CONST0_RTX (GET_MODE (x));
5718           op1 = XEXP (x, 0);
5719           goto cost_minus;
5720         }
5721
5722       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5723         {
5724           /* Support (neg(fma...)) as a single instruction only if
5725              sign of zeros is unimportant.  This matches the decision
5726              making in aarch64.md.  */
5727           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5728             {
5729               /* FNMADD.  */
5730               *cost = rtx_cost (op0, NEG, 0, speed);
5731               return true;
5732             }
5733           if (speed)
5734             /* FNEG.  */
5735             *cost += extra_cost->fp[mode == DFmode].neg;
5736           return false;
5737         }
5738
5739       return false;
5740
5741     case CLRSB:
5742     case CLZ:
5743       if (speed)
5744         *cost += extra_cost->alu.clz;
5745
5746       return false;
5747
5748     case COMPARE:
5749       op0 = XEXP (x, 0);
5750       op1 = XEXP (x, 1);
5751
5752       if (op1 == const0_rtx
5753           && GET_CODE (op0) == AND)
5754         {
5755           x = op0;
5756           goto cost_logic;
5757         }
5758
5759       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5760         {
5761           /* TODO: A write to the CC flags possibly costs extra, this
5762              needs encoding in the cost tables.  */
5763
5764           /* CC_ZESWPmode supports zero extend for free.  */
5765           if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5766             op0 = XEXP (op0, 0);
5767
5768           /* ANDS.  */
5769           if (GET_CODE (op0) == AND)
5770             {
5771               x = op0;
5772               goto cost_logic;
5773             }
5774
5775           if (GET_CODE (op0) == PLUS)
5776             {
5777               /* ADDS (and CMN alias).  */
5778               x = op0;
5779               goto cost_plus;
5780             }
5781
5782           if (GET_CODE (op0) == MINUS)
5783             {
5784               /* SUBS.  */
5785               x = op0;
5786               goto cost_minus;
5787             }
5788
5789           if (GET_CODE (op1) == NEG)
5790             {
5791               /* CMN.  */
5792               if (speed)
5793                 *cost += extra_cost->alu.arith;
5794
5795               *cost += rtx_cost (op0, COMPARE, 0, speed);
5796               *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5797               return true;
5798             }
5799
5800           /* CMP.
5801
5802              Compare can freely swap the order of operands, and
5803              canonicalization puts the more complex operation first.
5804              But the integer MINUS logic expects the shift/extend
5805              operation in op1.  */
5806           if (! (REG_P (op0)
5807                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5808           {
5809             op0 = XEXP (x, 1);
5810             op1 = XEXP (x, 0);
5811           }
5812           goto cost_minus;
5813         }
5814
5815       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5816         {
5817           /* FCMP.  */
5818           if (speed)
5819             *cost += extra_cost->fp[mode == DFmode].compare;
5820
5821           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5822             {
5823               /* FCMP supports constant 0.0 for no extra cost. */
5824               return true;
5825             }
5826           return false;
5827         }
5828
5829       return false;
5830
5831     case MINUS:
5832       {
5833         op0 = XEXP (x, 0);
5834         op1 = XEXP (x, 1);
5835
5836 cost_minus:
5837         /* Detect valid immediates.  */
5838         if ((GET_MODE_CLASS (mode) == MODE_INT
5839              || (GET_MODE_CLASS (mode) == MODE_CC
5840                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5841             && CONST_INT_P (op1)
5842             && aarch64_uimm12_shift (INTVAL (op1)))
5843           {
5844             *cost += rtx_cost (op0, MINUS, 0, speed);
5845
5846             if (speed)
5847               /* SUB(S) (immediate).  */
5848               *cost += extra_cost->alu.arith;
5849             return true;
5850
5851           }
5852
5853         /* Look for SUB (extended register).  */
5854         if (aarch64_rtx_arith_op_extract_p (op1, mode))
5855           {
5856             if (speed)
5857               *cost += extra_cost->alu.extend_arith;
5858
5859             *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5860                                (enum rtx_code) GET_CODE (op1),
5861                                0, speed);
5862             return true;
5863           }
5864
5865         rtx new_op1 = aarch64_strip_extend (op1);
5866
5867         /* Cost this as an FMA-alike operation.  */
5868         if ((GET_CODE (new_op1) == MULT
5869              || aarch64_shift_p (GET_CODE (new_op1)))
5870             && code != COMPARE)
5871           {
5872             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5873                                             (enum rtx_code) code,
5874                                             speed);
5875             *cost += rtx_cost (op0, MINUS, 0, speed);
5876             return true;
5877           }
5878
5879         *cost += rtx_cost (new_op1, MINUS, 1, speed);
5880
5881         if (speed)
5882           {
5883             if (GET_MODE_CLASS (mode) == MODE_INT)
5884               /* SUB(S).  */
5885               *cost += extra_cost->alu.arith;
5886             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5887               /* FSUB.  */
5888               *cost += extra_cost->fp[mode == DFmode].addsub;
5889           }
5890         return true;
5891       }
5892
5893     case PLUS:
5894       {
5895         rtx new_op0;
5896
5897         op0 = XEXP (x, 0);
5898         op1 = XEXP (x, 1);
5899
5900 cost_plus:
5901         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5902             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5903           {
5904             /* CSINC.  */
5905             *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5906             *cost += rtx_cost (op1, PLUS, 1, speed);
5907             return true;
5908           }
5909
5910         if (GET_MODE_CLASS (mode) == MODE_INT
5911             && CONST_INT_P (op1)
5912             && aarch64_uimm12_shift (INTVAL (op1)))
5913           {
5914             *cost += rtx_cost (op0, PLUS, 0, speed);
5915
5916             if (speed)
5917               /* ADD (immediate).  */
5918               *cost += extra_cost->alu.arith;
5919             return true;
5920           }
5921
5922         /* Look for ADD (extended register).  */
5923         if (aarch64_rtx_arith_op_extract_p (op0, mode))
5924           {
5925             if (speed)
5926               *cost += extra_cost->alu.extend_arith;
5927
5928             *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5929                                (enum rtx_code) GET_CODE (op0),
5930                                0, speed);
5931             return true;
5932           }
5933
5934         /* Strip any extend, leave shifts behind as we will
5935            cost them through mult_cost.  */
5936         new_op0 = aarch64_strip_extend (op0);
5937
5938         if (GET_CODE (new_op0) == MULT
5939             || aarch64_shift_p (GET_CODE (new_op0)))
5940           {
5941             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5942                                             speed);
5943             *cost += rtx_cost (op1, PLUS, 1, speed);
5944             return true;
5945           }
5946
5947         *cost += (rtx_cost (new_op0, PLUS, 0, speed)
5948                   + rtx_cost (op1, PLUS, 1, speed));
5949
5950         if (speed)
5951           {
5952             if (GET_MODE_CLASS (mode) == MODE_INT)
5953               /* ADD.  */
5954               *cost += extra_cost->alu.arith;
5955             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5956               /* FADD.  */
5957               *cost += extra_cost->fp[mode == DFmode].addsub;
5958           }
5959         return true;
5960       }
5961
5962     case BSWAP:
5963       *cost = COSTS_N_INSNS (1);
5964
5965       if (speed)
5966         *cost += extra_cost->alu.rev;
5967
5968       return false;
5969
5970     case IOR:
5971       if (aarch_rev16_p (x))
5972         {
5973           *cost = COSTS_N_INSNS (1);
5974
5975           if (speed)
5976             *cost += extra_cost->alu.rev;
5977
5978           return true;
5979         }
5980     /* Fall through.  */
5981     case XOR:
5982     case AND:
5983     cost_logic:
5984       op0 = XEXP (x, 0);
5985       op1 = XEXP (x, 1);
5986
5987       if (code == AND
5988           && GET_CODE (op0) == MULT
5989           && CONST_INT_P (XEXP (op0, 1))
5990           && CONST_INT_P (op1)
5991           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
5992                                INTVAL (op1)) != 0)
5993         {
5994           /* This is a UBFM/SBFM.  */
5995           *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
5996           if (speed)
5997             *cost += extra_cost->alu.bfx;
5998           return true;
5999         }
6000
6001       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6002         {
6003           /* We possibly get the immediate for free, this is not
6004              modelled.  */
6005           if (CONST_INT_P (op1)
6006               && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
6007             {
6008               *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6009
6010               if (speed)
6011                 *cost += extra_cost->alu.logical;
6012
6013               return true;
6014             }
6015           else
6016             {
6017               rtx new_op0 = op0;
6018
6019               /* Handle ORN, EON, or BIC.  */
6020               if (GET_CODE (op0) == NOT)
6021                 op0 = XEXP (op0, 0);
6022
6023               new_op0 = aarch64_strip_shift (op0);
6024
6025               /* If we had a shift on op0 then this is a logical-shift-
6026                  by-register/immediate operation.  Otherwise, this is just
6027                  a logical operation.  */
6028               if (speed)
6029                 {
6030                   if (new_op0 != op0)
6031                     {
6032                       /* Shift by immediate.  */
6033                       if (CONST_INT_P (XEXP (op0, 1)))
6034                         *cost += extra_cost->alu.log_shift;
6035                       else
6036                         *cost += extra_cost->alu.log_shift_reg;
6037                     }
6038                   else
6039                     *cost += extra_cost->alu.logical;
6040                 }
6041
6042               /* In both cases we want to cost both operands.  */
6043               *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6044                        + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6045
6046               return true;
6047             }
6048         }
6049       return false;
6050
6051     case NOT:
6052       x = XEXP (x, 0);
6053       op0 = aarch64_strip_shift (x);
6054
6055       /* MVN-shifted-reg.  */
6056       if (op0 != x)
6057         {
6058           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6059
6060           if (speed)
6061             *cost += extra_cost->alu.log_shift;
6062
6063           return true;
6064         }
6065       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6066          Handle the second form here taking care that 'a' in the above can
6067          be a shift.  */
6068       else if (GET_CODE (op0) == XOR)
6069         {
6070           rtx newop0 = XEXP (op0, 0);
6071           rtx newop1 = XEXP (op0, 1);
6072           rtx op0_stripped = aarch64_strip_shift (newop0);
6073
6074           *cost += rtx_cost (newop1, (enum rtx_code) code, 1, speed)
6075                    + rtx_cost (op0_stripped, XOR, 0, speed);
6076
6077           if (speed)
6078             {
6079               if (op0_stripped != newop0)
6080                 *cost += extra_cost->alu.log_shift;
6081               else
6082                 *cost += extra_cost->alu.logical;
6083             }
6084
6085           return true;
6086         }
6087       /* MVN.  */
6088       if (speed)
6089         *cost += extra_cost->alu.logical;
6090
6091       return false;
6092
6093     case ZERO_EXTEND:
6094
6095       op0 = XEXP (x, 0);
6096       /* If a value is written in SI mode, then zero extended to DI
6097          mode, the operation will in general be free as a write to
6098          a 'w' register implicitly zeroes the upper bits of an 'x'
6099          register.  However, if this is
6100
6101            (set (reg) (zero_extend (reg)))
6102
6103          we must cost the explicit register move.  */
6104       if (mode == DImode
6105           && GET_MODE (op0) == SImode
6106           && outer == SET)
6107         {
6108           int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6109
6110           if (!op_cost && speed)
6111             /* MOV.  */
6112             *cost += extra_cost->alu.extend;
6113           else
6114             /* Free, the cost is that of the SI mode operation.  */
6115             *cost = op_cost;
6116
6117           return true;
6118         }
6119       else if (MEM_P (XEXP (x, 0)))
6120         {
6121           /* All loads can zero extend to any size for free.  */
6122           *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6123           return true;
6124         }
6125
6126       /* UXTB/UXTH.  */
6127       if (speed)
6128         *cost += extra_cost->alu.extend;
6129
6130       return false;
6131
6132     case SIGN_EXTEND:
6133       if (MEM_P (XEXP (x, 0)))
6134         {
6135           /* LDRSH.  */
6136           if (speed)
6137             {
6138               rtx address = XEXP (XEXP (x, 0), 0);
6139               *cost += extra_cost->ldst.load_sign_extend;
6140
6141               *cost +=
6142                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6143                                                      0, speed));
6144             }
6145           return true;
6146         }
6147
6148       if (speed)
6149         *cost += extra_cost->alu.extend;
6150       return false;
6151
6152     case ASHIFT:
6153       op0 = XEXP (x, 0);
6154       op1 = XEXP (x, 1);
6155
6156       if (CONST_INT_P (op1))
6157         {
6158           /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6159              aliases.  */
6160           if (speed)
6161             *cost += extra_cost->alu.shift;
6162
6163           /* We can incorporate zero/sign extend for free.  */
6164           if (GET_CODE (op0) == ZERO_EXTEND
6165               || GET_CODE (op0) == SIGN_EXTEND)
6166             op0 = XEXP (op0, 0);
6167
6168           *cost += rtx_cost (op0, ASHIFT, 0, speed);
6169           return true;
6170         }
6171       else
6172         {
6173           /* LSLV.  */
6174           if (speed)
6175             *cost += extra_cost->alu.shift_reg;
6176
6177           return false;  /* All arguments need to be in registers.  */
6178         }
6179
6180     case ROTATE:
6181     case ROTATERT:
6182     case LSHIFTRT:
6183     case ASHIFTRT:
6184       op0 = XEXP (x, 0);
6185       op1 = XEXP (x, 1);
6186
6187       if (CONST_INT_P (op1))
6188         {
6189           /* ASR (immediate) and friends.  */
6190           if (speed)
6191             *cost += extra_cost->alu.shift;
6192
6193           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6194           return true;
6195         }
6196       else
6197         {
6198
6199           /* ASR (register) and friends.  */
6200           if (speed)
6201             *cost += extra_cost->alu.shift_reg;
6202
6203           return false;  /* All arguments need to be in registers.  */
6204         }
6205
6206     case SYMBOL_REF:
6207
6208       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6209         {
6210           /* LDR.  */
6211           if (speed)
6212             *cost += extra_cost->ldst.load;
6213         }
6214       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6215                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6216         {
6217           /* ADRP, followed by ADD.  */
6218           *cost += COSTS_N_INSNS (1);
6219           if (speed)
6220             *cost += 2 * extra_cost->alu.arith;
6221         }
6222       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6223                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6224         {
6225           /* ADR.  */
6226           if (speed)
6227             *cost += extra_cost->alu.arith;
6228         }
6229
6230       if (flag_pic)
6231         {
6232           /* One extra load instruction, after accessing the GOT.  */
6233           *cost += COSTS_N_INSNS (1);
6234           if (speed)
6235             *cost += extra_cost->ldst.load;
6236         }
6237       return true;
6238
6239     case HIGH:
6240     case LO_SUM:
6241       /* ADRP/ADD (immediate).  */
6242       if (speed)
6243         *cost += extra_cost->alu.arith;
6244       return true;
6245
6246     case ZERO_EXTRACT:
6247     case SIGN_EXTRACT:
6248       /* UBFX/SBFX.  */
6249       if (speed)
6250         *cost += extra_cost->alu.bfx;
6251
6252       /* We can trust that the immediates used will be correct (there
6253          are no by-register forms), so we need only cost op0.  */
6254       *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6255       return true;
6256
6257     case MULT:
6258       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6259       /* aarch64_rtx_mult_cost always handles recursion to its
6260          operands.  */
6261       return true;
6262
6263     case MOD:
6264     case UMOD:
6265       if (speed)
6266         {
6267           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6268             *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6269                       + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6270           else if (GET_MODE (x) == DFmode)
6271             *cost += (extra_cost->fp[1].mult
6272                       + extra_cost->fp[1].div);
6273           else if (GET_MODE (x) == SFmode)
6274             *cost += (extra_cost->fp[0].mult
6275                       + extra_cost->fp[0].div);
6276         }
6277       return false;  /* All arguments need to be in registers.  */
6278
6279     case DIV:
6280     case UDIV:
6281     case SQRT:
6282       if (speed)
6283         {
6284           if (GET_MODE_CLASS (mode) == MODE_INT)
6285             /* There is no integer SQRT, so only DIV and UDIV can get
6286                here.  */
6287             *cost += extra_cost->mult[mode == DImode].idiv;
6288           else
6289             *cost += extra_cost->fp[mode == DFmode].div;
6290         }
6291       return false;  /* All arguments need to be in registers.  */
6292
6293     case IF_THEN_ELSE:
6294       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6295                                          XEXP (x, 2), cost, speed);
6296
6297     case EQ:
6298     case NE:
6299     case GT:
6300     case GTU:
6301     case LT:
6302     case LTU:
6303     case GE:
6304     case GEU:
6305     case LE:
6306     case LEU:
6307
6308       return false; /* All arguments must be in registers.  */
6309
6310     case FMA:
6311       op0 = XEXP (x, 0);
6312       op1 = XEXP (x, 1);
6313       op2 = XEXP (x, 2);
6314
6315       if (speed)
6316         *cost += extra_cost->fp[mode == DFmode].fma;
6317
6318       /* FMSUB, FNMADD, and FNMSUB are free.  */
6319       if (GET_CODE (op0) == NEG)
6320         op0 = XEXP (op0, 0);
6321
6322       if (GET_CODE (op2) == NEG)
6323         op2 = XEXP (op2, 0);
6324
6325       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6326          and the by-element operand as operand 0.  */
6327       if (GET_CODE (op1) == NEG)
6328         op1 = XEXP (op1, 0);
6329
6330       /* Catch vector-by-element operations.  The by-element operand can
6331          either be (vec_duplicate (vec_select (x))) or just
6332          (vec_select (x)), depending on whether we are multiplying by
6333          a vector or a scalar.
6334
6335          Canonicalization is not very good in these cases, FMA4 will put the
6336          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
6337       if (GET_CODE (op0) == VEC_DUPLICATE)
6338         op0 = XEXP (op0, 0);
6339       else if (GET_CODE (op1) == VEC_DUPLICATE)
6340         op1 = XEXP (op1, 0);
6341
6342       if (GET_CODE (op0) == VEC_SELECT)
6343         op0 = XEXP (op0, 0);
6344       else if (GET_CODE (op1) == VEC_SELECT)
6345         op1 = XEXP (op1, 0);
6346
6347       /* If the remaining parameters are not registers,
6348          get the cost to put them into registers.  */
6349       *cost += rtx_cost (op0, FMA, 0, speed);
6350       *cost += rtx_cost (op1, FMA, 1, speed);
6351       *cost += rtx_cost (op2, FMA, 2, speed);
6352       return true;
6353
6354     case FLOAT_EXTEND:
6355       if (speed)
6356         *cost += extra_cost->fp[mode == DFmode].widen;
6357       return false;
6358
6359     case FLOAT_TRUNCATE:
6360       if (speed)
6361         *cost += extra_cost->fp[mode == DFmode].narrow;
6362       return false;
6363
6364     case FIX:
6365     case UNSIGNED_FIX:
6366       x = XEXP (x, 0);
6367       /* Strip the rounding part.  They will all be implemented
6368          by the fcvt* family of instructions anyway.  */
6369       if (GET_CODE (x) == UNSPEC)
6370         {
6371           unsigned int uns_code = XINT (x, 1);
6372
6373           if (uns_code == UNSPEC_FRINTA
6374               || uns_code == UNSPEC_FRINTM
6375               || uns_code == UNSPEC_FRINTN
6376               || uns_code == UNSPEC_FRINTP
6377               || uns_code == UNSPEC_FRINTZ)
6378             x = XVECEXP (x, 0, 0);
6379         }
6380
6381       if (speed)
6382         *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6383
6384       *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6385       return true;
6386
6387     case ABS:
6388       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6389         {
6390           /* FABS and FNEG are analogous.  */
6391           if (speed)
6392             *cost += extra_cost->fp[mode == DFmode].neg;
6393         }
6394       else
6395         {
6396           /* Integer ABS will either be split to
6397              two arithmetic instructions, or will be an ABS
6398              (scalar), which we don't model.  */
6399           *cost = COSTS_N_INSNS (2);
6400           if (speed)
6401             *cost += 2 * extra_cost->alu.arith;
6402         }
6403       return false;
6404
6405     case SMAX:
6406     case SMIN:
6407       if (speed)
6408         {
6409           /* FMAXNM/FMINNM/FMAX/FMIN.
6410              TODO: This may not be accurate for all implementations, but
6411              we do not model this in the cost tables.  */
6412           *cost += extra_cost->fp[mode == DFmode].addsub;
6413         }
6414       return false;
6415
6416     case UNSPEC:
6417       /* The floating point round to integer frint* instructions.  */
6418       if (aarch64_frint_unspec_p (XINT (x, 1)))
6419         {
6420           if (speed)
6421             *cost += extra_cost->fp[mode == DFmode].roundint;
6422
6423           return false;
6424         }
6425
6426       if (XINT (x, 1) == UNSPEC_RBIT)
6427         {
6428           if (speed)
6429             *cost += extra_cost->alu.rev;
6430
6431           return false;
6432         }
6433       break;
6434
6435     case TRUNCATE:
6436
6437       /* Decompose <su>muldi3_highpart.  */
6438       if (/* (truncate:DI  */
6439           mode == DImode
6440           /*   (lshiftrt:TI  */
6441           && GET_MODE (XEXP (x, 0)) == TImode
6442           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6443           /*      (mult:TI  */
6444           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6445           /*        (ANY_EXTEND:TI (reg:DI))
6446                     (ANY_EXTEND:TI (reg:DI)))  */
6447           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6448                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6449               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6450                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6451           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6452           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6453           /*     (const_int 64)  */
6454           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6455           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6456         {
6457           /* UMULH/SMULH.  */
6458           if (speed)
6459             *cost += extra_cost->mult[mode == DImode].extend;
6460           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6461                              MULT, 0, speed);
6462           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6463                              MULT, 1, speed);
6464           return true;
6465         }
6466
6467       /* Fall through.  */
6468     default:
6469       break;
6470     }
6471
6472   if (dump_file && (dump_flags & TDF_DETAILS))
6473     fprintf (dump_file,
6474       "\nFailed to cost RTX.  Assuming default cost.\n");
6475
6476   return true;
6477 }
6478
6479 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6480    calculated for X.  This cost is stored in *COST.  Returns true
6481    if the total cost of X was calculated.  */
6482 static bool
6483 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6484                    int param, int *cost, bool speed)
6485 {
6486   bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6487
6488   if (dump_file && (dump_flags & TDF_DETAILS))
6489     {
6490       print_rtl_single (dump_file, x);
6491       fprintf (dump_file, "\n%s cost: %d (%s)\n",
6492                speed ? "Hot" : "Cold",
6493                *cost, result ? "final" : "partial");
6494     }
6495
6496   return result;
6497 }
6498
6499 static int
6500 aarch64_register_move_cost (machine_mode mode,
6501                             reg_class_t from_i, reg_class_t to_i)
6502 {
6503   enum reg_class from = (enum reg_class) from_i;
6504   enum reg_class to = (enum reg_class) to_i;
6505   const struct cpu_regmove_cost *regmove_cost
6506     = aarch64_tune_params->regmove_cost;
6507
6508   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
6509   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6510     to = GENERAL_REGS;
6511
6512   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6513     from = GENERAL_REGS;
6514
6515   /* Moving between GPR and stack cost is the same as GP2GP.  */
6516   if ((from == GENERAL_REGS && to == STACK_REG)
6517       || (to == GENERAL_REGS && from == STACK_REG))
6518     return regmove_cost->GP2GP;
6519
6520   /* To/From the stack register, we move via the gprs.  */
6521   if (to == STACK_REG || from == STACK_REG)
6522     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6523             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6524
6525   if (GET_MODE_SIZE (mode) == 16)
6526     {
6527       /* 128-bit operations on general registers require 2 instructions.  */
6528       if (from == GENERAL_REGS && to == GENERAL_REGS)
6529         return regmove_cost->GP2GP * 2;
6530       else if (from == GENERAL_REGS)
6531         return regmove_cost->GP2FP * 2;
6532       else if (to == GENERAL_REGS)
6533         return regmove_cost->FP2GP * 2;
6534
6535       /* When AdvSIMD instructions are disabled it is not possible to move
6536          a 128-bit value directly between Q registers.  This is handled in
6537          secondary reload.  A general register is used as a scratch to move
6538          the upper DI value and the lower DI value is moved directly,
6539          hence the cost is the sum of three moves. */
6540       if (! TARGET_SIMD)
6541         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6542
6543       return regmove_cost->FP2FP;
6544     }
6545
6546   if (from == GENERAL_REGS && to == GENERAL_REGS)
6547     return regmove_cost->GP2GP;
6548   else if (from == GENERAL_REGS)
6549     return regmove_cost->GP2FP;
6550   else if (to == GENERAL_REGS)
6551     return regmove_cost->FP2GP;
6552
6553   return regmove_cost->FP2FP;
6554 }
6555
6556 static int
6557 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6558                           reg_class_t rclass ATTRIBUTE_UNUSED,
6559                           bool in ATTRIBUTE_UNUSED)
6560 {
6561   return aarch64_tune_params->memmov_cost;
6562 }
6563
6564 /* Return the number of instructions that can be issued per cycle.  */
6565 static int
6566 aarch64_sched_issue_rate (void)
6567 {
6568   return aarch64_tune_params->issue_rate;
6569 }
6570
6571 static int
6572 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6573 {
6574   int issue_rate = aarch64_sched_issue_rate ();
6575
6576   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6577 }
6578
6579 /* Vectorizer cost model target hooks.  */
6580
6581 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
6582 static int
6583 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6584                                     tree vectype,
6585                                     int misalign ATTRIBUTE_UNUSED)
6586 {
6587   unsigned elements;
6588
6589   switch (type_of_cost)
6590     {
6591       case scalar_stmt:
6592         return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6593
6594       case scalar_load:
6595         return aarch64_tune_params->vec_costs->scalar_load_cost;
6596
6597       case scalar_store:
6598         return aarch64_tune_params->vec_costs->scalar_store_cost;
6599
6600       case vector_stmt:
6601         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6602
6603       case vector_load:
6604         return aarch64_tune_params->vec_costs->vec_align_load_cost;
6605
6606       case vector_store:
6607         return aarch64_tune_params->vec_costs->vec_store_cost;
6608
6609       case vec_to_scalar:
6610         return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6611
6612       case scalar_to_vec:
6613         return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6614
6615       case unaligned_load:
6616         return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6617
6618       case unaligned_store:
6619         return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6620
6621       case cond_branch_taken:
6622         return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6623
6624       case cond_branch_not_taken:
6625         return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6626
6627       case vec_perm:
6628       case vec_promote_demote:
6629         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6630
6631       case vec_construct:
6632         elements = TYPE_VECTOR_SUBPARTS (vectype);
6633         return elements / 2 + 1;
6634
6635       default:
6636         gcc_unreachable ();
6637     }
6638 }
6639
6640 /* Implement targetm.vectorize.add_stmt_cost.  */
6641 static unsigned
6642 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6643                        struct _stmt_vec_info *stmt_info, int misalign,
6644                        enum vect_cost_model_location where)
6645 {
6646   unsigned *cost = (unsigned *) data;
6647   unsigned retval = 0;
6648
6649   if (flag_vect_cost_model)
6650     {
6651       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6652       int stmt_cost =
6653             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6654
6655       /* Statements in an inner loop relative to the loop being
6656          vectorized are weighted more heavily.  The value here is
6657          a function (linear for now) of the loop nest level.  */
6658       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6659         {
6660           loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6661           struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
6662           unsigned nest_level = loop_depth (loop);
6663
6664           count *= nest_level;
6665         }
6666
6667       retval = (unsigned) (count * stmt_cost);
6668       cost[where] += retval;
6669     }
6670
6671   return retval;
6672 }
6673
6674 static void initialize_aarch64_code_model (void);
6675
6676 /* Parse the architecture extension string.  */
6677
6678 static void
6679 aarch64_parse_extension (char *str)
6680 {
6681   /* The extension string is parsed left to right.  */
6682   const struct aarch64_option_extension *opt = NULL;
6683
6684   /* Flag to say whether we are adding or removing an extension.  */
6685   int adding_ext = -1;
6686
6687   while (str != NULL && *str != 0)
6688     {
6689       char *ext;
6690       size_t len;
6691
6692       str++;
6693       ext = strchr (str, '+');
6694
6695       if (ext != NULL)
6696         len = ext - str;
6697       else
6698         len = strlen (str);
6699
6700       if (len >= 2 && strncmp (str, "no", 2) == 0)
6701         {
6702           adding_ext = 0;
6703           len -= 2;
6704           str += 2;
6705         }
6706       else if (len > 0)
6707         adding_ext = 1;
6708
6709       if (len == 0)
6710         {
6711           error ("missing feature modifier after %qs", adding_ext ? "+"
6712                                                                   : "+no");
6713           return;
6714         }
6715
6716       /* Scan over the extensions table trying to find an exact match.  */
6717       for (opt = all_extensions; opt->name != NULL; opt++)
6718         {
6719           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6720             {
6721               /* Add or remove the extension.  */
6722               if (adding_ext)
6723                 aarch64_isa_flags |= opt->flags_on;
6724               else
6725                 aarch64_isa_flags &= ~(opt->flags_off);
6726               break;
6727             }
6728         }
6729
6730       if (opt->name == NULL)
6731         {
6732           /* Extension not found in list.  */
6733           error ("unknown feature modifier %qs", str);
6734           return;
6735         }
6736
6737       str = ext;
6738     };
6739
6740   return;
6741 }
6742
6743 /* Parse the ARCH string.  */
6744
6745 static void
6746 aarch64_parse_arch (void)
6747 {
6748   char *ext;
6749   const struct processor *arch;
6750   char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6751   size_t len;
6752
6753   strcpy (str, aarch64_arch_string);
6754
6755   ext = strchr (str, '+');
6756
6757   if (ext != NULL)
6758     len = ext - str;
6759   else
6760     len = strlen (str);
6761
6762   if (len == 0)
6763     {
6764       error ("missing arch name in -march=%qs", str);
6765       return;
6766     }
6767
6768   /* Loop through the list of supported ARCHs to find a match.  */
6769   for (arch = all_architectures; arch->name != NULL; arch++)
6770     {
6771       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6772         {
6773           selected_arch = arch;
6774           aarch64_isa_flags = selected_arch->flags;
6775
6776           if (!selected_cpu)
6777             selected_cpu = &all_cores[selected_arch->core];
6778
6779           if (ext != NULL)
6780             {
6781               /* ARCH string contains at least one extension.  */
6782               aarch64_parse_extension (ext);
6783             }
6784
6785           if (strcmp (selected_arch->arch, selected_cpu->arch))
6786             {
6787               warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6788                        selected_cpu->name, selected_arch->name);
6789             }
6790
6791           return;
6792         }
6793     }
6794
6795   /* ARCH name not found in list.  */
6796   error ("unknown value %qs for -march", str);
6797   return;
6798 }
6799
6800 /* Parse the CPU string.  */
6801
6802 static void
6803 aarch64_parse_cpu (void)
6804 {
6805   char *ext;
6806   const struct processor *cpu;
6807   char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6808   size_t len;
6809
6810   strcpy (str, aarch64_cpu_string);
6811
6812   ext = strchr (str, '+');
6813
6814   if (ext != NULL)
6815     len = ext - str;
6816   else
6817     len = strlen (str);
6818
6819   if (len == 0)
6820     {
6821       error ("missing cpu name in -mcpu=%qs", str);
6822       return;
6823     }
6824
6825   /* Loop through the list of supported CPUs to find a match.  */
6826   for (cpu = all_cores; cpu->name != NULL; cpu++)
6827     {
6828       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6829         {
6830           selected_cpu = cpu;
6831           aarch64_isa_flags = selected_cpu->flags;
6832
6833           if (ext != NULL)
6834             {
6835               /* CPU string contains at least one extension.  */
6836               aarch64_parse_extension (ext);
6837             }
6838
6839           return;
6840         }
6841     }
6842
6843   /* CPU name not found in list.  */
6844   error ("unknown value %qs for -mcpu", str);
6845   return;
6846 }
6847
6848 /* Parse the TUNE string.  */
6849
6850 static void
6851 aarch64_parse_tune (void)
6852 {
6853   const struct processor *cpu;
6854   char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6855   strcpy (str, aarch64_tune_string);
6856
6857   /* Loop through the list of supported CPUs to find a match.  */
6858   for (cpu = all_cores; cpu->name != NULL; cpu++)
6859     {
6860       if (strcmp (cpu->name, str) == 0)
6861         {
6862           selected_tune = cpu;
6863           return;
6864         }
6865     }
6866
6867   /* CPU name not found in list.  */
6868   error ("unknown value %qs for -mtune", str);
6869   return;
6870 }
6871
6872
6873 /* Implement TARGET_OPTION_OVERRIDE.  */
6874
6875 static void
6876 aarch64_override_options (void)
6877 {
6878   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6879      If either of -march or -mtune is given, they override their
6880      respective component of -mcpu.
6881
6882      So, first parse AARCH64_CPU_STRING, then the others, be careful
6883      with -march as, if -mcpu is not present on the command line, march
6884      must set a sensible default CPU.  */
6885   if (aarch64_cpu_string)
6886     {
6887       aarch64_parse_cpu ();
6888     }
6889
6890   if (aarch64_arch_string)
6891     {
6892       aarch64_parse_arch ();
6893     }
6894
6895   if (aarch64_tune_string)
6896     {
6897       aarch64_parse_tune ();
6898     }
6899
6900 #ifndef HAVE_AS_MABI_OPTION
6901   /* The compiler may have been configured with 2.23.* binutils, which does
6902      not have support for ILP32.  */
6903   if (TARGET_ILP32)
6904     error ("Assembler does not support -mabi=ilp32");
6905 #endif
6906
6907   initialize_aarch64_code_model ();
6908
6909   aarch64_build_bitmask_table ();
6910
6911   /* This target defaults to strict volatile bitfields.  */
6912   if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6913     flag_strict_volatile_bitfields = 1;
6914
6915   /* If the user did not specify a processor, choose the default
6916      one for them.  This will be the CPU set during configuration using
6917      --with-cpu, otherwise it is "generic".  */
6918   if (!selected_cpu)
6919     {
6920       selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6921       aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6922     }
6923
6924   gcc_assert (selected_cpu);
6925
6926   if (!selected_tune)
6927     selected_tune = selected_cpu;
6928
6929   aarch64_tune_flags = selected_tune->flags;
6930   aarch64_tune = selected_tune->core;
6931   aarch64_tune_params = selected_tune->tune;
6932   aarch64_architecture_version = selected_cpu->architecture_version;
6933
6934   if (aarch64_fix_a53_err835769 == 2)
6935     {
6936 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
6937       aarch64_fix_a53_err835769 = 1;
6938 #else
6939       aarch64_fix_a53_err835769 = 0;
6940 #endif
6941     }
6942
6943   /* If not opzimizing for size, set the default
6944      alignment to what the target wants */
6945   if (!optimize_size)
6946     {
6947       if (align_loops <= 0)
6948         align_loops = aarch64_tune_params->loop_align;
6949       if (align_jumps <= 0)
6950         align_jumps = aarch64_tune_params->jump_align;
6951       if (align_functions <= 0)
6952         align_functions = aarch64_tune_params->function_align;
6953     }
6954
6955   if (AARCH64_TUNE_FMA_STEERING)
6956     aarch64_register_fma_steering ();
6957
6958   aarch64_override_options_after_change ();
6959 }
6960
6961 /* Implement targetm.override_options_after_change.  */
6962
6963 static void
6964 aarch64_override_options_after_change (void)
6965 {
6966   if (flag_omit_frame_pointer)
6967     flag_omit_leaf_frame_pointer = false;
6968   else if (flag_omit_leaf_frame_pointer)
6969     flag_omit_frame_pointer = true;
6970 }
6971
6972 static struct machine_function *
6973 aarch64_init_machine_status (void)
6974 {
6975   struct machine_function *machine;
6976   machine = ggc_cleared_alloc<machine_function> ();
6977   return machine;
6978 }
6979
6980 void
6981 aarch64_init_expanders (void)
6982 {
6983   init_machine_status = aarch64_init_machine_status;
6984 }
6985
6986 /* A checking mechanism for the implementation of the various code models.  */
6987 static void
6988 initialize_aarch64_code_model (void)
6989 {
6990    if (flag_pic)
6991      {
6992        switch (aarch64_cmodel_var)
6993          {
6994          case AARCH64_CMODEL_TINY:
6995            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
6996            break;
6997          case AARCH64_CMODEL_SMALL:
6998            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
6999            break;
7000          case AARCH64_CMODEL_LARGE:
7001            sorry ("code model %qs with -f%s", "large",
7002                   flag_pic > 1 ? "PIC" : "pic");
7003          default:
7004            gcc_unreachable ();
7005          }
7006      }
7007    else
7008      aarch64_cmodel = aarch64_cmodel_var;
7009 }
7010
7011 /* Return true if SYMBOL_REF X binds locally.  */
7012
7013 static bool
7014 aarch64_symbol_binds_local_p (const_rtx x)
7015 {
7016   return (SYMBOL_REF_DECL (x)
7017           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
7018           : SYMBOL_REF_LOCAL_P (x));
7019 }
7020
7021 /* Return true if SYMBOL_REF X is thread local */
7022 static bool
7023 aarch64_tls_symbol_p (rtx x)
7024 {
7025   if (! TARGET_HAVE_TLS)
7026     return false;
7027
7028   if (GET_CODE (x) != SYMBOL_REF)
7029     return false;
7030
7031   return SYMBOL_REF_TLS_MODEL (x) != 0;
7032 }
7033
7034 /* Classify a TLS symbol into one of the TLS kinds.  */
7035 enum aarch64_symbol_type
7036 aarch64_classify_tls_symbol (rtx x)
7037 {
7038   enum tls_model tls_kind = tls_symbolic_operand_type (x);
7039
7040   switch (tls_kind)
7041     {
7042     case TLS_MODEL_GLOBAL_DYNAMIC:
7043     case TLS_MODEL_LOCAL_DYNAMIC:
7044       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
7045
7046     case TLS_MODEL_INITIAL_EXEC:
7047       return SYMBOL_SMALL_GOTTPREL;
7048
7049     case TLS_MODEL_LOCAL_EXEC:
7050       return SYMBOL_SMALL_TPREL;
7051
7052     case TLS_MODEL_EMULATED:
7053     case TLS_MODEL_NONE:
7054       return SYMBOL_FORCE_TO_MEM;
7055
7056     default:
7057       gcc_unreachable ();
7058     }
7059 }
7060
7061 /* Return the method that should be used to access SYMBOL_REF or
7062    LABEL_REF X in context CONTEXT.  */
7063
7064 enum aarch64_symbol_type
7065 aarch64_classify_symbol (rtx x, rtx offset,
7066                          enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
7067 {
7068   if (GET_CODE (x) == LABEL_REF)
7069     {
7070       switch (aarch64_cmodel)
7071         {
7072         case AARCH64_CMODEL_LARGE:
7073           return SYMBOL_FORCE_TO_MEM;
7074
7075         case AARCH64_CMODEL_TINY_PIC:
7076         case AARCH64_CMODEL_TINY:
7077           return SYMBOL_TINY_ABSOLUTE;
7078
7079         case AARCH64_CMODEL_SMALL_PIC:
7080         case AARCH64_CMODEL_SMALL:
7081           return SYMBOL_SMALL_ABSOLUTE;
7082
7083         default:
7084           gcc_unreachable ();
7085         }
7086     }
7087
7088   if (GET_CODE (x) == SYMBOL_REF)
7089     {
7090       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7091           return SYMBOL_FORCE_TO_MEM;
7092
7093       if (aarch64_tls_symbol_p (x))
7094         return aarch64_classify_tls_symbol (x);
7095
7096       switch (aarch64_cmodel)
7097         {
7098         case AARCH64_CMODEL_TINY:
7099           /* When we retreive symbol + offset address, we have to make sure
7100              the offset does not cause overflow of the final address.  But
7101              we have no way of knowing the address of symbol at compile time
7102              so we can't accurately say if the distance between the PC and
7103              symbol + offset is outside the addressible range of +/-1M in the
7104              TINY code model.  So we rely on images not being greater than
7105              1M and cap the offset at 1M and anything beyond 1M will have to
7106              be loaded using an alternative mechanism.  */
7107           if (SYMBOL_REF_WEAK (x)
7108               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7109             return SYMBOL_FORCE_TO_MEM;
7110           return SYMBOL_TINY_ABSOLUTE;
7111
7112         case AARCH64_CMODEL_SMALL:
7113           /* Same reasoning as the tiny code model, but the offset cap here is
7114              4G.  */
7115           if (SYMBOL_REF_WEAK (x)
7116               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
7117                             HOST_WIDE_INT_C (4294967264)))
7118             return SYMBOL_FORCE_TO_MEM;
7119           return SYMBOL_SMALL_ABSOLUTE;
7120
7121         case AARCH64_CMODEL_TINY_PIC:
7122           if (!aarch64_symbol_binds_local_p (x))
7123             return SYMBOL_TINY_GOT;
7124           return SYMBOL_TINY_ABSOLUTE;
7125
7126         case AARCH64_CMODEL_SMALL_PIC:
7127           if (!aarch64_symbol_binds_local_p (x))
7128             return SYMBOL_SMALL_GOT;
7129           return SYMBOL_SMALL_ABSOLUTE;
7130
7131         default:
7132           gcc_unreachable ();
7133         }
7134     }
7135
7136   /* By default push everything into the constant pool.  */
7137   return SYMBOL_FORCE_TO_MEM;
7138 }
7139
7140 bool
7141 aarch64_constant_address_p (rtx x)
7142 {
7143   return (CONSTANT_P (x) && memory_address_p (DImode, x));
7144 }
7145
7146 bool
7147 aarch64_legitimate_pic_operand_p (rtx x)
7148 {
7149   if (GET_CODE (x) == SYMBOL_REF
7150       || (GET_CODE (x) == CONST
7151           && GET_CODE (XEXP (x, 0)) == PLUS
7152           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7153      return false;
7154
7155   return true;
7156 }
7157
7158 /* Return true if X holds either a quarter-precision or
7159      floating-point +0.0 constant.  */
7160 static bool
7161 aarch64_valid_floating_const (machine_mode mode, rtx x)
7162 {
7163   if (!CONST_DOUBLE_P (x))
7164     return false;
7165
7166   /* TODO: We could handle moving 0.0 to a TFmode register,
7167      but first we would like to refactor the movtf_aarch64
7168      to be more amicable to split moves properly and
7169      correctly gate on TARGET_SIMD.  For now - reject all
7170      constants which are not to SFmode or DFmode registers.  */
7171   if (!(mode == SFmode || mode == DFmode))
7172     return false;
7173
7174   if (aarch64_float_const_zero_rtx_p (x))
7175     return true;
7176   return aarch64_float_const_representable_p (x);
7177 }
7178
7179 static bool
7180 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7181 {
7182   /* Do not allow vector struct mode constants.  We could support
7183      0 and -1 easily, but they need support in aarch64-simd.md.  */
7184   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7185     return false;
7186
7187   /* This could probably go away because
7188      we now decompose CONST_INTs according to expand_mov_immediate.  */
7189   if ((GET_CODE (x) == CONST_VECTOR
7190        && aarch64_simd_valid_immediate (x, mode, false, NULL))
7191       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7192         return !targetm.cannot_force_const_mem (mode, x);
7193
7194   if (GET_CODE (x) == HIGH
7195       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7196     return true;
7197
7198   return aarch64_constant_address_p (x);
7199 }
7200
7201 rtx
7202 aarch64_load_tp (rtx target)
7203 {
7204   if (!target
7205       || GET_MODE (target) != Pmode
7206       || !register_operand (target, Pmode))
7207     target = gen_reg_rtx (Pmode);
7208
7209   /* Can return in any reg.  */
7210   emit_insn (gen_aarch64_load_tp_hard (target));
7211   return target;
7212 }
7213
7214 /* On AAPCS systems, this is the "struct __va_list".  */
7215 static GTY(()) tree va_list_type;
7216
7217 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7218    Return the type to use as __builtin_va_list.
7219
7220    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7221
7222    struct __va_list
7223    {
7224      void *__stack;
7225      void *__gr_top;
7226      void *__vr_top;
7227      int   __gr_offs;
7228      int   __vr_offs;
7229    };  */
7230
7231 static tree
7232 aarch64_build_builtin_va_list (void)
7233 {
7234   tree va_list_name;
7235   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7236
7237   /* Create the type.  */
7238   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7239   /* Give it the required name.  */
7240   va_list_name = build_decl (BUILTINS_LOCATION,
7241                              TYPE_DECL,
7242                              get_identifier ("__va_list"),
7243                              va_list_type);
7244   DECL_ARTIFICIAL (va_list_name) = 1;
7245   TYPE_NAME (va_list_type) = va_list_name;
7246   TYPE_STUB_DECL (va_list_type) = va_list_name;
7247
7248   /* Create the fields.  */
7249   f_stack = build_decl (BUILTINS_LOCATION,
7250                         FIELD_DECL, get_identifier ("__stack"),
7251                         ptr_type_node);
7252   f_grtop = build_decl (BUILTINS_LOCATION,
7253                         FIELD_DECL, get_identifier ("__gr_top"),
7254                         ptr_type_node);
7255   f_vrtop = build_decl (BUILTINS_LOCATION,
7256                         FIELD_DECL, get_identifier ("__vr_top"),
7257                         ptr_type_node);
7258   f_groff = build_decl (BUILTINS_LOCATION,
7259                         FIELD_DECL, get_identifier ("__gr_offs"),
7260                         integer_type_node);
7261   f_vroff = build_decl (BUILTINS_LOCATION,
7262                         FIELD_DECL, get_identifier ("__vr_offs"),
7263                         integer_type_node);
7264
7265   DECL_ARTIFICIAL (f_stack) = 1;
7266   DECL_ARTIFICIAL (f_grtop) = 1;
7267   DECL_ARTIFICIAL (f_vrtop) = 1;
7268   DECL_ARTIFICIAL (f_groff) = 1;
7269   DECL_ARTIFICIAL (f_vroff) = 1;
7270
7271   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7272   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7273   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7274   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7275   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7276
7277   TYPE_FIELDS (va_list_type) = f_stack;
7278   DECL_CHAIN (f_stack) = f_grtop;
7279   DECL_CHAIN (f_grtop) = f_vrtop;
7280   DECL_CHAIN (f_vrtop) = f_groff;
7281   DECL_CHAIN (f_groff) = f_vroff;
7282
7283   /* Compute its layout.  */
7284   layout_type (va_list_type);
7285
7286   return va_list_type;
7287 }
7288
7289 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
7290 static void
7291 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7292 {
7293   const CUMULATIVE_ARGS *cum;
7294   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7295   tree stack, grtop, vrtop, groff, vroff;
7296   tree t;
7297   int gr_save_area_size;
7298   int vr_save_area_size;
7299   int vr_offset;
7300
7301   cum = &crtl->args.info;
7302   gr_save_area_size
7303     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7304   vr_save_area_size
7305     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7306
7307   if (TARGET_GENERAL_REGS_ONLY)
7308     {
7309       if (cum->aapcs_nvrn > 0)
7310         sorry ("%qs and floating point or vector arguments",
7311                "-mgeneral-regs-only");
7312       vr_save_area_size = 0;
7313     }
7314
7315   f_stack = TYPE_FIELDS (va_list_type_node);
7316   f_grtop = DECL_CHAIN (f_stack);
7317   f_vrtop = DECL_CHAIN (f_grtop);
7318   f_groff = DECL_CHAIN (f_vrtop);
7319   f_vroff = DECL_CHAIN (f_groff);
7320
7321   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7322                   NULL_TREE);
7323   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7324                   NULL_TREE);
7325   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7326                   NULL_TREE);
7327   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7328                   NULL_TREE);
7329   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7330                   NULL_TREE);
7331
7332   /* Emit code to initialize STACK, which points to the next varargs stack
7333      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
7334      by named arguments.  STACK is 8-byte aligned.  */
7335   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7336   if (cum->aapcs_stack_size > 0)
7337     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7338   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7339   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7340
7341   /* Emit code to initialize GRTOP, the top of the GR save area.
7342      virtual_incoming_args_rtx should have been 16 byte aligned.  */
7343   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7344   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7345   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7346
7347   /* Emit code to initialize VRTOP, the top of the VR save area.
7348      This address is gr_save_area_bytes below GRTOP, rounded
7349      down to the next 16-byte boundary.  */
7350   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7351   vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7352                              STACK_BOUNDARY / BITS_PER_UNIT);
7353
7354   if (vr_offset)
7355     t = fold_build_pointer_plus_hwi (t, -vr_offset);
7356   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7357   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7358
7359   /* Emit code to initialize GROFF, the offset from GRTOP of the
7360      next GPR argument.  */
7361   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7362               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7363   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7364
7365   /* Likewise emit code to initialize VROFF, the offset from FTOP
7366      of the next VR argument.  */
7367   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7368               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7369   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7370 }
7371
7372 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
7373
7374 static tree
7375 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7376                               gimple_seq *post_p ATTRIBUTE_UNUSED)
7377 {
7378   tree addr;
7379   bool indirect_p;
7380   bool is_ha;           /* is HFA or HVA.  */
7381   bool dw_align;        /* double-word align.  */
7382   machine_mode ag_mode = VOIDmode;
7383   int nregs;
7384   machine_mode mode;
7385
7386   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7387   tree stack, f_top, f_off, off, arg, roundup, on_stack;
7388   HOST_WIDE_INT size, rsize, adjust, align;
7389   tree t, u, cond1, cond2;
7390
7391   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7392   if (indirect_p)
7393     type = build_pointer_type (type);
7394
7395   mode = TYPE_MODE (type);
7396
7397   f_stack = TYPE_FIELDS (va_list_type_node);
7398   f_grtop = DECL_CHAIN (f_stack);
7399   f_vrtop = DECL_CHAIN (f_grtop);
7400   f_groff = DECL_CHAIN (f_vrtop);
7401   f_vroff = DECL_CHAIN (f_groff);
7402
7403   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7404                   f_stack, NULL_TREE);
7405   size = int_size_in_bytes (type);
7406   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7407
7408   dw_align = false;
7409   adjust = 0;
7410   if (aarch64_vfp_is_call_or_return_candidate (mode,
7411                                                type,
7412                                                &ag_mode,
7413                                                &nregs,
7414                                                &is_ha))
7415     {
7416       /* TYPE passed in fp/simd registers.  */
7417       if (TARGET_GENERAL_REGS_ONLY)
7418         sorry ("%qs and floating point or vector arguments",
7419                "-mgeneral-regs-only");
7420
7421       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
7422                       unshare_expr (valist), f_vrtop, NULL_TREE);
7423       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
7424                       unshare_expr (valist), f_vroff, NULL_TREE);
7425
7426       rsize = nregs * UNITS_PER_VREG;
7427
7428       if (is_ha)
7429         {
7430           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
7431             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
7432         }
7433       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
7434                && size < UNITS_PER_VREG)
7435         {
7436           adjust = UNITS_PER_VREG - size;
7437         }
7438     }
7439   else
7440     {
7441       /* TYPE passed in general registers.  */
7442       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
7443                       unshare_expr (valist), f_grtop, NULL_TREE);
7444       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
7445                       unshare_expr (valist), f_groff, NULL_TREE);
7446       rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
7447       nregs = rsize / UNITS_PER_WORD;
7448
7449       if (align > 8)
7450         dw_align = true;
7451
7452       if (BLOCK_REG_PADDING (mode, type, 1) == downward
7453           && size < UNITS_PER_WORD)
7454         {
7455           adjust = UNITS_PER_WORD  - size;
7456         }
7457     }
7458
7459   /* Get a local temporary for the field value.  */
7460   off = get_initialized_tmp_var (f_off, pre_p, NULL);
7461
7462   /* Emit code to branch if off >= 0.  */
7463   t = build2 (GE_EXPR, boolean_type_node, off,
7464               build_int_cst (TREE_TYPE (off), 0));
7465   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
7466
7467   if (dw_align)
7468     {
7469       /* Emit: offs = (offs + 15) & -16.  */
7470       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7471                   build_int_cst (TREE_TYPE (off), 15));
7472       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
7473                   build_int_cst (TREE_TYPE (off), -16));
7474       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
7475     }
7476   else
7477     roundup = NULL;
7478
7479   /* Update ap.__[g|v]r_offs  */
7480   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7481               build_int_cst (TREE_TYPE (off), rsize));
7482   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
7483
7484   /* String up.  */
7485   if (roundup)
7486     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7487
7488   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
7489   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
7490               build_int_cst (TREE_TYPE (f_off), 0));
7491   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
7492
7493   /* String up: make sure the assignment happens before the use.  */
7494   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
7495   COND_EXPR_ELSE (cond1) = t;
7496
7497   /* Prepare the trees handling the argument that is passed on the stack;
7498      the top level node will store in ON_STACK.  */
7499   arg = get_initialized_tmp_var (stack, pre_p, NULL);
7500   if (align > 8)
7501     {
7502       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
7503       t = fold_convert (intDI_type_node, arg);
7504       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7505                   build_int_cst (TREE_TYPE (t), 15));
7506       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7507                   build_int_cst (TREE_TYPE (t), -16));
7508       t = fold_convert (TREE_TYPE (arg), t);
7509       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7510     }
7511   else
7512     roundup = NULL;
7513   /* Advance ap.__stack  */
7514   t = fold_convert (intDI_type_node, arg);
7515   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7516               build_int_cst (TREE_TYPE (t), size + 7));
7517   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7518               build_int_cst (TREE_TYPE (t), -8));
7519   t = fold_convert (TREE_TYPE (arg), t);
7520   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7521   /* String up roundup and advance.  */
7522   if (roundup)
7523     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7524   /* String up with arg */
7525   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7526   /* Big-endianness related address adjustment.  */
7527   if (BLOCK_REG_PADDING (mode, type, 1) == downward
7528       && size < UNITS_PER_WORD)
7529   {
7530     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7531                 size_int (UNITS_PER_WORD - size));
7532     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7533   }
7534
7535   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7536   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7537
7538   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
7539   t = off;
7540   if (adjust)
7541     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7542                 build_int_cst (TREE_TYPE (off), adjust));
7543
7544   t = fold_convert (sizetype, t);
7545   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7546
7547   if (is_ha)
7548     {
7549       /* type ha; // treat as "struct {ftype field[n];}"
7550          ... [computing offs]
7551          for (i = 0; i <nregs; ++i, offs += 16)
7552            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7553          return ha;  */
7554       int i;
7555       tree tmp_ha, field_t, field_ptr_t;
7556
7557       /* Declare a local variable.  */
7558       tmp_ha = create_tmp_var_raw (type, "ha");
7559       gimple_add_tmp_var (tmp_ha);
7560
7561       /* Establish the base type.  */
7562       switch (ag_mode)
7563         {
7564         case SFmode:
7565           field_t = float_type_node;
7566           field_ptr_t = float_ptr_type_node;
7567           break;
7568         case DFmode:
7569           field_t = double_type_node;
7570           field_ptr_t = double_ptr_type_node;
7571           break;
7572         case TFmode:
7573           field_t = long_double_type_node;
7574           field_ptr_t = long_double_ptr_type_node;
7575           break;
7576 /* The half precision and quad precision are not fully supported yet.  Enable
7577    the following code after the support is complete.  Need to find the correct
7578    type node for __fp16 *.  */
7579 #if 0
7580         case HFmode:
7581           field_t = float_type_node;
7582           field_ptr_t = float_ptr_type_node;
7583           break;
7584 #endif
7585         case V2SImode:
7586         case V4SImode:
7587             {
7588               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7589               field_t = build_vector_type_for_mode (innertype, ag_mode);
7590               field_ptr_t = build_pointer_type (field_t);
7591             }
7592           break;
7593         default:
7594           gcc_assert (0);
7595         }
7596
7597       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
7598       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7599       addr = t;
7600       t = fold_convert (field_ptr_t, addr);
7601       t = build2 (MODIFY_EXPR, field_t,
7602                   build1 (INDIRECT_REF, field_t, tmp_ha),
7603                   build1 (INDIRECT_REF, field_t, t));
7604
7605       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
7606       for (i = 1; i < nregs; ++i)
7607         {
7608           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7609           u = fold_convert (field_ptr_t, addr);
7610           u = build2 (MODIFY_EXPR, field_t,
7611                       build2 (MEM_REF, field_t, tmp_ha,
7612                               build_int_cst (field_ptr_t,
7613                                              (i *
7614                                               int_size_in_bytes (field_t)))),
7615                       build1 (INDIRECT_REF, field_t, u));
7616           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7617         }
7618
7619       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7620       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7621     }
7622
7623   COND_EXPR_ELSE (cond2) = t;
7624   addr = fold_convert (build_pointer_type (type), cond1);
7625   addr = build_va_arg_indirect_ref (addr);
7626
7627   if (indirect_p)
7628     addr = build_va_arg_indirect_ref (addr);
7629
7630   return addr;
7631 }
7632
7633 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
7634
7635 static void
7636 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7637                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7638                                 int no_rtl)
7639 {
7640   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7641   CUMULATIVE_ARGS local_cum;
7642   int gr_saved, vr_saved;
7643
7644   /* The caller has advanced CUM up to, but not beyond, the last named
7645      argument.  Advance a local copy of CUM past the last "real" named
7646      argument, to find out how many registers are left over.  */
7647   local_cum = *cum;
7648   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7649
7650   /* Found out how many registers we need to save.  */
7651   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7652   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7653
7654   if (TARGET_GENERAL_REGS_ONLY)
7655     {
7656       if (local_cum.aapcs_nvrn > 0)
7657         sorry ("%qs and floating point or vector arguments",
7658                "-mgeneral-regs-only");
7659       vr_saved = 0;
7660     }
7661
7662   if (!no_rtl)
7663     {
7664       if (gr_saved > 0)
7665         {
7666           rtx ptr, mem;
7667
7668           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
7669           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7670                                - gr_saved * UNITS_PER_WORD);
7671           mem = gen_frame_mem (BLKmode, ptr);
7672           set_mem_alias_set (mem, get_varargs_alias_set ());
7673
7674           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7675                                mem, gr_saved);
7676         }
7677       if (vr_saved > 0)
7678         {
7679           /* We can't use move_block_from_reg, because it will use
7680              the wrong mode, storing D regs only.  */
7681           machine_mode mode = TImode;
7682           int off, i;
7683
7684           /* Set OFF to the offset from virtual_incoming_args_rtx of
7685              the first vector register.  The VR save area lies below
7686              the GR one, and is aligned to 16 bytes.  */
7687           off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7688                                    STACK_BOUNDARY / BITS_PER_UNIT);
7689           off -= vr_saved * UNITS_PER_VREG;
7690
7691           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7692             {
7693               rtx ptr, mem;
7694
7695               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7696               mem = gen_frame_mem (mode, ptr);
7697               set_mem_alias_set (mem, get_varargs_alias_set ());
7698               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7699               off += UNITS_PER_VREG;
7700             }
7701         }
7702     }
7703
7704   /* We don't save the size into *PRETEND_SIZE because we want to avoid
7705      any complication of having crtl->args.pretend_args_size changed.  */
7706   cfun->machine->frame.saved_varargs_size
7707     = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7708                       STACK_BOUNDARY / BITS_PER_UNIT)
7709        + vr_saved * UNITS_PER_VREG);
7710 }
7711
7712 static void
7713 aarch64_conditional_register_usage (void)
7714 {
7715   int i;
7716   if (!TARGET_FLOAT)
7717     {
7718       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7719         {
7720           fixed_regs[i] = 1;
7721           call_used_regs[i] = 1;
7722         }
7723     }
7724 }
7725
7726 /* Walk down the type tree of TYPE counting consecutive base elements.
7727    If *MODEP is VOIDmode, then set it to the first valid floating point
7728    type.  If a non-floating point type is found, or if a floating point
7729    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7730    otherwise return the count in the sub-tree.  */
7731 static int
7732 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
7733 {
7734   machine_mode mode;
7735   HOST_WIDE_INT size;
7736
7737   switch (TREE_CODE (type))
7738     {
7739     case REAL_TYPE:
7740       mode = TYPE_MODE (type);
7741       if (mode != DFmode && mode != SFmode && mode != TFmode)
7742         return -1;
7743
7744       if (*modep == VOIDmode)
7745         *modep = mode;
7746
7747       if (*modep == mode)
7748         return 1;
7749
7750       break;
7751
7752     case COMPLEX_TYPE:
7753       mode = TYPE_MODE (TREE_TYPE (type));
7754       if (mode != DFmode && mode != SFmode && mode != TFmode)
7755         return -1;
7756
7757       if (*modep == VOIDmode)
7758         *modep = mode;
7759
7760       if (*modep == mode)
7761         return 2;
7762
7763       break;
7764
7765     case VECTOR_TYPE:
7766       /* Use V2SImode and V4SImode as representatives of all 64-bit
7767          and 128-bit vector types.  */
7768       size = int_size_in_bytes (type);
7769       switch (size)
7770         {
7771         case 8:
7772           mode = V2SImode;
7773           break;
7774         case 16:
7775           mode = V4SImode;
7776           break;
7777         default:
7778           return -1;
7779         }
7780
7781       if (*modep == VOIDmode)
7782         *modep = mode;
7783
7784       /* Vector modes are considered to be opaque: two vectors are
7785          equivalent for the purposes of being homogeneous aggregates
7786          if they are the same size.  */
7787       if (*modep == mode)
7788         return 1;
7789
7790       break;
7791
7792     case ARRAY_TYPE:
7793       {
7794         int count;
7795         tree index = TYPE_DOMAIN (type);
7796
7797         /* Can't handle incomplete types nor sizes that are not
7798            fixed.  */
7799         if (!COMPLETE_TYPE_P (type)
7800             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7801           return -1;
7802
7803         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7804         if (count == -1
7805             || !index
7806             || !TYPE_MAX_VALUE (index)
7807             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7808             || !TYPE_MIN_VALUE (index)
7809             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7810             || count < 0)
7811           return -1;
7812
7813         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7814                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7815
7816         /* There must be no padding.  */
7817         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7818           return -1;
7819
7820         return count;
7821       }
7822
7823     case RECORD_TYPE:
7824       {
7825         int count = 0;
7826         int sub_count;
7827         tree field;
7828
7829         /* Can't handle incomplete types nor sizes that are not
7830            fixed.  */
7831         if (!COMPLETE_TYPE_P (type)
7832             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7833           return -1;
7834
7835         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7836           {
7837             if (TREE_CODE (field) != FIELD_DECL)
7838               continue;
7839
7840             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7841             if (sub_count < 0)
7842               return -1;
7843             count += sub_count;
7844           }
7845
7846         /* There must be no padding.  */
7847         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7848           return -1;
7849
7850         return count;
7851       }
7852
7853     case UNION_TYPE:
7854     case QUAL_UNION_TYPE:
7855       {
7856         /* These aren't very interesting except in a degenerate case.  */
7857         int count = 0;
7858         int sub_count;
7859         tree field;
7860
7861         /* Can't handle incomplete types nor sizes that are not
7862            fixed.  */
7863         if (!COMPLETE_TYPE_P (type)
7864             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7865           return -1;
7866
7867         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7868           {
7869             if (TREE_CODE (field) != FIELD_DECL)
7870               continue;
7871
7872             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7873             if (sub_count < 0)
7874               return -1;
7875             count = count > sub_count ? count : sub_count;
7876           }
7877
7878         /* There must be no padding.  */
7879         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7880           return -1;
7881
7882         return count;
7883       }
7884
7885     default:
7886       break;
7887     }
7888
7889   return -1;
7890 }
7891
7892 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7893    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
7894    array types.  The C99 floating-point complex types are also considered
7895    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
7896    types, which are GCC extensions and out of the scope of AAPCS64, are
7897    treated as composite types here as well.
7898
7899    Note that MODE itself is not sufficient in determining whether a type
7900    is such a composite type or not.  This is because
7901    stor-layout.c:compute_record_mode may have already changed the MODE
7902    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
7903    structure with only one field may have its MODE set to the mode of the
7904    field.  Also an integer mode whose size matches the size of the
7905    RECORD_TYPE type may be used to substitute the original mode
7906    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
7907    solely relied on.  */
7908
7909 static bool
7910 aarch64_composite_type_p (const_tree type,
7911                           machine_mode mode)
7912 {
7913   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7914     return true;
7915
7916   if (mode == BLKmode
7917       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7918       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7919     return true;
7920
7921   return false;
7922 }
7923
7924 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7925    type as described in AAPCS64 \S 4.1.2.
7926
7927    See the comment above aarch64_composite_type_p for the notes on MODE.  */
7928
7929 static bool
7930 aarch64_short_vector_p (const_tree type,
7931                         machine_mode mode)
7932 {
7933   HOST_WIDE_INT size = -1;
7934
7935   if (type && TREE_CODE (type) == VECTOR_TYPE)
7936     size = int_size_in_bytes (type);
7937   else if (!aarch64_composite_type_p (type, mode)
7938            && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
7939                || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
7940     size = GET_MODE_SIZE (mode);
7941
7942   return (size == 8 || size == 16) ? true : false;
7943 }
7944
7945 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
7946    shall be passed or returned in simd/fp register(s) (providing these
7947    parameter passing registers are available).
7948
7949    Upon successful return, *COUNT returns the number of needed registers,
7950    *BASE_MODE returns the mode of the individual register and when IS_HAF
7951    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
7952    floating-point aggregate or a homogeneous short-vector aggregate.  */
7953
7954 static bool
7955 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
7956                                          const_tree type,
7957                                          machine_mode *base_mode,
7958                                          int *count,
7959                                          bool *is_ha)
7960 {
7961   machine_mode new_mode = VOIDmode;
7962   bool composite_p = aarch64_composite_type_p (type, mode);
7963
7964   if (is_ha != NULL) *is_ha = false;
7965
7966   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
7967       || aarch64_short_vector_p (type, mode))
7968     {
7969       *count = 1;
7970       new_mode = mode;
7971     }
7972   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
7973     {
7974       if (is_ha != NULL) *is_ha = true;
7975       *count = 2;
7976       new_mode = GET_MODE_INNER (mode);
7977     }
7978   else if (type && composite_p)
7979     {
7980       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
7981
7982       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
7983         {
7984           if (is_ha != NULL) *is_ha = true;
7985           *count = ag_count;
7986         }
7987       else
7988         return false;
7989     }
7990   else
7991     return false;
7992
7993   *base_mode = new_mode;
7994   return true;
7995 }
7996
7997 /* Implement TARGET_STRUCT_VALUE_RTX.  */
7998
7999 static rtx
8000 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
8001                           int incoming ATTRIBUTE_UNUSED)
8002 {
8003   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
8004 }
8005
8006 /* Implements target hook vector_mode_supported_p.  */
8007 static bool
8008 aarch64_vector_mode_supported_p (machine_mode mode)
8009 {
8010   if (TARGET_SIMD
8011       && (mode == V4SImode  || mode == V8HImode
8012           || mode == V16QImode || mode == V2DImode
8013           || mode == V2SImode  || mode == V4HImode
8014           || mode == V8QImode || mode == V2SFmode
8015           || mode == V4SFmode || mode == V2DFmode
8016           || mode == V1DFmode))
8017     return true;
8018
8019   return false;
8020 }
8021
8022 /* Return appropriate SIMD container
8023    for MODE within a vector of WIDTH bits.  */
8024 static machine_mode
8025 aarch64_simd_container_mode (machine_mode mode, unsigned width)
8026 {
8027   gcc_assert (width == 64 || width == 128);
8028   if (TARGET_SIMD)
8029     {
8030       if (width == 128)
8031         switch (mode)
8032           {
8033           case DFmode:
8034             return V2DFmode;
8035           case SFmode:
8036             return V4SFmode;
8037           case SImode:
8038             return V4SImode;
8039           case HImode:
8040             return V8HImode;
8041           case QImode:
8042             return V16QImode;
8043           case DImode:
8044             return V2DImode;
8045           default:
8046             break;
8047           }
8048       else
8049         switch (mode)
8050           {
8051           case SFmode:
8052             return V2SFmode;
8053           case SImode:
8054             return V2SImode;
8055           case HImode:
8056             return V4HImode;
8057           case QImode:
8058             return V8QImode;
8059           default:
8060             break;
8061           }
8062     }
8063   return word_mode;
8064 }
8065
8066 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
8067 static machine_mode
8068 aarch64_preferred_simd_mode (machine_mode mode)
8069 {
8070   return aarch64_simd_container_mode (mode, 128);
8071 }
8072
8073 /* Return the bitmask of possible vector sizes for the vectorizer
8074    to iterate over.  */
8075 static unsigned int
8076 aarch64_autovectorize_vector_sizes (void)
8077 {
8078   return (16 | 8);
8079 }
8080
8081 /* Implement TARGET_MANGLE_TYPE.  */
8082
8083 static const char *
8084 aarch64_mangle_type (const_tree type)
8085 {
8086   /* The AArch64 ABI documents say that "__va_list" has to be
8087      managled as if it is in the "std" namespace.  */
8088   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8089     return "St9__va_list";
8090
8091   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
8092      builtin types.  */
8093   if (TYPE_NAME (type) != NULL)
8094     return aarch64_mangle_builtin_type (type);
8095
8096   /* Use the default mangling.  */
8097   return NULL;
8098 }
8099
8100
8101 /* Return true if the rtx_insn contains a MEM RTX somewhere
8102    in it.  */
8103
8104 static bool
8105 has_memory_op (rtx_insn *mem_insn)
8106 {
8107   subrtx_iterator::array_type array;
8108   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8109     if (MEM_P (*iter))
8110       return true;
8111
8112   return false;
8113 }
8114
8115 /* Find the first rtx_insn before insn that will generate an assembly
8116    instruction.  */
8117
8118 static rtx_insn *
8119 aarch64_prev_real_insn (rtx_insn *insn)
8120 {
8121   if (!insn)
8122     return NULL;
8123
8124   do
8125     {
8126       insn = prev_real_insn (insn);
8127     }
8128   while (insn && recog_memoized (insn) < 0);
8129
8130   return insn;
8131 }
8132
8133 static bool
8134 is_madd_op (enum attr_type t1)
8135 {
8136   unsigned int i;
8137   /* A number of these may be AArch32 only.  */
8138   enum attr_type mlatypes[] = {
8139     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8140     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8141     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8142   };
8143
8144   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8145     {
8146       if (t1 == mlatypes[i])
8147         return true;
8148     }
8149
8150   return false;
8151 }
8152
8153 /* Check if there is a register dependency between a load and the insn
8154    for which we hold recog_data.  */
8155
8156 static bool
8157 dep_between_memop_and_curr (rtx memop)
8158 {
8159   rtx load_reg;
8160   int opno;
8161
8162   gcc_assert (GET_CODE (memop) == SET);
8163
8164   if (!REG_P (SET_DEST (memop)))
8165     return false;
8166
8167   load_reg = SET_DEST (memop);
8168   for (opno = 1; opno < recog_data.n_operands; opno++)
8169     {
8170       rtx operand = recog_data.operand[opno];
8171       if (REG_P (operand)
8172           && reg_overlap_mentioned_p (load_reg, operand))
8173         return true;
8174
8175     }
8176   return false;
8177 }
8178
8179
8180 /* When working around the Cortex-A53 erratum 835769,
8181    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8182    instruction and has a preceding memory instruction such that a NOP
8183    should be inserted between them.  */
8184
8185 bool
8186 aarch64_madd_needs_nop (rtx_insn* insn)
8187 {
8188   enum attr_type attr_type;
8189   rtx_insn *prev;
8190   rtx body;
8191
8192   if (!aarch64_fix_a53_err835769)
8193     return false;
8194
8195   if (recog_memoized (insn) < 0)
8196     return false;
8197
8198   attr_type = get_attr_type (insn);
8199   if (!is_madd_op (attr_type))
8200     return false;
8201
8202   prev = aarch64_prev_real_insn (insn);
8203   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8204      Restore recog state to INSN to avoid state corruption.  */
8205   extract_constrain_insn_cached (insn);
8206
8207   if (!prev || !has_memory_op (prev))
8208     return false;
8209
8210   body = single_set (prev);
8211
8212   /* If the previous insn is a memory op and there is no dependency between
8213      it and the DImode madd, emit a NOP between them.  If body is NULL then we
8214      have a complex memory operation, probably a load/store pair.
8215      Be conservative for now and emit a NOP.  */
8216   if (GET_MODE (recog_data.operand[0]) == DImode
8217       && (!body || !dep_between_memop_and_curr (body)))
8218     return true;
8219
8220   return false;
8221
8222 }
8223
8224
8225 /* Implement FINAL_PRESCAN_INSN.  */
8226
8227 void
8228 aarch64_final_prescan_insn (rtx_insn *insn)
8229 {
8230   if (aarch64_madd_needs_nop (insn))
8231     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8232 }
8233
8234
8235 /* Return the equivalent letter for size.  */
8236 static char
8237 sizetochar (int size)
8238 {
8239   switch (size)
8240     {
8241     case 64: return 'd';
8242     case 32: return 's';
8243     case 16: return 'h';
8244     case 8 : return 'b';
8245     default: gcc_unreachable ();
8246     }
8247 }
8248
8249 /* Return true iff x is a uniform vector of floating-point
8250    constants, and the constant can be represented in
8251    quarter-precision form.  Note, as aarch64_float_const_representable
8252    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
8253 static bool
8254 aarch64_vect_float_const_representable_p (rtx x)
8255 {
8256   int i = 0;
8257   REAL_VALUE_TYPE r0, ri;
8258   rtx x0, xi;
8259
8260   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8261     return false;
8262
8263   x0 = CONST_VECTOR_ELT (x, 0);
8264   if (!CONST_DOUBLE_P (x0))
8265     return false;
8266
8267   REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8268
8269   for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8270     {
8271       xi = CONST_VECTOR_ELT (x, i);
8272       if (!CONST_DOUBLE_P (xi))
8273         return false;
8274
8275       REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8276       if (!REAL_VALUES_EQUAL (r0, ri))
8277         return false;
8278     }
8279
8280   return aarch64_float_const_representable_p (x0);
8281 }
8282
8283 /* Return true for valid and false for invalid.  */
8284 bool
8285 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8286                               struct simd_immediate_info *info)
8287 {
8288 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
8289   matches = 1;                                          \
8290   for (i = 0; i < idx; i += (STRIDE))                   \
8291     if (!(TEST))                                        \
8292       matches = 0;                                      \
8293   if (matches)                                          \
8294     {                                                   \
8295       immtype = (CLASS);                                \
8296       elsize = (ELSIZE);                                \
8297       eshift = (SHIFT);                                 \
8298       emvn = (NEG);                                     \
8299       break;                                            \
8300     }
8301
8302   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8303   unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8304   unsigned char bytes[16];
8305   int immtype = -1, matches;
8306   unsigned int invmask = inverse ? 0xff : 0;
8307   int eshift, emvn;
8308
8309   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8310     {
8311       if (! (aarch64_simd_imm_zero_p (op, mode)
8312              || aarch64_vect_float_const_representable_p (op)))
8313         return false;
8314
8315       if (info)
8316         {
8317           info->value = CONST_VECTOR_ELT (op, 0);
8318           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8319           info->mvn = false;
8320           info->shift = 0;
8321         }
8322
8323       return true;
8324     }
8325
8326   /* Splat vector constant out into a byte vector.  */
8327   for (i = 0; i < n_elts; i++)
8328     {
8329       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
8330          it must be laid out in the vector register in reverse order.  */
8331       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8332       unsigned HOST_WIDE_INT elpart;
8333       unsigned int part, parts;
8334
8335       if (CONST_INT_P (el))
8336         {
8337           elpart = INTVAL (el);
8338           parts = 1;
8339         }
8340       else if (GET_CODE (el) == CONST_DOUBLE)
8341         {
8342           elpart = CONST_DOUBLE_LOW (el);
8343           parts = 2;
8344         }
8345       else
8346         gcc_unreachable ();
8347
8348       for (part = 0; part < parts; part++)
8349         {
8350           unsigned int byte;
8351           for (byte = 0; byte < innersize; byte++)
8352             {
8353               bytes[idx++] = (elpart & 0xff) ^ invmask;
8354               elpart >>= BITS_PER_UNIT;
8355             }
8356           if (GET_CODE (el) == CONST_DOUBLE)
8357             elpart = CONST_DOUBLE_HIGH (el);
8358         }
8359     }
8360
8361   /* Sanity check.  */
8362   gcc_assert (idx == GET_MODE_SIZE (mode));
8363
8364   do
8365     {
8366       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8367              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8368
8369       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8370              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8371
8372       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8373              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8374
8375       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8376              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8377
8378       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8379
8380       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8381
8382       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8383              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8384
8385       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8386              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8387
8388       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8389              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8390
8391       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8392              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8393
8394       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8395
8396       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8397
8398       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8399              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8400
8401       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8402              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8403
8404       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8405              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8406
8407       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8408              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8409
8410       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8411
8412       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8413              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8414     }
8415   while (0);
8416
8417   if (immtype == -1)
8418     return false;
8419
8420   if (info)
8421     {
8422       info->element_width = elsize;
8423       info->mvn = emvn != 0;
8424       info->shift = eshift;
8425
8426       unsigned HOST_WIDE_INT imm = 0;
8427
8428       if (immtype >= 12 && immtype <= 15)
8429         info->msl = true;
8430
8431       /* Un-invert bytes of recognized vector, if necessary.  */
8432       if (invmask != 0)
8433         for (i = 0; i < idx; i++)
8434           bytes[i] ^= invmask;
8435
8436       if (immtype == 17)
8437         {
8438           /* FIXME: Broken on 32-bit H_W_I hosts.  */
8439           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8440
8441           for (i = 0; i < 8; i++)
8442             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8443               << (i * BITS_PER_UNIT);
8444
8445
8446           info->value = GEN_INT (imm);
8447         }
8448       else
8449         {
8450           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8451             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8452
8453           /* Construct 'abcdefgh' because the assembler cannot handle
8454              generic constants.  */
8455           if (info->mvn)
8456             imm = ~imm;
8457           imm = (imm >> info->shift) & 0xff;
8458           info->value = GEN_INT (imm);
8459         }
8460     }
8461
8462   return true;
8463 #undef CHECK
8464 }
8465
8466 /* Check of immediate shift constants are within range.  */
8467 bool
8468 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8469 {
8470   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8471   if (left)
8472     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8473   else
8474     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8475 }
8476
8477 /* Return true if X is a uniform vector where all elements
8478    are either the floating-point constant 0.0 or the
8479    integer constant 0.  */
8480 bool
8481 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8482 {
8483   return x == CONST0_RTX (mode);
8484 }
8485
8486 bool
8487 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8488 {
8489   HOST_WIDE_INT imm = INTVAL (x);
8490   int i;
8491
8492   for (i = 0; i < 8; i++)
8493     {
8494       unsigned int byte = imm & 0xff;
8495       if (byte != 0xff && byte != 0)
8496        return false;
8497       imm >>= 8;
8498     }
8499
8500   return true;
8501 }
8502
8503 bool
8504 aarch64_mov_operand_p (rtx x,
8505                        enum aarch64_symbol_context context,
8506                        machine_mode mode)
8507 {
8508   if (GET_CODE (x) == HIGH
8509       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8510     return true;
8511
8512   if (CONST_INT_P (x))
8513     return true;
8514
8515   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8516     return true;
8517
8518   return aarch64_classify_symbolic_expression (x, context)
8519     == SYMBOL_TINY_ABSOLUTE;
8520 }
8521
8522 /* Return a const_int vector of VAL.  */
8523 rtx
8524 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8525 {
8526   int nunits = GET_MODE_NUNITS (mode);
8527   rtvec v = rtvec_alloc (nunits);
8528   int i;
8529
8530   for (i=0; i < nunits; i++)
8531     RTVEC_ELT (v, i) = GEN_INT (val);
8532
8533   return gen_rtx_CONST_VECTOR (mode, v);
8534 }
8535
8536 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
8537
8538 bool
8539 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8540 {
8541   machine_mode vmode;
8542
8543   gcc_assert (!VECTOR_MODE_P (mode));
8544   vmode = aarch64_preferred_simd_mode (mode);
8545   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8546   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8547 }
8548
8549 /* Construct and return a PARALLEL RTX vector with elements numbering the
8550    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8551    the vector - from the perspective of the architecture.  This does not
8552    line up with GCC's perspective on lane numbers, so we end up with
8553    different masks depending on our target endian-ness.  The diagram
8554    below may help.  We must draw the distinction when building masks
8555    which select one half of the vector.  An instruction selecting
8556    architectural low-lanes for a big-endian target, must be described using
8557    a mask selecting GCC high-lanes.
8558
8559                  Big-Endian             Little-Endian
8560
8561 GCC             0   1   2   3           3   2   1   0
8562               | x | x | x | x |       | x | x | x | x |
8563 Architecture    3   2   1   0           3   2   1   0
8564
8565 Low Mask:         { 2, 3 }                { 0, 1 }
8566 High Mask:        { 0, 1 }                { 2, 3 }
8567 */
8568
8569 rtx
8570 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8571 {
8572   int nunits = GET_MODE_NUNITS (mode);
8573   rtvec v = rtvec_alloc (nunits / 2);
8574   int high_base = nunits / 2;
8575   int low_base = 0;
8576   int base;
8577   rtx t1;
8578   int i;
8579
8580   if (BYTES_BIG_ENDIAN)
8581     base = high ? low_base : high_base;
8582   else
8583     base = high ? high_base : low_base;
8584
8585   for (i = 0; i < nunits / 2; i++)
8586     RTVEC_ELT (v, i) = GEN_INT (base + i);
8587
8588   t1 = gen_rtx_PARALLEL (mode, v);
8589   return t1;
8590 }
8591
8592 /* Check OP for validity as a PARALLEL RTX vector with elements
8593    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8594    from the perspective of the architecture.  See the diagram above
8595    aarch64_simd_vect_par_cnst_half for more details.  */
8596
8597 bool
8598 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8599                                        bool high)
8600 {
8601   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8602   HOST_WIDE_INT count_op = XVECLEN (op, 0);
8603   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8604   int i = 0;
8605
8606   if (!VECTOR_MODE_P (mode))
8607     return false;
8608
8609   if (count_op != count_ideal)
8610     return false;
8611
8612   for (i = 0; i < count_ideal; i++)
8613     {
8614       rtx elt_op = XVECEXP (op, 0, i);
8615       rtx elt_ideal = XVECEXP (ideal, 0, i);
8616
8617       if (!CONST_INT_P (elt_op)
8618           || INTVAL (elt_ideal) != INTVAL (elt_op))
8619         return false;
8620     }
8621   return true;
8622 }
8623
8624 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
8625    HIGH (exclusive).  */
8626 void
8627 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
8628                           const_tree exp)
8629 {
8630   HOST_WIDE_INT lane;
8631   gcc_assert (CONST_INT_P (operand));
8632   lane = INTVAL (operand);
8633
8634   if (lane < low || lane >= high)
8635   {
8636     if (exp)
8637       error ("%Klane %ld out of range %ld - %ld", exp, lane, low, high - 1);
8638     else
8639       error ("lane %ld out of range %ld - %ld", lane, low, high - 1);
8640   }
8641 }
8642
8643 /* Return TRUE if OP is a valid vector addressing mode.  */
8644 bool
8645 aarch64_simd_mem_operand_p (rtx op)
8646 {
8647   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8648                         || REG_P (XEXP (op, 0)));
8649 }
8650
8651 /* Emit a register copy from operand to operand, taking care not to
8652    early-clobber source registers in the process.
8653
8654    COUNT is the number of components into which the copy needs to be
8655    decomposed.  */
8656 void
8657 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
8658                                 unsigned int count)
8659 {
8660   unsigned int i;
8661   int rdest = REGNO (operands[0]);
8662   int rsrc = REGNO (operands[1]);
8663
8664   if (!reg_overlap_mentioned_p (operands[0], operands[1])
8665       || rdest < rsrc)
8666     for (i = 0; i < count; i++)
8667       emit_move_insn (gen_rtx_REG (mode, rdest + i),
8668                       gen_rtx_REG (mode, rsrc + i));
8669   else
8670     for (i = 0; i < count; i++)
8671       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
8672                       gen_rtx_REG (mode, rsrc + count - i - 1));
8673 }
8674
8675 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8676    one of VSTRUCT modes: OI, CI or XI.  */
8677 int
8678 aarch64_simd_attr_length_move (rtx_insn *insn)
8679 {
8680   machine_mode mode;
8681
8682   extract_insn_cached (insn);
8683
8684   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8685     {
8686       mode = GET_MODE (recog_data.operand[0]);
8687       switch (mode)
8688         {
8689         case OImode:
8690           return 8;
8691         case CImode:
8692           return 12;
8693         case XImode:
8694           return 16;
8695         default:
8696           gcc_unreachable ();
8697         }
8698     }
8699   return 4;
8700 }
8701
8702 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
8703    one of VSTRUCT modes: OI, CI, EI, or XI.  */
8704 int
8705 aarch64_simd_attr_length_rglist (enum machine_mode mode)
8706 {
8707   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
8708 }
8709
8710 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
8711    alignment of a vector to 128 bits.  */
8712 static HOST_WIDE_INT
8713 aarch64_simd_vector_alignment (const_tree type)
8714 {
8715   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8716   return MIN (align, 128);
8717 }
8718
8719 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
8720 static bool
8721 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8722 {
8723   if (is_packed)
8724     return false;
8725
8726   /* We guarantee alignment for vectors up to 128-bits.  */
8727   if (tree_int_cst_compare (TYPE_SIZE (type),
8728                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8729     return false;
8730
8731   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
8732   return true;
8733 }
8734
8735 /* If VALS is a vector constant that can be loaded into a register
8736    using DUP, generate instructions to do so and return an RTX to
8737    assign to the register.  Otherwise return NULL_RTX.  */
8738 static rtx
8739 aarch64_simd_dup_constant (rtx vals)
8740 {
8741   machine_mode mode = GET_MODE (vals);
8742   machine_mode inner_mode = GET_MODE_INNER (mode);
8743   int n_elts = GET_MODE_NUNITS (mode);
8744   bool all_same = true;
8745   rtx x;
8746   int i;
8747
8748   if (GET_CODE (vals) != CONST_VECTOR)
8749     return NULL_RTX;
8750
8751   for (i = 1; i < n_elts; ++i)
8752     {
8753       x = CONST_VECTOR_ELT (vals, i);
8754       if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8755         all_same = false;
8756     }
8757
8758   if (!all_same)
8759     return NULL_RTX;
8760
8761   /* We can load this constant by using DUP and a constant in a
8762      single ARM register.  This will be cheaper than a vector
8763      load.  */
8764   x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8765   return gen_rtx_VEC_DUPLICATE (mode, x);
8766 }
8767
8768
8769 /* Generate code to load VALS, which is a PARALLEL containing only
8770    constants (for vec_init) or CONST_VECTOR, efficiently into a
8771    register.  Returns an RTX to copy into the register, or NULL_RTX
8772    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
8773 static rtx
8774 aarch64_simd_make_constant (rtx vals)
8775 {
8776   machine_mode mode = GET_MODE (vals);
8777   rtx const_dup;
8778   rtx const_vec = NULL_RTX;
8779   int n_elts = GET_MODE_NUNITS (mode);
8780   int n_const = 0;
8781   int i;
8782
8783   if (GET_CODE (vals) == CONST_VECTOR)
8784     const_vec = vals;
8785   else if (GET_CODE (vals) == PARALLEL)
8786     {
8787       /* A CONST_VECTOR must contain only CONST_INTs and
8788          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8789          Only store valid constants in a CONST_VECTOR.  */
8790       for (i = 0; i < n_elts; ++i)
8791         {
8792           rtx x = XVECEXP (vals, 0, i);
8793           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8794             n_const++;
8795         }
8796       if (n_const == n_elts)
8797         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8798     }
8799   else
8800     gcc_unreachable ();
8801
8802   if (const_vec != NULL_RTX
8803       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8804     /* Load using MOVI/MVNI.  */
8805     return const_vec;
8806   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8807     /* Loaded using DUP.  */
8808     return const_dup;
8809   else if (const_vec != NULL_RTX)
8810     /* Load from constant pool. We can not take advantage of single-cycle
8811        LD1 because we need a PC-relative addressing mode.  */
8812     return const_vec;
8813   else
8814     /* A PARALLEL containing something not valid inside CONST_VECTOR.
8815        We can not construct an initializer.  */
8816     return NULL_RTX;
8817 }
8818
8819 void
8820 aarch64_expand_vector_init (rtx target, rtx vals)
8821 {
8822   machine_mode mode = GET_MODE (target);
8823   machine_mode inner_mode = GET_MODE_INNER (mode);
8824   int n_elts = GET_MODE_NUNITS (mode);
8825   int n_var = 0;
8826   rtx any_const = NULL_RTX;
8827   bool all_same = true;
8828
8829   for (int i = 0; i < n_elts; ++i)
8830     {
8831       rtx x = XVECEXP (vals, 0, i);
8832       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8833         ++n_var;
8834       else
8835         any_const = x;
8836
8837       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8838         all_same = false;
8839     }
8840
8841   if (n_var == 0)
8842     {
8843       rtx constant = aarch64_simd_make_constant (vals);
8844       if (constant != NULL_RTX)
8845         {
8846           emit_move_insn (target, constant);
8847           return;
8848         }
8849     }
8850
8851   /* Splat a single non-constant element if we can.  */
8852   if (all_same)
8853     {
8854       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8855       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8856       return;
8857     }
8858
8859   /* Half the fields (or less) are non-constant.  Load constant then overwrite
8860      varying fields.  Hope that this is more efficient than using the stack.  */
8861   if (n_var <= n_elts/2)
8862     {
8863       rtx copy = copy_rtx (vals);
8864
8865       /* Load constant part of vector.  We really don't care what goes into the
8866          parts we will overwrite, but we're more likely to be able to load the
8867          constant efficiently if it has fewer, larger, repeating parts
8868          (see aarch64_simd_valid_immediate).  */
8869       for (int i = 0; i < n_elts; i++)
8870         {
8871           rtx x = XVECEXP (vals, 0, i);
8872           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8873             continue;
8874           rtx subst = any_const;
8875           for (int bit = n_elts / 2; bit > 0; bit /= 2)
8876             {
8877               /* Look in the copied vector, as more elements are const.  */
8878               rtx test = XVECEXP (copy, 0, i ^ bit);
8879               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
8880                 {
8881                   subst = test;
8882                   break;
8883                 }
8884             }
8885           XVECEXP (copy, 0, i) = subst;
8886         }
8887       aarch64_expand_vector_init (target, copy);
8888
8889       /* Insert variables.  */
8890       enum insn_code icode = optab_handler (vec_set_optab, mode);
8891       gcc_assert (icode != CODE_FOR_nothing);
8892
8893       for (int i = 0; i < n_elts; i++)
8894         {
8895           rtx x = XVECEXP (vals, 0, i);
8896           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8897             continue;
8898           x = copy_to_mode_reg (inner_mode, x);
8899           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
8900         }
8901       return;
8902     }
8903
8904   /* Construct the vector in memory one field at a time
8905      and load the whole vector.  */
8906   rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8907   for (int i = 0; i < n_elts; i++)
8908     emit_move_insn (adjust_address_nv (mem, inner_mode,
8909                                     i * GET_MODE_SIZE (inner_mode)),
8910                     XVECEXP (vals, 0, i));
8911   emit_move_insn (target, mem);
8912
8913 }
8914
8915 static unsigned HOST_WIDE_INT
8916 aarch64_shift_truncation_mask (machine_mode mode)
8917 {
8918   return
8919     (aarch64_vector_mode_supported_p (mode)
8920      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8921 }
8922
8923 #ifndef TLS_SECTION_ASM_FLAG
8924 #define TLS_SECTION_ASM_FLAG 'T'
8925 #endif
8926
8927 void
8928 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8929                                tree decl ATTRIBUTE_UNUSED)
8930 {
8931   char flagchars[10], *f = flagchars;
8932
8933   /* If we have already declared this section, we can use an
8934      abbreviated form to switch back to it -- unless this section is
8935      part of a COMDAT groups, in which case GAS requires the full
8936      declaration every time.  */
8937   if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8938       && (flags & SECTION_DECLARED))
8939     {
8940       fprintf (asm_out_file, "\t.section\t%s\n", name);
8941       return;
8942     }
8943
8944   if (!(flags & SECTION_DEBUG))
8945     *f++ = 'a';
8946   if (flags & SECTION_WRITE)
8947     *f++ = 'w';
8948   if (flags & SECTION_CODE)
8949     *f++ = 'x';
8950   if (flags & SECTION_SMALL)
8951     *f++ = 's';
8952   if (flags & SECTION_MERGE)
8953     *f++ = 'M';
8954   if (flags & SECTION_STRINGS)
8955     *f++ = 'S';
8956   if (flags & SECTION_TLS)
8957     *f++ = TLS_SECTION_ASM_FLAG;
8958   if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8959     *f++ = 'G';
8960   *f = '\0';
8961
8962   fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
8963
8964   if (!(flags & SECTION_NOTYPE))
8965     {
8966       const char *type;
8967       const char *format;
8968
8969       if (flags & SECTION_BSS)
8970         type = "nobits";
8971       else
8972         type = "progbits";
8973
8974 #ifdef TYPE_OPERAND_FMT
8975       format = "," TYPE_OPERAND_FMT;
8976 #else
8977       format = ",@%s";
8978 #endif
8979
8980       fprintf (asm_out_file, format, type);
8981
8982       if (flags & SECTION_ENTSIZE)
8983         fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
8984       if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
8985         {
8986           if (TREE_CODE (decl) == IDENTIFIER_NODE)
8987             fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
8988           else
8989             fprintf (asm_out_file, ",%s,comdat",
8990                      IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
8991         }
8992     }
8993
8994   putc ('\n', asm_out_file);
8995 }
8996
8997 /* Select a format to encode pointers in exception handling data.  */
8998 int
8999 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
9000 {
9001    int type;
9002    switch (aarch64_cmodel)
9003      {
9004      case AARCH64_CMODEL_TINY:
9005      case AARCH64_CMODEL_TINY_PIC:
9006      case AARCH64_CMODEL_SMALL:
9007      case AARCH64_CMODEL_SMALL_PIC:
9008        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
9009           for everything.  */
9010        type = DW_EH_PE_sdata4;
9011        break;
9012      default:
9013        /* No assumptions here.  8-byte relocs required.  */
9014        type = DW_EH_PE_sdata8;
9015        break;
9016      }
9017    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
9018 }
9019
9020 /* Emit load exclusive.  */
9021
9022 static void
9023 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
9024                              rtx mem, rtx model_rtx)
9025 {
9026   rtx (*gen) (rtx, rtx, rtx);
9027
9028   switch (mode)
9029     {
9030     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
9031     case HImode: gen = gen_aarch64_load_exclusivehi; break;
9032     case SImode: gen = gen_aarch64_load_exclusivesi; break;
9033     case DImode: gen = gen_aarch64_load_exclusivedi; break;
9034     default:
9035       gcc_unreachable ();
9036     }
9037
9038   emit_insn (gen (rval, mem, model_rtx));
9039 }
9040
9041 /* Emit store exclusive.  */
9042
9043 static void
9044 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
9045                               rtx rval, rtx mem, rtx model_rtx)
9046 {
9047   rtx (*gen) (rtx, rtx, rtx, rtx);
9048
9049   switch (mode)
9050     {
9051     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
9052     case HImode: gen = gen_aarch64_store_exclusivehi; break;
9053     case SImode: gen = gen_aarch64_store_exclusivesi; break;
9054     case DImode: gen = gen_aarch64_store_exclusivedi; break;
9055     default:
9056       gcc_unreachable ();
9057     }
9058
9059   emit_insn (gen (bval, rval, mem, model_rtx));
9060 }
9061
9062 /* Mark the previous jump instruction as unlikely.  */
9063
9064 static void
9065 aarch64_emit_unlikely_jump (rtx insn)
9066 {
9067   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
9068
9069   insn = emit_jump_insn (insn);
9070   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
9071 }
9072
9073 /* Expand a compare and swap pattern.  */
9074
9075 void
9076 aarch64_expand_compare_and_swap (rtx operands[])
9077 {
9078   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9079   machine_mode mode, cmp_mode;
9080   rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9081
9082   bval = operands[0];
9083   rval = operands[1];
9084   mem = operands[2];
9085   oldval = operands[3];
9086   newval = operands[4];
9087   is_weak = operands[5];
9088   mod_s = operands[6];
9089   mod_f = operands[7];
9090   mode = GET_MODE (mem);
9091   cmp_mode = mode;
9092
9093   /* Normally the succ memory model must be stronger than fail, but in the
9094      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9095      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
9096
9097   if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
9098       && INTVAL (mod_s) == MEMMODEL_RELEASE)
9099     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9100
9101   switch (mode)
9102     {
9103     case QImode:
9104     case HImode:
9105       /* For short modes, we're going to perform the comparison in SImode,
9106          so do the zero-extension now.  */
9107       cmp_mode = SImode;
9108       rval = gen_reg_rtx (SImode);
9109       oldval = convert_modes (SImode, mode, oldval, true);
9110       /* Fall through.  */
9111
9112     case SImode:
9113     case DImode:
9114       /* Force the value into a register if needed.  */
9115       if (!aarch64_plus_operand (oldval, mode))
9116         oldval = force_reg (cmp_mode, oldval);
9117       break;
9118
9119     default:
9120       gcc_unreachable ();
9121     }
9122
9123   switch (mode)
9124     {
9125     case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9126     case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9127     case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9128     case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9129     default:
9130       gcc_unreachable ();
9131     }
9132
9133   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9134
9135   if (mode == QImode || mode == HImode)
9136     emit_move_insn (operands[1], gen_lowpart (mode, rval));
9137
9138   x = gen_rtx_REG (CCmode, CC_REGNUM);
9139   x = gen_rtx_EQ (SImode, x, const0_rtx);
9140   emit_insn (gen_rtx_SET (VOIDmode, bval, x));
9141 }
9142
9143 /* Split a compare and swap pattern.  */
9144
9145 void
9146 aarch64_split_compare_and_swap (rtx operands[])
9147 {
9148   rtx rval, mem, oldval, newval, scratch;
9149   machine_mode mode;
9150   bool is_weak;
9151   rtx_code_label *label1, *label2;
9152   rtx x, cond;
9153
9154   rval = operands[0];
9155   mem = operands[1];
9156   oldval = operands[2];
9157   newval = operands[3];
9158   is_weak = (operands[4] != const0_rtx);
9159   scratch = operands[7];
9160   mode = GET_MODE (mem);
9161
9162   label1 = NULL;
9163   if (!is_weak)
9164     {
9165       label1 = gen_label_rtx ();
9166       emit_label (label1);
9167     }
9168   label2 = gen_label_rtx ();
9169
9170   aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
9171
9172   cond = aarch64_gen_compare_reg (NE, rval, oldval);
9173   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9174   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9175                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9176   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9177
9178   aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
9179
9180   if (!is_weak)
9181     {
9182       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9183       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9184                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9185       aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9186     }
9187   else
9188     {
9189       cond = gen_rtx_REG (CCmode, CC_REGNUM);
9190       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9191       emit_insn (gen_rtx_SET (VOIDmode, cond, x));
9192     }
9193
9194   emit_label (label2);
9195 }
9196
9197 /* Split an atomic operation.  */
9198
9199 void
9200 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9201                      rtx value, rtx model_rtx, rtx cond)
9202 {
9203   machine_mode mode = GET_MODE (mem);
9204   machine_mode wmode = (mode == DImode ? DImode : SImode);
9205   rtx_code_label *label;
9206   rtx x;
9207
9208   label = gen_label_rtx ();
9209   emit_label (label);
9210
9211   if (new_out)
9212     new_out = gen_lowpart (wmode, new_out);
9213   if (old_out)
9214     old_out = gen_lowpart (wmode, old_out);
9215   else
9216     old_out = new_out;
9217   value = simplify_gen_subreg (wmode, value, mode, 0);
9218
9219   aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9220
9221   switch (code)
9222     {
9223     case SET:
9224       new_out = value;
9225       break;
9226
9227     case NOT:
9228       x = gen_rtx_AND (wmode, old_out, value);
9229       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9230       x = gen_rtx_NOT (wmode, new_out);
9231       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9232       break;
9233
9234     case MINUS:
9235       if (CONST_INT_P (value))
9236         {
9237           value = GEN_INT (-INTVAL (value));
9238           code = PLUS;
9239         }
9240       /* Fall through.  */
9241
9242     default:
9243       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9244       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9245       break;
9246     }
9247
9248   aarch64_emit_store_exclusive (mode, cond, mem,
9249                                 gen_lowpart (mode, new_out), model_rtx);
9250
9251   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9252   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9253                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9254   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9255 }
9256
9257 static void
9258 aarch64_print_extension (void)
9259 {
9260   const struct aarch64_option_extension *opt = NULL;
9261
9262   for (opt = all_extensions; opt->name != NULL; opt++)
9263     if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9264       asm_fprintf (asm_out_file, "+%s", opt->name);
9265
9266   asm_fprintf (asm_out_file, "\n");
9267 }
9268
9269 static void
9270 aarch64_start_file (void)
9271 {
9272   if (selected_arch)
9273     {
9274       asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9275       aarch64_print_extension ();
9276     }
9277   else if (selected_cpu)
9278     {
9279       const char *truncated_name
9280             = aarch64_rewrite_selected_cpu (selected_cpu->name);
9281       asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9282       aarch64_print_extension ();
9283     }
9284   default_file_start();
9285 }
9286
9287 /* Target hook for c_mode_for_suffix.  */
9288 static machine_mode
9289 aarch64_c_mode_for_suffix (char suffix)
9290 {
9291   if (suffix == 'q')
9292     return TFmode;
9293
9294   return VOIDmode;
9295 }
9296
9297 /* We can only represent floating point constants which will fit in
9298    "quarter-precision" values.  These values are characterised by
9299    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
9300    by:
9301
9302    (-1)^s * (n/16) * 2^r
9303
9304    Where:
9305      's' is the sign bit.
9306      'n' is an integer in the range 16 <= n <= 31.
9307      'r' is an integer in the range -3 <= r <= 4.  */
9308
9309 /* Return true iff X can be represented by a quarter-precision
9310    floating point immediate operand X.  Note, we cannot represent 0.0.  */
9311 bool
9312 aarch64_float_const_representable_p (rtx x)
9313 {
9314   /* This represents our current view of how many bits
9315      make up the mantissa.  */
9316   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9317   int exponent;
9318   unsigned HOST_WIDE_INT mantissa, mask;
9319   REAL_VALUE_TYPE r, m;
9320   bool fail;
9321
9322   if (!CONST_DOUBLE_P (x))
9323     return false;
9324
9325   if (GET_MODE (x) == VOIDmode)
9326     return false;
9327
9328   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9329
9330   /* We cannot represent infinities, NaNs or +/-zero.  We won't
9331      know if we have +zero until we analyse the mantissa, but we
9332      can reject the other invalid values.  */
9333   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9334       || REAL_VALUE_MINUS_ZERO (r))
9335     return false;
9336
9337   /* Extract exponent.  */
9338   r = real_value_abs (&r);
9339   exponent = REAL_EXP (&r);
9340
9341   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9342      highest (sign) bit, with a fixed binary point at bit point_pos.
9343      m1 holds the low part of the mantissa, m2 the high part.
9344      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9345      bits for the mantissa, this can fail (low bits will be lost).  */
9346   real_ldexp (&m, &r, point_pos - exponent);
9347   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9348
9349   /* If the low part of the mantissa has bits set we cannot represent
9350      the value.  */
9351   if (w.elt (0) != 0)
9352     return false;
9353   /* We have rejected the lower HOST_WIDE_INT, so update our
9354      understanding of how many bits lie in the mantissa and
9355      look only at the high HOST_WIDE_INT.  */
9356   mantissa = w.elt (1);
9357   point_pos -= HOST_BITS_PER_WIDE_INT;
9358
9359   /* We can only represent values with a mantissa of the form 1.xxxx.  */
9360   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9361   if ((mantissa & mask) != 0)
9362     return false;
9363
9364   /* Having filtered unrepresentable values, we may now remove all
9365      but the highest 5 bits.  */
9366   mantissa >>= point_pos - 5;
9367
9368   /* We cannot represent the value 0.0, so reject it.  This is handled
9369      elsewhere.  */
9370   if (mantissa == 0)
9371     return false;
9372
9373   /* Then, as bit 4 is always set, we can mask it off, leaving
9374      the mantissa in the range [0, 15].  */
9375   mantissa &= ~(1 << 4);
9376   gcc_assert (mantissa <= 15);
9377
9378   /* GCC internally does not use IEEE754-like encoding (where normalized
9379      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
9380      Our mantissa values are shifted 4 places to the left relative to
9381      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9382      by 5 places to correct for GCC's representation.  */
9383   exponent = 5 - exponent;
9384
9385   return (exponent >= 0 && exponent <= 7);
9386 }
9387
9388 char*
9389 aarch64_output_simd_mov_immediate (rtx const_vector,
9390                                    machine_mode mode,
9391                                    unsigned width)
9392 {
9393   bool is_valid;
9394   static char templ[40];
9395   const char *mnemonic;
9396   const char *shift_op;
9397   unsigned int lane_count = 0;
9398   char element_char;
9399
9400   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
9401
9402   /* This will return true to show const_vector is legal for use as either
9403      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
9404      also update INFO to show how the immediate should be generated.  */
9405   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
9406   gcc_assert (is_valid);
9407
9408   element_char = sizetochar (info.element_width);
9409   lane_count = width / info.element_width;
9410
9411   mode = GET_MODE_INNER (mode);
9412   if (mode == SFmode || mode == DFmode)
9413     {
9414       gcc_assert (info.shift == 0 && ! info.mvn);
9415       if (aarch64_float_const_zero_rtx_p (info.value))
9416         info.value = GEN_INT (0);
9417       else
9418         {
9419 #define buf_size 20
9420           REAL_VALUE_TYPE r;
9421           REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
9422           char float_buf[buf_size] = {'\0'};
9423           real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
9424 #undef buf_size
9425
9426           if (lane_count == 1)
9427             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
9428           else
9429             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
9430                       lane_count, element_char, float_buf);
9431           return templ;
9432         }
9433     }
9434
9435   mnemonic = info.mvn ? "mvni" : "movi";
9436   shift_op = info.msl ? "msl" : "lsl";
9437
9438   if (lane_count == 1)
9439     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9440               mnemonic, UINTVAL (info.value));
9441   else if (info.shift)
9442     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9443               ", %s %d", mnemonic, lane_count, element_char,
9444               UINTVAL (info.value), shift_op, info.shift);
9445   else
9446     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9447               mnemonic, lane_count, element_char, UINTVAL (info.value));
9448   return templ;
9449 }
9450
9451 char*
9452 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9453                                           machine_mode mode)
9454 {
9455   machine_mode vmode;
9456
9457   gcc_assert (!VECTOR_MODE_P (mode));
9458   vmode = aarch64_simd_container_mode (mode, 64);
9459   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9460   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9461 }
9462
9463 /* Split operands into moves from op[1] + op[2] into op[0].  */
9464
9465 void
9466 aarch64_split_combinev16qi (rtx operands[3])
9467 {
9468   unsigned int dest = REGNO (operands[0]);
9469   unsigned int src1 = REGNO (operands[1]);
9470   unsigned int src2 = REGNO (operands[2]);
9471   machine_mode halfmode = GET_MODE (operands[1]);
9472   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9473   rtx destlo, desthi;
9474
9475   gcc_assert (halfmode == V16QImode);
9476
9477   if (src1 == dest && src2 == dest + halfregs)
9478     {
9479       /* No-op move.  Can't split to nothing; emit something.  */
9480       emit_note (NOTE_INSN_DELETED);
9481       return;
9482     }
9483
9484   /* Preserve register attributes for variable tracking.  */
9485   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9486   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9487                                GET_MODE_SIZE (halfmode));
9488
9489   /* Special case of reversed high/low parts.  */
9490   if (reg_overlap_mentioned_p (operands[2], destlo)
9491       && reg_overlap_mentioned_p (operands[1], desthi))
9492     {
9493       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9494       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9495       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9496     }
9497   else if (!reg_overlap_mentioned_p (operands[2], destlo))
9498     {
9499       /* Try to avoid unnecessary moves if part of the result
9500          is in the right place already.  */
9501       if (src1 != dest)
9502         emit_move_insn (destlo, operands[1]);
9503       if (src2 != dest + halfregs)
9504         emit_move_insn (desthi, operands[2]);
9505     }
9506   else
9507     {
9508       if (src2 != dest + halfregs)
9509         emit_move_insn (desthi, operands[2]);
9510       if (src1 != dest)
9511         emit_move_insn (destlo, operands[1]);
9512     }
9513 }
9514
9515 /* vec_perm support.  */
9516
9517 #define MAX_VECT_LEN 16
9518
9519 struct expand_vec_perm_d
9520 {
9521   rtx target, op0, op1;
9522   unsigned char perm[MAX_VECT_LEN];
9523   machine_mode vmode;
9524   unsigned char nelt;
9525   bool one_vector_p;
9526   bool testing_p;
9527 };
9528
9529 /* Generate a variable permutation.  */
9530
9531 static void
9532 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9533 {
9534   machine_mode vmode = GET_MODE (target);
9535   bool one_vector_p = rtx_equal_p (op0, op1);
9536
9537   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9538   gcc_checking_assert (GET_MODE (op0) == vmode);
9539   gcc_checking_assert (GET_MODE (op1) == vmode);
9540   gcc_checking_assert (GET_MODE (sel) == vmode);
9541   gcc_checking_assert (TARGET_SIMD);
9542
9543   if (one_vector_p)
9544     {
9545       if (vmode == V8QImode)
9546         {
9547           /* Expand the argument to a V16QI mode by duplicating it.  */
9548           rtx pair = gen_reg_rtx (V16QImode);
9549           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9550           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9551         }
9552       else
9553         {
9554           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9555         }
9556     }
9557   else
9558     {
9559       rtx pair;
9560
9561       if (vmode == V8QImode)
9562         {
9563           pair = gen_reg_rtx (V16QImode);
9564           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9565           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9566         }
9567       else
9568         {
9569           pair = gen_reg_rtx (OImode);
9570           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9571           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9572         }
9573     }
9574 }
9575
9576 void
9577 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9578 {
9579   machine_mode vmode = GET_MODE (target);
9580   unsigned int nelt = GET_MODE_NUNITS (vmode);
9581   bool one_vector_p = rtx_equal_p (op0, op1);
9582   rtx mask;
9583
9584   /* The TBL instruction does not use a modulo index, so we must take care
9585      of that ourselves.  */
9586   mask = aarch64_simd_gen_const_vector_dup (vmode,
9587       one_vector_p ? nelt - 1 : 2 * nelt - 1);
9588   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9589
9590   /* For big-endian, we also need to reverse the index within the vector
9591      (but not which vector).  */
9592   if (BYTES_BIG_ENDIAN)
9593     {
9594       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
9595       if (!one_vector_p)
9596         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9597       sel = expand_simple_binop (vmode, XOR, sel, mask,
9598                                  NULL, 0, OPTAB_LIB_WIDEN);
9599     }
9600   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9601 }
9602
9603 /* Recognize patterns suitable for the TRN instructions.  */
9604 static bool
9605 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9606 {
9607   unsigned int i, odd, mask, nelt = d->nelt;
9608   rtx out, in0, in1, x;
9609   rtx (*gen) (rtx, rtx, rtx);
9610   machine_mode vmode = d->vmode;
9611
9612   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9613     return false;
9614
9615   /* Note that these are little-endian tests.
9616      We correct for big-endian later.  */
9617   if (d->perm[0] == 0)
9618     odd = 0;
9619   else if (d->perm[0] == 1)
9620     odd = 1;
9621   else
9622     return false;
9623   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9624
9625   for (i = 0; i < nelt; i += 2)
9626     {
9627       if (d->perm[i] != i + odd)
9628         return false;
9629       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9630         return false;
9631     }
9632
9633   /* Success!  */
9634   if (d->testing_p)
9635     return true;
9636
9637   in0 = d->op0;
9638   in1 = d->op1;
9639   if (BYTES_BIG_ENDIAN)
9640     {
9641       x = in0, in0 = in1, in1 = x;
9642       odd = !odd;
9643     }
9644   out = d->target;
9645
9646   if (odd)
9647     {
9648       switch (vmode)
9649         {
9650         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9651         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9652         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9653         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9654         case V4SImode: gen = gen_aarch64_trn2v4si; break;
9655         case V2SImode: gen = gen_aarch64_trn2v2si; break;
9656         case V2DImode: gen = gen_aarch64_trn2v2di; break;
9657         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9658         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9659         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9660         default:
9661           return false;
9662         }
9663     }
9664   else
9665     {
9666       switch (vmode)
9667         {
9668         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9669         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9670         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9671         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9672         case V4SImode: gen = gen_aarch64_trn1v4si; break;
9673         case V2SImode: gen = gen_aarch64_trn1v2si; break;
9674         case V2DImode: gen = gen_aarch64_trn1v2di; break;
9675         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9676         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9677         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9678         default:
9679           return false;
9680         }
9681     }
9682
9683   emit_insn (gen (out, in0, in1));
9684   return true;
9685 }
9686
9687 /* Recognize patterns suitable for the UZP instructions.  */
9688 static bool
9689 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
9690 {
9691   unsigned int i, odd, mask, nelt = d->nelt;
9692   rtx out, in0, in1, x;
9693   rtx (*gen) (rtx, rtx, rtx);
9694   machine_mode vmode = d->vmode;
9695
9696   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9697     return false;
9698
9699   /* Note that these are little-endian tests.
9700      We correct for big-endian later.  */
9701   if (d->perm[0] == 0)
9702     odd = 0;
9703   else if (d->perm[0] == 1)
9704     odd = 1;
9705   else
9706     return false;
9707   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9708
9709   for (i = 0; i < nelt; i++)
9710     {
9711       unsigned elt = (i * 2 + odd) & mask;
9712       if (d->perm[i] != elt)
9713         return false;
9714     }
9715
9716   /* Success!  */
9717   if (d->testing_p)
9718     return true;
9719
9720   in0 = d->op0;
9721   in1 = d->op1;
9722   if (BYTES_BIG_ENDIAN)
9723     {
9724       x = in0, in0 = in1, in1 = x;
9725       odd = !odd;
9726     }
9727   out = d->target;
9728
9729   if (odd)
9730     {
9731       switch (vmode)
9732         {
9733         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9734         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9735         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9736         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9737         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9738         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9739         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9740         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9741         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9742         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9743         default:
9744           return false;
9745         }
9746     }
9747   else
9748     {
9749       switch (vmode)
9750         {
9751         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9752         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9753         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9754         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9755         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9756         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9757         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9758         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9759         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9760         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9761         default:
9762           return false;
9763         }
9764     }
9765
9766   emit_insn (gen (out, in0, in1));
9767   return true;
9768 }
9769
9770 /* Recognize patterns suitable for the ZIP instructions.  */
9771 static bool
9772 aarch64_evpc_zip (struct expand_vec_perm_d *d)
9773 {
9774   unsigned int i, high, mask, nelt = d->nelt;
9775   rtx out, in0, in1, x;
9776   rtx (*gen) (rtx, rtx, rtx);
9777   machine_mode vmode = d->vmode;
9778
9779   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9780     return false;
9781
9782   /* Note that these are little-endian tests.
9783      We correct for big-endian later.  */
9784   high = nelt / 2;
9785   if (d->perm[0] == high)
9786     /* Do Nothing.  */
9787     ;
9788   else if (d->perm[0] == 0)
9789     high = 0;
9790   else
9791     return false;
9792   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9793
9794   for (i = 0; i < nelt / 2; i++)
9795     {
9796       unsigned elt = (i + high) & mask;
9797       if (d->perm[i * 2] != elt)
9798         return false;
9799       elt = (elt + nelt) & mask;
9800       if (d->perm[i * 2 + 1] != elt)
9801         return false;
9802     }
9803
9804   /* Success!  */
9805   if (d->testing_p)
9806     return true;
9807
9808   in0 = d->op0;
9809   in1 = d->op1;
9810   if (BYTES_BIG_ENDIAN)
9811     {
9812       x = in0, in0 = in1, in1 = x;
9813       high = !high;
9814     }
9815   out = d->target;
9816
9817   if (high)
9818     {
9819       switch (vmode)
9820         {
9821         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9822         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9823         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9824         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9825         case V4SImode: gen = gen_aarch64_zip2v4si; break;
9826         case V2SImode: gen = gen_aarch64_zip2v2si; break;
9827         case V2DImode: gen = gen_aarch64_zip2v2di; break;
9828         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9829         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9830         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9831         default:
9832           return false;
9833         }
9834     }
9835   else
9836     {
9837       switch (vmode)
9838         {
9839         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9840         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9841         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9842         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9843         case V4SImode: gen = gen_aarch64_zip1v4si; break;
9844         case V2SImode: gen = gen_aarch64_zip1v2si; break;
9845         case V2DImode: gen = gen_aarch64_zip1v2di; break;
9846         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9847         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9848         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9849         default:
9850           return false;
9851         }
9852     }
9853
9854   emit_insn (gen (out, in0, in1));
9855   return true;
9856 }
9857
9858 /* Recognize patterns for the EXT insn.  */
9859
9860 static bool
9861 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9862 {
9863   unsigned int i, nelt = d->nelt;
9864   rtx (*gen) (rtx, rtx, rtx, rtx);
9865   rtx offset;
9866
9867   unsigned int location = d->perm[0]; /* Always < nelt.  */
9868
9869   /* Check if the extracted indices are increasing by one.  */
9870   for (i = 1; i < nelt; i++)
9871     {
9872       unsigned int required = location + i;
9873       if (d->one_vector_p)
9874         {
9875           /* We'll pass the same vector in twice, so allow indices to wrap.  */
9876           required &= (nelt - 1);
9877         }
9878       if (d->perm[i] != required)
9879         return false;
9880     }
9881
9882   switch (d->vmode)
9883     {
9884     case V16QImode: gen = gen_aarch64_extv16qi; break;
9885     case V8QImode: gen = gen_aarch64_extv8qi; break;
9886     case V4HImode: gen = gen_aarch64_extv4hi; break;
9887     case V8HImode: gen = gen_aarch64_extv8hi; break;
9888     case V2SImode: gen = gen_aarch64_extv2si; break;
9889     case V4SImode: gen = gen_aarch64_extv4si; break;
9890     case V2SFmode: gen = gen_aarch64_extv2sf; break;
9891     case V4SFmode: gen = gen_aarch64_extv4sf; break;
9892     case V2DImode: gen = gen_aarch64_extv2di; break;
9893     case V2DFmode: gen = gen_aarch64_extv2df; break;
9894     default:
9895       return false;
9896     }
9897
9898   /* Success! */
9899   if (d->testing_p)
9900     return true;
9901
9902   /* The case where (location == 0) is a no-op for both big- and little-endian,
9903      and is removed by the mid-end at optimization levels -O1 and higher.  */
9904
9905   if (BYTES_BIG_ENDIAN && (location != 0))
9906     {
9907       /* After setup, we want the high elements of the first vector (stored
9908          at the LSB end of the register), and the low elements of the second
9909          vector (stored at the MSB end of the register). So swap.  */
9910       std::swap (d->op0, d->op1);
9911       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
9912       location = nelt - location;
9913     }
9914
9915   offset = GEN_INT (location);
9916   emit_insn (gen (d->target, d->op0, d->op1, offset));
9917   return true;
9918 }
9919
9920 /* Recognize patterns for the REV insns.  */
9921
9922 static bool
9923 aarch64_evpc_rev (struct expand_vec_perm_d *d)
9924 {
9925   unsigned int i, j, diff, nelt = d->nelt;
9926   rtx (*gen) (rtx, rtx);
9927
9928   if (!d->one_vector_p)
9929     return false;
9930
9931   diff = d->perm[0];
9932   switch (diff)
9933     {
9934     case 7:
9935       switch (d->vmode)
9936         {
9937         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
9938         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
9939         default:
9940           return false;
9941         }
9942       break;
9943     case 3:
9944       switch (d->vmode)
9945         {
9946         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
9947         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
9948         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
9949         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
9950         default:
9951           return false;
9952         }
9953       break;
9954     case 1:
9955       switch (d->vmode)
9956         {
9957         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
9958         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
9959         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
9960         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
9961         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
9962         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
9963         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
9964         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
9965         default:
9966           return false;
9967         }
9968       break;
9969     default:
9970       return false;
9971     }
9972
9973   for (i = 0; i < nelt ; i += diff + 1)
9974     for (j = 0; j <= diff; j += 1)
9975       {
9976         /* This is guaranteed to be true as the value of diff
9977            is 7, 3, 1 and we should have enough elements in the
9978            queue to generate this.  Getting a vector mask with a
9979            value of diff other than these values implies that
9980            something is wrong by the time we get here.  */
9981         gcc_assert (i + j < nelt);
9982         if (d->perm[i + j] != i + diff - j)
9983           return false;
9984       }
9985
9986   /* Success! */
9987   if (d->testing_p)
9988     return true;
9989
9990   emit_insn (gen (d->target, d->op0));
9991   return true;
9992 }
9993
9994 static bool
9995 aarch64_evpc_dup (struct expand_vec_perm_d *d)
9996 {
9997   rtx (*gen) (rtx, rtx, rtx);
9998   rtx out = d->target;
9999   rtx in0;
10000   machine_mode vmode = d->vmode;
10001   unsigned int i, elt, nelt = d->nelt;
10002   rtx lane;
10003
10004   elt = d->perm[0];
10005   for (i = 1; i < nelt; i++)
10006     {
10007       if (elt != d->perm[i])
10008         return false;
10009     }
10010
10011   /* The generic preparation in aarch64_expand_vec_perm_const_1
10012      swaps the operand order and the permute indices if it finds
10013      d->perm[0] to be in the second operand.  Thus, we can always
10014      use d->op0 and need not do any extra arithmetic to get the
10015      correct lane number.  */
10016   in0 = d->op0;
10017   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
10018
10019   switch (vmode)
10020     {
10021     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
10022     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
10023     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
10024     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
10025     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
10026     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
10027     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
10028     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
10029     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
10030     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
10031     default:
10032       return false;
10033     }
10034
10035   emit_insn (gen (out, in0, lane));
10036   return true;
10037 }
10038
10039 static bool
10040 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
10041 {
10042   rtx rperm[MAX_VECT_LEN], sel;
10043   machine_mode vmode = d->vmode;
10044   unsigned int i, nelt = d->nelt;
10045
10046   if (d->testing_p)
10047     return true;
10048
10049   /* Generic code will try constant permutation twice.  Once with the
10050      original mode and again with the elements lowered to QImode.
10051      So wait and don't do the selector expansion ourselves.  */
10052   if (vmode != V8QImode && vmode != V16QImode)
10053     return false;
10054
10055   for (i = 0; i < nelt; ++i)
10056     {
10057       int nunits = GET_MODE_NUNITS (vmode);
10058
10059       /* If big-endian and two vectors we end up with a weird mixed-endian
10060          mode on NEON.  Reverse the index within each word but not the word
10061          itself.  */
10062       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
10063                                            : d->perm[i]);
10064     }
10065   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
10066   sel = force_reg (vmode, sel);
10067
10068   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
10069   return true;
10070 }
10071
10072 static bool
10073 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10074 {
10075   /* The pattern matching functions above are written to look for a small
10076      number to begin the sequence (0, 1, N/2).  If we begin with an index
10077      from the second operand, we can swap the operands.  */
10078   if (d->perm[0] >= d->nelt)
10079     {
10080       unsigned i, nelt = d->nelt;
10081
10082       gcc_assert (nelt == (nelt & -nelt));
10083       for (i = 0; i < nelt; ++i)
10084         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
10085
10086       std::swap (d->op0, d->op1);
10087     }
10088
10089   if (TARGET_SIMD)
10090     {
10091       if (aarch64_evpc_rev (d))
10092         return true;
10093       else if (aarch64_evpc_ext (d))
10094         return true;
10095       else if (aarch64_evpc_dup (d))
10096         return true;
10097       else if (aarch64_evpc_zip (d))
10098         return true;
10099       else if (aarch64_evpc_uzp (d))
10100         return true;
10101       else if (aarch64_evpc_trn (d))
10102         return true;
10103       return aarch64_evpc_tbl (d);
10104     }
10105   return false;
10106 }
10107
10108 /* Expand a vec_perm_const pattern.  */
10109
10110 bool
10111 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10112 {
10113   struct expand_vec_perm_d d;
10114   int i, nelt, which;
10115
10116   d.target = target;
10117   d.op0 = op0;
10118   d.op1 = op1;
10119
10120   d.vmode = GET_MODE (target);
10121   gcc_assert (VECTOR_MODE_P (d.vmode));
10122   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10123   d.testing_p = false;
10124
10125   for (i = which = 0; i < nelt; ++i)
10126     {
10127       rtx e = XVECEXP (sel, 0, i);
10128       int ei = INTVAL (e) & (2 * nelt - 1);
10129       which |= (ei < nelt ? 1 : 2);
10130       d.perm[i] = ei;
10131     }
10132
10133   switch (which)
10134     {
10135     default:
10136       gcc_unreachable ();
10137
10138     case 3:
10139       d.one_vector_p = false;
10140       if (!rtx_equal_p (op0, op1))
10141         break;
10142
10143       /* The elements of PERM do not suggest that only the first operand
10144          is used, but both operands are identical.  Allow easier matching
10145          of the permutation by folding the permutation into the single
10146          input vector.  */
10147       /* Fall Through.  */
10148     case 2:
10149       for (i = 0; i < nelt; ++i)
10150         d.perm[i] &= nelt - 1;
10151       d.op0 = op1;
10152       d.one_vector_p = true;
10153       break;
10154
10155     case 1:
10156       d.op1 = op0;
10157       d.one_vector_p = true;
10158       break;
10159     }
10160
10161   return aarch64_expand_vec_perm_const_1 (&d);
10162 }
10163
10164 static bool
10165 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10166                                      const unsigned char *sel)
10167 {
10168   struct expand_vec_perm_d d;
10169   unsigned int i, nelt, which;
10170   bool ret;
10171
10172   d.vmode = vmode;
10173   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10174   d.testing_p = true;
10175   memcpy (d.perm, sel, nelt);
10176
10177   /* Calculate whether all elements are in one vector.  */
10178   for (i = which = 0; i < nelt; ++i)
10179     {
10180       unsigned char e = d.perm[i];
10181       gcc_assert (e < 2 * nelt);
10182       which |= (e < nelt ? 1 : 2);
10183     }
10184
10185   /* If all elements are from the second vector, reindex as if from the
10186      first vector.  */
10187   if (which == 2)
10188     for (i = 0; i < nelt; ++i)
10189       d.perm[i] -= nelt;
10190
10191   /* Check whether the mask can be applied to a single vector.  */
10192   d.one_vector_p = (which != 3);
10193
10194   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10195   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10196   if (!d.one_vector_p)
10197     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10198
10199   start_sequence ();
10200   ret = aarch64_expand_vec_perm_const_1 (&d);
10201   end_sequence ();
10202
10203   return ret;
10204 }
10205
10206 rtx
10207 aarch64_reverse_mask (enum machine_mode mode)
10208 {
10209   /* We have to reverse each vector because we dont have
10210      a permuted load that can reverse-load according to ABI rules.  */
10211   rtx mask;
10212   rtvec v = rtvec_alloc (16);
10213   int i, j;
10214   int nunits = GET_MODE_NUNITS (mode);
10215   int usize = GET_MODE_UNIT_SIZE (mode);
10216
10217   gcc_assert (BYTES_BIG_ENDIAN);
10218   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
10219
10220   for (i = 0; i < nunits; i++)
10221     for (j = 0; j < usize; j++)
10222       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
10223   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
10224   return force_reg (V16QImode, mask);
10225 }
10226
10227 /* Implement MODES_TIEABLE_P.  */
10228
10229 bool
10230 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10231 {
10232   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10233     return true;
10234
10235   /* We specifically want to allow elements of "structure" modes to
10236      be tieable to the structure.  This more general condition allows
10237      other rarer situations too.  */
10238   if (TARGET_SIMD
10239       && aarch64_vector_mode_p (mode1)
10240       && aarch64_vector_mode_p (mode2))
10241     return true;
10242
10243   return false;
10244 }
10245
10246 /* Return a new RTX holding the result of moving POINTER forward by
10247    AMOUNT bytes.  */
10248
10249 static rtx
10250 aarch64_move_pointer (rtx pointer, int amount)
10251 {
10252   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10253
10254   return adjust_automodify_address (pointer, GET_MODE (pointer),
10255                                     next, amount);
10256 }
10257
10258 /* Return a new RTX holding the result of moving POINTER forward by the
10259    size of the mode it points to.  */
10260
10261 static rtx
10262 aarch64_progress_pointer (rtx pointer)
10263 {
10264   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10265
10266   return aarch64_move_pointer (pointer, amount);
10267 }
10268
10269 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10270    MODE bytes.  */
10271
10272 static void
10273 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10274                                               machine_mode mode)
10275 {
10276   rtx reg = gen_reg_rtx (mode);
10277
10278   /* "Cast" the pointers to the correct mode.  */
10279   *src = adjust_address (*src, mode, 0);
10280   *dst = adjust_address (*dst, mode, 0);
10281   /* Emit the memcpy.  */
10282   emit_move_insn (reg, *src);
10283   emit_move_insn (*dst, reg);
10284   /* Move the pointers forward.  */
10285   *src = aarch64_progress_pointer (*src);
10286   *dst = aarch64_progress_pointer (*dst);
10287 }
10288
10289 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
10290    we succeed, otherwise return false.  */
10291
10292 bool
10293 aarch64_expand_movmem (rtx *operands)
10294 {
10295   unsigned int n;
10296   rtx dst = operands[0];
10297   rtx src = operands[1];
10298   rtx base;
10299   bool speed_p = !optimize_function_for_size_p (cfun);
10300
10301   /* When optimizing for size, give a better estimate of the length of a
10302      memcpy call, but use the default otherwise.  */
10303   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10304
10305   /* We can't do anything smart if the amount to copy is not constant.  */
10306   if (!CONST_INT_P (operands[2]))
10307     return false;
10308
10309   n = UINTVAL (operands[2]);
10310
10311   /* Try to keep the number of instructions low.  For cases below 16 bytes we
10312      need to make at most two moves.  For cases above 16 bytes it will be one
10313      move for each 16 byte chunk, then at most two additional moves.  */
10314   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10315     return false;
10316
10317   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10318   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10319
10320   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10321   src = adjust_automodify_address (src, VOIDmode, base, 0);
10322
10323   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10324      1-byte chunk.  */
10325   if (n < 4)
10326     {
10327       if (n >= 2)
10328         {
10329           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10330           n -= 2;
10331         }
10332
10333       if (n == 1)
10334         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10335
10336       return true;
10337     }
10338
10339   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
10340      4-byte chunk, partially overlapping with the previously copied chunk.  */
10341   if (n < 8)
10342     {
10343       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10344       n -= 4;
10345       if (n > 0)
10346         {
10347           int move = n - 4;
10348
10349           src = aarch64_move_pointer (src, move);
10350           dst = aarch64_move_pointer (dst, move);
10351           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10352         }
10353       return true;
10354     }
10355
10356   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
10357      them, then (if applicable) an 8-byte chunk.  */
10358   while (n >= 8)
10359     {
10360       if (n / 16)
10361         {
10362           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10363           n -= 16;
10364         }
10365       else
10366         {
10367           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10368           n -= 8;
10369         }
10370     }
10371
10372   /* Finish the final bytes of the copy.  We can always do this in one
10373      instruction.  We either copy the exact amount we need, or partially
10374      overlap with the previous chunk we copied and copy 8-bytes.  */
10375   if (n == 0)
10376     return true;
10377   else if (n == 1)
10378     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10379   else if (n == 2)
10380     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10381   else if (n == 4)
10382     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10383   else
10384     {
10385       if (n == 3)
10386         {
10387           src = aarch64_move_pointer (src, -1);
10388           dst = aarch64_move_pointer (dst, -1);
10389           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10390         }
10391       else
10392         {
10393           int move = n - 8;
10394
10395           src = aarch64_move_pointer (src, move);
10396           dst = aarch64_move_pointer (dst, move);
10397           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10398         }
10399     }
10400
10401   return true;
10402 }
10403
10404 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
10405
10406 static unsigned HOST_WIDE_INT
10407 aarch64_asan_shadow_offset (void)
10408 {
10409   return (HOST_WIDE_INT_1 << 36);
10410 }
10411
10412 static bool
10413 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
10414                                         unsigned int align,
10415                                         enum by_pieces_operation op,
10416                                         bool speed_p)
10417 {
10418   /* STORE_BY_PIECES can be used when copying a constant string, but
10419      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10420      For now we always fail this and let the move_by_pieces code copy
10421      the string from read-only memory.  */
10422   if (op == STORE_BY_PIECES)
10423     return false;
10424
10425   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10426 }
10427
10428 static enum machine_mode
10429 aarch64_code_to_ccmode (enum rtx_code code)
10430 {
10431   switch (code)
10432     {
10433     case NE:
10434       return CC_DNEmode;
10435
10436     case EQ:
10437       return CC_DEQmode;
10438
10439     case LE:
10440       return CC_DLEmode;
10441
10442     case LT:
10443       return CC_DLTmode;
10444
10445     case GE:
10446       return CC_DGEmode;
10447
10448     case GT:
10449       return CC_DGTmode;
10450
10451     case LEU:
10452       return CC_DLEUmode;
10453
10454     case LTU:
10455       return CC_DLTUmode;
10456
10457     case GEU:
10458       return CC_DGEUmode;
10459
10460     case GTU:
10461       return CC_DGTUmode;
10462
10463     default:
10464       return CCmode;
10465     }
10466 }
10467
10468 static rtx
10469 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
10470                         int code, tree treeop0, tree treeop1)
10471 {
10472   enum machine_mode op_mode, cmp_mode, cc_mode;
10473   rtx op0, op1, cmp, target;
10474   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10475   enum insn_code icode;
10476   struct expand_operand ops[4];
10477
10478   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
10479   if (cc_mode == CCmode)
10480     return NULL_RTX;
10481
10482   start_sequence ();
10483   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10484
10485   op_mode = GET_MODE (op0);
10486   if (op_mode == VOIDmode)
10487     op_mode = GET_MODE (op1);
10488
10489   switch (op_mode)
10490     {
10491     case QImode:
10492     case HImode:
10493     case SImode:
10494       cmp_mode = SImode;
10495       icode = CODE_FOR_cmpsi;
10496       break;
10497
10498     case DImode:
10499       cmp_mode = DImode;
10500       icode = CODE_FOR_cmpdi;
10501       break;
10502
10503     default:
10504       end_sequence ();
10505       return NULL_RTX;
10506     }
10507
10508   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10509   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10510   if (!op0 || !op1)
10511     {
10512       end_sequence ();
10513       return NULL_RTX;
10514     }
10515   *prep_seq = get_insns ();
10516   end_sequence ();
10517
10518   cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
10519   target = gen_rtx_REG (CCmode, CC_REGNUM);
10520
10521   create_output_operand (&ops[0], target, CCmode);
10522   create_fixed_operand (&ops[1], cmp);
10523   create_fixed_operand (&ops[2], op0);
10524   create_fixed_operand (&ops[3], op1);
10525
10526   start_sequence ();
10527   if (!maybe_expand_insn (icode, 4, ops))
10528     {
10529       end_sequence ();
10530       return NULL_RTX;
10531     }
10532   *gen_seq = get_insns ();
10533   end_sequence ();
10534
10535   return gen_rtx_REG (cc_mode, CC_REGNUM);
10536 }
10537
10538 static rtx
10539 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
10540                        tree treeop0, tree treeop1, int bit_code)
10541 {
10542   rtx op0, op1, cmp0, cmp1, target;
10543   enum machine_mode op_mode, cmp_mode, cc_mode;
10544   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10545   enum insn_code icode = CODE_FOR_ccmp_andsi;
10546   struct expand_operand ops[6];
10547
10548   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
10549   if (cc_mode == CCmode)
10550     return NULL_RTX;
10551
10552   push_to_sequence ((rtx_insn*) *prep_seq);
10553   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10554
10555   op_mode = GET_MODE (op0);
10556   if (op_mode == VOIDmode)
10557     op_mode = GET_MODE (op1);
10558
10559   switch (op_mode)
10560     {
10561     case QImode:
10562     case HImode:
10563     case SImode:
10564       cmp_mode = SImode;
10565       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
10566                                                 : CODE_FOR_ccmp_iorsi;
10567       break;
10568
10569     case DImode:
10570       cmp_mode = DImode;
10571       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
10572                                                 : CODE_FOR_ccmp_iordi;
10573       break;
10574
10575     default:
10576       end_sequence ();
10577       return NULL_RTX;
10578     }
10579
10580   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10581   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10582   if (!op0 || !op1)
10583     {
10584       end_sequence ();
10585       return NULL_RTX;
10586     }
10587   *prep_seq = get_insns ();
10588   end_sequence ();
10589
10590   target = gen_rtx_REG (cc_mode, CC_REGNUM);
10591   cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
10592   cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
10593
10594   create_fixed_operand (&ops[0], prev);
10595   create_fixed_operand (&ops[1], target);
10596   create_fixed_operand (&ops[2], op0);
10597   create_fixed_operand (&ops[3], op1);
10598   create_fixed_operand (&ops[4], cmp0);
10599   create_fixed_operand (&ops[5], cmp1);
10600
10601   push_to_sequence ((rtx_insn*) *gen_seq);
10602   if (!maybe_expand_insn (icode, 6, ops))
10603     {
10604       end_sequence ();
10605       return NULL_RTX;
10606     }
10607
10608   *gen_seq = get_insns ();
10609   end_sequence ();
10610
10611   return target;
10612 }
10613
10614 #undef TARGET_GEN_CCMP_FIRST
10615 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
10616
10617 #undef TARGET_GEN_CCMP_NEXT
10618 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
10619
10620 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
10621    instruction fusion of some sort.  */
10622
10623 static bool
10624 aarch64_macro_fusion_p (void)
10625 {
10626   return aarch64_tune_params->fuseable_ops != AARCH64_FUSE_NOTHING;
10627 }
10628
10629
10630 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
10631    should be kept together during scheduling.  */
10632
10633 static bool
10634 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
10635 {
10636   rtx set_dest;
10637   rtx prev_set = single_set (prev);
10638   rtx curr_set = single_set (curr);
10639   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
10640   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
10641
10642   if (!aarch64_macro_fusion_p ())
10643     return false;
10644
10645   if (simple_sets_p
10646       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOV_MOVK))
10647     {
10648       /* We are trying to match:
10649          prev (mov)  == (set (reg r0) (const_int imm16))
10650          curr (movk) == (set (zero_extract (reg r0)
10651                                            (const_int 16)
10652                                            (const_int 16))
10653                              (const_int imm16_1))  */
10654
10655       set_dest = SET_DEST (curr_set);
10656
10657       if (GET_CODE (set_dest) == ZERO_EXTRACT
10658           && CONST_INT_P (SET_SRC (curr_set))
10659           && CONST_INT_P (SET_SRC (prev_set))
10660           && CONST_INT_P (XEXP (set_dest, 2))
10661           && INTVAL (XEXP (set_dest, 2)) == 16
10662           && REG_P (XEXP (set_dest, 0))
10663           && REG_P (SET_DEST (prev_set))
10664           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
10665         {
10666           return true;
10667         }
10668     }
10669
10670   if (simple_sets_p
10671       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_ADD))
10672     {
10673
10674       /*  We're trying to match:
10675           prev (adrp) == (set (reg r1)
10676                               (high (symbol_ref ("SYM"))))
10677           curr (add) == (set (reg r0)
10678                              (lo_sum (reg r1)
10679                                      (symbol_ref ("SYM"))))
10680           Note that r0 need not necessarily be the same as r1, especially
10681           during pre-regalloc scheduling.  */
10682
10683       if (satisfies_constraint_Ush (SET_SRC (prev_set))
10684           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10685         {
10686           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
10687               && REG_P (XEXP (SET_SRC (curr_set), 0))
10688               && REGNO (XEXP (SET_SRC (curr_set), 0))
10689                  == REGNO (SET_DEST (prev_set))
10690               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
10691                               XEXP (SET_SRC (curr_set), 1)))
10692             return true;
10693         }
10694     }
10695
10696   if (simple_sets_p
10697       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOVK_MOVK))
10698     {
10699
10700       /* We're trying to match:
10701          prev (movk) == (set (zero_extract (reg r0)
10702                                            (const_int 16)
10703                                            (const_int 32))
10704                              (const_int imm16_1))
10705          curr (movk) == (set (zero_extract (reg r0)
10706                                            (const_int 16)
10707                                            (const_int 48))
10708                              (const_int imm16_2))  */
10709
10710       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
10711           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
10712           && REG_P (XEXP (SET_DEST (prev_set), 0))
10713           && REG_P (XEXP (SET_DEST (curr_set), 0))
10714           && REGNO (XEXP (SET_DEST (prev_set), 0))
10715              == REGNO (XEXP (SET_DEST (curr_set), 0))
10716           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
10717           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
10718           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
10719           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
10720           && CONST_INT_P (SET_SRC (prev_set))
10721           && CONST_INT_P (SET_SRC (curr_set)))
10722         return true;
10723
10724     }
10725   if (simple_sets_p
10726       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_LDR))
10727     {
10728       /* We're trying to match:
10729           prev (adrp) == (set (reg r0)
10730                               (high (symbol_ref ("SYM"))))
10731           curr (ldr) == (set (reg r1)
10732                              (mem (lo_sum (reg r0)
10733                                              (symbol_ref ("SYM")))))
10734                  or
10735           curr (ldr) == (set (reg r1)
10736                              (zero_extend (mem
10737                                            (lo_sum (reg r0)
10738                                                    (symbol_ref ("SYM"))))))  */
10739       if (satisfies_constraint_Ush (SET_SRC (prev_set))
10740           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10741         {
10742           rtx curr_src = SET_SRC (curr_set);
10743
10744           if (GET_CODE (curr_src) == ZERO_EXTEND)
10745             curr_src = XEXP (curr_src, 0);
10746
10747           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
10748               && REG_P (XEXP (XEXP (curr_src, 0), 0))
10749               && REGNO (XEXP (XEXP (curr_src, 0), 0))
10750                  == REGNO (SET_DEST (prev_set))
10751               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
10752                               XEXP (SET_SRC (prev_set), 0)))
10753               return true;
10754         }
10755     }
10756
10757   if ((aarch64_tune_params->fuseable_ops & AARCH64_FUSE_CMP_BRANCH)
10758       && any_condjump_p (curr))
10759     {
10760       enum attr_type prev_type = get_attr_type (prev);
10761
10762       /* FIXME: this misses some which is considered simple arthematic
10763          instructions for ThunderX.  Simple shifts are missed here.  */
10764       if (prev_type == TYPE_ALUS_SREG
10765           || prev_type == TYPE_ALUS_IMM
10766           || prev_type == TYPE_LOGICS_REG
10767           || prev_type == TYPE_LOGICS_IMM)
10768         return true;
10769     }
10770
10771   return false;
10772 }
10773
10774 /* If MEM is in the form of [base+offset], extract the two parts
10775    of address and set to BASE and OFFSET, otherwise return false
10776    after clearing BASE and OFFSET.  */
10777
10778 bool
10779 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
10780 {
10781   rtx addr;
10782
10783   gcc_assert (MEM_P (mem));
10784
10785   addr = XEXP (mem, 0);
10786
10787   if (REG_P (addr))
10788     {
10789       *base = addr;
10790       *offset = const0_rtx;
10791       return true;
10792     }
10793
10794   if (GET_CODE (addr) == PLUS
10795       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
10796     {
10797       *base = XEXP (addr, 0);
10798       *offset = XEXP (addr, 1);
10799       return true;
10800     }
10801
10802   *base = NULL_RTX;
10803   *offset = NULL_RTX;
10804
10805   return false;
10806 }
10807
10808 /* Types for scheduling fusion.  */
10809 enum sched_fusion_type
10810 {
10811   SCHED_FUSION_NONE = 0,
10812   SCHED_FUSION_LD_SIGN_EXTEND,
10813   SCHED_FUSION_LD_ZERO_EXTEND,
10814   SCHED_FUSION_LD,
10815   SCHED_FUSION_ST,
10816   SCHED_FUSION_NUM
10817 };
10818
10819 /* If INSN is a load or store of address in the form of [base+offset],
10820    extract the two parts and set to BASE and OFFSET.  Return scheduling
10821    fusion type this INSN is.  */
10822
10823 static enum sched_fusion_type
10824 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
10825 {
10826   rtx x, dest, src;
10827   enum sched_fusion_type fusion = SCHED_FUSION_LD;
10828
10829   gcc_assert (INSN_P (insn));
10830   x = PATTERN (insn);
10831   if (GET_CODE (x) != SET)
10832     return SCHED_FUSION_NONE;
10833
10834   src = SET_SRC (x);
10835   dest = SET_DEST (x);
10836
10837   if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
10838       && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
10839     return SCHED_FUSION_NONE;
10840
10841   if (GET_CODE (src) == SIGN_EXTEND)
10842     {
10843       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
10844       src = XEXP (src, 0);
10845       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10846         return SCHED_FUSION_NONE;
10847     }
10848   else if (GET_CODE (src) == ZERO_EXTEND)
10849     {
10850       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
10851       src = XEXP (src, 0);
10852       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10853         return SCHED_FUSION_NONE;
10854     }
10855
10856   if (GET_CODE (src) == MEM && REG_P (dest))
10857     extract_base_offset_in_addr (src, base, offset);
10858   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
10859     {
10860       fusion = SCHED_FUSION_ST;
10861       extract_base_offset_in_addr (dest, base, offset);
10862     }
10863   else
10864     return SCHED_FUSION_NONE;
10865
10866   if (*base == NULL_RTX || *offset == NULL_RTX)
10867     fusion = SCHED_FUSION_NONE;
10868
10869   return fusion;
10870 }
10871
10872 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
10873
10874    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
10875    and PRI are only calculated for these instructions.  For other instruction,
10876    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
10877    type instruction fusion can be added by returning different priorities.
10878
10879    It's important that irrelevant instructions get the largest FUSION_PRI.  */
10880
10881 static void
10882 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
10883                                int *fusion_pri, int *pri)
10884 {
10885   int tmp, off_val;
10886   rtx base, offset;
10887   enum sched_fusion_type fusion;
10888
10889   gcc_assert (INSN_P (insn));
10890
10891   tmp = max_pri - 1;
10892   fusion = fusion_load_store (insn, &base, &offset);
10893   if (fusion == SCHED_FUSION_NONE)
10894     {
10895       *pri = tmp;
10896       *fusion_pri = tmp;
10897       return;
10898     }
10899
10900   /* Set FUSION_PRI according to fusion type and base register.  */
10901   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
10902
10903   /* Calculate PRI.  */
10904   tmp /= 2;
10905
10906   /* INSN with smaller offset goes first.  */
10907   off_val = (int)(INTVAL (offset));
10908   if (off_val >= 0)
10909     tmp -= (off_val & 0xfffff);
10910   else
10911     tmp += ((- off_val) & 0xfffff);
10912
10913   *pri = tmp;
10914   return;
10915 }
10916
10917 /* Given OPERANDS of consecutive load/store, check if we can merge
10918    them into ldp/stp.  LOAD is true if they are load instructions.
10919    MODE is the mode of memory operands.  */
10920
10921 bool
10922 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
10923                                 enum machine_mode mode)
10924 {
10925   HOST_WIDE_INT offval_1, offval_2, msize;
10926   enum reg_class rclass_1, rclass_2;
10927   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
10928
10929   if (load)
10930     {
10931       mem_1 = operands[1];
10932       mem_2 = operands[3];
10933       reg_1 = operands[0];
10934       reg_2 = operands[2];
10935       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
10936       if (REGNO (reg_1) == REGNO (reg_2))
10937         return false;
10938     }
10939   else
10940     {
10941       mem_1 = operands[0];
10942       mem_2 = operands[2];
10943       reg_1 = operands[1];
10944       reg_2 = operands[3];
10945     }
10946
10947   /* The mems cannot be volatile.  */
10948   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
10949     return false;
10950
10951   /* Check if the addresses are in the form of [base+offset].  */
10952   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
10953   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
10954     return false;
10955   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
10956   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
10957     return false;
10958
10959   /* Check if the bases are same.  */
10960   if (!rtx_equal_p (base_1, base_2))
10961     return false;
10962
10963   offval_1 = INTVAL (offset_1);
10964   offval_2 = INTVAL (offset_2);
10965   msize = GET_MODE_SIZE (mode);
10966   /* Check if the offsets are consecutive.  */
10967   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
10968     return false;
10969
10970   /* Check if the addresses are clobbered by load.  */
10971   if (load)
10972     {
10973       if (reg_mentioned_p (reg_1, mem_1))
10974         return false;
10975
10976       /* In increasing order, the last load can clobber the address.  */
10977       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
10978       return false;
10979     }
10980
10981   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
10982     rclass_1 = FP_REGS;
10983   else
10984     rclass_1 = GENERAL_REGS;
10985
10986   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
10987     rclass_2 = FP_REGS;
10988   else
10989     rclass_2 = GENERAL_REGS;
10990
10991   /* Check if the registers are of same class.  */
10992   if (rclass_1 != rclass_2)
10993     return false;
10994
10995   return true;
10996 }
10997
10998 /* Given OPERANDS of consecutive load/store, check if we can merge
10999    them into ldp/stp by adjusting the offset.  LOAD is true if they
11000    are load instructions.  MODE is the mode of memory operands.
11001
11002    Given below consecutive stores:
11003
11004      str  w1, [xb, 0x100]
11005      str  w1, [xb, 0x104]
11006      str  w1, [xb, 0x108]
11007      str  w1, [xb, 0x10c]
11008
11009    Though the offsets are out of the range supported by stp, we can
11010    still pair them after adjusting the offset, like:
11011
11012      add  scratch, xb, 0x100
11013      stp  w1, w1, [scratch]
11014      stp  w1, w1, [scratch, 0x8]
11015
11016    The peephole patterns detecting this opportunity should guarantee
11017    the scratch register is avaliable.  */
11018
11019 bool
11020 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
11021                                        enum machine_mode mode)
11022 {
11023   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
11024   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
11025   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
11026   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
11027
11028   if (load)
11029     {
11030       reg_1 = operands[0];
11031       mem_1 = operands[1];
11032       reg_2 = operands[2];
11033       mem_2 = operands[3];
11034       reg_3 = operands[4];
11035       mem_3 = operands[5];
11036       reg_4 = operands[6];
11037       mem_4 = operands[7];
11038       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
11039                   && REG_P (reg_3) && REG_P (reg_4));
11040       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
11041         return false;
11042     }
11043   else
11044     {
11045       mem_1 = operands[0];
11046       reg_1 = operands[1];
11047       mem_2 = operands[2];
11048       reg_2 = operands[3];
11049       mem_3 = operands[4];
11050       reg_3 = operands[5];
11051       mem_4 = operands[6];
11052       reg_4 = operands[7];
11053     }
11054   /* Skip if memory operand is by itslef valid for ldp/stp.  */
11055   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
11056     return false;
11057
11058   /* The mems cannot be volatile.  */
11059   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
11060       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
11061     return false;
11062
11063   /* Check if the addresses are in the form of [base+offset].  */
11064   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11065   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11066     return false;
11067   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11068   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11069     return false;
11070   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11071   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11072     return false;
11073   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11074   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11075     return false;
11076
11077   /* Check if the bases are same.  */
11078   if (!rtx_equal_p (base_1, base_2)
11079       || !rtx_equal_p (base_2, base_3)
11080       || !rtx_equal_p (base_3, base_4))
11081     return false;
11082
11083   offval_1 = INTVAL (offset_1);
11084   offval_2 = INTVAL (offset_2);
11085   offval_3 = INTVAL (offset_3);
11086   offval_4 = INTVAL (offset_4);
11087   msize = GET_MODE_SIZE (mode);
11088   /* Check if the offsets are consecutive.  */
11089   if ((offval_1 != (offval_2 + msize)
11090        || offval_1 != (offval_3 + msize * 2)
11091        || offval_1 != (offval_4 + msize * 3))
11092       && (offval_4 != (offval_3 + msize)
11093           || offval_4 != (offval_2 + msize * 2)
11094           || offval_4 != (offval_1 + msize * 3)))
11095     return false;
11096
11097   /* Check if the addresses are clobbered by load.  */
11098   if (load)
11099     {
11100       if (reg_mentioned_p (reg_1, mem_1)
11101           || reg_mentioned_p (reg_2, mem_2)
11102           || reg_mentioned_p (reg_3, mem_3))
11103         return false;
11104
11105       /* In increasing order, the last load can clobber the address.  */
11106       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11107         return false;
11108     }
11109
11110   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11111     rclass_1 = FP_REGS;
11112   else
11113     rclass_1 = GENERAL_REGS;
11114
11115   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11116     rclass_2 = FP_REGS;
11117   else
11118     rclass_2 = GENERAL_REGS;
11119
11120   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11121     rclass_3 = FP_REGS;
11122   else
11123     rclass_3 = GENERAL_REGS;
11124
11125   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11126     rclass_4 = FP_REGS;
11127   else
11128     rclass_4 = GENERAL_REGS;
11129
11130   /* Check if the registers are of same class.  */
11131   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11132     return false;
11133
11134   return true;
11135 }
11136
11137 /* Given OPERANDS of consecutive load/store, this function pairs them
11138    into ldp/stp after adjusting the offset.  It depends on the fact
11139    that addresses of load/store instructions are in increasing order.
11140    MODE is the mode of memory operands.  CODE is the rtl operator
11141    which should be applied to all memory operands, it's SIGN_EXTEND,
11142    ZERO_EXTEND or UNKNOWN.  */
11143
11144 bool
11145 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11146                              enum machine_mode mode, RTX_CODE code)
11147 {
11148   rtx base, offset, t1, t2;
11149   rtx mem_1, mem_2, mem_3, mem_4;
11150   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11151
11152   if (load)
11153     {
11154       mem_1 = operands[1];
11155       mem_2 = operands[3];
11156       mem_3 = operands[5];
11157       mem_4 = operands[7];
11158     }
11159   else
11160     {
11161       mem_1 = operands[0];
11162       mem_2 = operands[2];
11163       mem_3 = operands[4];
11164       mem_4 = operands[6];
11165       gcc_assert (code == UNKNOWN);
11166     }
11167
11168   extract_base_offset_in_addr (mem_1, &base, &offset);
11169   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11170
11171   /* Adjust offset thus it can fit in ldp/stp instruction.  */
11172   msize = GET_MODE_SIZE (mode);
11173   stp_off_limit = msize * 0x40;
11174   off_val = INTVAL (offset);
11175   abs_off = (off_val < 0) ? -off_val : off_val;
11176   new_off = abs_off % stp_off_limit;
11177   adj_off = abs_off - new_off;
11178
11179   /* Further adjust to make sure all offsets are OK.  */
11180   if ((new_off + msize * 2) >= stp_off_limit)
11181     {
11182       adj_off += stp_off_limit;
11183       new_off -= stp_off_limit;
11184     }
11185
11186   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
11187   if (adj_off >= 0x1000)
11188     return false;
11189
11190   if (off_val < 0)
11191     {
11192       adj_off = -adj_off;
11193       new_off = -new_off;
11194     }
11195
11196   /* Create new memory references.  */
11197   mem_1 = change_address (mem_1, VOIDmode,
11198                           plus_constant (DImode, operands[8], new_off));
11199
11200   /* Check if the adjusted address is OK for ldp/stp.  */
11201   if (!aarch64_mem_pair_operand (mem_1, mode))
11202     return false;
11203
11204   msize = GET_MODE_SIZE (mode);
11205   mem_2 = change_address (mem_2, VOIDmode,
11206                           plus_constant (DImode,
11207                                          operands[8],
11208                                          new_off + msize));
11209   mem_3 = change_address (mem_3, VOIDmode,
11210                           plus_constant (DImode,
11211                                          operands[8],
11212                                          new_off + msize * 2));
11213   mem_4 = change_address (mem_4, VOIDmode,
11214                           plus_constant (DImode,
11215                                          operands[8],
11216                                          new_off + msize * 3));
11217
11218   if (code == ZERO_EXTEND)
11219     {
11220       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11221       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11222       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11223       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11224     }
11225   else if (code == SIGN_EXTEND)
11226     {
11227       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11228       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11229       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11230       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11231     }
11232
11233   if (load)
11234     {
11235       operands[1] = mem_1;
11236       operands[3] = mem_2;
11237       operands[5] = mem_3;
11238       operands[7] = mem_4;
11239     }
11240   else
11241     {
11242       operands[0] = mem_1;
11243       operands[2] = mem_2;
11244       operands[4] = mem_3;
11245       operands[6] = mem_4;
11246     }
11247
11248   /* Emit adjusting instruction.  */
11249   emit_insn (gen_rtx_SET (VOIDmode, operands[8],
11250                           plus_constant (DImode, base, adj_off)));
11251   /* Emit ldp/stp instructions.  */
11252   t1 = gen_rtx_SET (VOIDmode, operands[0], operands[1]);
11253   t2 = gen_rtx_SET (VOIDmode, operands[2], operands[3]);
11254   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11255   t1 = gen_rtx_SET (VOIDmode, operands[4], operands[5]);
11256   t2 = gen_rtx_SET (VOIDmode, operands[6], operands[7]);
11257   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11258   return true;
11259 }
11260
11261 #undef TARGET_ADDRESS_COST
11262 #define TARGET_ADDRESS_COST aarch64_address_cost
11263
11264 /* This hook will determines whether unnamed bitfields affect the alignment
11265    of the containing structure.  The hook returns true if the structure
11266    should inherit the alignment requirements of an unnamed bitfield's
11267    type.  */
11268 #undef TARGET_ALIGN_ANON_BITFIELD
11269 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11270
11271 #undef TARGET_ASM_ALIGNED_DI_OP
11272 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11273
11274 #undef TARGET_ASM_ALIGNED_HI_OP
11275 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11276
11277 #undef TARGET_ASM_ALIGNED_SI_OP
11278 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11279
11280 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11281 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11282   hook_bool_const_tree_hwi_hwi_const_tree_true
11283
11284 #undef TARGET_ASM_FILE_START
11285 #define TARGET_ASM_FILE_START aarch64_start_file
11286
11287 #undef TARGET_ASM_OUTPUT_MI_THUNK
11288 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11289
11290 #undef TARGET_ASM_SELECT_RTX_SECTION
11291 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11292
11293 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11294 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11295
11296 #undef TARGET_BUILD_BUILTIN_VA_LIST
11297 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11298
11299 #undef TARGET_CALLEE_COPIES
11300 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11301
11302 #undef TARGET_CAN_ELIMINATE
11303 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11304
11305 #undef TARGET_CANNOT_FORCE_CONST_MEM
11306 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11307
11308 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11309 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11310
11311 /* Only the least significant bit is used for initialization guard
11312    variables.  */
11313 #undef TARGET_CXX_GUARD_MASK_BIT
11314 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11315
11316 #undef TARGET_C_MODE_FOR_SUFFIX
11317 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11318
11319 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11320 #undef  TARGET_DEFAULT_TARGET_FLAGS
11321 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11322 #endif
11323
11324 #undef TARGET_CLASS_MAX_NREGS
11325 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11326
11327 #undef TARGET_BUILTIN_DECL
11328 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11329
11330 #undef  TARGET_EXPAND_BUILTIN
11331 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11332
11333 #undef TARGET_EXPAND_BUILTIN_VA_START
11334 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11335
11336 #undef TARGET_FOLD_BUILTIN
11337 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11338
11339 #undef TARGET_FUNCTION_ARG
11340 #define TARGET_FUNCTION_ARG aarch64_function_arg
11341
11342 #undef TARGET_FUNCTION_ARG_ADVANCE
11343 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11344
11345 #undef TARGET_FUNCTION_ARG_BOUNDARY
11346 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11347
11348 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11349 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11350
11351 #undef TARGET_FUNCTION_VALUE
11352 #define TARGET_FUNCTION_VALUE aarch64_function_value
11353
11354 #undef TARGET_FUNCTION_VALUE_REGNO_P
11355 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11356
11357 #undef TARGET_FRAME_POINTER_REQUIRED
11358 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11359
11360 #undef TARGET_GIMPLE_FOLD_BUILTIN
11361 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11362
11363 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11364 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11365
11366 #undef  TARGET_INIT_BUILTINS
11367 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
11368
11369 #undef TARGET_LEGITIMATE_ADDRESS_P
11370 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11371
11372 #undef TARGET_LEGITIMATE_CONSTANT_P
11373 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11374
11375 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11376 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11377
11378 #undef TARGET_LRA_P
11379 #define TARGET_LRA_P hook_bool_void_true
11380
11381 #undef TARGET_MANGLE_TYPE
11382 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11383
11384 #undef TARGET_MEMORY_MOVE_COST
11385 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11386
11387 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11388 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11389
11390 #undef TARGET_MUST_PASS_IN_STACK
11391 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11392
11393 /* This target hook should return true if accesses to volatile bitfields
11394    should use the narrowest mode possible.  It should return false if these
11395    accesses should use the bitfield container type.  */
11396 #undef TARGET_NARROW_VOLATILE_BITFIELD
11397 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11398
11399 #undef  TARGET_OPTION_OVERRIDE
11400 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11401
11402 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11403 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11404   aarch64_override_options_after_change
11405
11406 #undef TARGET_PASS_BY_REFERENCE
11407 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11408
11409 #undef TARGET_PREFERRED_RELOAD_CLASS
11410 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11411
11412 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11413 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11414
11415 #undef TARGET_SECONDARY_RELOAD
11416 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11417
11418 #undef TARGET_SHIFT_TRUNCATION_MASK
11419 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11420
11421 #undef TARGET_SETUP_INCOMING_VARARGS
11422 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11423
11424 #undef TARGET_STRUCT_VALUE_RTX
11425 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
11426
11427 #undef TARGET_REGISTER_MOVE_COST
11428 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11429
11430 #undef TARGET_RETURN_IN_MEMORY
11431 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11432
11433 #undef TARGET_RETURN_IN_MSB
11434 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11435
11436 #undef TARGET_RTX_COSTS
11437 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11438
11439 #undef TARGET_SCHED_ISSUE_RATE
11440 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11441
11442 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
11443 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
11444   aarch64_sched_first_cycle_multipass_dfa_lookahead
11445
11446 #undef TARGET_TRAMPOLINE_INIT
11447 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11448
11449 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11450 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11451
11452 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11453 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11454
11455 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11456 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11457
11458 #undef TARGET_VECTORIZE_ADD_STMT_COST
11459 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11460
11461 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11462 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11463   aarch64_builtin_vectorization_cost
11464
11465 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11466 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11467
11468 #undef TARGET_VECTORIZE_BUILTINS
11469 #define TARGET_VECTORIZE_BUILTINS
11470
11471 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11472 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11473   aarch64_builtin_vectorized_function
11474
11475 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11476 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11477   aarch64_autovectorize_vector_sizes
11478
11479 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11480 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11481   aarch64_atomic_assign_expand_fenv
11482
11483 /* Section anchor support.  */
11484
11485 #undef TARGET_MIN_ANCHOR_OFFSET
11486 #define TARGET_MIN_ANCHOR_OFFSET -256
11487
11488 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11489    byte offset; we can do much more for larger data types, but have no way
11490    to determine the size of the access.  We assume accesses are aligned.  */
11491 #undef TARGET_MAX_ANCHOR_OFFSET
11492 #define TARGET_MAX_ANCHOR_OFFSET 4095
11493
11494 #undef TARGET_VECTOR_ALIGNMENT
11495 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11496
11497 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11498 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11499   aarch64_simd_vector_alignment_reachable
11500
11501 /* vec_perm support.  */
11502
11503 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11504 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11505   aarch64_vectorize_vec_perm_const_ok
11506
11507
11508 #undef TARGET_FIXED_CONDITION_CODE_REGS
11509 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11510
11511 #undef TARGET_FLAGS_REGNUM
11512 #define TARGET_FLAGS_REGNUM CC_REGNUM
11513
11514 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11515 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11516
11517 #undef TARGET_ASAN_SHADOW_OFFSET
11518 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11519
11520 #undef TARGET_LEGITIMIZE_ADDRESS
11521 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11522
11523 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11524 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11525   aarch64_use_by_pieces_infrastructure_p
11526
11527 #undef TARGET_CAN_USE_DOLOOP_P
11528 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11529
11530 #undef TARGET_SCHED_MACRO_FUSION_P
11531 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11532
11533 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11534 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11535
11536 #undef TARGET_SCHED_FUSION_PRIORITY
11537 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11538
11539 struct gcc_target targetm = TARGET_INITIALIZER;
11540
11541 #include "gt-aarch64.h"