gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2015 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "tm.h"
  25 #include "insn-codes.h"
  26 #include "rtl.h"
  27 #include "insn-attr.h"
  28 #include "hash-set.h"
  29 #include "machmode.h"
  30 #include "vec.h"
  31 #include "double-int.h"
  32 #include "input.h"
  33 #include "alias.h"
  34 #include "symtab.h"
  35 #include "wide-int.h"
  36 #include "inchash.h"
  37 #include "tree.h"
  38 #include "fold-const.h"
  39 #include "stringpool.h"
  40 #include "stor-layout.h"
  41 #include "calls.h"
  42 #include "varasm.h"
  43 #include "regs.h"
  44 #include "dominance.h"
  45 #include "cfg.h"
  46 #include "cfgrtl.h"
  47 #include "cfganal.h"
  48 #include "lcm.h"
  49 #include "cfgbuild.h"
  50 #include "cfgcleanup.h"
  51 #include "predict.h"
  52 #include "basic-block.h"
  53 #include "df.h"
  54 #include "hard-reg-set.h"
  55 #include "output.h"
  56 #include "hashtab.h"
  57 #include "function.h"
  58 #include "flags.h"
  59 #include "statistics.h"
  60 #include "real.h"
  61 #include "fixed-value.h"
  62 #include "insn-config.h"
  63 #include "expmed.h"
  64 #include "dojump.h"
  65 #include "explow.h"
  66 #include "emit-rtl.h"
  67 #include "stmt.h"
  68 #include "expr.h"
  69 #include "reload.h"
  70 #include "toplev.h"
  71 #include "target.h"
  72 #include "target-def.h"
  73 #include "targhooks.h"
  74 #include "ggc.h"
  75 #include "tm_p.h"
  76 #include "recog.h"
  77 #include "langhooks.h"
  78 #include "diagnostic-core.h"
  79 #include "hash-table.h"
  80 #include "tree-ssa-alias.h"
  81 #include "internal-fn.h"
  82 #include "gimple-fold.h"
  83 #include "tree-eh.h"
  84 #include "gimple-expr.h"
  85 #include "is-a.h"
  86 #include "gimple.h"
  87 #include "gimplify.h"
  88 #include "optabs.h"
  89 #include "dwarf2.h"
  90 #include "cfgloop.h"
  91 #include "tree-vectorizer.h"
  92 #include "aarch64-cost-tables.h"
  93 #include "dumpfile.h"
  94 #include "builtins.h"
  95 #include "rtl-iter.h"
  96 #include "tm-constrs.h"
  97 #include "sched-int.h"
  98 #include "cortex-a57-fma-steering.h"
  99
 100 /* Defined for convenience.  */
 101 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
 102
 103 /* Classifies an address.
 104
 105    ADDRESS_REG_IMM
 106        A simple base register plus immediate offset.
 107
 108    ADDRESS_REG_WB
 109        A base register indexed by immediate offset with writeback.
 110
 111    ADDRESS_REG_REG
 112        A base register indexed by (optionally scaled) register.
 113
 114    ADDRESS_REG_UXTW
 115        A base register indexed by (optionally scaled) zero-extended register.
 116
 117    ADDRESS_REG_SXTW
 118        A base register indexed by (optionally scaled) sign-extended register.
 119
 120    ADDRESS_LO_SUM
 121        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 122
 123    ADDRESS_SYMBOLIC:
 124        A constant symbolic address, in pc-relative literal pool.  */
 125
 126 enum aarch64_address_type {
 127   ADDRESS_REG_IMM,
 128   ADDRESS_REG_WB,
 129   ADDRESS_REG_REG,
 130   ADDRESS_REG_UXTW,
 131   ADDRESS_REG_SXTW,
 132   ADDRESS_LO_SUM,
 133   ADDRESS_SYMBOLIC
 134 };
 135
 136 struct aarch64_address_info {
 137   enum aarch64_address_type type;
 138   rtx base;
 139   rtx offset;
 140   int shift;
 141   enum aarch64_symbol_type symbol_type;
 142 };
 143
 144 struct simd_immediate_info
 145 {
 146   rtx value;
 147   int shift;
 148   int element_width;
 149   bool mvn;
 150   bool msl;
 151 };
 152
 153 /* The current code model.  */
 154 enum aarch64_code_model aarch64_cmodel;
 155
 156 #ifdef HAVE_AS_TLS
 157 #undef TARGET_HAVE_TLS
 158 #define TARGET_HAVE_TLS 1
 159 #endif
 160
 161 static bool aarch64_composite_type_p (const_tree, machine_mode);
 162 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 163                                                      const_tree,
 164                                                      machine_mode *, int *,
 165                                                      bool *);
 166 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 167 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 168 static void aarch64_override_options_after_change (void);
 169 static bool aarch64_vector_mode_supported_p (machine_mode);
 170 static unsigned bit_count (unsigned HOST_WIDE_INT);
 171 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 172                                                  const unsigned char *sel);
 173 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 174
 175 /* Major revision number of the ARM Architecture implemented by the target.  */
 176 unsigned aarch64_architecture_version;
 177
 178 /* The processor for which instructions should be scheduled.  */
 179 enum aarch64_processor aarch64_tune = cortexa53;
 180
 181 /* The current tuning set.  */
 182 const struct tune_params *aarch64_tune_params;
 183
 184 /* Mask to specify which instructions we are allowed to generate.  */
 185 unsigned long aarch64_isa_flags = 0;
 186
 187 /* Mask to specify which instruction scheduling options should be used.  */
 188 unsigned long aarch64_tune_flags = 0;
 189
 190 /* Tuning parameters.  */
 191
 192 static const struct cpu_addrcost_table generic_addrcost_table =
 193 {
 194     {
 195       0, /* hi  */
 196       0, /* si  */
 197       0, /* di  */
 198       0, /* ti  */
 199     },
 200   0, /* pre_modify  */
 201   0, /* post_modify  */
 202   0, /* register_offset  */
 203   0, /* register_extend  */
 204   0 /* imm_offset  */
 205 };
 206
 207 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 208 {
 209     {
 210       1, /* hi  */
 211       0, /* si  */
 212       0, /* di  */
 213       1, /* ti  */
 214     },
 215   0, /* pre_modify  */
 216   0, /* post_modify  */
 217   0, /* register_offset  */
 218   0, /* register_extend  */
 219   0, /* imm_offset  */
 220 };
 221
 222 static const struct cpu_addrcost_table xgene1_addrcost_table =
 223 {
 224     {
 225       1, /* hi  */
 226       0, /* si  */
 227       0, /* di  */
 228       1, /* ti  */
 229     },
 230   1, /* pre_modify  */
 231   0, /* post_modify  */
 232   0, /* register_offset  */
 233   1, /* register_extend  */
 234   0, /* imm_offset  */
 235 };
 236
 237 static const struct cpu_regmove_cost generic_regmove_cost =
 238 {
 239   1, /* GP2GP  */
 240   /* Avoid the use of slow int<->fp moves for spilling by setting
 241      their cost higher than memmov_cost.  */
 242   5, /* GP2FP  */
 243   5, /* FP2GP  */
 244   2 /* FP2FP  */
 245 };
 246
 247 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 248 {
 249   1, /* GP2GP  */
 250   /* Avoid the use of slow int<->fp moves for spilling by setting
 251      their cost higher than memmov_cost.  */
 252   5, /* GP2FP  */
 253   5, /* FP2GP  */
 254   2 /* FP2FP  */
 255 };
 256
 257 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 258 {
 259   1, /* GP2GP  */
 260   /* Avoid the use of slow int<->fp moves for spilling by setting
 261      their cost higher than memmov_cost.  */
 262   5, /* GP2FP  */
 263   5, /* FP2GP  */
 264   2 /* FP2FP  */
 265 };
 266
 267 static const struct cpu_regmove_cost thunderx_regmove_cost =
 268 {
 269   2, /* GP2GP  */
 270   2, /* GP2FP  */
 271   6, /* FP2GP  */
 272   4 /* FP2FP  */
 273 };
 274
 275 static const struct cpu_regmove_cost xgene1_regmove_cost =
 276 {
 277   1, /* GP2GP  */
 278   /* Avoid the use of slow int<->fp moves for spilling by setting
 279      their cost higher than memmov_cost.  */
 280   8, /* GP2FP  */
 281   8, /* FP2GP  */
 282   2 /* FP2FP  */
 283 };
 284
 285 /* Generic costs for vector insn classes.  */
 286 static const struct cpu_vector_cost generic_vector_cost =
 287 {
 288   1, /* scalar_stmt_cost  */
 289   1, /* scalar_load_cost  */
 290   1, /* scalar_store_cost  */
 291   1, /* vec_stmt_cost  */
 292   1, /* vec_to_scalar_cost  */
 293   1, /* scalar_to_vec_cost  */
 294   1, /* vec_align_load_cost  */
 295   1, /* vec_unalign_load_cost  */
 296   1, /* vec_unalign_store_cost  */
 297   1, /* vec_store_cost  */
 298   3, /* cond_taken_branch_cost  */
 299   1 /* cond_not_taken_branch_cost  */
 300 };
 301
 302 /* Generic costs for vector insn classes.  */
 303 static const struct cpu_vector_cost cortexa57_vector_cost =
 304 {
 305   1, /* scalar_stmt_cost  */
 306   4, /* scalar_load_cost  */
 307   1, /* scalar_store_cost  */
 308   3, /* vec_stmt_cost  */
 309   8, /* vec_to_scalar_cost  */
 310   8, /* scalar_to_vec_cost  */
 311   5, /* vec_align_load_cost  */
 312   5, /* vec_unalign_load_cost  */
 313   1, /* vec_unalign_store_cost  */
 314   1, /* vec_store_cost  */
 315   1, /* cond_taken_branch_cost  */
 316   1 /* cond_not_taken_branch_cost  */
 317 };
 318
 319 /* Generic costs for vector insn classes.  */
 320 static const struct cpu_vector_cost xgene1_vector_cost =
 321 {
 322   1, /* scalar_stmt_cost  */
 323   5, /* scalar_load_cost  */
 324   1, /* scalar_store_cost  */
 325   2, /* vec_stmt_cost  */
 326   4, /* vec_to_scalar_cost  */
 327   4, /* scalar_to_vec_cost  */
 328   10, /* vec_align_load_cost  */
 329   10, /* vec_unalign_load_cost  */
 330   2, /* vec_unalign_store_cost  */
 331   2, /* vec_store_cost  */
 332   2, /* cond_taken_branch_cost  */
 333   1 /* cond_not_taken_branch_cost  */
 334 };
 335
 336 #define AARCH64_FUSE_NOTHING    (0)
 337 #define AARCH64_FUSE_MOV_MOVK   (1 << 0)
 338 #define AARCH64_FUSE_ADRP_ADD   (1 << 1)
 339 #define AARCH64_FUSE_MOVK_MOVK  (1 << 2)
 340 #define AARCH64_FUSE_ADRP_LDR   (1 << 3)
 341 #define AARCH64_FUSE_CMP_BRANCH (1 << 4)
 342
 343 static const struct tune_params generic_tunings =
 344 {
 345   &cortexa57_extra_costs,
 346   &generic_addrcost_table,
 347   &generic_regmove_cost,
 348   &generic_vector_cost,
 349   4, /* memmov_cost  */
 350   2, /* issue_rate  */
 351   AARCH64_FUSE_NOTHING, /* fuseable_ops  */
 352   8,    /* function_align.  */
 353   8,    /* jump_align.  */
 354   4,    /* loop_align.  */
 355   2,    /* int_reassoc_width.  */
 356   4,    /* fp_reassoc_width.  */
 357   1     /* vec_reassoc_width.  */
 358 };
 359
 360 static const struct tune_params cortexa53_tunings =
 361 {
 362   &cortexa53_extra_costs,
 363   &generic_addrcost_table,
 364   &cortexa53_regmove_cost,
 365   &generic_vector_cost,
 366   4, /* memmov_cost  */
 367   2, /* issue_rate  */
 368   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 369    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fuseable_ops  */
 370   8,    /* function_align.  */
 371   8,    /* jump_align.  */
 372   4,    /* loop_align.  */
 373   2,    /* int_reassoc_width.  */
 374   4,    /* fp_reassoc_width.  */
 375   1     /* vec_reassoc_width.  */
 376 };
 377
 378 static const struct tune_params cortexa57_tunings =
 379 {
 380   &cortexa57_extra_costs,
 381   &cortexa57_addrcost_table,
 382   &cortexa57_regmove_cost,
 383   &cortexa57_vector_cost,
 384   4, /* memmov_cost  */
 385   3, /* issue_rate  */
 386   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 387    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 388   16,   /* function_align.  */
 389   8,    /* jump_align.  */
 390   4,    /* loop_align.  */
 391   2,    /* int_reassoc_width.  */
 392   4,    /* fp_reassoc_width.  */
 393   1     /* vec_reassoc_width.  */
 394 };
 395
 396 static const struct tune_params thunderx_tunings =
 397 {
 398   &thunderx_extra_costs,
 399   &generic_addrcost_table,
 400   &thunderx_regmove_cost,
 401   &generic_vector_cost,
 402   6, /* memmov_cost  */
 403   2, /* issue_rate  */
 404   AARCH64_FUSE_CMP_BRANCH, /* fuseable_ops  */
 405   8,    /* function_align.  */
 406   8,    /* jump_align.  */
 407   8,    /* loop_align.  */
 408   2,    /* int_reassoc_width.  */
 409   4,    /* fp_reassoc_width.  */
 410   1     /* vec_reassoc_width.  */
 411 };
 412
 413 static const struct tune_params xgene1_tunings =
 414 {
 415   &xgene1_extra_costs,
 416   &xgene1_addrcost_table,
 417   &xgene1_regmove_cost,
 418   &xgene1_vector_cost,
 419   6, /* memmov_cost  */
 420   4, /* issue_rate  */
 421   AARCH64_FUSE_NOTHING, /* fuseable_ops  */
 422   16,   /* function_align.  */
 423   8,    /* jump_align.  */
 424   16,   /* loop_align.  */
 425   2,    /* int_reassoc_width.  */
 426   4,    /* fp_reassoc_width.  */
 427   1     /* vec_reassoc_width.  */
 428 };
 429
 430 /* A processor implementing AArch64.  */
 431 struct processor
 432 {
 433   const char *const name;
 434   enum aarch64_processor core;
 435   const char *arch;
 436   unsigned architecture_version;
 437   const unsigned long flags;
 438   const struct tune_params *const tune;
 439 };
 440
 441 /* Processor cores implementing AArch64.  */
 442 static const struct processor all_cores[] =
 443 {
 444 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
 445   {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
 446 #include "aarch64-cores.def"
 447 #undef AARCH64_CORE
 448   {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
 449   {NULL, aarch64_none, NULL, 0, 0, NULL}
 450 };
 451
 452 /* Architectures implementing AArch64.  */
 453 static const struct processor all_architectures[] =
 454 {
 455 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
 456   {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
 457 #include "aarch64-arches.def"
 458 #undef AARCH64_ARCH
 459   {NULL, aarch64_none, NULL, 0, 0, NULL}
 460 };
 461
 462 /* Target specification.  These are populated as commandline arguments
 463    are processed, or NULL if not specified.  */
 464 static const struct processor *selected_arch;
 465 static const struct processor *selected_cpu;
 466 static const struct processor *selected_tune;
 467
 468 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 469
 470 /* An ISA extension in the co-processor and main instruction set space.  */
 471 struct aarch64_option_extension
 472 {
 473   const char *const name;
 474   const unsigned long flags_on;
 475   const unsigned long flags_off;
 476 };
 477
 478 /* ISA extensions in AArch64.  */
 479 static const struct aarch64_option_extension all_extensions[] =
 480 {
 481 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
 482   {NAME, FLAGS_ON, FLAGS_OFF},
 483 #include "aarch64-option-extensions.def"
 484 #undef AARCH64_OPT_EXTENSION
 485   {NULL, 0, 0}
 486 };
 487
 488 /* Used to track the size of an address when generating a pre/post
 489    increment address.  */
 490 static machine_mode aarch64_memory_reference_mode;
 491
 492 /* A table of valid AArch64 "bitmask immediate" values for
 493    logical instructions.  */
 494
 495 #define AARCH64_NUM_BITMASKS  5334
 496 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
 497
 498 typedef enum aarch64_cond_code
 499 {
 500   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 501   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 502   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 503 }
 504 aarch64_cc;
 505
 506 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 507
 508 /* The condition codes of the processor, and the inverse function.  */
 509 static const char * const aarch64_condition_codes[] =
 510 {
 511   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 512   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 513 };
 514
 515 static unsigned int
 516 aarch64_min_divisions_for_recip_mul (enum machine_mode mode ATTRIBUTE_UNUSED)
 517 {
 518   return 2;
 519 }
 520
 521 static int
 522 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 523                              enum machine_mode mode)
 524 {
 525   if (VECTOR_MODE_P (mode))
 526     return aarch64_tune_params->vec_reassoc_width;
 527   if (INTEGRAL_MODE_P (mode))
 528     return aarch64_tune_params->int_reassoc_width;
 529   if (FLOAT_MODE_P (mode))
 530     return aarch64_tune_params->fp_reassoc_width;
 531   return 1;
 532 }
 533
 534 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 535 unsigned
 536 aarch64_dbx_register_number (unsigned regno)
 537 {
 538    if (GP_REGNUM_P (regno))
 539      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 540    else if (regno == SP_REGNUM)
 541      return AARCH64_DWARF_SP;
 542    else if (FP_REGNUM_P (regno))
 543      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 544
 545    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 546       equivalent DWARF register.  */
 547    return DWARF_FRAME_REGISTERS;
 548 }
 549
 550 /* Return TRUE if MODE is any of the large INT modes.  */
 551 static bool
 552 aarch64_vect_struct_mode_p (machine_mode mode)
 553 {
 554   return mode == OImode || mode == CImode || mode == XImode;
 555 }
 556
 557 /* Return TRUE if MODE is any of the vector modes.  */
 558 static bool
 559 aarch64_vector_mode_p (machine_mode mode)
 560 {
 561   return aarch64_vector_mode_supported_p (mode)
 562          || aarch64_vect_struct_mode_p (mode);
 563 }
 564
 565 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 566 static bool
 567 aarch64_array_mode_supported_p (machine_mode mode,
 568                                 unsigned HOST_WIDE_INT nelems)
 569 {
 570   if (TARGET_SIMD
 571       && AARCH64_VALID_SIMD_QREG_MODE (mode)
 572       && (nelems >= 2 && nelems <= 4))
 573     return true;
 574
 575   return false;
 576 }
 577
 578 /* Implement HARD_REGNO_NREGS.  */
 579
 580 int
 581 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
 582 {
 583   switch (aarch64_regno_regclass (regno))
 584     {
 585     case FP_REGS:
 586     case FP_LO_REGS:
 587       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 588     default:
 589       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 590     }
 591   gcc_unreachable ();
 592 }
 593
 594 /* Implement HARD_REGNO_MODE_OK.  */
 595
 596 int
 597 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
 598 {
 599   if (GET_MODE_CLASS (mode) == MODE_CC)
 600     return regno == CC_REGNUM;
 601
 602   if (regno == SP_REGNUM)
 603     /* The purpose of comparing with ptr_mode is to support the
 604        global register variable associated with the stack pointer
 605        register via the syntax of asm ("wsp") in ILP32.  */
 606     return mode == Pmode || mode == ptr_mode;
 607
 608   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 609     return mode == Pmode;
 610
 611   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 612     return 1;
 613
 614   if (FP_REGNUM_P (regno))
 615     {
 616       if (aarch64_vect_struct_mode_p (mode))
 617         return
 618           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 619       else
 620         return 1;
 621     }
 622
 623   return 0;
 624 }
 625
 626 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 627 machine_mode
 628 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 629                                      machine_mode mode)
 630 {
 631   /* Handle modes that fit within single registers.  */
 632   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 633     {
 634       if (GET_MODE_SIZE (mode) >= 4)
 635         return mode;
 636       else
 637         return SImode;
 638     }
 639   /* Fall back to generic for multi-reg and very large modes.  */
 640   else
 641     return choose_hard_reg_mode (regno, nregs, false);
 642 }
 643
 644 /* Return true if calls to DECL should be treated as
 645    long-calls (ie called via a register).  */
 646 static bool
 647 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 648 {
 649   return false;
 650 }
 651
 652 /* Return true if calls to symbol-ref SYM should be treated as
 653    long-calls (ie called via a register).  */
 654 bool
 655 aarch64_is_long_call_p (rtx sym)
 656 {
 657   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 658 }
 659
 660 /* Return true if the offsets to a zero/sign-extract operation
 661    represent an expression that matches an extend operation.  The
 662    operands represent the paramters from
 663
 664    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 665 bool
 666 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
 667                                 rtx extract_imm)
 668 {
 669   HOST_WIDE_INT mult_val, extract_val;
 670
 671   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 672     return false;
 673
 674   mult_val = INTVAL (mult_imm);
 675   extract_val = INTVAL (extract_imm);
 676
 677   if (extract_val > 8
 678       && extract_val < GET_MODE_BITSIZE (mode)
 679       && exact_log2 (extract_val & ~7) > 0
 680       && (extract_val & 7) <= 4
 681       && mult_val == (1 << (extract_val & 7)))
 682     return true;
 683
 684   return false;
 685 }
 686
 687 /* Emit an insn that's a simple single-set.  Both the operands must be
 688    known to be valid.  */
 689 inline static rtx
 690 emit_set_insn (rtx x, rtx y)
 691 {
 692   return emit_insn (gen_rtx_SET (VOIDmode, x, y));
 693 }
 694
 695 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 696    return the rtx for register 0 in the proper mode.  */
 697 rtx
 698 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 699 {
 700   machine_mode mode = SELECT_CC_MODE (code, x, y);
 701   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 702
 703   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 704   return cc_reg;
 705 }
 706
 707 /* Build the SYMBOL_REF for __tls_get_addr.  */
 708
 709 static GTY(()) rtx tls_get_addr_libfunc;
 710
 711 rtx
 712 aarch64_tls_get_addr (void)
 713 {
 714   if (!tls_get_addr_libfunc)
 715     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 716   return tls_get_addr_libfunc;
 717 }
 718
 719 /* Return the TLS model to use for ADDR.  */
 720
 721 static enum tls_model
 722 tls_symbolic_operand_type (rtx addr)
 723 {
 724   enum tls_model tls_kind = TLS_MODEL_NONE;
 725   rtx sym, addend;
 726
 727   if (GET_CODE (addr) == CONST)
 728     {
 729       split_const (addr, &sym, &addend);
 730       if (GET_CODE (sym) == SYMBOL_REF)
 731         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 732     }
 733   else if (GET_CODE (addr) == SYMBOL_REF)
 734     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 735
 736   return tls_kind;
 737 }
 738
 739 /* We'll allow lo_sum's in addresses in our legitimate addresses
 740    so that combine would take care of combining addresses where
 741    necessary, but for generation purposes, we'll generate the address
 742    as :
 743    RTL                               Absolute
 744    tmp = hi (symbol_ref);            adrp  x1, foo
 745    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 746                                      nop
 747
 748    PIC                               TLS
 749    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 750    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 751                                      bl   __tls_get_addr
 752                                      nop
 753
 754    Load TLS symbol, depending on TLS mechanism and TLS access model.
 755
 756    Global Dynamic - Traditional TLS:
 757    adrp tmp, :tlsgd:imm
 758    add  dest, tmp, #:tlsgd_lo12:imm
 759    bl   __tls_get_addr
 760
 761    Global Dynamic - TLS Descriptors:
 762    adrp dest, :tlsdesc:imm
 763    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 764    add  dest, dest, #:tlsdesc_lo12:imm
 765    blr  tmp
 766    mrs  tp, tpidr_el0
 767    add  dest, dest, tp
 768
 769    Initial Exec:
 770    mrs  tp, tpidr_el0
 771    adrp tmp, :gottprel:imm
 772    ldr  dest, [tmp, #:gottprel_lo12:imm]
 773    add  dest, dest, tp
 774
 775    Local Exec:
 776    mrs  tp, tpidr_el0
 777    add  t0, tp, #:tprel_hi12:imm, lsl #12
 778    add  t0, t0, #:tprel_lo12_nc:imm
 779 */
 780
 781 static void
 782 aarch64_load_symref_appropriately (rtx dest, rtx imm,
 783                                    enum aarch64_symbol_type type)
 784 {
 785   switch (type)
 786     {
 787     case SYMBOL_SMALL_ABSOLUTE:
 788       {
 789         /* In ILP32, the mode of dest can be either SImode or DImode.  */
 790         rtx tmp_reg = dest;
 791         machine_mode mode = GET_MODE (dest);
 792
 793         gcc_assert (mode == Pmode || mode == ptr_mode);
 794
 795         if (can_create_pseudo_p ())
 796           tmp_reg = gen_reg_rtx (mode);
 797
 798         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 799         emit_insn (gen_add_losym (dest, tmp_reg, imm));
 800         return;
 801       }
 802
 803     case SYMBOL_TINY_ABSOLUTE:
 804       emit_insn (gen_rtx_SET (Pmode, dest, imm));
 805       return;
 806
 807     case SYMBOL_SMALL_GOT:
 808       {
 809         /* In ILP32, the mode of dest can be either SImode or DImode,
 810            while the got entry is always of SImode size.  The mode of
 811            dest depends on how dest is used: if dest is assigned to a
 812            pointer (e.g. in the memory), it has SImode; it may have
 813            DImode if dest is dereferenced to access the memeory.
 814            This is why we have to handle three different ldr_got_small
 815            patterns here (two patterns for ILP32).  */
 816         rtx tmp_reg = dest;
 817         machine_mode mode = GET_MODE (dest);
 818
 819         if (can_create_pseudo_p ())
 820           tmp_reg = gen_reg_rtx (mode);
 821
 822         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 823         if (mode == ptr_mode)
 824           {
 825             if (mode == DImode)
 826               emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
 827             else
 828               emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
 829           }
 830         else
 831           {
 832             gcc_assert (mode == Pmode);
 833             emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
 834           }
 835
 836         return;
 837       }
 838
 839     case SYMBOL_SMALL_TLSGD:
 840       {
 841         rtx_insn *insns;
 842         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
 843
 844         start_sequence ();
 845         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
 846         insns = get_insns ();
 847         end_sequence ();
 848
 849         RTL_CONST_CALL_P (insns) = 1;
 850         emit_libcall_block (insns, dest, result, imm);
 851         return;
 852       }
 853
 854     case SYMBOL_SMALL_TLSDESC:
 855       {
 856         machine_mode mode = GET_MODE (dest);
 857         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
 858         rtx tp;
 859
 860         gcc_assert (mode == Pmode || mode == ptr_mode);
 861
 862         /* In ILP32, the got entry is always of SImode size.  Unlike
 863            small GOT, the dest is fixed at reg 0.  */
 864         if (TARGET_ILP32)
 865           emit_insn (gen_tlsdesc_small_si (imm));
 866         else
 867           emit_insn (gen_tlsdesc_small_di (imm));
 868         tp = aarch64_load_tp (NULL);
 869
 870         if (mode != Pmode)
 871           tp = gen_lowpart (mode, tp);
 872
 873         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, x0)));
 874         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 875         return;
 876       }
 877
 878     case SYMBOL_SMALL_GOTTPREL:
 879       {
 880         /* In ILP32, the mode of dest can be either SImode or DImode,
 881            while the got entry is always of SImode size.  The mode of
 882            dest depends on how dest is used: if dest is assigned to a
 883            pointer (e.g. in the memory), it has SImode; it may have
 884            DImode if dest is dereferenced to access the memeory.
 885            This is why we have to handle three different tlsie_small
 886            patterns here (two patterns for ILP32).  */
 887         machine_mode mode = GET_MODE (dest);
 888         rtx tmp_reg = gen_reg_rtx (mode);
 889         rtx tp = aarch64_load_tp (NULL);
 890
 891         if (mode == ptr_mode)
 892           {
 893             if (mode == DImode)
 894               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
 895             else
 896               {
 897                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
 898                 tp = gen_lowpart (mode, tp);
 899               }
 900           }
 901         else
 902           {
 903             gcc_assert (mode == Pmode);
 904             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
 905           }
 906
 907         emit_insn (gen_rtx_SET (mode, dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
 908         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 909         return;
 910       }
 911
 912     case SYMBOL_SMALL_TPREL:
 913       {
 914         rtx tp = aarch64_load_tp (NULL);
 915
 916         if (GET_MODE (dest) != Pmode)
 917           tp = gen_lowpart (GET_MODE (dest), tp);
 918
 919         emit_insn (gen_tlsle_small (dest, tp, imm));
 920         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
 921         return;
 922       }
 923
 924     case SYMBOL_TINY_GOT:
 925       emit_insn (gen_ldr_got_tiny (dest, imm));
 926       return;
 927
 928     default:
 929       gcc_unreachable ();
 930     }
 931 }
 932
 933 /* Emit a move from SRC to DEST.  Assume that the move expanders can
 934    handle all moves if !can_create_pseudo_p ().  The distinction is
 935    important because, unlike emit_move_insn, the move expanders know
 936    how to force Pmode objects into the constant pool even when the
 937    constant pool address is not itself legitimate.  */
 938 static rtx
 939 aarch64_emit_move (rtx dest, rtx src)
 940 {
 941   return (can_create_pseudo_p ()
 942           ? emit_move_insn (dest, src)
 943           : emit_move_insn_1 (dest, src));
 944 }
 945
 946 /* Split a 128-bit move operation into two 64-bit move operations,
 947    taking care to handle partial overlap of register to register
 948    copies.  Special cases are needed when moving between GP regs and
 949    FP regs.  SRC can be a register, constant or memory; DST a register
 950    or memory.  If either operand is memory it must not have any side
 951    effects.  */
 952 void
 953 aarch64_split_128bit_move (rtx dst, rtx src)
 954 {
 955   rtx dst_lo, dst_hi;
 956   rtx src_lo, src_hi;
 957
 958   machine_mode mode = GET_MODE (dst);
 959
 960   gcc_assert (mode == TImode || mode == TFmode);
 961   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
 962   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
 963
 964   if (REG_P (dst) && REG_P (src))
 965     {
 966       int src_regno = REGNO (src);
 967       int dst_regno = REGNO (dst);
 968
 969       /* Handle FP <-> GP regs.  */
 970       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
 971         {
 972           src_lo = gen_lowpart (word_mode, src);
 973           src_hi = gen_highpart (word_mode, src);
 974
 975           if (mode == TImode)
 976             {
 977               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
 978               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
 979             }
 980           else
 981             {
 982               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
 983               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
 984             }
 985           return;
 986         }
 987       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
 988         {
 989           dst_lo = gen_lowpart (word_mode, dst);
 990           dst_hi = gen_highpart (word_mode, dst);
 991
 992           if (mode == TImode)
 993             {
 994               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
 995               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
 996             }
 997           else
 998             {
 999               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1000               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1001             }
1002           return;
1003         }
1004     }
1005
1006   dst_lo = gen_lowpart (word_mode, dst);
1007   dst_hi = gen_highpart (word_mode, dst);
1008   src_lo = gen_lowpart (word_mode, src);
1009   src_hi = gen_highpart_mode (word_mode, mode, src);
1010
1011   /* At most one pairing may overlap.  */
1012   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1013     {
1014       aarch64_emit_move (dst_hi, src_hi);
1015       aarch64_emit_move (dst_lo, src_lo);
1016     }
1017   else
1018     {
1019       aarch64_emit_move (dst_lo, src_lo);
1020       aarch64_emit_move (dst_hi, src_hi);
1021     }
1022 }
1023
1024 bool
1025 aarch64_split_128bit_move_p (rtx dst, rtx src)
1026 {
1027   return (! REG_P (src)
1028           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1029 }
1030
1031 /* Split a complex SIMD combine.  */
1032
1033 void
1034 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1035 {
1036   machine_mode src_mode = GET_MODE (src1);
1037   machine_mode dst_mode = GET_MODE (dst);
1038
1039   gcc_assert (VECTOR_MODE_P (dst_mode));
1040
1041   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1042     {
1043       rtx (*gen) (rtx, rtx, rtx);
1044
1045       switch (src_mode)
1046         {
1047         case V8QImode:
1048           gen = gen_aarch64_simd_combinev8qi;
1049           break;
1050         case V4HImode:
1051           gen = gen_aarch64_simd_combinev4hi;
1052           break;
1053         case V2SImode:
1054           gen = gen_aarch64_simd_combinev2si;
1055           break;
1056         case V2SFmode:
1057           gen = gen_aarch64_simd_combinev2sf;
1058           break;
1059         case DImode:
1060           gen = gen_aarch64_simd_combinedi;
1061           break;
1062         case DFmode:
1063           gen = gen_aarch64_simd_combinedf;
1064           break;
1065         default:
1066           gcc_unreachable ();
1067         }
1068
1069       emit_insn (gen (dst, src1, src2));
1070       return;
1071     }
1072 }
1073
1074 /* Split a complex SIMD move.  */
1075
1076 void
1077 aarch64_split_simd_move (rtx dst, rtx src)
1078 {
1079   machine_mode src_mode = GET_MODE (src);
1080   machine_mode dst_mode = GET_MODE (dst);
1081
1082   gcc_assert (VECTOR_MODE_P (dst_mode));
1083
1084   if (REG_P (dst) && REG_P (src))
1085     {
1086       rtx (*gen) (rtx, rtx);
1087
1088       gcc_assert (VECTOR_MODE_P (src_mode));
1089
1090       switch (src_mode)
1091         {
1092         case V16QImode:
1093           gen = gen_aarch64_split_simd_movv16qi;
1094           break;
1095         case V8HImode:
1096           gen = gen_aarch64_split_simd_movv8hi;
1097           break;
1098         case V4SImode:
1099           gen = gen_aarch64_split_simd_movv4si;
1100           break;
1101         case V2DImode:
1102           gen = gen_aarch64_split_simd_movv2di;
1103           break;
1104         case V4SFmode:
1105           gen = gen_aarch64_split_simd_movv4sf;
1106           break;
1107         case V2DFmode:
1108           gen = gen_aarch64_split_simd_movv2df;
1109           break;
1110         default:
1111           gcc_unreachable ();
1112         }
1113
1114       emit_insn (gen (dst, src));
1115       return;
1116     }
1117 }
1118
1119 static rtx
1120 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1121 {
1122   if (can_create_pseudo_p ())
1123     return force_reg (mode, value);
1124   else
1125     {
1126       x = aarch64_emit_move (x, value);
1127       return x;
1128     }
1129 }
1130
1131
1132 static rtx
1133 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1134 {
1135   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1136     {
1137       rtx high;
1138       /* Load the full offset into a register.  This
1139          might be improvable in the future.  */
1140       high = GEN_INT (offset);
1141       offset = 0;
1142       high = aarch64_force_temporary (mode, temp, high);
1143       reg = aarch64_force_temporary (mode, temp,
1144                                      gen_rtx_PLUS (mode, high, reg));
1145     }
1146   return plus_constant (mode, reg, offset);
1147 }
1148
1149 static int
1150 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1151                                 machine_mode mode)
1152 {
1153   unsigned HOST_WIDE_INT mask;
1154   int i;
1155   bool first;
1156   unsigned HOST_WIDE_INT val;
1157   bool subtargets;
1158   rtx subtarget;
1159   int one_match, zero_match, first_not_ffff_match;
1160   int num_insns = 0;
1161
1162   if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1163     {
1164       if (generate)
1165         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1166       num_insns++;
1167       return num_insns;
1168     }
1169
1170   if (mode == SImode)
1171     {
1172       /* We know we can't do this in 1 insn, and we must be able to do it
1173          in two; so don't mess around looking for sequences that don't buy
1174          us anything.  */
1175       if (generate)
1176         {
1177           emit_insn (gen_rtx_SET (VOIDmode, dest,
1178                                   GEN_INT (INTVAL (imm) & 0xffff)));
1179           emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1180                                      GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1181         }
1182       num_insns += 2;
1183       return num_insns;
1184     }
1185
1186   /* Remaining cases are all for DImode.  */
1187
1188   val = INTVAL (imm);
1189   subtargets = optimize && can_create_pseudo_p ();
1190
1191   one_match = 0;
1192   zero_match = 0;
1193   mask = 0xffff;
1194   first_not_ffff_match = -1;
1195
1196   for (i = 0; i < 64; i += 16, mask <<= 16)
1197     {
1198       if ((val & mask) == mask)
1199         one_match++;
1200       else
1201         {
1202           if (first_not_ffff_match < 0)
1203             first_not_ffff_match = i;
1204           if ((val & mask) == 0)
1205             zero_match++;
1206         }
1207     }
1208
1209   if (one_match == 2)
1210     {
1211       /* Set one of the quarters and then insert back into result.  */
1212       mask = 0xffffll << first_not_ffff_match;
1213       if (generate)
1214         {
1215           emit_insn (gen_rtx_SET (VOIDmode, dest, GEN_INT (val | mask)));
1216           emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1217                                      GEN_INT ((val >> first_not_ffff_match)
1218                                               & 0xffff)));
1219         }
1220       num_insns += 2;
1221       return num_insns;
1222     }
1223
1224   if (zero_match == 2)
1225     goto simple_sequence;
1226
1227   mask = 0x0ffff0000UL;
1228   for (i = 16; i < 64; i += 16, mask <<= 16)
1229     {
1230       HOST_WIDE_INT comp = mask & ~(mask - 1);
1231
1232       if (aarch64_uimm12_shift (val - (val & mask)))
1233         {
1234           if (generate)
1235             {
1236               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1237               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1238                                       GEN_INT (val & mask)));
1239               emit_insn (gen_adddi3 (dest, subtarget,
1240                                      GEN_INT (val - (val & mask))));
1241             }
1242           num_insns += 2;
1243           return num_insns;
1244         }
1245       else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1246         {
1247           if (generate)
1248             {
1249               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1250               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1251                                       GEN_INT ((val + comp) & mask)));
1252               emit_insn (gen_adddi3 (dest, subtarget,
1253                                      GEN_INT (val - ((val + comp) & mask))));
1254             }
1255           num_insns += 2;
1256           return num_insns;
1257         }
1258       else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1259         {
1260           if (generate)
1261             {
1262               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1263               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1264                                       GEN_INT ((val - comp) | ~mask)));
1265               emit_insn (gen_adddi3 (dest, subtarget,
1266                                      GEN_INT (val - ((val - comp) | ~mask))));
1267             }
1268           num_insns += 2;
1269           return num_insns;
1270         }
1271       else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1272         {
1273           if (generate)
1274             {
1275               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1276               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1277                                       GEN_INT (val | ~mask)));
1278               emit_insn (gen_adddi3 (dest, subtarget,
1279                                      GEN_INT (val - (val | ~mask))));
1280             }
1281           num_insns += 2;
1282           return num_insns;
1283         }
1284     }
1285
1286   /* See if we can do it by arithmetically combining two
1287      immediates.  */
1288   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1289     {
1290       int j;
1291       mask = 0xffff;
1292
1293       if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1294           || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1295         {
1296           if (generate)
1297             {
1298               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1299               emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1300                                       GEN_INT (aarch64_bitmasks[i])));
1301               emit_insn (gen_adddi3 (dest, subtarget,
1302                                      GEN_INT (val - aarch64_bitmasks[i])));
1303             }
1304           num_insns += 2;
1305           return num_insns;
1306         }
1307
1308       for (j = 0; j < 64; j += 16, mask <<= 16)
1309         {
1310           if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1311             {
1312               if (generate)
1313                 {
1314                   emit_insn (gen_rtx_SET (VOIDmode, dest,
1315                                           GEN_INT (aarch64_bitmasks[i])));
1316                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1317                                              GEN_INT ((val >> j) & 0xffff)));
1318                 }
1319               num_insns += 2;
1320               return num_insns;
1321             }
1322         }
1323     }
1324
1325   /* See if we can do it by logically combining two immediates.  */
1326   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1327     {
1328       if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1329         {
1330           int j;
1331
1332           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1333             if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1334               {
1335                 if (generate)
1336                   {
1337                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1338                     emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1339                                             GEN_INT (aarch64_bitmasks[i])));
1340                     emit_insn (gen_iordi3 (dest, subtarget,
1341                                            GEN_INT (aarch64_bitmasks[j])));
1342                   }
1343                 num_insns += 2;
1344                 return num_insns;
1345               }
1346         }
1347       else if ((val & aarch64_bitmasks[i]) == val)
1348         {
1349           int j;
1350
1351           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1352             if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1353               {
1354                 if (generate)
1355                   {
1356                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1357                     emit_insn (gen_rtx_SET (VOIDmode, subtarget,
1358                                             GEN_INT (aarch64_bitmasks[j])));
1359                     emit_insn (gen_anddi3 (dest, subtarget,
1360                                            GEN_INT (aarch64_bitmasks[i])));
1361                   }
1362                 num_insns += 2;
1363                 return num_insns;
1364               }
1365         }
1366     }
1367
1368   if (one_match > zero_match)
1369     {
1370       /* Set either first three quarters or all but the third.   */
1371       mask = 0xffffll << (16 - first_not_ffff_match);
1372       if (generate)
1373         emit_insn (gen_rtx_SET (VOIDmode, dest,
1374                                 GEN_INT (val | mask | 0xffffffff00000000ull)));
1375       num_insns ++;
1376
1377       /* Now insert other two quarters.  */
1378       for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1379            i < 64; i += 16, mask <<= 16)
1380         {
1381           if ((val & mask) != mask)
1382             {
1383               if (generate)
1384                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1385                                            GEN_INT ((val >> i) & 0xffff)));
1386               num_insns ++;
1387             }
1388         }
1389       return num_insns;
1390     }
1391
1392  simple_sequence:
1393   first = true;
1394   mask = 0xffff;
1395   for (i = 0; i < 64; i += 16, mask <<= 16)
1396     {
1397       if ((val & mask) != 0)
1398         {
1399           if (first)
1400             {
1401               if (generate)
1402                 emit_insn (gen_rtx_SET (VOIDmode, dest,
1403                                         GEN_INT (val & mask)));
1404               num_insns ++;
1405               first = false;
1406             }
1407           else
1408             {
1409               if (generate)
1410                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1411                                            GEN_INT ((val >> i) & 0xffff)));
1412               num_insns ++;
1413             }
1414         }
1415     }
1416
1417   return num_insns;
1418 }
1419
1420
1421 void
1422 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1423 {
1424   machine_mode mode = GET_MODE (dest);
1425
1426   gcc_assert (mode == SImode || mode == DImode);
1427
1428   /* Check on what type of symbol it is.  */
1429   if (GET_CODE (imm) == SYMBOL_REF
1430       || GET_CODE (imm) == LABEL_REF
1431       || GET_CODE (imm) == CONST)
1432     {
1433       rtx mem, base, offset;
1434       enum aarch64_symbol_type sty;
1435
1436       /* If we have (const (plus symbol offset)), separate out the offset
1437          before we start classifying the symbol.  */
1438       split_const (imm, &base, &offset);
1439
1440       sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1441       switch (sty)
1442         {
1443         case SYMBOL_FORCE_TO_MEM:
1444           if (offset != const0_rtx
1445               && targetm.cannot_force_const_mem (mode, imm))
1446             {
1447               gcc_assert (can_create_pseudo_p ());
1448               base = aarch64_force_temporary (mode, dest, base);
1449               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1450               aarch64_emit_move (dest, base);
1451               return;
1452             }
1453           mem = force_const_mem (ptr_mode, imm);
1454           gcc_assert (mem);
1455           if (mode != ptr_mode)
1456             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1457           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1458           return;
1459
1460         case SYMBOL_SMALL_TLSGD:
1461         case SYMBOL_SMALL_TLSDESC:
1462         case SYMBOL_SMALL_GOTTPREL:
1463         case SYMBOL_SMALL_GOT:
1464         case SYMBOL_TINY_GOT:
1465           if (offset != const0_rtx)
1466             {
1467               gcc_assert(can_create_pseudo_p ());
1468               base = aarch64_force_temporary (mode, dest, base);
1469               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1470               aarch64_emit_move (dest, base);
1471               return;
1472             }
1473           /* FALLTHRU */
1474
1475         case SYMBOL_SMALL_TPREL:
1476         case SYMBOL_SMALL_ABSOLUTE:
1477         case SYMBOL_TINY_ABSOLUTE:
1478           aarch64_load_symref_appropriately (dest, imm, sty);
1479           return;
1480
1481         default:
1482           gcc_unreachable ();
1483         }
1484     }
1485
1486   if (!CONST_INT_P (imm))
1487     {
1488       if (GET_CODE (imm) == HIGH)
1489         emit_insn (gen_rtx_SET (VOIDmode, dest, imm));
1490       else
1491         {
1492           rtx mem = force_const_mem (mode, imm);
1493           gcc_assert (mem);
1494           emit_insn (gen_rtx_SET (VOIDmode, dest, mem));
1495         }
1496
1497       return;
1498     }
1499
1500   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1501 }
1502
1503 static bool
1504 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1505                                  tree exp ATTRIBUTE_UNUSED)
1506 {
1507   /* Currently, always true.  */
1508   return true;
1509 }
1510
1511 /* Implement TARGET_PASS_BY_REFERENCE.  */
1512
1513 static bool
1514 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1515                            machine_mode mode,
1516                            const_tree type,
1517                            bool named ATTRIBUTE_UNUSED)
1518 {
1519   HOST_WIDE_INT size;
1520   machine_mode dummymode;
1521   int nregs;
1522
1523   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1524   size = (mode == BLKmode && type)
1525     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1526
1527   /* Aggregates are passed by reference based on their size.  */
1528   if (type && AGGREGATE_TYPE_P (type))
1529     {
1530       size = int_size_in_bytes (type);
1531     }
1532
1533   /* Variable sized arguments are always returned by reference.  */
1534   if (size < 0)
1535     return true;
1536
1537   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1538   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1539                                                &dummymode, &nregs,
1540                                                NULL))
1541     return false;
1542
1543   /* Arguments which are variable sized or larger than 2 registers are
1544      passed by reference unless they are a homogenous floating point
1545      aggregate.  */
1546   return size > 2 * UNITS_PER_WORD;
1547 }
1548
1549 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1550 static bool
1551 aarch64_return_in_msb (const_tree valtype)
1552 {
1553   machine_mode dummy_mode;
1554   int dummy_int;
1555
1556   /* Never happens in little-endian mode.  */
1557   if (!BYTES_BIG_ENDIAN)
1558     return false;
1559
1560   /* Only composite types smaller than or equal to 16 bytes can
1561      be potentially returned in registers.  */
1562   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1563       || int_size_in_bytes (valtype) <= 0
1564       || int_size_in_bytes (valtype) > 16)
1565     return false;
1566
1567   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1568      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1569      is always passed/returned in the least significant bits of fp/simd
1570      register(s).  */
1571   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1572                                                &dummy_mode, &dummy_int, NULL))
1573     return false;
1574
1575   return true;
1576 }
1577
1578 /* Implement TARGET_FUNCTION_VALUE.
1579    Define how to find the value returned by a function.  */
1580
1581 static rtx
1582 aarch64_function_value (const_tree type, const_tree func,
1583                         bool outgoing ATTRIBUTE_UNUSED)
1584 {
1585   machine_mode mode;
1586   int unsignedp;
1587   int count;
1588   machine_mode ag_mode;
1589
1590   mode = TYPE_MODE (type);
1591   if (INTEGRAL_TYPE_P (type))
1592     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1593
1594   if (aarch64_return_in_msb (type))
1595     {
1596       HOST_WIDE_INT size = int_size_in_bytes (type);
1597
1598       if (size % UNITS_PER_WORD != 0)
1599         {
1600           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1601           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1602         }
1603     }
1604
1605   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1606                                                &ag_mode, &count, NULL))
1607     {
1608       if (!aarch64_composite_type_p (type, mode))
1609         {
1610           gcc_assert (count == 1 && mode == ag_mode);
1611           return gen_rtx_REG (mode, V0_REGNUM);
1612         }
1613       else
1614         {
1615           int i;
1616           rtx par;
1617
1618           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1619           for (i = 0; i < count; i++)
1620             {
1621               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1622               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1623                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1624               XVECEXP (par, 0, i) = tmp;
1625             }
1626           return par;
1627         }
1628     }
1629   else
1630     return gen_rtx_REG (mode, R0_REGNUM);
1631 }
1632
1633 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1634    Return true if REGNO is the number of a hard register in which the values
1635    of called function may come back.  */
1636
1637 static bool
1638 aarch64_function_value_regno_p (const unsigned int regno)
1639 {
1640   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1641      of 16-byte return values are: 128-bit integers and 16-byte small
1642      structures (excluding homogeneous floating-point aggregates).  */
1643   if (regno == R0_REGNUM || regno == R1_REGNUM)
1644     return true;
1645
1646   /* Up to four fp/simd registers can return a function value, e.g. a
1647      homogeneous floating-point aggregate having four members.  */
1648   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1649     return !TARGET_GENERAL_REGS_ONLY;
1650
1651   return false;
1652 }
1653
1654 /* Implement TARGET_RETURN_IN_MEMORY.
1655
1656    If the type T of the result of a function is such that
1657      void func (T arg)
1658    would require that arg be passed as a value in a register (or set of
1659    registers) according to the parameter passing rules, then the result
1660    is returned in the same registers as would be used for such an
1661    argument.  */
1662
1663 static bool
1664 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1665 {
1666   HOST_WIDE_INT size;
1667   machine_mode ag_mode;
1668   int count;
1669
1670   if (!AGGREGATE_TYPE_P (type)
1671       && TREE_CODE (type) != COMPLEX_TYPE
1672       && TREE_CODE (type) != VECTOR_TYPE)
1673     /* Simple scalar types always returned in registers.  */
1674     return false;
1675
1676   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1677                                                type,
1678                                                &ag_mode,
1679                                                &count,
1680                                                NULL))
1681     return false;
1682
1683   /* Types larger than 2 registers returned in memory.  */
1684   size = int_size_in_bytes (type);
1685   return (size < 0 || size > 2 * UNITS_PER_WORD);
1686 }
1687
1688 static bool
1689 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1690                                const_tree type, int *nregs)
1691 {
1692   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1693   return aarch64_vfp_is_call_or_return_candidate (mode,
1694                                                   type,
1695                                                   &pcum->aapcs_vfp_rmode,
1696                                                   nregs,
1697                                                   NULL);
1698 }
1699
1700 /* Given MODE and TYPE of a function argument, return the alignment in
1701    bits.  The idea is to suppress any stronger alignment requested by
1702    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1703    This is a helper function for local use only.  */
1704
1705 static unsigned int
1706 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1707 {
1708   unsigned int alignment;
1709
1710   if (type)
1711     {
1712       if (!integer_zerop (TYPE_SIZE (type)))
1713         {
1714           if (TYPE_MODE (type) == mode)
1715             alignment = TYPE_ALIGN (type);
1716           else
1717             alignment = GET_MODE_ALIGNMENT (mode);
1718         }
1719       else
1720         alignment = 0;
1721     }
1722   else
1723     alignment = GET_MODE_ALIGNMENT (mode);
1724
1725   return alignment;
1726 }
1727
1728 /* Layout a function argument according to the AAPCS64 rules.  The rule
1729    numbers refer to the rule numbers in the AAPCS64.  */
1730
1731 static void
1732 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1733                     const_tree type,
1734                     bool named ATTRIBUTE_UNUSED)
1735 {
1736   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1737   int ncrn, nvrn, nregs;
1738   bool allocate_ncrn, allocate_nvrn;
1739   HOST_WIDE_INT size;
1740
1741   /* We need to do this once per argument.  */
1742   if (pcum->aapcs_arg_processed)
1743     return;
1744
1745   pcum->aapcs_arg_processed = true;
1746
1747   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1748   size
1749     = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1750                         UNITS_PER_WORD);
1751
1752   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1753   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1754                                                  mode,
1755                                                  type,
1756                                                  &nregs);
1757
1758   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1759      The following code thus handles passing by SIMD/FP registers first.  */
1760
1761   nvrn = pcum->aapcs_nvrn;
1762
1763   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1764      and homogenous short-vector aggregates (HVA).  */
1765   if (allocate_nvrn)
1766     {
1767       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1768         {
1769           pcum->aapcs_nextnvrn = nvrn + nregs;
1770           if (!aarch64_composite_type_p (type, mode))
1771             {
1772               gcc_assert (nregs == 1);
1773               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1774             }
1775           else
1776             {
1777               rtx par;
1778               int i;
1779               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1780               for (i = 0; i < nregs; i++)
1781                 {
1782                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1783                                          V0_REGNUM + nvrn + i);
1784                   tmp = gen_rtx_EXPR_LIST
1785                     (VOIDmode, tmp,
1786                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1787                   XVECEXP (par, 0, i) = tmp;
1788                 }
1789               pcum->aapcs_reg = par;
1790             }
1791           return;
1792         }
1793       else
1794         {
1795           /* C.3 NSRN is set to 8.  */
1796           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1797           goto on_stack;
1798         }
1799     }
1800
1801   ncrn = pcum->aapcs_ncrn;
1802   nregs = size / UNITS_PER_WORD;
1803
1804   /* C6 - C9.  though the sign and zero extension semantics are
1805      handled elsewhere.  This is the case where the argument fits
1806      entirely general registers.  */
1807   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1808     {
1809       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1810
1811       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1812
1813       /* C.8 if the argument has an alignment of 16 then the NGRN is
1814          rounded up to the next even number.  */
1815       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1816         {
1817           ++ncrn;
1818           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1819         }
1820       /* NREGS can be 0 when e.g. an empty structure is to be passed.
1821          A reg is still generated for it, but the caller should be smart
1822          enough not to use it.  */
1823       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1824         {
1825           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1826         }
1827       else
1828         {
1829           rtx par;
1830           int i;
1831
1832           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1833           for (i = 0; i < nregs; i++)
1834             {
1835               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1836               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1837                                        GEN_INT (i * UNITS_PER_WORD));
1838               XVECEXP (par, 0, i) = tmp;
1839             }
1840           pcum->aapcs_reg = par;
1841         }
1842
1843       pcum->aapcs_nextncrn = ncrn + nregs;
1844       return;
1845     }
1846
1847   /* C.11  */
1848   pcum->aapcs_nextncrn = NUM_ARG_REGS;
1849
1850   /* The argument is passed on stack; record the needed number of words for
1851      this argument and align the total size if necessary.  */
1852 on_stack:
1853   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1854   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1855     pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1856                                                16 / UNITS_PER_WORD);
1857   return;
1858 }
1859
1860 /* Implement TARGET_FUNCTION_ARG.  */
1861
1862 static rtx
1863 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1864                       const_tree type, bool named)
1865 {
1866   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1867   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1868
1869   if (mode == VOIDmode)
1870     return NULL_RTX;
1871
1872   aarch64_layout_arg (pcum_v, mode, type, named);
1873   return pcum->aapcs_reg;
1874 }
1875
1876 void
1877 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1878                            const_tree fntype ATTRIBUTE_UNUSED,
1879                            rtx libname ATTRIBUTE_UNUSED,
1880                            const_tree fndecl ATTRIBUTE_UNUSED,
1881                            unsigned n_named ATTRIBUTE_UNUSED)
1882 {
1883   pcum->aapcs_ncrn = 0;
1884   pcum->aapcs_nvrn = 0;
1885   pcum->aapcs_nextncrn = 0;
1886   pcum->aapcs_nextnvrn = 0;
1887   pcum->pcs_variant = ARM_PCS_AAPCS64;
1888   pcum->aapcs_reg = NULL_RTX;
1889   pcum->aapcs_arg_processed = false;
1890   pcum->aapcs_stack_words = 0;
1891   pcum->aapcs_stack_size = 0;
1892
1893   return;
1894 }
1895
1896 static void
1897 aarch64_function_arg_advance (cumulative_args_t pcum_v,
1898                               machine_mode mode,
1899                               const_tree type,
1900                               bool named)
1901 {
1902   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1903   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
1904     {
1905       aarch64_layout_arg (pcum_v, mode, type, named);
1906       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
1907                   != (pcum->aapcs_stack_words != 0));
1908       pcum->aapcs_arg_processed = false;
1909       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
1910       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
1911       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
1912       pcum->aapcs_stack_words = 0;
1913       pcum->aapcs_reg = NULL_RTX;
1914     }
1915 }
1916
1917 bool
1918 aarch64_function_arg_regno_p (unsigned regno)
1919 {
1920   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
1921           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
1922 }
1923
1924 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
1925    PARM_BOUNDARY bits of alignment, but will be given anything up
1926    to STACK_BOUNDARY bits if the type requires it.  This makes sure
1927    that both before and after the layout of each argument, the Next
1928    Stacked Argument Address (NSAA) will have a minimum alignment of
1929    8 bytes.  */
1930
1931 static unsigned int
1932 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
1933 {
1934   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1935
1936   if (alignment < PARM_BOUNDARY)
1937     alignment = PARM_BOUNDARY;
1938   if (alignment > STACK_BOUNDARY)
1939     alignment = STACK_BOUNDARY;
1940   return alignment;
1941 }
1942
1943 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
1944
1945    Return true if an argument passed on the stack should be padded upwards,
1946    i.e. if the least-significant byte of the stack slot has useful data.
1947
1948    Small aggregate types are placed in the lowest memory address.
1949
1950    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
1951
1952 bool
1953 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
1954 {
1955   /* On little-endian targets, the least significant byte of every stack
1956      argument is passed at the lowest byte address of the stack slot.  */
1957   if (!BYTES_BIG_ENDIAN)
1958     return true;
1959
1960   /* Otherwise, integral, floating-point and pointer types are padded downward:
1961      the least significant byte of a stack argument is passed at the highest
1962      byte address of the stack slot.  */
1963   if (type
1964       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
1965          || POINTER_TYPE_P (type))
1966       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
1967     return false;
1968
1969   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
1970   return true;
1971 }
1972
1973 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
1974
1975    It specifies padding for the last (may also be the only)
1976    element of a block move between registers and memory.  If
1977    assuming the block is in the memory, padding upward means that
1978    the last element is padded after its highest significant byte,
1979    while in downward padding, the last element is padded at the
1980    its least significant byte side.
1981
1982    Small aggregates and small complex types are always padded
1983    upwards.
1984
1985    We don't need to worry about homogeneous floating-point or
1986    short-vector aggregates; their move is not affected by the
1987    padding direction determined here.  Regardless of endianness,
1988    each element of such an aggregate is put in the least
1989    significant bits of a fp/simd register.
1990
1991    Return !BYTES_BIG_ENDIAN if the least significant byte of the
1992    register has useful data, and return the opposite if the most
1993    significant byte does.  */
1994
1995 bool
1996 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
1997                      bool first ATTRIBUTE_UNUSED)
1998 {
1999
2000   /* Small composite types are always padded upward.  */
2001   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2002     {
2003       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2004                             : GET_MODE_SIZE (mode));
2005       if (size < 2 * UNITS_PER_WORD)
2006         return true;
2007     }
2008
2009   /* Otherwise, use the default padding.  */
2010   return !BYTES_BIG_ENDIAN;
2011 }
2012
2013 static machine_mode
2014 aarch64_libgcc_cmp_return_mode (void)
2015 {
2016   return SImode;
2017 }
2018
2019 static bool
2020 aarch64_frame_pointer_required (void)
2021 {
2022   /* In aarch64_override_options_after_change
2023      flag_omit_leaf_frame_pointer turns off the frame pointer by
2024      default.  Turn it back on now if we've not got a leaf
2025      function.  */
2026   if (flag_omit_leaf_frame_pointer
2027       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2028     return true;
2029
2030   return false;
2031 }
2032
2033 /* Mark the registers that need to be saved by the callee and calculate
2034    the size of the callee-saved registers area and frame record (both FP
2035    and LR may be omitted).  */
2036 static void
2037 aarch64_layout_frame (void)
2038 {
2039   HOST_WIDE_INT offset = 0;
2040   int regno;
2041
2042   if (reload_completed && cfun->machine->frame.laid_out)
2043     return;
2044
2045 #define SLOT_NOT_REQUIRED (-2)
2046 #define SLOT_REQUIRED     (-1)
2047
2048   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2049   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2050
2051   /* First mark all the registers that really need to be saved...  */
2052   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2053     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2054
2055   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2056     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2057
2058   /* ... that includes the eh data registers (if needed)...  */
2059   if (crtl->calls_eh_return)
2060     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2061       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2062         = SLOT_REQUIRED;
2063
2064   /* ... and any callee saved register that dataflow says is live.  */
2065   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2066     if (df_regs_ever_live_p (regno)
2067         && (regno == R30_REGNUM
2068             || !call_used_regs[regno]))
2069       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2070
2071   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2072     if (df_regs_ever_live_p (regno)
2073         && !call_used_regs[regno])
2074       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2075
2076   if (frame_pointer_needed)
2077     {
2078       /* FP and LR are placed in the linkage record.  */
2079       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2080       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2081       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2082       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2083       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2084       offset += 2 * UNITS_PER_WORD;
2085     }
2086
2087   /* Now assign stack slots for them.  */
2088   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2089     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2090       {
2091         cfun->machine->frame.reg_offset[regno] = offset;
2092         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2093           cfun->machine->frame.wb_candidate1 = regno;
2094         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2095           cfun->machine->frame.wb_candidate2 = regno;
2096         offset += UNITS_PER_WORD;
2097       }
2098
2099   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2100     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2101       {
2102         cfun->machine->frame.reg_offset[regno] = offset;
2103         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2104           cfun->machine->frame.wb_candidate1 = regno;
2105         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2106                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2107           cfun->machine->frame.wb_candidate2 = regno;
2108         offset += UNITS_PER_WORD;
2109       }
2110
2111   cfun->machine->frame.padding0 =
2112     (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2113   offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2114
2115   cfun->machine->frame.saved_regs_size = offset;
2116
2117   cfun->machine->frame.hard_fp_offset
2118     = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2119                         + get_frame_size ()
2120                         + cfun->machine->frame.saved_regs_size,
2121                         STACK_BOUNDARY / BITS_PER_UNIT);
2122
2123   cfun->machine->frame.frame_size
2124     = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2125                         + crtl->outgoing_args_size,
2126                         STACK_BOUNDARY / BITS_PER_UNIT);
2127
2128   cfun->machine->frame.laid_out = true;
2129 }
2130
2131 static bool
2132 aarch64_register_saved_on_entry (int regno)
2133 {
2134   return cfun->machine->frame.reg_offset[regno] >= 0;
2135 }
2136
2137 static unsigned
2138 aarch64_next_callee_save (unsigned regno, unsigned limit)
2139 {
2140   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2141     regno ++;
2142   return regno;
2143 }
2144
2145 static void
2146 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2147                            HOST_WIDE_INT adjustment)
2148  {
2149   rtx base_rtx = stack_pointer_rtx;
2150   rtx insn, reg, mem;
2151
2152   reg = gen_rtx_REG (mode, regno);
2153   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2154                             plus_constant (Pmode, base_rtx, -adjustment));
2155   mem = gen_rtx_MEM (mode, mem);
2156
2157   insn = emit_move_insn (mem, reg);
2158   RTX_FRAME_RELATED_P (insn) = 1;
2159 }
2160
2161 static rtx
2162 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2163                           HOST_WIDE_INT adjustment)
2164 {
2165   switch (mode)
2166     {
2167     case DImode:
2168       return gen_storewb_pairdi_di (base, base, reg, reg2,
2169                                     GEN_INT (-adjustment),
2170                                     GEN_INT (UNITS_PER_WORD - adjustment));
2171     case DFmode:
2172       return gen_storewb_pairdf_di (base, base, reg, reg2,
2173                                     GEN_INT (-adjustment),
2174                                     GEN_INT (UNITS_PER_WORD - adjustment));
2175     default:
2176       gcc_unreachable ();
2177     }
2178 }
2179
2180 static void
2181 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2182                          unsigned regno2, HOST_WIDE_INT adjustment)
2183 {
2184   rtx_insn *insn;
2185   rtx reg1 = gen_rtx_REG (mode, regno1);
2186   rtx reg2 = gen_rtx_REG (mode, regno2);
2187
2188   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2189                                               reg2, adjustment));
2190   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2191   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2192   RTX_FRAME_RELATED_P (insn) = 1;
2193 }
2194
2195 static rtx
2196 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2197                          HOST_WIDE_INT adjustment)
2198 {
2199   switch (mode)
2200     {
2201     case DImode:
2202       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2203                                    GEN_INT (UNITS_PER_WORD));
2204     case DFmode:
2205       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2206                                    GEN_INT (UNITS_PER_WORD));
2207     default:
2208       gcc_unreachable ();
2209     }
2210 }
2211
2212 static rtx
2213 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2214                         rtx reg2)
2215 {
2216   switch (mode)
2217     {
2218     case DImode:
2219       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2220
2221     case DFmode:
2222       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2223
2224     default:
2225       gcc_unreachable ();
2226     }
2227 }
2228
2229 static rtx
2230 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2231                        rtx mem2)
2232 {
2233   switch (mode)
2234     {
2235     case DImode:
2236       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2237
2238     case DFmode:
2239       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2240
2241     default:
2242       gcc_unreachable ();
2243     }
2244 }
2245
2246
2247 static void
2248 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2249                            unsigned start, unsigned limit, bool skip_wb)
2250 {
2251   rtx_insn *insn;
2252   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2253                                                  ? gen_frame_mem : gen_rtx_MEM);
2254   unsigned regno;
2255   unsigned regno2;
2256
2257   for (regno = aarch64_next_callee_save (start, limit);
2258        regno <= limit;
2259        regno = aarch64_next_callee_save (regno + 1, limit))
2260     {
2261       rtx reg, mem;
2262       HOST_WIDE_INT offset;
2263
2264       if (skip_wb
2265           && (regno == cfun->machine->frame.wb_candidate1
2266               || regno == cfun->machine->frame.wb_candidate2))
2267         continue;
2268
2269       reg = gen_rtx_REG (mode, regno);
2270       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2271       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2272                                               offset));
2273
2274       regno2 = aarch64_next_callee_save (regno + 1, limit);
2275
2276       if (regno2 <= limit
2277           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2278               == cfun->machine->frame.reg_offset[regno2]))
2279
2280         {
2281           rtx reg2 = gen_rtx_REG (mode, regno2);
2282           rtx mem2;
2283
2284           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2285           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2286                                                    offset));
2287           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2288                                                     reg2));
2289
2290           /* The first part of a frame-related parallel insn is
2291              always assumed to be relevant to the frame
2292              calculations; subsequent parts, are only
2293              frame-related if explicitly marked.  */
2294           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2295           regno = regno2;
2296         }
2297       else
2298         insn = emit_move_insn (mem, reg);
2299
2300       RTX_FRAME_RELATED_P (insn) = 1;
2301     }
2302 }
2303
2304 static void
2305 aarch64_restore_callee_saves (machine_mode mode,
2306                               HOST_WIDE_INT start_offset, unsigned start,
2307                               unsigned limit, bool skip_wb, rtx *cfi_ops)
2308 {
2309   rtx base_rtx = stack_pointer_rtx;
2310   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2311                                                  ? gen_frame_mem : gen_rtx_MEM);
2312   unsigned regno;
2313   unsigned regno2;
2314   HOST_WIDE_INT offset;
2315
2316   for (regno = aarch64_next_callee_save (start, limit);
2317        regno <= limit;
2318        regno = aarch64_next_callee_save (regno + 1, limit))
2319     {
2320       rtx reg, mem;
2321
2322       if (skip_wb
2323           && (regno == cfun->machine->frame.wb_candidate1
2324               || regno == cfun->machine->frame.wb_candidate2))
2325         continue;
2326
2327       reg = gen_rtx_REG (mode, regno);
2328       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2329       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2330
2331       regno2 = aarch64_next_callee_save (regno + 1, limit);
2332
2333       if (regno2 <= limit
2334           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2335               == cfun->machine->frame.reg_offset[regno2]))
2336         {
2337           rtx reg2 = gen_rtx_REG (mode, regno2);
2338           rtx mem2;
2339
2340           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2341           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2342           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2343
2344           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2345           regno = regno2;
2346         }
2347       else
2348         emit_move_insn (reg, mem);
2349       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2350     }
2351 }
2352
2353 /* AArch64 stack frames generated by this compiler look like:
2354
2355         +-------------------------------+
2356         |                               |
2357         |  incoming stack arguments     |
2358         |                               |
2359         +-------------------------------+
2360         |                               | <-- incoming stack pointer (aligned)
2361         |  callee-allocated save area   |
2362         |  for register varargs         |
2363         |                               |
2364         +-------------------------------+
2365         |  local variables              | <-- frame_pointer_rtx
2366         |                               |
2367         +-------------------------------+
2368         |  padding0                     | \
2369         +-------------------------------+  |
2370         |  callee-saved registers       |  | frame.saved_regs_size
2371         +-------------------------------+  |
2372         |  LR'                          |  |
2373         +-------------------------------+  |
2374         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2375         +-------------------------------+
2376         |  dynamic allocation           |
2377         +-------------------------------+
2378         |  padding                      |
2379         +-------------------------------+
2380         |  outgoing stack arguments     | <-- arg_pointer
2381         |                               |
2382         +-------------------------------+
2383         |                               | <-- stack_pointer_rtx (aligned)
2384
2385    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2386    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2387    unchanged.  */
2388
2389 /* Generate the prologue instructions for entry into a function.
2390    Establish the stack frame by decreasing the stack pointer with a
2391    properly calculated size and, if necessary, create a frame record
2392    filled with the values of LR and previous frame pointer.  The
2393    current FP is also set up if it is in use.  */
2394
2395 void
2396 aarch64_expand_prologue (void)
2397 {
2398   /* sub sp, sp, #<frame_size>
2399      stp {fp, lr}, [sp, #<frame_size> - 16]
2400      add fp, sp, #<frame_size> - hardfp_offset
2401      stp {cs_reg}, [fp, #-16] etc.
2402
2403      sub sp, sp, <final_adjustment_if_any>
2404   */
2405   HOST_WIDE_INT frame_size, offset;
2406   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2407   HOST_WIDE_INT hard_fp_offset;
2408   rtx_insn *insn;
2409
2410   aarch64_layout_frame ();
2411
2412   offset = frame_size = cfun->machine->frame.frame_size;
2413   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2414   fp_offset = frame_size - hard_fp_offset;
2415
2416   if (flag_stack_usage_info)
2417     current_function_static_stack_size = frame_size;
2418
2419   /* Store pairs and load pairs have a range only -512 to 504.  */
2420   if (offset >= 512)
2421     {
2422       /* When the frame has a large size, an initial decrease is done on
2423          the stack pointer to jump over the callee-allocated save area for
2424          register varargs, the local variable area and/or the callee-saved
2425          register area.  This will allow the pre-index write-back
2426          store pair instructions to be used for setting up the stack frame
2427          efficiently.  */
2428       offset = hard_fp_offset;
2429       if (offset >= 512)
2430         offset = cfun->machine->frame.saved_regs_size;
2431
2432       frame_size -= (offset + crtl->outgoing_args_size);
2433       fp_offset = 0;
2434
2435       if (frame_size >= 0x1000000)
2436         {
2437           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2438           emit_move_insn (op0, GEN_INT (-frame_size));
2439           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2440
2441           add_reg_note (insn, REG_CFA_ADJUST_CFA,
2442                         gen_rtx_SET (VOIDmode, stack_pointer_rtx,
2443                                      plus_constant (Pmode, stack_pointer_rtx,
2444                                                     -frame_size)));
2445           RTX_FRAME_RELATED_P (insn) = 1;
2446         }
2447       else if (frame_size > 0)
2448         {
2449           int hi_ofs = frame_size & 0xfff000;
2450           int lo_ofs = frame_size & 0x000fff;
2451
2452           if (hi_ofs)
2453             {
2454               insn = emit_insn (gen_add2_insn
2455                                 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2456               RTX_FRAME_RELATED_P (insn) = 1;
2457             }
2458           if (lo_ofs)
2459             {
2460               insn = emit_insn (gen_add2_insn
2461                                 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2462               RTX_FRAME_RELATED_P (insn) = 1;
2463             }
2464         }
2465     }
2466   else
2467     frame_size = -1;
2468
2469   if (offset > 0)
2470     {
2471       bool skip_wb = false;
2472
2473       if (frame_pointer_needed)
2474         {
2475           skip_wb = true;
2476
2477           if (fp_offset)
2478             {
2479               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2480                                                GEN_INT (-offset)));
2481               RTX_FRAME_RELATED_P (insn) = 1;
2482
2483               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2484                                          R30_REGNUM, false);
2485             }
2486           else
2487             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2488
2489           /* Set up frame pointer to point to the location of the
2490              previous frame pointer on the stack.  */
2491           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2492                                            stack_pointer_rtx,
2493                                            GEN_INT (fp_offset)));
2494           RTX_FRAME_RELATED_P (insn) = 1;
2495           emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2496         }
2497       else
2498         {
2499           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2500           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2501
2502           if (fp_offset
2503               || reg1 == FIRST_PSEUDO_REGISTER
2504               || (reg2 == FIRST_PSEUDO_REGISTER
2505                   && offset >= 256))
2506             {
2507               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2508                                                GEN_INT (-offset)));
2509               RTX_FRAME_RELATED_P (insn) = 1;
2510             }
2511           else
2512             {
2513               machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2514
2515               skip_wb = true;
2516
2517               if (reg2 == FIRST_PSEUDO_REGISTER)
2518                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2519               else
2520                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2521             }
2522         }
2523
2524       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2525                                  skip_wb);
2526       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2527                                  skip_wb);
2528     }
2529
2530   /* when offset >= 512,
2531      sub sp, sp, #<outgoing_args_size> */
2532   if (frame_size > -1)
2533     {
2534       if (crtl->outgoing_args_size > 0)
2535         {
2536           insn = emit_insn (gen_add2_insn
2537                             (stack_pointer_rtx,
2538                              GEN_INT (- crtl->outgoing_args_size)));
2539           RTX_FRAME_RELATED_P (insn) = 1;
2540         }
2541     }
2542 }
2543
2544 /* Return TRUE if we can use a simple_return insn.
2545
2546    This function checks whether the callee saved stack is empty, which
2547    means no restore actions are need. The pro_and_epilogue will use
2548    this to check whether shrink-wrapping opt is feasible.  */
2549
2550 bool
2551 aarch64_use_return_insn_p (void)
2552 {
2553   if (!reload_completed)
2554     return false;
2555
2556   if (crtl->profile)
2557     return false;
2558
2559   aarch64_layout_frame ();
2560
2561   return cfun->machine->frame.frame_size == 0;
2562 }
2563
2564 /* Generate the epilogue instructions for returning from a function.  */
2565 void
2566 aarch64_expand_epilogue (bool for_sibcall)
2567 {
2568   HOST_WIDE_INT frame_size, offset;
2569   HOST_WIDE_INT fp_offset;
2570   HOST_WIDE_INT hard_fp_offset;
2571   rtx_insn *insn;
2572   /* We need to add memory barrier to prevent read from deallocated stack.  */
2573   bool need_barrier_p = (get_frame_size () != 0
2574                          || cfun->machine->frame.saved_varargs_size);
2575
2576   aarch64_layout_frame ();
2577
2578   offset = frame_size = cfun->machine->frame.frame_size;
2579   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2580   fp_offset = frame_size - hard_fp_offset;
2581
2582   /* Store pairs and load pairs have a range only -512 to 504.  */
2583   if (offset >= 512)
2584     {
2585       offset = hard_fp_offset;
2586       if (offset >= 512)
2587         offset = cfun->machine->frame.saved_regs_size;
2588
2589       frame_size -= (offset + crtl->outgoing_args_size);
2590       fp_offset = 0;
2591       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2592         {
2593           insn = emit_insn (gen_add2_insn
2594                             (stack_pointer_rtx,
2595                              GEN_INT (crtl->outgoing_args_size)));
2596           RTX_FRAME_RELATED_P (insn) = 1;
2597         }
2598     }
2599   else
2600     frame_size = -1;
2601
2602   /* If there were outgoing arguments or we've done dynamic stack
2603      allocation, then restore the stack pointer from the frame
2604      pointer.  This is at most one insn and more efficient than using
2605      GCC's internal mechanism.  */
2606   if (frame_pointer_needed
2607       && (crtl->outgoing_args_size || cfun->calls_alloca))
2608     {
2609       if (cfun->calls_alloca)
2610         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2611
2612       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2613                                        hard_frame_pointer_rtx,
2614                                        GEN_INT (0)));
2615       offset = offset - fp_offset;
2616     }
2617
2618   if (offset > 0)
2619     {
2620       unsigned reg1 = cfun->machine->frame.wb_candidate1;
2621       unsigned reg2 = cfun->machine->frame.wb_candidate2;
2622       bool skip_wb = true;
2623       rtx cfi_ops = NULL;
2624
2625       if (frame_pointer_needed)
2626         fp_offset = 0;
2627       else if (fp_offset
2628                || reg1 == FIRST_PSEUDO_REGISTER
2629                || (reg2 == FIRST_PSEUDO_REGISTER
2630                    && offset >= 256))
2631         skip_wb = false;
2632
2633       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2634                                     skip_wb, &cfi_ops);
2635       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2636                                     skip_wb, &cfi_ops);
2637
2638       if (need_barrier_p)
2639         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2640
2641       if (skip_wb)
2642         {
2643           machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2644           rtx rreg1 = gen_rtx_REG (mode1, reg1);
2645
2646           cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2647           if (reg2 == FIRST_PSEUDO_REGISTER)
2648             {
2649               rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2650               mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2651               mem = gen_rtx_MEM (mode1, mem);
2652               insn = emit_move_insn (rreg1, mem);
2653             }
2654           else
2655             {
2656               rtx rreg2 = gen_rtx_REG (mode1, reg2);
2657
2658               cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2659               insn = emit_insn (aarch64_gen_loadwb_pair
2660                                 (mode1, stack_pointer_rtx, rreg1,
2661                                  rreg2, offset));
2662             }
2663         }
2664       else
2665         {
2666           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2667                                            GEN_INT (offset)));
2668         }
2669
2670       /* Reset the CFA to be SP + FRAME_SIZE.  */
2671       rtx new_cfa = stack_pointer_rtx;
2672       if (frame_size > 0)
2673         new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2674       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2675       REG_NOTES (insn) = cfi_ops;
2676       RTX_FRAME_RELATED_P (insn) = 1;
2677     }
2678
2679   if (frame_size > 0)
2680     {
2681       if (need_barrier_p)
2682         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2683
2684       if (frame_size >= 0x1000000)
2685         {
2686           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2687           emit_move_insn (op0, GEN_INT (frame_size));
2688           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2689         }
2690       else
2691         {
2692           int hi_ofs = frame_size & 0xfff000;
2693           int lo_ofs = frame_size & 0x000fff;
2694
2695           if (hi_ofs && lo_ofs)
2696             {
2697               insn = emit_insn (gen_add2_insn
2698                                 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2699               RTX_FRAME_RELATED_P (insn) = 1;
2700               frame_size = lo_ofs;
2701             }
2702           insn = emit_insn (gen_add2_insn
2703                             (stack_pointer_rtx, GEN_INT (frame_size)));
2704         }
2705
2706       /* Reset the CFA to be SP + 0.  */
2707       add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2708       RTX_FRAME_RELATED_P (insn) = 1;
2709     }
2710
2711   /* Stack adjustment for exception handler.  */
2712   if (crtl->calls_eh_return)
2713     {
2714       /* We need to unwind the stack by the offset computed by
2715          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
2716          to be SP; letting the CFA move during this adjustment
2717          is just as correct as retaining the CFA from the body
2718          of the function.  Therefore, do nothing special.  */
2719       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2720     }
2721
2722   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2723   if (!for_sibcall)
2724     emit_jump_insn (ret_rtx);
2725 }
2726
2727 /* Return the place to copy the exception unwinding return address to.
2728    This will probably be a stack slot, but could (in theory be the
2729    return register).  */
2730 rtx
2731 aarch64_final_eh_return_addr (void)
2732 {
2733   HOST_WIDE_INT fp_offset;
2734
2735   aarch64_layout_frame ();
2736
2737   fp_offset = cfun->machine->frame.frame_size
2738               - cfun->machine->frame.hard_fp_offset;
2739
2740   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2741     return gen_rtx_REG (DImode, LR_REGNUM);
2742
2743   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2744      result in a store to save LR introduced by builtin_eh_return () being
2745      incorrectly deleted because the alias is not detected.
2746      So in the calculation of the address to copy the exception unwinding
2747      return address to, we note 2 cases.
2748      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2749      we return a SP-relative location since all the addresses are SP-relative
2750      in this case.  This prevents the store from being optimized away.
2751      If the fp_offset is not 0, then the addresses will be FP-relative and
2752      therefore we return a FP-relative location.  */
2753
2754   if (frame_pointer_needed)
2755     {
2756       if (fp_offset)
2757         return gen_frame_mem (DImode,
2758                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2759       else
2760         return gen_frame_mem (DImode,
2761                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2762     }
2763
2764   /* If FP is not needed, we calculate the location of LR, which would be
2765      at the top of the saved registers block.  */
2766
2767   return gen_frame_mem (DImode,
2768                         plus_constant (Pmode,
2769                                        stack_pointer_rtx,
2770                                        fp_offset
2771                                        + cfun->machine->frame.saved_regs_size
2772                                        - 2 * UNITS_PER_WORD));
2773 }
2774
2775 /* Possibly output code to build up a constant in a register.  For
2776    the benefit of the costs infrastructure, returns the number of
2777    instructions which would be emitted.  GENERATE inhibits or
2778    enables code generation.  */
2779
2780 static int
2781 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2782 {
2783   int insns = 0;
2784
2785   if (aarch64_bitmask_imm (val, DImode))
2786     {
2787       if (generate)
2788         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2789       insns = 1;
2790     }
2791   else
2792     {
2793       int i;
2794       int ncount = 0;
2795       int zcount = 0;
2796       HOST_WIDE_INT valp = val >> 16;
2797       HOST_WIDE_INT valm;
2798       HOST_WIDE_INT tval;
2799
2800       for (i = 16; i < 64; i += 16)
2801         {
2802           valm = (valp & 0xffff);
2803
2804           if (valm != 0)
2805             ++ zcount;
2806
2807           if (valm != 0xffff)
2808             ++ ncount;
2809
2810           valp >>= 16;
2811         }
2812
2813       /* zcount contains the number of additional MOVK instructions
2814          required if the constant is built up with an initial MOVZ instruction,
2815          while ncount is the number of MOVK instructions required if starting
2816          with a MOVN instruction.  Choose the sequence that yields the fewest
2817          number of instructions, preferring MOVZ instructions when they are both
2818          the same.  */
2819       if (ncount < zcount)
2820         {
2821           if (generate)
2822             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2823                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2824           tval = 0xffff;
2825           insns++;
2826         }
2827       else
2828         {
2829           if (generate)
2830             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2831                             GEN_INT (val & 0xffff));
2832           tval = 0;
2833           insns++;
2834         }
2835
2836       val >>= 16;
2837
2838       for (i = 16; i < 64; i += 16)
2839         {
2840           if ((val & 0xffff) != tval)
2841             {
2842               if (generate)
2843                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2844                                            GEN_INT (i),
2845                                            GEN_INT (val & 0xffff)));
2846               insns++;
2847             }
2848           val >>= 16;
2849         }
2850     }
2851   return insns;
2852 }
2853
2854 static void
2855 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2856 {
2857   HOST_WIDE_INT mdelta = delta;
2858   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2859   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2860
2861   if (mdelta < 0)
2862     mdelta = -mdelta;
2863
2864   if (mdelta >= 4096 * 4096)
2865     {
2866       (void) aarch64_build_constant (scratchreg, delta, true);
2867       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2868     }
2869   else if (mdelta > 0)
2870     {
2871       if (mdelta >= 4096)
2872         {
2873           emit_insn (gen_rtx_SET (Pmode, scratch_rtx, GEN_INT (mdelta / 4096)));
2874           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2875           if (delta < 0)
2876             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2877                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
2878           else
2879             emit_insn (gen_rtx_SET (Pmode, this_rtx,
2880                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
2881         }
2882       if (mdelta % 4096 != 0)
2883         {
2884           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
2885           emit_insn (gen_rtx_SET (Pmode, this_rtx,
2886                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
2887         }
2888     }
2889 }
2890
2891 /* Output code to add DELTA to the first argument, and then jump
2892    to FUNCTION.  Used for C++ multiple inheritance.  */
2893 static void
2894 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
2895                          HOST_WIDE_INT delta,
2896                          HOST_WIDE_INT vcall_offset,
2897                          tree function)
2898 {
2899   /* The this pointer is always in x0.  Note that this differs from
2900      Arm where the this pointer maybe bumped to r1 if r0 is required
2901      to return a pointer to an aggregate.  On AArch64 a result value
2902      pointer will be in x8.  */
2903   int this_regno = R0_REGNUM;
2904   rtx this_rtx, temp0, temp1, addr, funexp;
2905   rtx_insn *insn;
2906
2907   reload_completed = 1;
2908   emit_note (NOTE_INSN_PROLOGUE_END);
2909
2910   if (vcall_offset == 0)
2911     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2912   else
2913     {
2914       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
2915
2916       this_rtx = gen_rtx_REG (Pmode, this_regno);
2917       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2918       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
2919
2920       addr = this_rtx;
2921       if (delta != 0)
2922         {
2923           if (delta >= -256 && delta < 256)
2924             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
2925                                        plus_constant (Pmode, this_rtx, delta));
2926           else
2927             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
2928         }
2929
2930       if (Pmode == ptr_mode)
2931         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
2932       else
2933         aarch64_emit_move (temp0,
2934                            gen_rtx_ZERO_EXTEND (Pmode,
2935                                                 gen_rtx_MEM (ptr_mode, addr)));
2936
2937       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
2938           addr = plus_constant (Pmode, temp0, vcall_offset);
2939       else
2940         {
2941           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
2942           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
2943         }
2944
2945       if (Pmode == ptr_mode)
2946         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
2947       else
2948         aarch64_emit_move (temp1,
2949                            gen_rtx_SIGN_EXTEND (Pmode,
2950                                                 gen_rtx_MEM (ptr_mode, addr)));
2951
2952       emit_insn (gen_add2_insn (this_rtx, temp1));
2953     }
2954
2955   /* Generate a tail call to the target function.  */
2956   if (!TREE_USED (function))
2957     {
2958       assemble_external (function);
2959       TREE_USED (function) = 1;
2960     }
2961   funexp = XEXP (DECL_RTL (function), 0);
2962   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
2963   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
2964   SIBLING_CALL_P (insn) = 1;
2965
2966   insn = get_insns ();
2967   shorten_branches (insn);
2968   final_start_function (insn, file, 1);
2969   final (insn, file, 1);
2970   final_end_function ();
2971
2972   /* Stop pretending to be a post-reload pass.  */
2973   reload_completed = 0;
2974 }
2975
2976 static bool
2977 aarch64_tls_referenced_p (rtx x)
2978 {
2979   if (!TARGET_HAVE_TLS)
2980     return false;
2981   subrtx_iterator::array_type array;
2982   FOR_EACH_SUBRTX (iter, array, x, ALL)
2983     {
2984       const_rtx x = *iter;
2985       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
2986         return true;
2987       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
2988          TLS offsets, not real symbol references.  */
2989       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
2990         iter.skip_subrtxes ();
2991     }
2992   return false;
2993 }
2994
2995
2996 static int
2997 aarch64_bitmasks_cmp (const void *i1, const void *i2)
2998 {
2999   const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3000   const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3001
3002   if (*imm1 < *imm2)
3003     return -1;
3004   if (*imm1 > *imm2)
3005     return +1;
3006   return 0;
3007 }
3008
3009
3010 static void
3011 aarch64_build_bitmask_table (void)
3012 {
3013   unsigned HOST_WIDE_INT mask, imm;
3014   unsigned int log_e, e, s, r;
3015   unsigned int nimms = 0;
3016
3017   for (log_e = 1; log_e <= 6; log_e++)
3018     {
3019       e = 1 << log_e;
3020       if (e == 64)
3021         mask = ~(HOST_WIDE_INT) 0;
3022       else
3023         mask = ((HOST_WIDE_INT) 1 << e) - 1;
3024       for (s = 1; s < e; s++)
3025         {
3026           for (r = 0; r < e; r++)
3027             {
3028               /* set s consecutive bits to 1 (s < 64) */
3029               imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3030               /* rotate right by r */
3031               if (r != 0)
3032                 imm = ((imm >> r) | (imm << (e - r))) & mask;
3033               /* replicate the constant depending on SIMD size */
3034               switch (log_e) {
3035               case 1: imm |= (imm <<  2);
3036               case 2: imm |= (imm <<  4);
3037               case 3: imm |= (imm <<  8);
3038               case 4: imm |= (imm << 16);
3039               case 5: imm |= (imm << 32);
3040               case 6:
3041                 break;
3042               default:
3043                 gcc_unreachable ();
3044               }
3045               gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3046               aarch64_bitmasks[nimms++] = imm;
3047             }
3048         }
3049     }
3050
3051   gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3052   qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3053          aarch64_bitmasks_cmp);
3054 }
3055
3056
3057 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3058    a left shift of 0 or 12 bits.  */
3059 bool
3060 aarch64_uimm12_shift (HOST_WIDE_INT val)
3061 {
3062   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3063           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3064           );
3065 }
3066
3067
3068 /* Return true if val is an immediate that can be loaded into a
3069    register by a MOVZ instruction.  */
3070 static bool
3071 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3072 {
3073   if (GET_MODE_SIZE (mode) > 4)
3074     {
3075       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3076           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3077         return 1;
3078     }
3079   else
3080     {
3081       /* Ignore sign extension.  */
3082       val &= (HOST_WIDE_INT) 0xffffffff;
3083     }
3084   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3085           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3086 }
3087
3088
3089 /* Return true if val is a valid bitmask immediate.  */
3090 bool
3091 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3092 {
3093   if (GET_MODE_SIZE (mode) < 8)
3094     {
3095       /* Replicate bit pattern.  */
3096       val &= (HOST_WIDE_INT) 0xffffffff;
3097       val |= val << 32;
3098     }
3099   return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3100                   sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3101 }
3102
3103
3104 /* Return true if val is an immediate that can be loaded into a
3105    register in a single instruction.  */
3106 bool
3107 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3108 {
3109   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3110     return 1;
3111   return aarch64_bitmask_imm (val, mode);
3112 }
3113
3114 static bool
3115 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3116 {
3117   rtx base, offset;
3118
3119   if (GET_CODE (x) == HIGH)
3120     return true;
3121
3122   split_const (x, &base, &offset);
3123   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3124     {
3125       if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3126           != SYMBOL_FORCE_TO_MEM)
3127         return true;
3128       else
3129         /* Avoid generating a 64-bit relocation in ILP32; leave
3130            to aarch64_expand_mov_immediate to handle it properly.  */
3131         return mode != ptr_mode;
3132     }
3133
3134   return aarch64_tls_referenced_p (x);
3135 }
3136
3137 /* Return true if register REGNO is a valid index register.
3138    STRICT_P is true if REG_OK_STRICT is in effect.  */
3139
3140 bool
3141 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3142 {
3143   if (!HARD_REGISTER_NUM_P (regno))
3144     {
3145       if (!strict_p)
3146         return true;
3147
3148       if (!reg_renumber)
3149         return false;
3150
3151       regno = reg_renumber[regno];
3152     }
3153   return GP_REGNUM_P (regno);
3154 }
3155
3156 /* Return true if register REGNO is a valid base register for mode MODE.
3157    STRICT_P is true if REG_OK_STRICT is in effect.  */
3158
3159 bool
3160 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3161 {
3162   if (!HARD_REGISTER_NUM_P (regno))
3163     {
3164       if (!strict_p)
3165         return true;
3166
3167       if (!reg_renumber)
3168         return false;
3169
3170       regno = reg_renumber[regno];
3171     }
3172
3173   /* The fake registers will be eliminated to either the stack or
3174      hard frame pointer, both of which are usually valid base registers.
3175      Reload deals with the cases where the eliminated form isn't valid.  */
3176   return (GP_REGNUM_P (regno)
3177           || regno == SP_REGNUM
3178           || regno == FRAME_POINTER_REGNUM
3179           || regno == ARG_POINTER_REGNUM);
3180 }
3181
3182 /* Return true if X is a valid base register for mode MODE.
3183    STRICT_P is true if REG_OK_STRICT is in effect.  */
3184
3185 static bool
3186 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3187 {
3188   if (!strict_p && GET_CODE (x) == SUBREG)
3189     x = SUBREG_REG (x);
3190
3191   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3192 }
3193
3194 /* Return true if address offset is a valid index.  If it is, fill in INFO
3195    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3196
3197 static bool
3198 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3199                         machine_mode mode, bool strict_p)
3200 {
3201   enum aarch64_address_type type;
3202   rtx index;
3203   int shift;
3204
3205   /* (reg:P) */
3206   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3207       && GET_MODE (x) == Pmode)
3208     {
3209       type = ADDRESS_REG_REG;
3210       index = x;
3211       shift = 0;
3212     }
3213   /* (sign_extend:DI (reg:SI)) */
3214   else if ((GET_CODE (x) == SIGN_EXTEND
3215             || GET_CODE (x) == ZERO_EXTEND)
3216            && GET_MODE (x) == DImode
3217            && GET_MODE (XEXP (x, 0)) == SImode)
3218     {
3219       type = (GET_CODE (x) == SIGN_EXTEND)
3220         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3221       index = XEXP (x, 0);
3222       shift = 0;
3223     }
3224   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3225   else if (GET_CODE (x) == MULT
3226            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3227                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3228            && GET_MODE (XEXP (x, 0)) == DImode
3229            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3230            && CONST_INT_P (XEXP (x, 1)))
3231     {
3232       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3233         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3234       index = XEXP (XEXP (x, 0), 0);
3235       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3236     }
3237   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3238   else if (GET_CODE (x) == ASHIFT
3239            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3240                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3241            && GET_MODE (XEXP (x, 0)) == DImode
3242            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3243            && CONST_INT_P (XEXP (x, 1)))
3244     {
3245       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3246         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3247       index = XEXP (XEXP (x, 0), 0);
3248       shift = INTVAL (XEXP (x, 1));
3249     }
3250   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3251   else if ((GET_CODE (x) == SIGN_EXTRACT
3252             || GET_CODE (x) == ZERO_EXTRACT)
3253            && GET_MODE (x) == DImode
3254            && GET_CODE (XEXP (x, 0)) == MULT
3255            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3256            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3257     {
3258       type = (GET_CODE (x) == SIGN_EXTRACT)
3259         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3260       index = XEXP (XEXP (x, 0), 0);
3261       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3262       if (INTVAL (XEXP (x, 1)) != 32 + shift
3263           || INTVAL (XEXP (x, 2)) != 0)
3264         shift = -1;
3265     }
3266   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3267      (const_int 0xffffffff<<shift)) */
3268   else if (GET_CODE (x) == AND
3269            && GET_MODE (x) == DImode
3270            && GET_CODE (XEXP (x, 0)) == MULT
3271            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3272            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3273            && CONST_INT_P (XEXP (x, 1)))
3274     {
3275       type = ADDRESS_REG_UXTW;
3276       index = XEXP (XEXP (x, 0), 0);
3277       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3278       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3279         shift = -1;
3280     }
3281   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3282   else if ((GET_CODE (x) == SIGN_EXTRACT
3283             || GET_CODE (x) == ZERO_EXTRACT)
3284            && GET_MODE (x) == DImode
3285            && GET_CODE (XEXP (x, 0)) == ASHIFT
3286            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3287            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3288     {
3289       type = (GET_CODE (x) == SIGN_EXTRACT)
3290         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3291       index = XEXP (XEXP (x, 0), 0);
3292       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3293       if (INTVAL (XEXP (x, 1)) != 32 + shift
3294           || INTVAL (XEXP (x, 2)) != 0)
3295         shift = -1;
3296     }
3297   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3298      (const_int 0xffffffff<<shift)) */
3299   else if (GET_CODE (x) == AND
3300            && GET_MODE (x) == DImode
3301            && GET_CODE (XEXP (x, 0)) == ASHIFT
3302            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3303            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3304            && CONST_INT_P (XEXP (x, 1)))
3305     {
3306       type = ADDRESS_REG_UXTW;
3307       index = XEXP (XEXP (x, 0), 0);
3308       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3309       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3310         shift = -1;
3311     }
3312   /* (mult:P (reg:P) (const_int scale)) */
3313   else if (GET_CODE (x) == MULT
3314            && GET_MODE (x) == Pmode
3315            && GET_MODE (XEXP (x, 0)) == Pmode
3316            && CONST_INT_P (XEXP (x, 1)))
3317     {
3318       type = ADDRESS_REG_REG;
3319       index = XEXP (x, 0);
3320       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3321     }
3322   /* (ashift:P (reg:P) (const_int shift)) */
3323   else if (GET_CODE (x) == ASHIFT
3324            && GET_MODE (x) == Pmode
3325            && GET_MODE (XEXP (x, 0)) == Pmode
3326            && CONST_INT_P (XEXP (x, 1)))
3327     {
3328       type = ADDRESS_REG_REG;
3329       index = XEXP (x, 0);
3330       shift = INTVAL (XEXP (x, 1));
3331     }
3332   else
3333     return false;
3334
3335   if (GET_CODE (index) == SUBREG)
3336     index = SUBREG_REG (index);
3337
3338   if ((shift == 0 ||
3339        (shift > 0 && shift <= 3
3340         && (1 << shift) == GET_MODE_SIZE (mode)))
3341       && REG_P (index)
3342       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3343     {
3344       info->type = type;
3345       info->offset = index;
3346       info->shift = shift;
3347       return true;
3348     }
3349
3350   return false;
3351 }
3352
3353 bool
3354 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3355 {
3356   return (offset >= -64 * GET_MODE_SIZE (mode)
3357           && offset < 64 * GET_MODE_SIZE (mode)
3358           && offset % GET_MODE_SIZE (mode) == 0);
3359 }
3360
3361 static inline bool
3362 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3363                                HOST_WIDE_INT offset)
3364 {
3365   return offset >= -256 && offset < 256;
3366 }
3367
3368 static inline bool
3369 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3370 {
3371   return (offset >= 0
3372           && offset < 4096 * GET_MODE_SIZE (mode)
3373           && offset % GET_MODE_SIZE (mode) == 0);
3374 }
3375
3376 /* Return true if X is a valid address for machine mode MODE.  If it is,
3377    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3378    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3379
3380 static bool
3381 aarch64_classify_address (struct aarch64_address_info *info,
3382                           rtx x, machine_mode mode,
3383                           RTX_CODE outer_code, bool strict_p)
3384 {
3385   enum rtx_code code = GET_CODE (x);
3386   rtx op0, op1;
3387
3388   /* On BE, we use load/store pair for all large int mode load/stores.  */
3389   bool load_store_pair_p = (outer_code == PARALLEL
3390                             || (BYTES_BIG_ENDIAN
3391                                 && aarch64_vect_struct_mode_p (mode)));
3392
3393   bool allow_reg_index_p =
3394     !load_store_pair_p
3395     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3396     && !aarch64_vect_struct_mode_p (mode);
3397
3398   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3399      REG addressing.  */
3400   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3401       && (code != POST_INC && code != REG))
3402     return false;
3403
3404   switch (code)
3405     {
3406     case REG:
3407     case SUBREG:
3408       info->type = ADDRESS_REG_IMM;
3409       info->base = x;
3410       info->offset = const0_rtx;
3411       return aarch64_base_register_rtx_p (x, strict_p);
3412
3413     case PLUS:
3414       op0 = XEXP (x, 0);
3415       op1 = XEXP (x, 1);
3416
3417       if (! strict_p
3418           && REG_P (op0)
3419           && (op0 == virtual_stack_vars_rtx
3420               || op0 == frame_pointer_rtx
3421               || op0 == arg_pointer_rtx)
3422           && CONST_INT_P (op1))
3423         {
3424           info->type = ADDRESS_REG_IMM;
3425           info->base = op0;
3426           info->offset = op1;
3427
3428           return true;
3429         }
3430
3431       if (GET_MODE_SIZE (mode) != 0
3432           && CONST_INT_P (op1)
3433           && aarch64_base_register_rtx_p (op0, strict_p))
3434         {
3435           HOST_WIDE_INT offset = INTVAL (op1);
3436
3437           info->type = ADDRESS_REG_IMM;
3438           info->base = op0;
3439           info->offset = op1;
3440
3441           /* TImode and TFmode values are allowed in both pairs of X
3442              registers and individual Q registers.  The available
3443              address modes are:
3444              X,X: 7-bit signed scaled offset
3445              Q:   9-bit signed offset
3446              We conservatively require an offset representable in either mode.
3447            */
3448           if (mode == TImode || mode == TFmode)
3449             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3450                     && offset_9bit_signed_unscaled_p (mode, offset));
3451
3452           /* A 7bit offset check because OImode will emit a ldp/stp
3453              instruction (only big endian will get here).
3454              For ldp/stp instructions, the offset is scaled for the size of a
3455              single element of the pair.  */
3456           if (mode == OImode)
3457             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3458
3459           /* Three 9/12 bit offsets checks because CImode will emit three
3460              ldr/str instructions (only big endian will get here).  */
3461           if (mode == CImode)
3462             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3463                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3464                         || offset_12bit_unsigned_scaled_p (V16QImode,
3465                                                            offset + 32)));
3466
3467           /* Two 7bit offsets checks because XImode will emit two ldp/stp
3468              instructions (only big endian will get here).  */
3469           if (mode == XImode)
3470             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3471                     && aarch64_offset_7bit_signed_scaled_p (TImode,
3472                                                             offset + 32));
3473
3474           if (load_store_pair_p)
3475             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3476                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3477           else
3478             return (offset_9bit_signed_unscaled_p (mode, offset)
3479                     || offset_12bit_unsigned_scaled_p (mode, offset));
3480         }
3481
3482       if (allow_reg_index_p)
3483         {
3484           /* Look for base + (scaled/extended) index register.  */
3485           if (aarch64_base_register_rtx_p (op0, strict_p)
3486               && aarch64_classify_index (info, op1, mode, strict_p))
3487             {
3488               info->base = op0;
3489               return true;
3490             }
3491           if (aarch64_base_register_rtx_p (op1, strict_p)
3492               && aarch64_classify_index (info, op0, mode, strict_p))
3493             {
3494               info->base = op1;
3495               return true;
3496             }
3497         }
3498
3499       return false;
3500
3501     case POST_INC:
3502     case POST_DEC:
3503     case PRE_INC:
3504     case PRE_DEC:
3505       info->type = ADDRESS_REG_WB;
3506       info->base = XEXP (x, 0);
3507       info->offset = NULL_RTX;
3508       return aarch64_base_register_rtx_p (info->base, strict_p);
3509
3510     case POST_MODIFY:
3511     case PRE_MODIFY:
3512       info->type = ADDRESS_REG_WB;
3513       info->base = XEXP (x, 0);
3514       if (GET_CODE (XEXP (x, 1)) == PLUS
3515           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3516           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3517           && aarch64_base_register_rtx_p (info->base, strict_p))
3518         {
3519           HOST_WIDE_INT offset;
3520           info->offset = XEXP (XEXP (x, 1), 1);
3521           offset = INTVAL (info->offset);
3522
3523           /* TImode and TFmode values are allowed in both pairs of X
3524              registers and individual Q registers.  The available
3525              address modes are:
3526              X,X: 7-bit signed scaled offset
3527              Q:   9-bit signed offset
3528              We conservatively require an offset representable in either mode.
3529            */
3530           if (mode == TImode || mode == TFmode)
3531             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3532                     && offset_9bit_signed_unscaled_p (mode, offset));
3533
3534           if (load_store_pair_p)
3535             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3536                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3537           else
3538             return offset_9bit_signed_unscaled_p (mode, offset);
3539         }
3540       return false;
3541
3542     case CONST:
3543     case SYMBOL_REF:
3544     case LABEL_REF:
3545       /* load literal: pc-relative constant pool entry.  Only supported
3546          for SI mode or larger.  */
3547       info->type = ADDRESS_SYMBOLIC;
3548
3549       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3550         {
3551           rtx sym, addend;
3552
3553           split_const (x, &sym, &addend);
3554           return (GET_CODE (sym) == LABEL_REF
3555                   || (GET_CODE (sym) == SYMBOL_REF
3556                       && CONSTANT_POOL_ADDRESS_P (sym)));
3557         }
3558       return false;
3559
3560     case LO_SUM:
3561       info->type = ADDRESS_LO_SUM;
3562       info->base = XEXP (x, 0);
3563       info->offset = XEXP (x, 1);
3564       if (allow_reg_index_p
3565           && aarch64_base_register_rtx_p (info->base, strict_p))
3566         {
3567           rtx sym, offs;
3568           split_const (info->offset, &sym, &offs);
3569           if (GET_CODE (sym) == SYMBOL_REF
3570               && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3571                   == SYMBOL_SMALL_ABSOLUTE))
3572             {
3573               /* The symbol and offset must be aligned to the access size.  */
3574               unsigned int align;
3575               unsigned int ref_size;
3576
3577               if (CONSTANT_POOL_ADDRESS_P (sym))
3578                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3579               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3580                 {
3581                   tree exp = SYMBOL_REF_DECL (sym);
3582                   align = TYPE_ALIGN (TREE_TYPE (exp));
3583                   align = CONSTANT_ALIGNMENT (exp, align);
3584                 }
3585               else if (SYMBOL_REF_DECL (sym))
3586                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3587               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3588                        && SYMBOL_REF_BLOCK (sym) != NULL)
3589                 align = SYMBOL_REF_BLOCK (sym)->alignment;
3590               else
3591                 align = BITS_PER_UNIT;
3592
3593               ref_size = GET_MODE_SIZE (mode);
3594               if (ref_size == 0)
3595                 ref_size = GET_MODE_SIZE (DImode);
3596
3597               return ((INTVAL (offs) & (ref_size - 1)) == 0
3598                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3599             }
3600         }
3601       return false;
3602
3603     default:
3604       return false;
3605     }
3606 }
3607
3608 bool
3609 aarch64_symbolic_address_p (rtx x)
3610 {
3611   rtx offset;
3612
3613   split_const (x, &x, &offset);
3614   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3615 }
3616
3617 /* Classify the base of symbolic expression X, given that X appears in
3618    context CONTEXT.  */
3619
3620 enum aarch64_symbol_type
3621 aarch64_classify_symbolic_expression (rtx x,
3622                                       enum aarch64_symbol_context context)
3623 {
3624   rtx offset;
3625
3626   split_const (x, &x, &offset);
3627   return aarch64_classify_symbol (x, offset, context);
3628 }
3629
3630
3631 /* Return TRUE if X is a legitimate address for accessing memory in
3632    mode MODE.  */
3633 static bool
3634 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3635 {
3636   struct aarch64_address_info addr;
3637
3638   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3639 }
3640
3641 /* Return TRUE if X is a legitimate address for accessing memory in
3642    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3643    pair operation.  */
3644 bool
3645 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3646                               RTX_CODE outer_code, bool strict_p)
3647 {
3648   struct aarch64_address_info addr;
3649
3650   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3651 }
3652
3653 /* Return TRUE if rtx X is immediate constant 0.0 */
3654 bool
3655 aarch64_float_const_zero_rtx_p (rtx x)
3656 {
3657   REAL_VALUE_TYPE r;
3658
3659   if (GET_MODE (x) == VOIDmode)
3660     return false;
3661
3662   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3663   if (REAL_VALUE_MINUS_ZERO (r))
3664     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3665   return REAL_VALUES_EQUAL (r, dconst0);
3666 }
3667
3668 /* Return the fixed registers used for condition codes.  */
3669
3670 static bool
3671 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3672 {
3673   *p1 = CC_REGNUM;
3674   *p2 = INVALID_REGNUM;
3675   return true;
3676 }
3677
3678 /* Emit call insn with PAT and do aarch64-specific handling.  */
3679
3680 void
3681 aarch64_emit_call_insn (rtx pat)
3682 {
3683   rtx insn = emit_call_insn (pat);
3684
3685   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3686   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3687   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3688 }
3689
3690 machine_mode
3691 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3692 {
3693   /* All floating point compares return CCFP if it is an equality
3694      comparison, and CCFPE otherwise.  */
3695   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3696     {
3697       switch (code)
3698         {
3699         case EQ:
3700         case NE:
3701         case UNORDERED:
3702         case ORDERED:
3703         case UNLT:
3704         case UNLE:
3705         case UNGT:
3706         case UNGE:
3707         case UNEQ:
3708         case LTGT:
3709           return CCFPmode;
3710
3711         case LT:
3712         case LE:
3713         case GT:
3714         case GE:
3715           return CCFPEmode;
3716
3717         default:
3718           gcc_unreachable ();
3719         }
3720     }
3721
3722   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3723       && y == const0_rtx
3724       && (code == EQ || code == NE || code == LT || code == GE)
3725       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3726           || GET_CODE (x) == NEG))
3727     return CC_NZmode;
3728
3729   /* A compare with a shifted operand.  Because of canonicalization,
3730      the comparison will have to be swapped when we emit the assembly
3731      code.  */
3732   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3733       && (REG_P (y) || GET_CODE (y) == SUBREG)
3734       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3735           || GET_CODE (x) == LSHIFTRT
3736           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3737     return CC_SWPmode;
3738
3739   /* Similarly for a negated operand, but we can only do this for
3740      equalities.  */
3741   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3742       && (REG_P (y) || GET_CODE (y) == SUBREG)
3743       && (code == EQ || code == NE)
3744       && GET_CODE (x) == NEG)
3745     return CC_Zmode;
3746
3747   /* A compare of a mode narrower than SI mode against zero can be done
3748      by extending the value in the comparison.  */
3749   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3750       && y == const0_rtx)
3751     /* Only use sign-extension if we really need it.  */
3752     return ((code == GT || code == GE || code == LE || code == LT)
3753             ? CC_SESWPmode : CC_ZESWPmode);
3754
3755   /* For everything else, return CCmode.  */
3756   return CCmode;
3757 }
3758
3759 static int
3760 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3761
3762 int
3763 aarch64_get_condition_code (rtx x)
3764 {
3765   machine_mode mode = GET_MODE (XEXP (x, 0));
3766   enum rtx_code comp_code = GET_CODE (x);
3767
3768   if (GET_MODE_CLASS (mode) != MODE_CC)
3769     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3770   return aarch64_get_condition_code_1 (mode, comp_code);
3771 }
3772
3773 static int
3774 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3775 {
3776   int ne = -1, eq = -1;
3777   switch (mode)
3778     {
3779     case CCFPmode:
3780     case CCFPEmode:
3781       switch (comp_code)
3782         {
3783         case GE: return AARCH64_GE;
3784         case GT: return AARCH64_GT;
3785         case LE: return AARCH64_LS;
3786         case LT: return AARCH64_MI;
3787         case NE: return AARCH64_NE;
3788         case EQ: return AARCH64_EQ;
3789         case ORDERED: return AARCH64_VC;
3790         case UNORDERED: return AARCH64_VS;
3791         case UNLT: return AARCH64_LT;
3792         case UNLE: return AARCH64_LE;
3793         case UNGT: return AARCH64_HI;
3794         case UNGE: return AARCH64_PL;
3795         default: return -1;
3796         }
3797       break;
3798
3799     case CC_DNEmode:
3800       ne = AARCH64_NE;
3801       eq = AARCH64_EQ;
3802       break;
3803
3804     case CC_DEQmode:
3805       ne = AARCH64_EQ;
3806       eq = AARCH64_NE;
3807       break;
3808
3809     case CC_DGEmode:
3810       ne = AARCH64_GE;
3811       eq = AARCH64_LT;
3812       break;
3813
3814     case CC_DLTmode:
3815       ne = AARCH64_LT;
3816       eq = AARCH64_GE;
3817       break;
3818
3819     case CC_DGTmode:
3820       ne = AARCH64_GT;
3821       eq = AARCH64_LE;
3822       break;
3823
3824     case CC_DLEmode:
3825       ne = AARCH64_LE;
3826       eq = AARCH64_GT;
3827       break;
3828
3829     case CC_DGEUmode:
3830       ne = AARCH64_CS;
3831       eq = AARCH64_CC;
3832       break;
3833
3834     case CC_DLTUmode:
3835       ne = AARCH64_CC;
3836       eq = AARCH64_CS;
3837       break;
3838
3839     case CC_DGTUmode:
3840       ne = AARCH64_HI;
3841       eq = AARCH64_LS;
3842       break;
3843
3844     case CC_DLEUmode:
3845       ne = AARCH64_LS;
3846       eq = AARCH64_HI;
3847       break;
3848
3849     case CCmode:
3850       switch (comp_code)
3851         {
3852         case NE: return AARCH64_NE;
3853         case EQ: return AARCH64_EQ;
3854         case GE: return AARCH64_GE;
3855         case GT: return AARCH64_GT;
3856         case LE: return AARCH64_LE;
3857         case LT: return AARCH64_LT;
3858         case GEU: return AARCH64_CS;
3859         case GTU: return AARCH64_HI;
3860         case LEU: return AARCH64_LS;
3861         case LTU: return AARCH64_CC;
3862         default: return -1;
3863         }
3864       break;
3865
3866     case CC_SWPmode:
3867     case CC_ZESWPmode:
3868     case CC_SESWPmode:
3869       switch (comp_code)
3870         {
3871         case NE: return AARCH64_NE;
3872         case EQ: return AARCH64_EQ;
3873         case GE: return AARCH64_LE;
3874         case GT: return AARCH64_LT;
3875         case LE: return AARCH64_GE;
3876         case LT: return AARCH64_GT;
3877         case GEU: return AARCH64_LS;
3878         case GTU: return AARCH64_CC;
3879         case LEU: return AARCH64_CS;
3880         case LTU: return AARCH64_HI;
3881         default: return -1;
3882         }
3883       break;
3884
3885     case CC_NZmode:
3886       switch (comp_code)
3887         {
3888         case NE: return AARCH64_NE;
3889         case EQ: return AARCH64_EQ;
3890         case GE: return AARCH64_PL;
3891         case LT: return AARCH64_MI;
3892         default: return -1;
3893         }
3894       break;
3895
3896     case CC_Zmode:
3897       switch (comp_code)
3898         {
3899         case NE: return AARCH64_NE;
3900         case EQ: return AARCH64_EQ;
3901         default: return -1;
3902         }
3903       break;
3904
3905     default:
3906       return -1;
3907       break;
3908     }
3909
3910   if (comp_code == NE)
3911     return ne;
3912
3913   if (comp_code == EQ)
3914     return eq;
3915
3916   return -1;
3917 }
3918
3919 bool
3920 aarch64_const_vec_all_same_in_range_p (rtx x,
3921                                   HOST_WIDE_INT minval,
3922                                   HOST_WIDE_INT maxval)
3923 {
3924   HOST_WIDE_INT firstval;
3925   int count, i;
3926
3927   if (GET_CODE (x) != CONST_VECTOR
3928       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
3929     return false;
3930
3931   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
3932   if (firstval < minval || firstval > maxval)
3933     return false;
3934
3935   count = CONST_VECTOR_NUNITS (x);
3936   for (i = 1; i < count; i++)
3937     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
3938       return false;
3939
3940   return true;
3941 }
3942
3943 bool
3944 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
3945 {
3946   return aarch64_const_vec_all_same_in_range_p (x, val, val);
3947 }
3948
3949 static unsigned
3950 bit_count (unsigned HOST_WIDE_INT value)
3951 {
3952   unsigned count = 0;
3953
3954   while (value)
3955     {
3956       count++;
3957       value &= value - 1;
3958     }
3959
3960   return count;
3961 }
3962
3963 /* N Z C V.  */
3964 #define AARCH64_CC_V 1
3965 #define AARCH64_CC_C (1 << 1)
3966 #define AARCH64_CC_Z (1 << 2)
3967 #define AARCH64_CC_N (1 << 3)
3968
3969 /* N Z C V flags for ccmp.  The first code is for AND op and the other
3970    is for IOR op.  Indexed by AARCH64_COND_CODE.  */
3971 static const int aarch64_nzcv_codes[][2] =
3972 {
3973   {AARCH64_CC_Z, 0}, /* EQ, Z == 1.  */
3974   {0, AARCH64_CC_Z}, /* NE, Z == 0.  */
3975   {AARCH64_CC_C, 0}, /* CS, C == 1.  */
3976   {0, AARCH64_CC_C}, /* CC, C == 0.  */
3977   {AARCH64_CC_N, 0}, /* MI, N == 1.  */
3978   {0, AARCH64_CC_N}, /* PL, N == 0.  */
3979   {AARCH64_CC_V, 0}, /* VS, V == 1.  */
3980   {0, AARCH64_CC_V}, /* VC, V == 0.  */
3981   {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0.  */
3982   {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0).  */
3983   {0, AARCH64_CC_V}, /* GE, N == V.  */
3984   {AARCH64_CC_V, 0}, /* LT, N != V.  */
3985   {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V.  */
3986   {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V).  */
3987   {0, 0}, /* AL, Any.  */
3988   {0, 0}, /* NV, Any.  */
3989 };
3990
3991 int
3992 aarch64_ccmp_mode_to_code (enum machine_mode mode)
3993 {
3994   switch (mode)
3995     {
3996     case CC_DNEmode:
3997       return NE;
3998
3999     case CC_DEQmode:
4000       return EQ;
4001
4002     case CC_DLEmode:
4003       return LE;
4004
4005     case CC_DGTmode:
4006       return GT;
4007
4008     case CC_DLTmode:
4009       return LT;
4010
4011     case CC_DGEmode:
4012       return GE;
4013
4014     case CC_DLEUmode:
4015       return LEU;
4016
4017     case CC_DGTUmode:
4018       return GTU;
4019
4020     case CC_DLTUmode:
4021       return LTU;
4022
4023     case CC_DGEUmode:
4024       return GEU;
4025
4026     default:
4027       gcc_unreachable ();
4028     }
4029 }
4030
4031
4032 void
4033 aarch64_print_operand (FILE *f, rtx x, char code)
4034 {
4035   switch (code)
4036     {
4037     /* An integer or symbol address without a preceding # sign.  */
4038     case 'c':
4039       switch (GET_CODE (x))
4040         {
4041         case CONST_INT:
4042           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4043           break;
4044
4045         case SYMBOL_REF:
4046           output_addr_const (f, x);
4047           break;
4048
4049         case CONST:
4050           if (GET_CODE (XEXP (x, 0)) == PLUS
4051               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4052             {
4053               output_addr_const (f, x);
4054               break;
4055             }
4056           /* Fall through.  */
4057
4058         default:
4059           output_operand_lossage ("Unsupported operand for code '%c'", code);
4060         }
4061       break;
4062
4063     case 'e':
4064       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4065       {
4066         int n;
4067
4068         if (!CONST_INT_P (x)
4069             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4070           {
4071             output_operand_lossage ("invalid operand for '%%%c'", code);
4072             return;
4073           }
4074
4075         switch (n)
4076           {
4077           case 3:
4078             fputc ('b', f);
4079             break;
4080           case 4:
4081             fputc ('h', f);
4082             break;
4083           case 5:
4084             fputc ('w', f);
4085             break;
4086           default:
4087             output_operand_lossage ("invalid operand for '%%%c'", code);
4088             return;
4089           }
4090       }
4091       break;
4092
4093     case 'p':
4094       {
4095         int n;
4096
4097         /* Print N such that 2^N == X.  */
4098         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4099           {
4100             output_operand_lossage ("invalid operand for '%%%c'", code);
4101             return;
4102           }
4103
4104         asm_fprintf (f, "%d", n);
4105       }
4106       break;
4107
4108     case 'P':
4109       /* Print the number of non-zero bits in X (a const_int).  */
4110       if (!CONST_INT_P (x))
4111         {
4112           output_operand_lossage ("invalid operand for '%%%c'", code);
4113           return;
4114         }
4115
4116       asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4117       break;
4118
4119     case 'H':
4120       /* Print the higher numbered register of a pair (TImode) of regs.  */
4121       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4122         {
4123           output_operand_lossage ("invalid operand for '%%%c'", code);
4124           return;
4125         }
4126
4127       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4128       break;
4129
4130     case 'm':
4131       {
4132         int cond_code;
4133         /* Print a condition (eq, ne, etc).  */
4134
4135         /* CONST_TRUE_RTX means always -- that's the default.  */
4136         if (x == const_true_rtx)
4137           return;
4138
4139         if (!COMPARISON_P (x))
4140           {
4141             output_operand_lossage ("invalid operand for '%%%c'", code);
4142             return;
4143           }
4144
4145         cond_code = aarch64_get_condition_code (x);
4146         gcc_assert (cond_code >= 0);
4147         fputs (aarch64_condition_codes[cond_code], f);
4148       }
4149       break;
4150
4151     case 'M':
4152       {
4153         int cond_code;
4154         /* Print the inverse of a condition (eq <-> ne, etc).  */
4155
4156         /* CONST_TRUE_RTX means never -- that's the default.  */
4157         if (x == const_true_rtx)
4158           {
4159             fputs ("nv", f);
4160             return;
4161           }
4162
4163         if (!COMPARISON_P (x))
4164           {
4165             output_operand_lossage ("invalid operand for '%%%c'", code);
4166             return;
4167           }
4168         cond_code = aarch64_get_condition_code (x);
4169         gcc_assert (cond_code >= 0);
4170         fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4171                                        (cond_code)], f);
4172       }
4173       break;
4174
4175     case 'b':
4176     case 'h':
4177     case 's':
4178     case 'd':
4179     case 'q':
4180       /* Print a scalar FP/SIMD register name.  */
4181       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4182         {
4183           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4184           return;
4185         }
4186       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4187       break;
4188
4189     case 'S':
4190     case 'T':
4191     case 'U':
4192     case 'V':
4193       /* Print the first FP/SIMD register name in a list.  */
4194       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4195         {
4196           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4197           return;
4198         }
4199       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4200       break;
4201
4202     case 'R':
4203       /* Print a scalar FP/SIMD register name + 1.  */
4204       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4205         {
4206           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4207           return;
4208         }
4209       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4210       break;
4211
4212     case 'X':
4213       /* Print bottom 16 bits of integer constant in hex.  */
4214       if (!CONST_INT_P (x))
4215         {
4216           output_operand_lossage ("invalid operand for '%%%c'", code);
4217           return;
4218         }
4219       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4220       break;
4221
4222     case 'w':
4223     case 'x':
4224       /* Print a general register name or the zero register (32-bit or
4225          64-bit).  */
4226       if (x == const0_rtx
4227           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4228         {
4229           asm_fprintf (f, "%czr", code);
4230           break;
4231         }
4232
4233       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4234         {
4235           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4236           break;
4237         }
4238
4239       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4240         {
4241           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4242           break;
4243         }
4244
4245       /* Fall through */
4246
4247     case 0:
4248       /* Print a normal operand, if it's a general register, then we
4249          assume DImode.  */
4250       if (x == NULL)
4251         {
4252           output_operand_lossage ("missing operand");
4253           return;
4254         }
4255
4256       switch (GET_CODE (x))
4257         {
4258         case REG:
4259           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4260           break;
4261
4262         case MEM:
4263           aarch64_memory_reference_mode = GET_MODE (x);
4264           output_address (XEXP (x, 0));
4265           break;
4266
4267         case LABEL_REF:
4268         case SYMBOL_REF:
4269           output_addr_const (asm_out_file, x);
4270           break;
4271
4272         case CONST_INT:
4273           asm_fprintf (f, "%wd", INTVAL (x));
4274           break;
4275
4276         case CONST_VECTOR:
4277           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4278             {
4279               gcc_assert (
4280                   aarch64_const_vec_all_same_in_range_p (x,
4281                                                          HOST_WIDE_INT_MIN,
4282                                                          HOST_WIDE_INT_MAX));
4283               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4284             }
4285           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4286             {
4287               fputc ('0', f);
4288             }
4289           else
4290             gcc_unreachable ();
4291           break;
4292
4293         case CONST_DOUBLE:
4294           /* CONST_DOUBLE can represent a double-width integer.
4295              In this case, the mode of x is VOIDmode.  */
4296           if (GET_MODE (x) == VOIDmode)
4297             ; /* Do Nothing.  */
4298           else if (aarch64_float_const_zero_rtx_p (x))
4299             {
4300               fputc ('0', f);
4301               break;
4302             }
4303           else if (aarch64_float_const_representable_p (x))
4304             {
4305 #define buf_size 20
4306               char float_buf[buf_size] = {'\0'};
4307               REAL_VALUE_TYPE r;
4308               REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4309               real_to_decimal_for_mode (float_buf, &r,
4310                                         buf_size, buf_size,
4311                                         1, GET_MODE (x));
4312               asm_fprintf (asm_out_file, "%s", float_buf);
4313               break;
4314 #undef buf_size
4315             }
4316           output_operand_lossage ("invalid constant");
4317           return;
4318         default:
4319           output_operand_lossage ("invalid operand");
4320           return;
4321         }
4322       break;
4323
4324     case 'A':
4325       if (GET_CODE (x) == HIGH)
4326         x = XEXP (x, 0);
4327
4328       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4329         {
4330         case SYMBOL_SMALL_GOT:
4331           asm_fprintf (asm_out_file, ":got:");
4332           break;
4333
4334         case SYMBOL_SMALL_TLSGD:
4335           asm_fprintf (asm_out_file, ":tlsgd:");
4336           break;
4337
4338         case SYMBOL_SMALL_TLSDESC:
4339           asm_fprintf (asm_out_file, ":tlsdesc:");
4340           break;
4341
4342         case SYMBOL_SMALL_GOTTPREL:
4343           asm_fprintf (asm_out_file, ":gottprel:");
4344           break;
4345
4346         case SYMBOL_SMALL_TPREL:
4347           asm_fprintf (asm_out_file, ":tprel:");
4348           break;
4349
4350         case SYMBOL_TINY_GOT:
4351           gcc_unreachable ();
4352           break;
4353
4354         default:
4355           break;
4356         }
4357       output_addr_const (asm_out_file, x);
4358       break;
4359
4360     case 'L':
4361       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4362         {
4363         case SYMBOL_SMALL_GOT:
4364           asm_fprintf (asm_out_file, ":lo12:");
4365           break;
4366
4367         case SYMBOL_SMALL_TLSGD:
4368           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4369           break;
4370
4371         case SYMBOL_SMALL_TLSDESC:
4372           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4373           break;
4374
4375         case SYMBOL_SMALL_GOTTPREL:
4376           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4377           break;
4378
4379         case SYMBOL_SMALL_TPREL:
4380           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4381           break;
4382
4383         case SYMBOL_TINY_GOT:
4384           asm_fprintf (asm_out_file, ":got:");
4385           break;
4386
4387         default:
4388           break;
4389         }
4390       output_addr_const (asm_out_file, x);
4391       break;
4392
4393     case 'G':
4394
4395       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4396         {
4397         case SYMBOL_SMALL_TPREL:
4398           asm_fprintf (asm_out_file, ":tprel_hi12:");
4399           break;
4400         default:
4401           break;
4402         }
4403       output_addr_const (asm_out_file, x);
4404       break;
4405
4406     case 'K':
4407       {
4408         int cond_code;
4409         /* Print nzcv.  */
4410
4411         if (!COMPARISON_P (x))
4412           {
4413             output_operand_lossage ("invalid operand for '%%%c'", code);
4414             return;
4415           }
4416
4417         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4418         gcc_assert (cond_code >= 0);
4419         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4420       }
4421       break;
4422
4423     case 'k':
4424       {
4425         int cond_code;
4426         /* Print nzcv.  */
4427
4428         if (!COMPARISON_P (x))
4429           {
4430             output_operand_lossage ("invalid operand for '%%%c'", code);
4431             return;
4432           }
4433
4434         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4435         gcc_assert (cond_code >= 0);
4436         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4437       }
4438       break;
4439
4440     default:
4441       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4442       return;
4443     }
4444 }
4445
4446 void
4447 aarch64_print_operand_address (FILE *f, rtx x)
4448 {
4449   struct aarch64_address_info addr;
4450
4451   if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4452                              MEM, true))
4453     switch (addr.type)
4454       {
4455       case ADDRESS_REG_IMM:
4456         if (addr.offset == const0_rtx)
4457           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4458         else
4459           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4460                        INTVAL (addr.offset));
4461         return;
4462
4463       case ADDRESS_REG_REG:
4464         if (addr.shift == 0)
4465           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4466                        reg_names [REGNO (addr.offset)]);
4467         else
4468           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4469                        reg_names [REGNO (addr.offset)], addr.shift);
4470         return;
4471
4472       case ADDRESS_REG_UXTW:
4473         if (addr.shift == 0)
4474           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4475                        REGNO (addr.offset) - R0_REGNUM);
4476         else
4477           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4478                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4479         return;
4480
4481       case ADDRESS_REG_SXTW:
4482         if (addr.shift == 0)
4483           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4484                        REGNO (addr.offset) - R0_REGNUM);
4485         else
4486           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4487                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4488         return;
4489
4490       case ADDRESS_REG_WB:
4491         switch (GET_CODE (x))
4492           {
4493           case PRE_INC:
4494             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4495                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4496             return;
4497           case POST_INC:
4498             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4499                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4500             return;
4501           case PRE_DEC:
4502             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4503                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4504             return;
4505           case POST_DEC:
4506             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4507                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4508             return;
4509           case PRE_MODIFY:
4510             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4511                          INTVAL (addr.offset));
4512             return;
4513           case POST_MODIFY:
4514             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4515                          INTVAL (addr.offset));
4516             return;
4517           default:
4518             break;
4519           }
4520         break;
4521
4522       case ADDRESS_LO_SUM:
4523         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4524         output_addr_const (f, addr.offset);
4525         asm_fprintf (f, "]");
4526         return;
4527
4528       case ADDRESS_SYMBOLIC:
4529         break;
4530       }
4531
4532   output_addr_const (f, x);
4533 }
4534
4535 bool
4536 aarch64_label_mentioned_p (rtx x)
4537 {
4538   const char *fmt;
4539   int i;
4540
4541   if (GET_CODE (x) == LABEL_REF)
4542     return true;
4543
4544   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4545      referencing instruction, but they are constant offsets, not
4546      symbols.  */
4547   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4548     return false;
4549
4550   fmt = GET_RTX_FORMAT (GET_CODE (x));
4551   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4552     {
4553       if (fmt[i] == 'E')
4554         {
4555           int j;
4556
4557           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4558             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4559               return 1;
4560         }
4561       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4562         return 1;
4563     }
4564
4565   return 0;
4566 }
4567
4568 /* Implement REGNO_REG_CLASS.  */
4569
4570 enum reg_class
4571 aarch64_regno_regclass (unsigned regno)
4572 {
4573   if (GP_REGNUM_P (regno))
4574     return GENERAL_REGS;
4575
4576   if (regno == SP_REGNUM)
4577     return STACK_REG;
4578
4579   if (regno == FRAME_POINTER_REGNUM
4580       || regno == ARG_POINTER_REGNUM)
4581     return POINTER_REGS;
4582
4583   if (FP_REGNUM_P (regno))
4584     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4585
4586   return NO_REGS;
4587 }
4588
4589 static rtx
4590 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
4591 {
4592   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4593      where mask is selected by alignment and size of the offset.
4594      We try to pick as large a range for the offset as possible to
4595      maximize the chance of a CSE.  However, for aligned addresses
4596      we limit the range to 4k so that structures with different sized
4597      elements are likely to use the same base.  */
4598
4599   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4600     {
4601       HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4602       HOST_WIDE_INT base_offset;
4603
4604       /* Does it look like we'll need a load/store-pair operation?  */
4605       if (GET_MODE_SIZE (mode) > 16
4606           || mode == TImode)
4607         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4608                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
4609       /* For offsets aren't a multiple of the access size, the limit is
4610          -256...255.  */
4611       else if (offset & (GET_MODE_SIZE (mode) - 1))
4612         base_offset = (offset + 0x100) & ~0x1ff;
4613       else
4614         base_offset = offset & ~0xfff;
4615
4616       if (base_offset == 0)
4617         return x;
4618
4619       offset -= base_offset;
4620       rtx base_reg = gen_reg_rtx (Pmode);
4621       rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4622                            NULL_RTX);
4623       emit_move_insn (base_reg, val);
4624       x = plus_constant (Pmode, base_reg, offset);
4625     }
4626
4627   return x;
4628 }
4629
4630 /* Try a machine-dependent way of reloading an illegitimate address
4631    operand.  If we find one, push the reload and return the new rtx.  */
4632
4633 rtx
4634 aarch64_legitimize_reload_address (rtx *x_p,
4635                                    machine_mode mode,
4636                                    int opnum, int type,
4637                                    int ind_levels ATTRIBUTE_UNUSED)
4638 {
4639   rtx x = *x_p;
4640
4641   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4642   if (aarch64_vect_struct_mode_p (mode)
4643       && GET_CODE (x) == PLUS
4644       && REG_P (XEXP (x, 0))
4645       && CONST_INT_P (XEXP (x, 1)))
4646     {
4647       rtx orig_rtx = x;
4648       x = copy_rtx (x);
4649       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4650                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4651                    opnum, (enum reload_type) type);
4652       return x;
4653     }
4654
4655   /* We must recognize output that we have already generated ourselves.  */
4656   if (GET_CODE (x) == PLUS
4657       && GET_CODE (XEXP (x, 0)) == PLUS
4658       && REG_P (XEXP (XEXP (x, 0), 0))
4659       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4660       && CONST_INT_P (XEXP (x, 1)))
4661     {
4662       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4663                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4664                    opnum, (enum reload_type) type);
4665       return x;
4666     }
4667
4668   /* We wish to handle large displacements off a base register by splitting
4669      the addend across an add and the mem insn.  This can cut the number of
4670      extra insns needed from 3 to 1.  It is only useful for load/store of a
4671      single register with 12 bit offset field.  */
4672   if (GET_CODE (x) == PLUS
4673       && REG_P (XEXP (x, 0))
4674       && CONST_INT_P (XEXP (x, 1))
4675       && HARD_REGISTER_P (XEXP (x, 0))
4676       && mode != TImode
4677       && mode != TFmode
4678       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4679     {
4680       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4681       HOST_WIDE_INT low = val & 0xfff;
4682       HOST_WIDE_INT high = val - low;
4683       HOST_WIDE_INT offs;
4684       rtx cst;
4685       machine_mode xmode = GET_MODE (x);
4686
4687       /* In ILP32, xmode can be either DImode or SImode.  */
4688       gcc_assert (xmode == DImode || xmode == SImode);
4689
4690       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4691          BLKmode alignment.  */
4692       if (GET_MODE_SIZE (mode) == 0)
4693         return NULL_RTX;
4694
4695       offs = low % GET_MODE_SIZE (mode);
4696
4697       /* Align misaligned offset by adjusting high part to compensate.  */
4698       if (offs != 0)
4699         {
4700           if (aarch64_uimm12_shift (high + offs))
4701             {
4702               /* Align down.  */
4703               low = low - offs;
4704               high = high + offs;
4705             }
4706           else
4707             {
4708               /* Align up.  */
4709               offs = GET_MODE_SIZE (mode) - offs;
4710               low = low + offs;
4711               high = high + (low & 0x1000) - offs;
4712               low &= 0xfff;
4713             }
4714         }
4715
4716       /* Check for overflow.  */
4717       if (high + low != val)
4718         return NULL_RTX;
4719
4720       cst = GEN_INT (high);
4721       if (!aarch64_uimm12_shift (high))
4722         cst = force_const_mem (xmode, cst);
4723
4724       /* Reload high part into base reg, leaving the low part
4725          in the mem instruction.
4726          Note that replacing this gen_rtx_PLUS with plus_constant is
4727          wrong in this case because we rely on the
4728          (plus (plus reg c1) c2) structure being preserved so that
4729          XEXP (*p, 0) in push_reload below uses the correct term.  */
4730       x = gen_rtx_PLUS (xmode,
4731                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4732                         GEN_INT (low));
4733
4734       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4735                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4736                    opnum, (enum reload_type) type);
4737       return x;
4738     }
4739
4740   return NULL_RTX;
4741 }
4742
4743
4744 static reg_class_t
4745 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4746                           reg_class_t rclass,
4747                           machine_mode mode,
4748                           secondary_reload_info *sri)
4749 {
4750   /* Without the TARGET_SIMD instructions we cannot move a Q register
4751      to a Q register directly.  We need a scratch.  */
4752   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4753       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4754       && reg_class_subset_p (rclass, FP_REGS))
4755     {
4756       if (mode == TFmode)
4757         sri->icode = CODE_FOR_aarch64_reload_movtf;
4758       else if (mode == TImode)
4759         sri->icode = CODE_FOR_aarch64_reload_movti;
4760       return NO_REGS;
4761     }
4762
4763   /* A TFmode or TImode memory access should be handled via an FP_REGS
4764      because AArch64 has richer addressing modes for LDR/STR instructions
4765      than LDP/STP instructions.  */
4766   if (!TARGET_GENERAL_REGS_ONLY && rclass == GENERAL_REGS
4767       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4768     return FP_REGS;
4769
4770   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4771       return GENERAL_REGS;
4772
4773   return NO_REGS;
4774 }
4775
4776 static bool
4777 aarch64_can_eliminate (const int from, const int to)
4778 {
4779   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4780      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4781
4782   if (frame_pointer_needed)
4783     {
4784       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4785         return true;
4786       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4787         return false;
4788       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4789           && !cfun->calls_alloca)
4790         return true;
4791       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4792         return true;
4793
4794       return false;
4795     }
4796   else
4797     {
4798       /* If we decided that we didn't need a leaf frame pointer but then used
4799          LR in the function, then we'll want a frame pointer after all, so
4800          prevent this elimination to ensure a frame pointer is used.  */
4801       if (to == STACK_POINTER_REGNUM
4802           && flag_omit_leaf_frame_pointer
4803           && df_regs_ever_live_p (LR_REGNUM))
4804         return false;
4805     }
4806
4807   return true;
4808 }
4809
4810 HOST_WIDE_INT
4811 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4812 {
4813   aarch64_layout_frame ();
4814
4815   if (to == HARD_FRAME_POINTER_REGNUM)
4816     {
4817       if (from == ARG_POINTER_REGNUM)
4818         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4819
4820       if (from == FRAME_POINTER_REGNUM)
4821         return (cfun->machine->frame.hard_fp_offset
4822                 - cfun->machine->frame.saved_varargs_size);
4823     }
4824
4825   if (to == STACK_POINTER_REGNUM)
4826     {
4827       if (from == FRAME_POINTER_REGNUM)
4828           return (cfun->machine->frame.frame_size
4829                   - cfun->machine->frame.saved_varargs_size);
4830     }
4831
4832   return cfun->machine->frame.frame_size;
4833 }
4834
4835 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
4836    previous frame.  */
4837
4838 rtx
4839 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4840 {
4841   if (count != 0)
4842     return const0_rtx;
4843   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4844 }
4845
4846
4847 static void
4848 aarch64_asm_trampoline_template (FILE *f)
4849 {
4850   if (TARGET_ILP32)
4851     {
4852       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
4853       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
4854     }
4855   else
4856     {
4857       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
4858       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
4859     }
4860   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
4861   assemble_aligned_integer (4, const0_rtx);
4862   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4863   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
4864 }
4865
4866 static void
4867 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
4868 {
4869   rtx fnaddr, mem, a_tramp;
4870   const int tramp_code_sz = 16;
4871
4872   /* Don't need to copy the trailing D-words, we fill those in below.  */
4873   emit_block_move (m_tramp, assemble_trampoline_template (),
4874                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
4875   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
4876   fnaddr = XEXP (DECL_RTL (fndecl), 0);
4877   if (GET_MODE (fnaddr) != ptr_mode)
4878     fnaddr = convert_memory_address (ptr_mode, fnaddr);
4879   emit_move_insn (mem, fnaddr);
4880
4881   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
4882   emit_move_insn (mem, chain_value);
4883
4884   /* XXX We should really define a "clear_cache" pattern and use
4885      gen_clear_cache().  */
4886   a_tramp = XEXP (m_tramp, 0);
4887   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
4888                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
4889                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
4890                      ptr_mode);
4891 }
4892
4893 static unsigned char
4894 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
4895 {
4896   switch (regclass)
4897     {
4898     case CALLER_SAVE_REGS:
4899     case POINTER_REGS:
4900     case GENERAL_REGS:
4901     case ALL_REGS:
4902     case FP_REGS:
4903     case FP_LO_REGS:
4904       return
4905         aarch64_vector_mode_p (mode) ? (GET_MODE_SIZE (mode) + 15) / 16 :
4906                                        (GET_MODE_SIZE (mode) + 7) / 8;
4907     case STACK_REG:
4908       return 1;
4909
4910     case NO_REGS:
4911       return 0;
4912
4913     default:
4914       break;
4915     }
4916   gcc_unreachable ();
4917 }
4918
4919 static reg_class_t
4920 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
4921 {
4922   if (regclass == POINTER_REGS)
4923     return GENERAL_REGS;
4924
4925   if (regclass == STACK_REG)
4926     {
4927       if (REG_P(x)
4928           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
4929           return regclass;
4930
4931       return NO_REGS;
4932     }
4933
4934   /* If it's an integer immediate that MOVI can't handle, then
4935      FP_REGS is not an option, so we return NO_REGS instead.  */
4936   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
4937       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
4938     return NO_REGS;
4939
4940   /* Register eliminiation can result in a request for
4941      SP+constant->FP_REGS.  We cannot support such operations which
4942      use SP as source and an FP_REG as destination, so reject out
4943      right now.  */
4944   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
4945     {
4946       rtx lhs = XEXP (x, 0);
4947
4948       /* Look through a possible SUBREG introduced by ILP32.  */
4949       if (GET_CODE (lhs) == SUBREG)
4950         lhs = SUBREG_REG (lhs);
4951
4952       gcc_assert (REG_P (lhs));
4953       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
4954                                       POINTER_REGS));
4955       return NO_REGS;
4956     }
4957
4958   return regclass;
4959 }
4960
4961 void
4962 aarch64_asm_output_labelref (FILE* f, const char *name)
4963 {
4964   asm_fprintf (f, "%U%s", name);
4965 }
4966
4967 static void
4968 aarch64_elf_asm_constructor (rtx symbol, int priority)
4969 {
4970   if (priority == DEFAULT_INIT_PRIORITY)
4971     default_ctor_section_asm_out_constructor (symbol, priority);
4972   else
4973     {
4974       section *s;
4975       char buf[18];
4976       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
4977       s = get_section (buf, SECTION_WRITE, NULL);
4978       switch_to_section (s);
4979       assemble_align (POINTER_SIZE);
4980       assemble_aligned_integer (POINTER_BYTES, symbol);
4981     }
4982 }
4983
4984 static void
4985 aarch64_elf_asm_destructor (rtx symbol, int priority)
4986 {
4987   if (priority == DEFAULT_INIT_PRIORITY)
4988     default_dtor_section_asm_out_destructor (symbol, priority);
4989   else
4990     {
4991       section *s;
4992       char buf[18];
4993       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
4994       s = get_section (buf, SECTION_WRITE, NULL);
4995       switch_to_section (s);
4996       assemble_align (POINTER_SIZE);
4997       assemble_aligned_integer (POINTER_BYTES, symbol);
4998     }
4999 }
5000
5001 const char*
5002 aarch64_output_casesi (rtx *operands)
5003 {
5004   char buf[100];
5005   char label[100];
5006   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5007   int index;
5008   static const char *const patterns[4][2] =
5009   {
5010     {
5011       "ldrb\t%w3, [%0,%w1,uxtw]",
5012       "add\t%3, %4, %w3, sxtb #2"
5013     },
5014     {
5015       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5016       "add\t%3, %4, %w3, sxth #2"
5017     },
5018     {
5019       "ldr\t%w3, [%0,%w1,uxtw #2]",
5020       "add\t%3, %4, %w3, sxtw #2"
5021     },
5022     /* We assume that DImode is only generated when not optimizing and
5023        that we don't really need 64-bit address offsets.  That would
5024        imply an object file with 8GB of code in a single function!  */
5025     {
5026       "ldr\t%w3, [%0,%w1,uxtw #2]",
5027       "add\t%3, %4, %w3, sxtw #2"
5028     }
5029   };
5030
5031   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5032
5033   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5034
5035   gcc_assert (index >= 0 && index <= 3);
5036
5037   /* Need to implement table size reduction, by chaning the code below.  */
5038   output_asm_insn (patterns[index][0], operands);
5039   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5040   snprintf (buf, sizeof (buf),
5041             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5042   output_asm_insn (buf, operands);
5043   output_asm_insn (patterns[index][1], operands);
5044   output_asm_insn ("br\t%3", operands);
5045   assemble_label (asm_out_file, label);
5046   return "";
5047 }
5048
5049
5050 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5051    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5052    operator.  */
5053
5054 int
5055 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5056 {
5057   if (shift >= 0 && shift <= 3)
5058     {
5059       int size;
5060       for (size = 8; size <= 32; size *= 2)
5061         {
5062           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5063           if (mask == bits << shift)
5064             return size;
5065         }
5066     }
5067   return 0;
5068 }
5069
5070 static bool
5071 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5072                                    const_rtx x ATTRIBUTE_UNUSED)
5073 {
5074   /* We can't use blocks for constants when we're using a per-function
5075      constant pool.  */
5076   return false;
5077 }
5078
5079 static section *
5080 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5081                             rtx x ATTRIBUTE_UNUSED,
5082                             unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5083 {
5084   /* Force all constant pool entries into the current function section.  */
5085   return function_section (current_function_decl);
5086 }
5087
5088
5089 /* Costs.  */
5090
5091 /* Helper function for rtx cost calculation.  Strip a shift expression
5092    from X.  Returns the inner operand if successful, or the original
5093    expression on failure.  */
5094 static rtx
5095 aarch64_strip_shift (rtx x)
5096 {
5097   rtx op = x;
5098
5099   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5100      we can convert both to ROR during final output.  */
5101   if ((GET_CODE (op) == ASHIFT
5102        || GET_CODE (op) == ASHIFTRT
5103        || GET_CODE (op) == LSHIFTRT
5104        || GET_CODE (op) == ROTATERT
5105        || GET_CODE (op) == ROTATE)
5106       && CONST_INT_P (XEXP (op, 1)))
5107     return XEXP (op, 0);
5108
5109   if (GET_CODE (op) == MULT
5110       && CONST_INT_P (XEXP (op, 1))
5111       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5112     return XEXP (op, 0);
5113
5114   return x;
5115 }
5116
5117 /* Helper function for rtx cost calculation.  Strip an extend
5118    expression from X.  Returns the inner operand if successful, or the
5119    original expression on failure.  We deal with a number of possible
5120    canonicalization variations here.  */
5121 static rtx
5122 aarch64_strip_extend (rtx x)
5123 {
5124   rtx op = x;
5125
5126   /* Zero and sign extraction of a widened value.  */
5127   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5128       && XEXP (op, 2) == const0_rtx
5129       && GET_CODE (XEXP (op, 0)) == MULT
5130       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5131                                          XEXP (op, 1)))
5132     return XEXP (XEXP (op, 0), 0);
5133
5134   /* It can also be represented (for zero-extend) as an AND with an
5135      immediate.  */
5136   if (GET_CODE (op) == AND
5137       && GET_CODE (XEXP (op, 0)) == MULT
5138       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5139       && CONST_INT_P (XEXP (op, 1))
5140       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5141                            INTVAL (XEXP (op, 1))) != 0)
5142     return XEXP (XEXP (op, 0), 0);
5143
5144   /* Now handle extended register, as this may also have an optional
5145      left shift by 1..4.  */
5146   if (GET_CODE (op) == ASHIFT
5147       && CONST_INT_P (XEXP (op, 1))
5148       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5149     op = XEXP (op, 0);
5150
5151   if (GET_CODE (op) == ZERO_EXTEND
5152       || GET_CODE (op) == SIGN_EXTEND)
5153     op = XEXP (op, 0);
5154
5155   if (op != x)
5156     return op;
5157
5158   return x;
5159 }
5160
5161 /* Return true iff CODE is a shift supported in combination
5162    with arithmetic instructions.  */
5163
5164 static bool
5165 aarch64_shift_p (enum rtx_code code)
5166 {
5167   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5168 }
5169
5170 /* Helper function for rtx cost calculation.  Calculate the cost of
5171    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5172    Return the calculated cost of the expression, recursing manually in to
5173    operands where needed.  */
5174
5175 static int
5176 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5177 {
5178   rtx op0, op1;
5179   const struct cpu_cost_table *extra_cost
5180     = aarch64_tune_params->insn_extra_cost;
5181   int cost = 0;
5182   bool compound_p = (outer == PLUS || outer == MINUS);
5183   machine_mode mode = GET_MODE (x);
5184
5185   gcc_checking_assert (code == MULT);
5186
5187   op0 = XEXP (x, 0);
5188   op1 = XEXP (x, 1);
5189
5190   if (VECTOR_MODE_P (mode))
5191     mode = GET_MODE_INNER (mode);
5192
5193   /* Integer multiply/fma.  */
5194   if (GET_MODE_CLASS (mode) == MODE_INT)
5195     {
5196       /* The multiply will be canonicalized as a shift, cost it as such.  */
5197       if (aarch64_shift_p (GET_CODE (x))
5198           || (CONST_INT_P (op1)
5199               && exact_log2 (INTVAL (op1)) > 0))
5200         {
5201           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5202                            || GET_CODE (op0) == SIGN_EXTEND;
5203           if (speed)
5204             {
5205               if (compound_p)
5206                 {
5207                   if (REG_P (op1))
5208                     /* ARITH + shift-by-register.  */
5209                     cost += extra_cost->alu.arith_shift_reg;
5210                   else if (is_extend)
5211                     /* ARITH + extended register.  We don't have a cost field
5212                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
5213                     cost += extra_cost->alu.extend_arith;
5214                   else
5215                     /* ARITH + shift-by-immediate.  */
5216                     cost += extra_cost->alu.arith_shift;
5217                 }
5218               else
5219                 /* LSL (immediate).  */
5220                 cost += extra_cost->alu.shift;
5221
5222             }
5223           /* Strip extends as we will have costed them in the case above.  */
5224           if (is_extend)
5225             op0 = aarch64_strip_extend (op0);
5226
5227           cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5228
5229           return cost;
5230         }
5231
5232       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
5233          compound and let the below cases handle it.  After all, MNEG is a
5234          special-case alias of MSUB.  */
5235       if (GET_CODE (op0) == NEG)
5236         {
5237           op0 = XEXP (op0, 0);
5238           compound_p = true;
5239         }
5240
5241       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5242       if ((GET_CODE (op0) == ZERO_EXTEND
5243            && GET_CODE (op1) == ZERO_EXTEND)
5244           || (GET_CODE (op0) == SIGN_EXTEND
5245               && GET_CODE (op1) == SIGN_EXTEND))
5246         {
5247           cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5248                   + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5249
5250           if (speed)
5251             {
5252               if (compound_p)
5253                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
5254                 cost += extra_cost->mult[0].extend_add;
5255               else
5256                 /* MUL/SMULL/UMULL.  */
5257                 cost += extra_cost->mult[0].extend;
5258             }
5259
5260           return cost;
5261         }
5262
5263       /* This is either an integer multiply or a MADD.  In both cases
5264          we want to recurse and cost the operands.  */
5265       cost += rtx_cost (op0, MULT, 0, speed)
5266               + rtx_cost (op1, MULT, 1, speed);
5267
5268       if (speed)
5269         {
5270           if (compound_p)
5271             /* MADD/MSUB.  */
5272             cost += extra_cost->mult[mode == DImode].add;
5273           else
5274             /* MUL.  */
5275             cost += extra_cost->mult[mode == DImode].simple;
5276         }
5277
5278       return cost;
5279     }
5280   else
5281     {
5282       if (speed)
5283         {
5284           /* Floating-point FMA/FMUL can also support negations of the
5285              operands.  */
5286           if (GET_CODE (op0) == NEG)
5287             op0 = XEXP (op0, 0);
5288           if (GET_CODE (op1) == NEG)
5289             op1 = XEXP (op1, 0);
5290
5291           if (compound_p)
5292             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5293             cost += extra_cost->fp[mode == DFmode].fma;
5294           else
5295             /* FMUL/FNMUL.  */
5296             cost += extra_cost->fp[mode == DFmode].mult;
5297         }
5298
5299       cost += rtx_cost (op0, MULT, 0, speed)
5300               + rtx_cost (op1, MULT, 1, speed);
5301       return cost;
5302     }
5303 }
5304
5305 static int
5306 aarch64_address_cost (rtx x,
5307                       machine_mode mode,
5308                       addr_space_t as ATTRIBUTE_UNUSED,
5309                       bool speed)
5310 {
5311   enum rtx_code c = GET_CODE (x);
5312   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params->addr_cost;
5313   struct aarch64_address_info info;
5314   int cost = 0;
5315   info.shift = 0;
5316
5317   if (!aarch64_classify_address (&info, x, mode, c, false))
5318     {
5319       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5320         {
5321           /* This is a CONST or SYMBOL ref which will be split
5322              in a different way depending on the code model in use.
5323              Cost it through the generic infrastructure.  */
5324           int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5325           /* Divide through by the cost of one instruction to
5326              bring it to the same units as the address costs.  */
5327           cost_symbol_ref /= COSTS_N_INSNS (1);
5328           /* The cost is then the cost of preparing the address,
5329              followed by an immediate (possibly 0) offset.  */
5330           return cost_symbol_ref + addr_cost->imm_offset;
5331         }
5332       else
5333         {
5334           /* This is most likely a jump table from a case
5335              statement.  */
5336           return addr_cost->register_offset;
5337         }
5338     }
5339
5340   switch (info.type)
5341     {
5342       case ADDRESS_LO_SUM:
5343       case ADDRESS_SYMBOLIC:
5344       case ADDRESS_REG_IMM:
5345         cost += addr_cost->imm_offset;
5346         break;
5347
5348       case ADDRESS_REG_WB:
5349         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5350           cost += addr_cost->pre_modify;
5351         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5352           cost += addr_cost->post_modify;
5353         else
5354           gcc_unreachable ();
5355
5356         break;
5357
5358       case ADDRESS_REG_REG:
5359         cost += addr_cost->register_offset;
5360         break;
5361
5362       case ADDRESS_REG_UXTW:
5363       case ADDRESS_REG_SXTW:
5364         cost += addr_cost->register_extend;
5365         break;
5366
5367       default:
5368         gcc_unreachable ();
5369     }
5370
5371
5372   if (info.shift > 0)
5373     {
5374       /* For the sake of calculating the cost of the shifted register
5375          component, we can treat same sized modes in the same way.  */
5376       switch (GET_MODE_BITSIZE (mode))
5377         {
5378           case 16:
5379             cost += addr_cost->addr_scale_costs.hi;
5380             break;
5381
5382           case 32:
5383             cost += addr_cost->addr_scale_costs.si;
5384             break;
5385
5386           case 64:
5387             cost += addr_cost->addr_scale_costs.di;
5388             break;
5389
5390           /* We can't tell, or this is a 128-bit vector.  */
5391           default:
5392             cost += addr_cost->addr_scale_costs.ti;
5393             break;
5394         }
5395     }
5396
5397   return cost;
5398 }
5399
5400 /* Return true if the RTX X in mode MODE is a zero or sign extract
5401    usable in an ADD or SUB (extended register) instruction.  */
5402 static bool
5403 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5404 {
5405   /* Catch add with a sign extract.
5406      This is add_<optab><mode>_multp2.  */
5407   if (GET_CODE (x) == SIGN_EXTRACT
5408       || GET_CODE (x) == ZERO_EXTRACT)
5409     {
5410       rtx op0 = XEXP (x, 0);
5411       rtx op1 = XEXP (x, 1);
5412       rtx op2 = XEXP (x, 2);
5413
5414       if (GET_CODE (op0) == MULT
5415           && CONST_INT_P (op1)
5416           && op2 == const0_rtx
5417           && CONST_INT_P (XEXP (op0, 1))
5418           && aarch64_is_extend_from_extract (mode,
5419                                              XEXP (op0, 1),
5420                                              op1))
5421         {
5422           return true;
5423         }
5424     }
5425
5426   return false;
5427 }
5428
5429 static bool
5430 aarch64_frint_unspec_p (unsigned int u)
5431 {
5432   switch (u)
5433     {
5434       case UNSPEC_FRINTZ:
5435       case UNSPEC_FRINTP:
5436       case UNSPEC_FRINTM:
5437       case UNSPEC_FRINTA:
5438       case UNSPEC_FRINTN:
5439       case UNSPEC_FRINTX:
5440       case UNSPEC_FRINTI:
5441         return true;
5442
5443       default:
5444         return false;
5445     }
5446 }
5447
5448 /* Return true iff X is an rtx that will match an extr instruction
5449    i.e. as described in the *extr<mode>5_insn family of patterns.
5450    OP0 and OP1 will be set to the operands of the shifts involved
5451    on success and will be NULL_RTX otherwise.  */
5452
5453 static bool
5454 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5455 {
5456   rtx op0, op1;
5457   machine_mode mode = GET_MODE (x);
5458
5459   *res_op0 = NULL_RTX;
5460   *res_op1 = NULL_RTX;
5461
5462   if (GET_CODE (x) != IOR)
5463     return false;
5464
5465   op0 = XEXP (x, 0);
5466   op1 = XEXP (x, 1);
5467
5468   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5469       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5470     {
5471      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
5472       if (GET_CODE (op1) == ASHIFT)
5473         std::swap (op0, op1);
5474
5475       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5476         return false;
5477
5478       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5479       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5480
5481       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5482           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5483         {
5484           *res_op0 = XEXP (op0, 0);
5485           *res_op1 = XEXP (op1, 0);
5486           return true;
5487         }
5488     }
5489
5490   return false;
5491 }
5492
5493 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5494    storing it in *COST.  Result is true if the total cost of the operation
5495    has now been calculated.  */
5496 static bool
5497 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5498 {
5499   rtx inner;
5500   rtx comparator;
5501   enum rtx_code cmpcode;
5502
5503   if (COMPARISON_P (op0))
5504     {
5505       inner = XEXP (op0, 0);
5506       comparator = XEXP (op0, 1);
5507       cmpcode = GET_CODE (op0);
5508     }
5509   else
5510     {
5511       inner = op0;
5512       comparator = const0_rtx;
5513       cmpcode = NE;
5514     }
5515
5516   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5517     {
5518       /* Conditional branch.  */
5519       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5520         return true;
5521       else
5522         {
5523           if (cmpcode == NE || cmpcode == EQ)
5524             {
5525               if (comparator == const0_rtx)
5526                 {
5527                   /* TBZ/TBNZ/CBZ/CBNZ.  */
5528                   if (GET_CODE (inner) == ZERO_EXTRACT)
5529                     /* TBZ/TBNZ.  */
5530                     *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5531                                        0, speed);
5532                 else
5533                   /* CBZ/CBNZ.  */
5534                   *cost += rtx_cost (inner, cmpcode, 0, speed);
5535
5536                 return true;
5537               }
5538             }
5539           else if (cmpcode == LT || cmpcode == GE)
5540             {
5541               /* TBZ/TBNZ.  */
5542               if (comparator == const0_rtx)
5543                 return true;
5544             }
5545         }
5546     }
5547   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5548     {
5549       /* It's a conditional operation based on the status flags,
5550          so it must be some flavor of CSEL.  */
5551
5552       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
5553       if (GET_CODE (op1) == NEG
5554           || GET_CODE (op1) == NOT
5555           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5556         op1 = XEXP (op1, 0);
5557
5558       *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5559       *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5560       return true;
5561     }
5562
5563   /* We don't know what this is, cost all operands.  */
5564   return false;
5565 }
5566
5567 /* Calculate the cost of calculating X, storing it in *COST.  Result
5568    is true if the total cost of the operation has now been calculated.  */
5569 static bool
5570 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5571                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5572 {
5573   rtx op0, op1, op2;
5574   const struct cpu_cost_table *extra_cost
5575     = aarch64_tune_params->insn_extra_cost;
5576   machine_mode mode = GET_MODE (x);
5577
5578   /* By default, assume that everything has equivalent cost to the
5579      cheapest instruction.  Any additional costs are applied as a delta
5580      above this default.  */
5581   *cost = COSTS_N_INSNS (1);
5582
5583   /* TODO: The cost infrastructure currently does not handle
5584      vector operations.  Assume that all vector operations
5585      are equally expensive.  */
5586   if (VECTOR_MODE_P (mode))
5587     {
5588       if (speed)
5589         *cost += extra_cost->vect.alu;
5590       return true;
5591     }
5592
5593   switch (code)
5594     {
5595     case SET:
5596       /* The cost depends entirely on the operands to SET.  */
5597       *cost = 0;
5598       op0 = SET_DEST (x);
5599       op1 = SET_SRC (x);
5600
5601       switch (GET_CODE (op0))
5602         {
5603         case MEM:
5604           if (speed)
5605             {
5606               rtx address = XEXP (op0, 0);
5607               if (GET_MODE_CLASS (mode) == MODE_INT)
5608                 *cost += extra_cost->ldst.store;
5609               else if (mode == SFmode)
5610                 *cost += extra_cost->ldst.storef;
5611               else if (mode == DFmode)
5612                 *cost += extra_cost->ldst.stored;
5613
5614               *cost +=
5615                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5616                                                      0, speed));
5617             }
5618
5619           *cost += rtx_cost (op1, SET, 1, speed);
5620           return true;
5621
5622         case SUBREG:
5623           if (! REG_P (SUBREG_REG (op0)))
5624             *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5625
5626           /* Fall through.  */
5627         case REG:
5628           /* const0_rtx is in general free, but we will use an
5629              instruction to set a register to 0.  */
5630           if (REG_P (op1) || op1 == const0_rtx)
5631             {
5632               /* The cost is 1 per register copied.  */
5633               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5634                               / UNITS_PER_WORD;
5635               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5636             }
5637           else
5638             /* Cost is just the cost of the RHS of the set.  */
5639             *cost += rtx_cost (op1, SET, 1, speed);
5640           return true;
5641
5642         case ZERO_EXTRACT:
5643         case SIGN_EXTRACT:
5644           /* Bit-field insertion.  Strip any redundant widening of
5645              the RHS to meet the width of the target.  */
5646           if (GET_CODE (op1) == SUBREG)
5647             op1 = SUBREG_REG (op1);
5648           if ((GET_CODE (op1) == ZERO_EXTEND
5649                || GET_CODE (op1) == SIGN_EXTEND)
5650               && CONST_INT_P (XEXP (op0, 1))
5651               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5652                   >= INTVAL (XEXP (op0, 1))))
5653             op1 = XEXP (op1, 0);
5654
5655           if (CONST_INT_P (op1))
5656             {
5657               /* MOV immediate is assumed to always be cheap.  */
5658               *cost = COSTS_N_INSNS (1);
5659             }
5660           else
5661             {
5662               /* BFM.  */
5663               if (speed)
5664                 *cost += extra_cost->alu.bfi;
5665               *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5666             }
5667
5668           return true;
5669
5670         default:
5671           /* We can't make sense of this, assume default cost.  */
5672           *cost = COSTS_N_INSNS (1);
5673           return false;
5674         }
5675       return false;
5676
5677     case CONST_INT:
5678       /* If an instruction can incorporate a constant within the
5679          instruction, the instruction's expression avoids calling
5680          rtx_cost() on the constant.  If rtx_cost() is called on a
5681          constant, then it is usually because the constant must be
5682          moved into a register by one or more instructions.
5683
5684          The exception is constant 0, which can be expressed
5685          as XZR/WZR and is therefore free.  The exception to this is
5686          if we have (set (reg) (const0_rtx)) in which case we must cost
5687          the move.  However, we can catch that when we cost the SET, so
5688          we don't need to consider that here.  */
5689       if (x == const0_rtx)
5690         *cost = 0;
5691       else
5692         {
5693           /* To an approximation, building any other constant is
5694              proportionally expensive to the number of instructions
5695              required to build that constant.  This is true whether we
5696              are compiling for SPEED or otherwise.  */
5697           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5698                                  (NULL_RTX, x, false, mode));
5699         }
5700       return true;
5701
5702     case CONST_DOUBLE:
5703       if (speed)
5704         {
5705           /* mov[df,sf]_aarch64.  */
5706           if (aarch64_float_const_representable_p (x))
5707             /* FMOV (scalar immediate).  */
5708             *cost += extra_cost->fp[mode == DFmode].fpconst;
5709           else if (!aarch64_float_const_zero_rtx_p (x))
5710             {
5711               /* This will be a load from memory.  */
5712               if (mode == DFmode)
5713                 *cost += extra_cost->ldst.loadd;
5714               else
5715                 *cost += extra_cost->ldst.loadf;
5716             }
5717           else
5718             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5719                or MOV v0.s[0], wzr - neither of which are modeled by the
5720                cost tables.  Just use the default cost.  */
5721             {
5722             }
5723         }
5724
5725       return true;
5726
5727     case MEM:
5728       if (speed)
5729         {
5730           /* For loads we want the base cost of a load, plus an
5731              approximation for the additional cost of the addressing
5732              mode.  */
5733           rtx address = XEXP (x, 0);
5734           if (GET_MODE_CLASS (mode) == MODE_INT)
5735             *cost += extra_cost->ldst.load;
5736           else if (mode == SFmode)
5737             *cost += extra_cost->ldst.loadf;
5738           else if (mode == DFmode)
5739             *cost += extra_cost->ldst.loadd;
5740
5741           *cost +=
5742                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5743                                                      0, speed));
5744         }
5745
5746       return true;
5747
5748     case NEG:
5749       op0 = XEXP (x, 0);
5750
5751       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5752        {
5753           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5754               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5755             {
5756               /* CSETM.  */
5757               *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5758               return true;
5759             }
5760
5761           /* Cost this as SUB wzr, X.  */
5762           op0 = CONST0_RTX (GET_MODE (x));
5763           op1 = XEXP (x, 0);
5764           goto cost_minus;
5765         }
5766
5767       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5768         {
5769           /* Support (neg(fma...)) as a single instruction only if
5770              sign of zeros is unimportant.  This matches the decision
5771              making in aarch64.md.  */
5772           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5773             {
5774               /* FNMADD.  */
5775               *cost = rtx_cost (op0, NEG, 0, speed);
5776               return true;
5777             }
5778           if (speed)
5779             /* FNEG.  */
5780             *cost += extra_cost->fp[mode == DFmode].neg;
5781           return false;
5782         }
5783
5784       return false;
5785
5786     case CLRSB:
5787     case CLZ:
5788       if (speed)
5789         *cost += extra_cost->alu.clz;
5790
5791       return false;
5792
5793     case COMPARE:
5794       op0 = XEXP (x, 0);
5795       op1 = XEXP (x, 1);
5796
5797       if (op1 == const0_rtx
5798           && GET_CODE (op0) == AND)
5799         {
5800           x = op0;
5801           goto cost_logic;
5802         }
5803
5804       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5805         {
5806           /* TODO: A write to the CC flags possibly costs extra, this
5807              needs encoding in the cost tables.  */
5808
5809           /* CC_ZESWPmode supports zero extend for free.  */
5810           if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5811             op0 = XEXP (op0, 0);
5812
5813           /* ANDS.  */
5814           if (GET_CODE (op0) == AND)
5815             {
5816               x = op0;
5817               goto cost_logic;
5818             }
5819
5820           if (GET_CODE (op0) == PLUS)
5821             {
5822               /* ADDS (and CMN alias).  */
5823               x = op0;
5824               goto cost_plus;
5825             }
5826
5827           if (GET_CODE (op0) == MINUS)
5828             {
5829               /* SUBS.  */
5830               x = op0;
5831               goto cost_minus;
5832             }
5833
5834           if (GET_CODE (op1) == NEG)
5835             {
5836               /* CMN.  */
5837               if (speed)
5838                 *cost += extra_cost->alu.arith;
5839
5840               *cost += rtx_cost (op0, COMPARE, 0, speed);
5841               *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
5842               return true;
5843             }
5844
5845           /* CMP.
5846
5847              Compare can freely swap the order of operands, and
5848              canonicalization puts the more complex operation first.
5849              But the integer MINUS logic expects the shift/extend
5850              operation in op1.  */
5851           if (! (REG_P (op0)
5852                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
5853           {
5854             op0 = XEXP (x, 1);
5855             op1 = XEXP (x, 0);
5856           }
5857           goto cost_minus;
5858         }
5859
5860       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
5861         {
5862           /* FCMP.  */
5863           if (speed)
5864             *cost += extra_cost->fp[mode == DFmode].compare;
5865
5866           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
5867             {
5868               *cost += rtx_cost (op0, COMPARE, 0, speed);
5869               /* FCMP supports constant 0.0 for no extra cost. */
5870               return true;
5871             }
5872           return false;
5873         }
5874
5875       return false;
5876
5877     case MINUS:
5878       {
5879         op0 = XEXP (x, 0);
5880         op1 = XEXP (x, 1);
5881
5882 cost_minus:
5883         /* Detect valid immediates.  */
5884         if ((GET_MODE_CLASS (mode) == MODE_INT
5885              || (GET_MODE_CLASS (mode) == MODE_CC
5886                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
5887             && CONST_INT_P (op1)
5888             && aarch64_uimm12_shift (INTVAL (op1)))
5889           {
5890             *cost += rtx_cost (op0, MINUS, 0, speed);
5891
5892             if (speed)
5893               /* SUB(S) (immediate).  */
5894               *cost += extra_cost->alu.arith;
5895             return true;
5896
5897           }
5898
5899         /* Look for SUB (extended register).  */
5900         if (aarch64_rtx_arith_op_extract_p (op1, mode))
5901           {
5902             if (speed)
5903               *cost += extra_cost->alu.extend_arith;
5904
5905             *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
5906                                (enum rtx_code) GET_CODE (op1),
5907                                0, speed);
5908             return true;
5909           }
5910
5911         rtx new_op1 = aarch64_strip_extend (op1);
5912
5913         /* Cost this as an FMA-alike operation.  */
5914         if ((GET_CODE (new_op1) == MULT
5915              || aarch64_shift_p (GET_CODE (new_op1)))
5916             && code != COMPARE)
5917           {
5918             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
5919                                             (enum rtx_code) code,
5920                                             speed);
5921             *cost += rtx_cost (op0, MINUS, 0, speed);
5922             return true;
5923           }
5924
5925         *cost += rtx_cost (new_op1, MINUS, 1, speed);
5926
5927         if (speed)
5928           {
5929             if (GET_MODE_CLASS (mode) == MODE_INT)
5930               /* SUB(S).  */
5931               *cost += extra_cost->alu.arith;
5932             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5933               /* FSUB.  */
5934               *cost += extra_cost->fp[mode == DFmode].addsub;
5935           }
5936         return true;
5937       }
5938
5939     case PLUS:
5940       {
5941         rtx new_op0;
5942
5943         op0 = XEXP (x, 0);
5944         op1 = XEXP (x, 1);
5945
5946 cost_plus:
5947         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5948             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5949           {
5950             /* CSINC.  */
5951             *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
5952             *cost += rtx_cost (op1, PLUS, 1, speed);
5953             return true;
5954           }
5955
5956         if (GET_MODE_CLASS (mode) == MODE_INT
5957             && CONST_INT_P (op1)
5958             && aarch64_uimm12_shift (INTVAL (op1)))
5959           {
5960             *cost += rtx_cost (op0, PLUS, 0, speed);
5961
5962             if (speed)
5963               /* ADD (immediate).  */
5964               *cost += extra_cost->alu.arith;
5965             return true;
5966           }
5967
5968         /* Look for ADD (extended register).  */
5969         if (aarch64_rtx_arith_op_extract_p (op0, mode))
5970           {
5971             if (speed)
5972               *cost += extra_cost->alu.extend_arith;
5973
5974             *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
5975                                (enum rtx_code) GET_CODE (op0),
5976                                0, speed);
5977             return true;
5978           }
5979
5980         /* Strip any extend, leave shifts behind as we will
5981            cost them through mult_cost.  */
5982         new_op0 = aarch64_strip_extend (op0);
5983
5984         if (GET_CODE (new_op0) == MULT
5985             || aarch64_shift_p (GET_CODE (new_op0)))
5986           {
5987             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
5988                                             speed);
5989             *cost += rtx_cost (op1, PLUS, 1, speed);
5990             return true;
5991           }
5992
5993         *cost += (rtx_cost (new_op0, PLUS, 0, speed)
5994                   + rtx_cost (op1, PLUS, 1, speed));
5995
5996         if (speed)
5997           {
5998             if (GET_MODE_CLASS (mode) == MODE_INT)
5999               /* ADD.  */
6000               *cost += extra_cost->alu.arith;
6001             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6002               /* FADD.  */
6003               *cost += extra_cost->fp[mode == DFmode].addsub;
6004           }
6005         return true;
6006       }
6007
6008     case BSWAP:
6009       *cost = COSTS_N_INSNS (1);
6010
6011       if (speed)
6012         *cost += extra_cost->alu.rev;
6013
6014       return false;
6015
6016     case IOR:
6017       if (aarch_rev16_p (x))
6018         {
6019           *cost = COSTS_N_INSNS (1);
6020
6021           if (speed)
6022             *cost += extra_cost->alu.rev;
6023
6024           return true;
6025         }
6026
6027       if (aarch64_extr_rtx_p (x, &op0, &op1))
6028         {
6029           *cost += rtx_cost (op0, IOR, 0, speed)
6030                    + rtx_cost (op1, IOR, 1, speed);
6031           if (speed)
6032             *cost += extra_cost->alu.shift;
6033
6034           return true;
6035         }
6036     /* Fall through.  */
6037     case XOR:
6038     case AND:
6039     cost_logic:
6040       op0 = XEXP (x, 0);
6041       op1 = XEXP (x, 1);
6042
6043       if (code == AND
6044           && GET_CODE (op0) == MULT
6045           && CONST_INT_P (XEXP (op0, 1))
6046           && CONST_INT_P (op1)
6047           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6048                                INTVAL (op1)) != 0)
6049         {
6050           /* This is a UBFM/SBFM.  */
6051           *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
6052           if (speed)
6053             *cost += extra_cost->alu.bfx;
6054           return true;
6055         }
6056
6057       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6058         {
6059           /* We possibly get the immediate for free, this is not
6060              modelled.  */
6061           if (CONST_INT_P (op1)
6062               && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
6063             {
6064               *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6065
6066               if (speed)
6067                 *cost += extra_cost->alu.logical;
6068
6069               return true;
6070             }
6071           else
6072             {
6073               rtx new_op0 = op0;
6074
6075               /* Handle ORN, EON, or BIC.  */
6076               if (GET_CODE (op0) == NOT)
6077                 op0 = XEXP (op0, 0);
6078
6079               new_op0 = aarch64_strip_shift (op0);
6080
6081               /* If we had a shift on op0 then this is a logical-shift-
6082                  by-register/immediate operation.  Otherwise, this is just
6083                  a logical operation.  */
6084               if (speed)
6085                 {
6086                   if (new_op0 != op0)
6087                     {
6088                       /* Shift by immediate.  */
6089                       if (CONST_INT_P (XEXP (op0, 1)))
6090                         *cost += extra_cost->alu.log_shift;
6091                       else
6092                         *cost += extra_cost->alu.log_shift_reg;
6093                     }
6094                   else
6095                     *cost += extra_cost->alu.logical;
6096                 }
6097
6098               /* In both cases we want to cost both operands.  */
6099               *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6100                        + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6101
6102               return true;
6103             }
6104         }
6105       return false;
6106
6107     case NOT:
6108       x = XEXP (x, 0);
6109       op0 = aarch64_strip_shift (x);
6110
6111       /* MVN-shifted-reg.  */
6112       if (op0 != x)
6113         {
6114           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6115
6116           if (speed)
6117             *cost += extra_cost->alu.log_shift;
6118
6119           return true;
6120         }
6121       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6122          Handle the second form here taking care that 'a' in the above can
6123          be a shift.  */
6124       else if (GET_CODE (op0) == XOR)
6125         {
6126           rtx newop0 = XEXP (op0, 0);
6127           rtx newop1 = XEXP (op0, 1);
6128           rtx op0_stripped = aarch64_strip_shift (newop0);
6129
6130           *cost += rtx_cost (newop1, (enum rtx_code) code, 1, speed)
6131                    + rtx_cost (op0_stripped, XOR, 0, speed);
6132
6133           if (speed)
6134             {
6135               if (op0_stripped != newop0)
6136                 *cost += extra_cost->alu.log_shift;
6137               else
6138                 *cost += extra_cost->alu.logical;
6139             }
6140
6141           return true;
6142         }
6143       /* MVN.  */
6144       if (speed)
6145         *cost += extra_cost->alu.logical;
6146
6147       return false;
6148
6149     case ZERO_EXTEND:
6150
6151       op0 = XEXP (x, 0);
6152       /* If a value is written in SI mode, then zero extended to DI
6153          mode, the operation will in general be free as a write to
6154          a 'w' register implicitly zeroes the upper bits of an 'x'
6155          register.  However, if this is
6156
6157            (set (reg) (zero_extend (reg)))
6158
6159          we must cost the explicit register move.  */
6160       if (mode == DImode
6161           && GET_MODE (op0) == SImode
6162           && outer == SET)
6163         {
6164           int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6165
6166           if (!op_cost && speed)
6167             /* MOV.  */
6168             *cost += extra_cost->alu.extend;
6169           else
6170             /* Free, the cost is that of the SI mode operation.  */
6171             *cost = op_cost;
6172
6173           return true;
6174         }
6175       else if (MEM_P (XEXP (x, 0)))
6176         {
6177           /* All loads can zero extend to any size for free.  */
6178           *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6179           return true;
6180         }
6181
6182       /* UXTB/UXTH.  */
6183       if (speed)
6184         *cost += extra_cost->alu.extend;
6185
6186       return false;
6187
6188     case SIGN_EXTEND:
6189       if (MEM_P (XEXP (x, 0)))
6190         {
6191           /* LDRSH.  */
6192           if (speed)
6193             {
6194               rtx address = XEXP (XEXP (x, 0), 0);
6195               *cost += extra_cost->ldst.load_sign_extend;
6196
6197               *cost +=
6198                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6199                                                      0, speed));
6200             }
6201           return true;
6202         }
6203
6204       if (speed)
6205         *cost += extra_cost->alu.extend;
6206       return false;
6207
6208     case ASHIFT:
6209       op0 = XEXP (x, 0);
6210       op1 = XEXP (x, 1);
6211
6212       if (CONST_INT_P (op1))
6213         {
6214           /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6215              aliases.  */
6216           if (speed)
6217             *cost += extra_cost->alu.shift;
6218
6219           /* We can incorporate zero/sign extend for free.  */
6220           if (GET_CODE (op0) == ZERO_EXTEND
6221               || GET_CODE (op0) == SIGN_EXTEND)
6222             op0 = XEXP (op0, 0);
6223
6224           *cost += rtx_cost (op0, ASHIFT, 0, speed);
6225           return true;
6226         }
6227       else
6228         {
6229           /* LSLV.  */
6230           if (speed)
6231             *cost += extra_cost->alu.shift_reg;
6232
6233           return false;  /* All arguments need to be in registers.  */
6234         }
6235
6236     case ROTATE:
6237     case ROTATERT:
6238     case LSHIFTRT:
6239     case ASHIFTRT:
6240       op0 = XEXP (x, 0);
6241       op1 = XEXP (x, 1);
6242
6243       if (CONST_INT_P (op1))
6244         {
6245           /* ASR (immediate) and friends.  */
6246           if (speed)
6247             *cost += extra_cost->alu.shift;
6248
6249           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6250           return true;
6251         }
6252       else
6253         {
6254
6255           /* ASR (register) and friends.  */
6256           if (speed)
6257             *cost += extra_cost->alu.shift_reg;
6258
6259           return false;  /* All arguments need to be in registers.  */
6260         }
6261
6262     case SYMBOL_REF:
6263
6264       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
6265         {
6266           /* LDR.  */
6267           if (speed)
6268             *cost += extra_cost->ldst.load;
6269         }
6270       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6271                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6272         {
6273           /* ADRP, followed by ADD.  */
6274           *cost += COSTS_N_INSNS (1);
6275           if (speed)
6276             *cost += 2 * extra_cost->alu.arith;
6277         }
6278       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6279                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6280         {
6281           /* ADR.  */
6282           if (speed)
6283             *cost += extra_cost->alu.arith;
6284         }
6285
6286       if (flag_pic)
6287         {
6288           /* One extra load instruction, after accessing the GOT.  */
6289           *cost += COSTS_N_INSNS (1);
6290           if (speed)
6291             *cost += extra_cost->ldst.load;
6292         }
6293       return true;
6294
6295     case HIGH:
6296     case LO_SUM:
6297       /* ADRP/ADD (immediate).  */
6298       if (speed)
6299         *cost += extra_cost->alu.arith;
6300       return true;
6301
6302     case ZERO_EXTRACT:
6303     case SIGN_EXTRACT:
6304       /* UBFX/SBFX.  */
6305       if (speed)
6306         *cost += extra_cost->alu.bfx;
6307
6308       /* We can trust that the immediates used will be correct (there
6309          are no by-register forms), so we need only cost op0.  */
6310       *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6311       return true;
6312
6313     case MULT:
6314       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6315       /* aarch64_rtx_mult_cost always handles recursion to its
6316          operands.  */
6317       return true;
6318
6319     case MOD:
6320     case UMOD:
6321       if (speed)
6322         {
6323           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6324             *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6325                       + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6326           else if (GET_MODE (x) == DFmode)
6327             *cost += (extra_cost->fp[1].mult
6328                       + extra_cost->fp[1].div);
6329           else if (GET_MODE (x) == SFmode)
6330             *cost += (extra_cost->fp[0].mult
6331                       + extra_cost->fp[0].div);
6332         }
6333       return false;  /* All arguments need to be in registers.  */
6334
6335     case DIV:
6336     case UDIV:
6337     case SQRT:
6338       if (speed)
6339         {
6340           if (GET_MODE_CLASS (mode) == MODE_INT)
6341             /* There is no integer SQRT, so only DIV and UDIV can get
6342                here.  */
6343             *cost += extra_cost->mult[mode == DImode].idiv;
6344           else
6345             *cost += extra_cost->fp[mode == DFmode].div;
6346         }
6347       return false;  /* All arguments need to be in registers.  */
6348
6349     case IF_THEN_ELSE:
6350       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6351                                          XEXP (x, 2), cost, speed);
6352
6353     case EQ:
6354     case NE:
6355     case GT:
6356     case GTU:
6357     case LT:
6358     case LTU:
6359     case GE:
6360     case GEU:
6361     case LE:
6362     case LEU:
6363
6364       return false; /* All arguments must be in registers.  */
6365
6366     case FMA:
6367       op0 = XEXP (x, 0);
6368       op1 = XEXP (x, 1);
6369       op2 = XEXP (x, 2);
6370
6371       if (speed)
6372         *cost += extra_cost->fp[mode == DFmode].fma;
6373
6374       /* FMSUB, FNMADD, and FNMSUB are free.  */
6375       if (GET_CODE (op0) == NEG)
6376         op0 = XEXP (op0, 0);
6377
6378       if (GET_CODE (op2) == NEG)
6379         op2 = XEXP (op2, 0);
6380
6381       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6382          and the by-element operand as operand 0.  */
6383       if (GET_CODE (op1) == NEG)
6384         op1 = XEXP (op1, 0);
6385
6386       /* Catch vector-by-element operations.  The by-element operand can
6387          either be (vec_duplicate (vec_select (x))) or just
6388          (vec_select (x)), depending on whether we are multiplying by
6389          a vector or a scalar.
6390
6391          Canonicalization is not very good in these cases, FMA4 will put the
6392          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
6393       if (GET_CODE (op0) == VEC_DUPLICATE)
6394         op0 = XEXP (op0, 0);
6395       else if (GET_CODE (op1) == VEC_DUPLICATE)
6396         op1 = XEXP (op1, 0);
6397
6398       if (GET_CODE (op0) == VEC_SELECT)
6399         op0 = XEXP (op0, 0);
6400       else if (GET_CODE (op1) == VEC_SELECT)
6401         op1 = XEXP (op1, 0);
6402
6403       /* If the remaining parameters are not registers,
6404          get the cost to put them into registers.  */
6405       *cost += rtx_cost (op0, FMA, 0, speed);
6406       *cost += rtx_cost (op1, FMA, 1, speed);
6407       *cost += rtx_cost (op2, FMA, 2, speed);
6408       return true;
6409
6410     case FLOAT_EXTEND:
6411       if (speed)
6412         *cost += extra_cost->fp[mode == DFmode].widen;
6413       return false;
6414
6415     case FLOAT_TRUNCATE:
6416       if (speed)
6417         *cost += extra_cost->fp[mode == DFmode].narrow;
6418       return false;
6419
6420     case FIX:
6421     case UNSIGNED_FIX:
6422       x = XEXP (x, 0);
6423       /* Strip the rounding part.  They will all be implemented
6424          by the fcvt* family of instructions anyway.  */
6425       if (GET_CODE (x) == UNSPEC)
6426         {
6427           unsigned int uns_code = XINT (x, 1);
6428
6429           if (uns_code == UNSPEC_FRINTA
6430               || uns_code == UNSPEC_FRINTM
6431               || uns_code == UNSPEC_FRINTN
6432               || uns_code == UNSPEC_FRINTP
6433               || uns_code == UNSPEC_FRINTZ)
6434             x = XVECEXP (x, 0, 0);
6435         }
6436
6437       if (speed)
6438         *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6439
6440       *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6441       return true;
6442
6443     case ABS:
6444       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6445         {
6446           op0 = XEXP (x, 0);
6447
6448           /* FABD, which is analogous to FADD.  */
6449           if (GET_CODE (op0) == MINUS)
6450             {
6451               *cost += rtx_cost (XEXP (op0, 0), MINUS, 0, speed);
6452                         + rtx_cost (XEXP (op0, 1), MINUS, 1, speed);
6453               if (speed)
6454                 *cost += extra_cost->fp[mode == DFmode].addsub;
6455
6456               return true;
6457             }
6458           /* Simple FABS is analogous to FNEG.  */
6459           if (speed)
6460             *cost += extra_cost->fp[mode == DFmode].neg;
6461         }
6462       else
6463         {
6464           /* Integer ABS will either be split to
6465              two arithmetic instructions, or will be an ABS
6466              (scalar), which we don't model.  */
6467           *cost = COSTS_N_INSNS (2);
6468           if (speed)
6469             *cost += 2 * extra_cost->alu.arith;
6470         }
6471       return false;
6472
6473     case SMAX:
6474     case SMIN:
6475       if (speed)
6476         {
6477           /* FMAXNM/FMINNM/FMAX/FMIN.
6478              TODO: This may not be accurate for all implementations, but
6479              we do not model this in the cost tables.  */
6480           *cost += extra_cost->fp[mode == DFmode].addsub;
6481         }
6482       return false;
6483
6484     case UNSPEC:
6485       /* The floating point round to integer frint* instructions.  */
6486       if (aarch64_frint_unspec_p (XINT (x, 1)))
6487         {
6488           if (speed)
6489             *cost += extra_cost->fp[mode == DFmode].roundint;
6490
6491           return false;
6492         }
6493
6494       if (XINT (x, 1) == UNSPEC_RBIT)
6495         {
6496           if (speed)
6497             *cost += extra_cost->alu.rev;
6498
6499           return false;
6500         }
6501       break;
6502
6503     case TRUNCATE:
6504
6505       /* Decompose <su>muldi3_highpart.  */
6506       if (/* (truncate:DI  */
6507           mode == DImode
6508           /*   (lshiftrt:TI  */
6509           && GET_MODE (XEXP (x, 0)) == TImode
6510           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6511           /*      (mult:TI  */
6512           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6513           /*        (ANY_EXTEND:TI (reg:DI))
6514                     (ANY_EXTEND:TI (reg:DI)))  */
6515           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6516                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6517               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6518                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6519           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6520           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6521           /*     (const_int 64)  */
6522           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6523           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6524         {
6525           /* UMULH/SMULH.  */
6526           if (speed)
6527             *cost += extra_cost->mult[mode == DImode].extend;
6528           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6529                              MULT, 0, speed);
6530           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6531                              MULT, 1, speed);
6532           return true;
6533         }
6534
6535       /* Fall through.  */
6536     default:
6537       break;
6538     }
6539
6540   if (dump_file && (dump_flags & TDF_DETAILS))
6541     fprintf (dump_file,
6542       "\nFailed to cost RTX.  Assuming default cost.\n");
6543
6544   return true;
6545 }
6546
6547 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6548    calculated for X.  This cost is stored in *COST.  Returns true
6549    if the total cost of X was calculated.  */
6550 static bool
6551 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6552                    int param, int *cost, bool speed)
6553 {
6554   bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6555
6556   if (dump_file && (dump_flags & TDF_DETAILS))
6557     {
6558       print_rtl_single (dump_file, x);
6559       fprintf (dump_file, "\n%s cost: %d (%s)\n",
6560                speed ? "Hot" : "Cold",
6561                *cost, result ? "final" : "partial");
6562     }
6563
6564   return result;
6565 }
6566
6567 static int
6568 aarch64_register_move_cost (machine_mode mode,
6569                             reg_class_t from_i, reg_class_t to_i)
6570 {
6571   enum reg_class from = (enum reg_class) from_i;
6572   enum reg_class to = (enum reg_class) to_i;
6573   const struct cpu_regmove_cost *regmove_cost
6574     = aarch64_tune_params->regmove_cost;
6575
6576   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
6577   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6578     to = GENERAL_REGS;
6579
6580   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6581     from = GENERAL_REGS;
6582
6583   /* Moving between GPR and stack cost is the same as GP2GP.  */
6584   if ((from == GENERAL_REGS && to == STACK_REG)
6585       || (to == GENERAL_REGS && from == STACK_REG))
6586     return regmove_cost->GP2GP;
6587
6588   /* To/From the stack register, we move via the gprs.  */
6589   if (to == STACK_REG || from == STACK_REG)
6590     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6591             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6592
6593   if (GET_MODE_SIZE (mode) == 16)
6594     {
6595       /* 128-bit operations on general registers require 2 instructions.  */
6596       if (from == GENERAL_REGS && to == GENERAL_REGS)
6597         return regmove_cost->GP2GP * 2;
6598       else if (from == GENERAL_REGS)
6599         return regmove_cost->GP2FP * 2;
6600       else if (to == GENERAL_REGS)
6601         return regmove_cost->FP2GP * 2;
6602
6603       /* When AdvSIMD instructions are disabled it is not possible to move
6604          a 128-bit value directly between Q registers.  This is handled in
6605          secondary reload.  A general register is used as a scratch to move
6606          the upper DI value and the lower DI value is moved directly,
6607          hence the cost is the sum of three moves. */
6608       if (! TARGET_SIMD)
6609         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6610
6611       return regmove_cost->FP2FP;
6612     }
6613
6614   if (from == GENERAL_REGS && to == GENERAL_REGS)
6615     return regmove_cost->GP2GP;
6616   else if (from == GENERAL_REGS)
6617     return regmove_cost->GP2FP;
6618   else if (to == GENERAL_REGS)
6619     return regmove_cost->FP2GP;
6620
6621   return regmove_cost->FP2FP;
6622 }
6623
6624 static int
6625 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6626                           reg_class_t rclass ATTRIBUTE_UNUSED,
6627                           bool in ATTRIBUTE_UNUSED)
6628 {
6629   return aarch64_tune_params->memmov_cost;
6630 }
6631
6632 /* Return the number of instructions that can be issued per cycle.  */
6633 static int
6634 aarch64_sched_issue_rate (void)
6635 {
6636   return aarch64_tune_params->issue_rate;
6637 }
6638
6639 static int
6640 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6641 {
6642   int issue_rate = aarch64_sched_issue_rate ();
6643
6644   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6645 }
6646
6647 /* Vectorizer cost model target hooks.  */
6648
6649 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
6650 static int
6651 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6652                                     tree vectype,
6653                                     int misalign ATTRIBUTE_UNUSED)
6654 {
6655   unsigned elements;
6656
6657   switch (type_of_cost)
6658     {
6659       case scalar_stmt:
6660         return aarch64_tune_params->vec_costs->scalar_stmt_cost;
6661
6662       case scalar_load:
6663         return aarch64_tune_params->vec_costs->scalar_load_cost;
6664
6665       case scalar_store:
6666         return aarch64_tune_params->vec_costs->scalar_store_cost;
6667
6668       case vector_stmt:
6669         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6670
6671       case vector_load:
6672         return aarch64_tune_params->vec_costs->vec_align_load_cost;
6673
6674       case vector_store:
6675         return aarch64_tune_params->vec_costs->vec_store_cost;
6676
6677       case vec_to_scalar:
6678         return aarch64_tune_params->vec_costs->vec_to_scalar_cost;
6679
6680       case scalar_to_vec:
6681         return aarch64_tune_params->vec_costs->scalar_to_vec_cost;
6682
6683       case unaligned_load:
6684         return aarch64_tune_params->vec_costs->vec_unalign_load_cost;
6685
6686       case unaligned_store:
6687         return aarch64_tune_params->vec_costs->vec_unalign_store_cost;
6688
6689       case cond_branch_taken:
6690         return aarch64_tune_params->vec_costs->cond_taken_branch_cost;
6691
6692       case cond_branch_not_taken:
6693         return aarch64_tune_params->vec_costs->cond_not_taken_branch_cost;
6694
6695       case vec_perm:
6696       case vec_promote_demote:
6697         return aarch64_tune_params->vec_costs->vec_stmt_cost;
6698
6699       case vec_construct:
6700         elements = TYPE_VECTOR_SUBPARTS (vectype);
6701         return elements / 2 + 1;
6702
6703       default:
6704         gcc_unreachable ();
6705     }
6706 }
6707
6708 /* Implement targetm.vectorize.add_stmt_cost.  */
6709 static unsigned
6710 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
6711                        struct _stmt_vec_info *stmt_info, int misalign,
6712                        enum vect_cost_model_location where)
6713 {
6714   unsigned *cost = (unsigned *) data;
6715   unsigned retval = 0;
6716
6717   if (flag_vect_cost_model)
6718     {
6719       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
6720       int stmt_cost =
6721             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
6722
6723       /* Statements in an inner loop relative to the loop being
6724          vectorized are weighted more heavily.  The value here is
6725          a function (linear for now) of the loop nest level.  */
6726       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
6727         {
6728           loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
6729           struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
6730           unsigned nest_level = loop_depth (loop);
6731
6732           count *= nest_level;
6733         }
6734
6735       retval = (unsigned) (count * stmt_cost);
6736       cost[where] += retval;
6737     }
6738
6739   return retval;
6740 }
6741
6742 static void initialize_aarch64_code_model (void);
6743
6744 /* Parse the architecture extension string.  */
6745
6746 static void
6747 aarch64_parse_extension (char *str)
6748 {
6749   /* The extension string is parsed left to right.  */
6750   const struct aarch64_option_extension *opt = NULL;
6751
6752   /* Flag to say whether we are adding or removing an extension.  */
6753   int adding_ext = -1;
6754
6755   while (str != NULL && *str != 0)
6756     {
6757       char *ext;
6758       size_t len;
6759
6760       str++;
6761       ext = strchr (str, '+');
6762
6763       if (ext != NULL)
6764         len = ext - str;
6765       else
6766         len = strlen (str);
6767
6768       if (len >= 2 && strncmp (str, "no", 2) == 0)
6769         {
6770           adding_ext = 0;
6771           len -= 2;
6772           str += 2;
6773         }
6774       else if (len > 0)
6775         adding_ext = 1;
6776
6777       if (len == 0)
6778         {
6779           error ("missing feature modifier after %qs", adding_ext ? "+"
6780                                                                   : "+no");
6781           return;
6782         }
6783
6784       /* Scan over the extensions table trying to find an exact match.  */
6785       for (opt = all_extensions; opt->name != NULL; opt++)
6786         {
6787           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
6788             {
6789               /* Add or remove the extension.  */
6790               if (adding_ext)
6791                 aarch64_isa_flags |= opt->flags_on;
6792               else
6793                 aarch64_isa_flags &= ~(opt->flags_off);
6794               break;
6795             }
6796         }
6797
6798       if (opt->name == NULL)
6799         {
6800           /* Extension not found in list.  */
6801           error ("unknown feature modifier %qs", str);
6802           return;
6803         }
6804
6805       str = ext;
6806     };
6807
6808   return;
6809 }
6810
6811 /* Parse the ARCH string.  */
6812
6813 static void
6814 aarch64_parse_arch (void)
6815 {
6816   char *ext;
6817   const struct processor *arch;
6818   char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
6819   size_t len;
6820
6821   strcpy (str, aarch64_arch_string);
6822
6823   ext = strchr (str, '+');
6824
6825   if (ext != NULL)
6826     len = ext - str;
6827   else
6828     len = strlen (str);
6829
6830   if (len == 0)
6831     {
6832       error ("missing arch name in -march=%qs", str);
6833       return;
6834     }
6835
6836   /* Loop through the list of supported ARCHs to find a match.  */
6837   for (arch = all_architectures; arch->name != NULL; arch++)
6838     {
6839       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
6840         {
6841           selected_arch = arch;
6842           aarch64_isa_flags = selected_arch->flags;
6843
6844           if (!selected_cpu)
6845             selected_cpu = &all_cores[selected_arch->core];
6846
6847           if (ext != NULL)
6848             {
6849               /* ARCH string contains at least one extension.  */
6850               aarch64_parse_extension (ext);
6851             }
6852
6853           if (strcmp (selected_arch->arch, selected_cpu->arch))
6854             {
6855               warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
6856                        selected_cpu->name, selected_arch->name);
6857             }
6858
6859           return;
6860         }
6861     }
6862
6863   /* ARCH name not found in list.  */
6864   error ("unknown value %qs for -march", str);
6865   return;
6866 }
6867
6868 /* Parse the CPU string.  */
6869
6870 static void
6871 aarch64_parse_cpu (void)
6872 {
6873   char *ext;
6874   const struct processor *cpu;
6875   char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
6876   size_t len;
6877
6878   strcpy (str, aarch64_cpu_string);
6879
6880   ext = strchr (str, '+');
6881
6882   if (ext != NULL)
6883     len = ext - str;
6884   else
6885     len = strlen (str);
6886
6887   if (len == 0)
6888     {
6889       error ("missing cpu name in -mcpu=%qs", str);
6890       return;
6891     }
6892
6893   /* Loop through the list of supported CPUs to find a match.  */
6894   for (cpu = all_cores; cpu->name != NULL; cpu++)
6895     {
6896       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
6897         {
6898           selected_cpu = cpu;
6899           aarch64_isa_flags = selected_cpu->flags;
6900
6901           if (ext != NULL)
6902             {
6903               /* CPU string contains at least one extension.  */
6904               aarch64_parse_extension (ext);
6905             }
6906
6907           return;
6908         }
6909     }
6910
6911   /* CPU name not found in list.  */
6912   error ("unknown value %qs for -mcpu", str);
6913   return;
6914 }
6915
6916 /* Parse the TUNE string.  */
6917
6918 static void
6919 aarch64_parse_tune (void)
6920 {
6921   const struct processor *cpu;
6922   char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
6923   strcpy (str, aarch64_tune_string);
6924
6925   /* Loop through the list of supported CPUs to find a match.  */
6926   for (cpu = all_cores; cpu->name != NULL; cpu++)
6927     {
6928       if (strcmp (cpu->name, str) == 0)
6929         {
6930           selected_tune = cpu;
6931           return;
6932         }
6933     }
6934
6935   /* CPU name not found in list.  */
6936   error ("unknown value %qs for -mtune", str);
6937   return;
6938 }
6939
6940
6941 /* Implement TARGET_OPTION_OVERRIDE.  */
6942
6943 static void
6944 aarch64_override_options (void)
6945 {
6946   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
6947      If either of -march or -mtune is given, they override their
6948      respective component of -mcpu.
6949
6950      So, first parse AARCH64_CPU_STRING, then the others, be careful
6951      with -march as, if -mcpu is not present on the command line, march
6952      must set a sensible default CPU.  */
6953   if (aarch64_cpu_string)
6954     {
6955       aarch64_parse_cpu ();
6956     }
6957
6958   if (aarch64_arch_string)
6959     {
6960       aarch64_parse_arch ();
6961     }
6962
6963   if (aarch64_tune_string)
6964     {
6965       aarch64_parse_tune ();
6966     }
6967
6968 #ifndef HAVE_AS_MABI_OPTION
6969   /* The compiler may have been configured with 2.23.* binutils, which does
6970      not have support for ILP32.  */
6971   if (TARGET_ILP32)
6972     error ("Assembler does not support -mabi=ilp32");
6973 #endif
6974
6975   initialize_aarch64_code_model ();
6976
6977   aarch64_build_bitmask_table ();
6978
6979   /* This target defaults to strict volatile bitfields.  */
6980   if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
6981     flag_strict_volatile_bitfields = 1;
6982
6983   /* If the user did not specify a processor, choose the default
6984      one for them.  This will be the CPU set during configuration using
6985      --with-cpu, otherwise it is "generic".  */
6986   if (!selected_cpu)
6987     {
6988       selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
6989       aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
6990     }
6991
6992   gcc_assert (selected_cpu);
6993
6994   if (!selected_tune)
6995     selected_tune = selected_cpu;
6996
6997   aarch64_tune_flags = selected_tune->flags;
6998   aarch64_tune = selected_tune->core;
6999   aarch64_tune_params = selected_tune->tune;
7000   aarch64_architecture_version = selected_cpu->architecture_version;
7001
7002   if (aarch64_fix_a53_err835769 == 2)
7003     {
7004 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
7005       aarch64_fix_a53_err835769 = 1;
7006 #else
7007       aarch64_fix_a53_err835769 = 0;
7008 #endif
7009     }
7010
7011   /* If not opzimizing for size, set the default
7012      alignment to what the target wants */
7013   if (!optimize_size)
7014     {
7015       if (align_loops <= 0)
7016         align_loops = aarch64_tune_params->loop_align;
7017       if (align_jumps <= 0)
7018         align_jumps = aarch64_tune_params->jump_align;
7019       if (align_functions <= 0)
7020         align_functions = aarch64_tune_params->function_align;
7021     }
7022
7023   if (AARCH64_TUNE_FMA_STEERING)
7024     aarch64_register_fma_steering ();
7025
7026   aarch64_override_options_after_change ();
7027 }
7028
7029 /* Implement targetm.override_options_after_change.  */
7030
7031 static void
7032 aarch64_override_options_after_change (void)
7033 {
7034   if (flag_omit_frame_pointer)
7035     flag_omit_leaf_frame_pointer = false;
7036   else if (flag_omit_leaf_frame_pointer)
7037     flag_omit_frame_pointer = true;
7038 }
7039
7040 static struct machine_function *
7041 aarch64_init_machine_status (void)
7042 {
7043   struct machine_function *machine;
7044   machine = ggc_cleared_alloc<machine_function> ();
7045   return machine;
7046 }
7047
7048 void
7049 aarch64_init_expanders (void)
7050 {
7051   init_machine_status = aarch64_init_machine_status;
7052 }
7053
7054 /* A checking mechanism for the implementation of the various code models.  */
7055 static void
7056 initialize_aarch64_code_model (void)
7057 {
7058    if (flag_pic)
7059      {
7060        switch (aarch64_cmodel_var)
7061          {
7062          case AARCH64_CMODEL_TINY:
7063            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
7064            break;
7065          case AARCH64_CMODEL_SMALL:
7066            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
7067            break;
7068          case AARCH64_CMODEL_LARGE:
7069            sorry ("code model %qs with -f%s", "large",
7070                   flag_pic > 1 ? "PIC" : "pic");
7071          default:
7072            gcc_unreachable ();
7073          }
7074      }
7075    else
7076      aarch64_cmodel = aarch64_cmodel_var;
7077 }
7078
7079 /* Return true if SYMBOL_REF X binds locally.  */
7080
7081 static bool
7082 aarch64_symbol_binds_local_p (const_rtx x)
7083 {
7084   return (SYMBOL_REF_DECL (x)
7085           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
7086           : SYMBOL_REF_LOCAL_P (x));
7087 }
7088
7089 /* Return true if SYMBOL_REF X is thread local */
7090 static bool
7091 aarch64_tls_symbol_p (rtx x)
7092 {
7093   if (! TARGET_HAVE_TLS)
7094     return false;
7095
7096   if (GET_CODE (x) != SYMBOL_REF)
7097     return false;
7098
7099   return SYMBOL_REF_TLS_MODEL (x) != 0;
7100 }
7101
7102 /* Classify a TLS symbol into one of the TLS kinds.  */
7103 enum aarch64_symbol_type
7104 aarch64_classify_tls_symbol (rtx x)
7105 {
7106   enum tls_model tls_kind = tls_symbolic_operand_type (x);
7107
7108   switch (tls_kind)
7109     {
7110     case TLS_MODEL_GLOBAL_DYNAMIC:
7111     case TLS_MODEL_LOCAL_DYNAMIC:
7112       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
7113
7114     case TLS_MODEL_INITIAL_EXEC:
7115       return SYMBOL_SMALL_GOTTPREL;
7116
7117     case TLS_MODEL_LOCAL_EXEC:
7118       return SYMBOL_SMALL_TPREL;
7119
7120     case TLS_MODEL_EMULATED:
7121     case TLS_MODEL_NONE:
7122       return SYMBOL_FORCE_TO_MEM;
7123
7124     default:
7125       gcc_unreachable ();
7126     }
7127 }
7128
7129 /* Return the method that should be used to access SYMBOL_REF or
7130    LABEL_REF X in context CONTEXT.  */
7131
7132 enum aarch64_symbol_type
7133 aarch64_classify_symbol (rtx x, rtx offset,
7134                          enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
7135 {
7136   if (GET_CODE (x) == LABEL_REF)
7137     {
7138       switch (aarch64_cmodel)
7139         {
7140         case AARCH64_CMODEL_LARGE:
7141           return SYMBOL_FORCE_TO_MEM;
7142
7143         case AARCH64_CMODEL_TINY_PIC:
7144         case AARCH64_CMODEL_TINY:
7145           return SYMBOL_TINY_ABSOLUTE;
7146
7147         case AARCH64_CMODEL_SMALL_PIC:
7148         case AARCH64_CMODEL_SMALL:
7149           return SYMBOL_SMALL_ABSOLUTE;
7150
7151         default:
7152           gcc_unreachable ();
7153         }
7154     }
7155
7156   if (GET_CODE (x) == SYMBOL_REF)
7157     {
7158       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7159           return SYMBOL_FORCE_TO_MEM;
7160
7161       if (aarch64_tls_symbol_p (x))
7162         return aarch64_classify_tls_symbol (x);
7163
7164       switch (aarch64_cmodel)
7165         {
7166         case AARCH64_CMODEL_TINY:
7167           /* When we retreive symbol + offset address, we have to make sure
7168              the offset does not cause overflow of the final address.  But
7169              we have no way of knowing the address of symbol at compile time
7170              so we can't accurately say if the distance between the PC and
7171              symbol + offset is outside the addressible range of +/-1M in the
7172              TINY code model.  So we rely on images not being greater than
7173              1M and cap the offset at 1M and anything beyond 1M will have to
7174              be loaded using an alternative mechanism.  */
7175           if (SYMBOL_REF_WEAK (x)
7176               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7177             return SYMBOL_FORCE_TO_MEM;
7178           return SYMBOL_TINY_ABSOLUTE;
7179
7180         case AARCH64_CMODEL_SMALL:
7181           /* Same reasoning as the tiny code model, but the offset cap here is
7182              4G.  */
7183           if (SYMBOL_REF_WEAK (x)
7184               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
7185                             HOST_WIDE_INT_C (4294967264)))
7186             return SYMBOL_FORCE_TO_MEM;
7187           return SYMBOL_SMALL_ABSOLUTE;
7188
7189         case AARCH64_CMODEL_TINY_PIC:
7190           if (!aarch64_symbol_binds_local_p (x))
7191             return SYMBOL_TINY_GOT;
7192           return SYMBOL_TINY_ABSOLUTE;
7193
7194         case AARCH64_CMODEL_SMALL_PIC:
7195           if (!aarch64_symbol_binds_local_p (x))
7196             return SYMBOL_SMALL_GOT;
7197           return SYMBOL_SMALL_ABSOLUTE;
7198
7199         default:
7200           gcc_unreachable ();
7201         }
7202     }
7203
7204   /* By default push everything into the constant pool.  */
7205   return SYMBOL_FORCE_TO_MEM;
7206 }
7207
7208 bool
7209 aarch64_constant_address_p (rtx x)
7210 {
7211   return (CONSTANT_P (x) && memory_address_p (DImode, x));
7212 }
7213
7214 bool
7215 aarch64_legitimate_pic_operand_p (rtx x)
7216 {
7217   if (GET_CODE (x) == SYMBOL_REF
7218       || (GET_CODE (x) == CONST
7219           && GET_CODE (XEXP (x, 0)) == PLUS
7220           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7221      return false;
7222
7223   return true;
7224 }
7225
7226 /* Return true if X holds either a quarter-precision or
7227      floating-point +0.0 constant.  */
7228 static bool
7229 aarch64_valid_floating_const (machine_mode mode, rtx x)
7230 {
7231   if (!CONST_DOUBLE_P (x))
7232     return false;
7233
7234   /* TODO: We could handle moving 0.0 to a TFmode register,
7235      but first we would like to refactor the movtf_aarch64
7236      to be more amicable to split moves properly and
7237      correctly gate on TARGET_SIMD.  For now - reject all
7238      constants which are not to SFmode or DFmode registers.  */
7239   if (!(mode == SFmode || mode == DFmode))
7240     return false;
7241
7242   if (aarch64_float_const_zero_rtx_p (x))
7243     return true;
7244   return aarch64_float_const_representable_p (x);
7245 }
7246
7247 static bool
7248 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7249 {
7250   /* Do not allow vector struct mode constants.  We could support
7251      0 and -1 easily, but they need support in aarch64-simd.md.  */
7252   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7253     return false;
7254
7255   /* This could probably go away because
7256      we now decompose CONST_INTs according to expand_mov_immediate.  */
7257   if ((GET_CODE (x) == CONST_VECTOR
7258        && aarch64_simd_valid_immediate (x, mode, false, NULL))
7259       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7260         return !targetm.cannot_force_const_mem (mode, x);
7261
7262   if (GET_CODE (x) == HIGH
7263       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7264     return true;
7265
7266   return aarch64_constant_address_p (x);
7267 }
7268
7269 rtx
7270 aarch64_load_tp (rtx target)
7271 {
7272   if (!target
7273       || GET_MODE (target) != Pmode
7274       || !register_operand (target, Pmode))
7275     target = gen_reg_rtx (Pmode);
7276
7277   /* Can return in any reg.  */
7278   emit_insn (gen_aarch64_load_tp_hard (target));
7279   return target;
7280 }
7281
7282 /* On AAPCS systems, this is the "struct __va_list".  */
7283 static GTY(()) tree va_list_type;
7284
7285 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7286    Return the type to use as __builtin_va_list.
7287
7288    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7289
7290    struct __va_list
7291    {
7292      void *__stack;
7293      void *__gr_top;
7294      void *__vr_top;
7295      int   __gr_offs;
7296      int   __vr_offs;
7297    };  */
7298
7299 static tree
7300 aarch64_build_builtin_va_list (void)
7301 {
7302   tree va_list_name;
7303   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7304
7305   /* Create the type.  */
7306   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7307   /* Give it the required name.  */
7308   va_list_name = build_decl (BUILTINS_LOCATION,
7309                              TYPE_DECL,
7310                              get_identifier ("__va_list"),
7311                              va_list_type);
7312   DECL_ARTIFICIAL (va_list_name) = 1;
7313   TYPE_NAME (va_list_type) = va_list_name;
7314   TYPE_STUB_DECL (va_list_type) = va_list_name;
7315
7316   /* Create the fields.  */
7317   f_stack = build_decl (BUILTINS_LOCATION,
7318                         FIELD_DECL, get_identifier ("__stack"),
7319                         ptr_type_node);
7320   f_grtop = build_decl (BUILTINS_LOCATION,
7321                         FIELD_DECL, get_identifier ("__gr_top"),
7322                         ptr_type_node);
7323   f_vrtop = build_decl (BUILTINS_LOCATION,
7324                         FIELD_DECL, get_identifier ("__vr_top"),
7325                         ptr_type_node);
7326   f_groff = build_decl (BUILTINS_LOCATION,
7327                         FIELD_DECL, get_identifier ("__gr_offs"),
7328                         integer_type_node);
7329   f_vroff = build_decl (BUILTINS_LOCATION,
7330                         FIELD_DECL, get_identifier ("__vr_offs"),
7331                         integer_type_node);
7332
7333   DECL_ARTIFICIAL (f_stack) = 1;
7334   DECL_ARTIFICIAL (f_grtop) = 1;
7335   DECL_ARTIFICIAL (f_vrtop) = 1;
7336   DECL_ARTIFICIAL (f_groff) = 1;
7337   DECL_ARTIFICIAL (f_vroff) = 1;
7338
7339   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7340   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7341   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7342   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7343   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7344
7345   TYPE_FIELDS (va_list_type) = f_stack;
7346   DECL_CHAIN (f_stack) = f_grtop;
7347   DECL_CHAIN (f_grtop) = f_vrtop;
7348   DECL_CHAIN (f_vrtop) = f_groff;
7349   DECL_CHAIN (f_groff) = f_vroff;
7350
7351   /* Compute its layout.  */
7352   layout_type (va_list_type);
7353
7354   return va_list_type;
7355 }
7356
7357 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
7358 static void
7359 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7360 {
7361   const CUMULATIVE_ARGS *cum;
7362   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7363   tree stack, grtop, vrtop, groff, vroff;
7364   tree t;
7365   int gr_save_area_size;
7366   int vr_save_area_size;
7367   int vr_offset;
7368
7369   cum = &crtl->args.info;
7370   gr_save_area_size
7371     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7372   vr_save_area_size
7373     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7374
7375   if (TARGET_GENERAL_REGS_ONLY)
7376     {
7377       if (cum->aapcs_nvrn > 0)
7378         sorry ("%qs and floating point or vector arguments",
7379                "-mgeneral-regs-only");
7380       vr_save_area_size = 0;
7381     }
7382
7383   f_stack = TYPE_FIELDS (va_list_type_node);
7384   f_grtop = DECL_CHAIN (f_stack);
7385   f_vrtop = DECL_CHAIN (f_grtop);
7386   f_groff = DECL_CHAIN (f_vrtop);
7387   f_vroff = DECL_CHAIN (f_groff);
7388
7389   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7390                   NULL_TREE);
7391   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7392                   NULL_TREE);
7393   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7394                   NULL_TREE);
7395   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7396                   NULL_TREE);
7397   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7398                   NULL_TREE);
7399
7400   /* Emit code to initialize STACK, which points to the next varargs stack
7401      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
7402      by named arguments.  STACK is 8-byte aligned.  */
7403   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7404   if (cum->aapcs_stack_size > 0)
7405     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7406   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7407   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7408
7409   /* Emit code to initialize GRTOP, the top of the GR save area.
7410      virtual_incoming_args_rtx should have been 16 byte aligned.  */
7411   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7412   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7413   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7414
7415   /* Emit code to initialize VRTOP, the top of the VR save area.
7416      This address is gr_save_area_bytes below GRTOP, rounded
7417      down to the next 16-byte boundary.  */
7418   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7419   vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7420                              STACK_BOUNDARY / BITS_PER_UNIT);
7421
7422   if (vr_offset)
7423     t = fold_build_pointer_plus_hwi (t, -vr_offset);
7424   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7425   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7426
7427   /* Emit code to initialize GROFF, the offset from GRTOP of the
7428      next GPR argument.  */
7429   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7430               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7431   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7432
7433   /* Likewise emit code to initialize VROFF, the offset from FTOP
7434      of the next VR argument.  */
7435   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7436               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7437   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7438 }
7439
7440 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
7441
7442 static tree
7443 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7444                               gimple_seq *post_p ATTRIBUTE_UNUSED)
7445 {
7446   tree addr;
7447   bool indirect_p;
7448   bool is_ha;           /* is HFA or HVA.  */
7449   bool dw_align;        /* double-word align.  */
7450   machine_mode ag_mode = VOIDmode;
7451   int nregs;
7452   machine_mode mode;
7453
7454   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7455   tree stack, f_top, f_off, off, arg, roundup, on_stack;
7456   HOST_WIDE_INT size, rsize, adjust, align;
7457   tree t, u, cond1, cond2;
7458
7459   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7460   if (indirect_p)
7461     type = build_pointer_type (type);
7462
7463   mode = TYPE_MODE (type);
7464
7465   f_stack = TYPE_FIELDS (va_list_type_node);
7466   f_grtop = DECL_CHAIN (f_stack);
7467   f_vrtop = DECL_CHAIN (f_grtop);
7468   f_groff = DECL_CHAIN (f_vrtop);
7469   f_vroff = DECL_CHAIN (f_groff);
7470
7471   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7472                   f_stack, NULL_TREE);
7473   size = int_size_in_bytes (type);
7474   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7475
7476   dw_align = false;
7477   adjust = 0;
7478   if (aarch64_vfp_is_call_or_return_candidate (mode,
7479                                                type,
7480                                                &ag_mode,
7481                                                &nregs,
7482                                                &is_ha))
7483     {
7484       /* TYPE passed in fp/simd registers.  */
7485       if (TARGET_GENERAL_REGS_ONLY)
7486         sorry ("%qs and floating point or vector arguments",
7487                "-mgeneral-regs-only");
7488
7489       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
7490                       unshare_expr (valist), f_vrtop, NULL_TREE);
7491       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
7492                       unshare_expr (valist), f_vroff, NULL_TREE);
7493
7494       rsize = nregs * UNITS_PER_VREG;
7495
7496       if (is_ha)
7497         {
7498           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
7499             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
7500         }
7501       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
7502                && size < UNITS_PER_VREG)
7503         {
7504           adjust = UNITS_PER_VREG - size;
7505         }
7506     }
7507   else
7508     {
7509       /* TYPE passed in general registers.  */
7510       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
7511                       unshare_expr (valist), f_grtop, NULL_TREE);
7512       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
7513                       unshare_expr (valist), f_groff, NULL_TREE);
7514       rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
7515       nregs = rsize / UNITS_PER_WORD;
7516
7517       if (align > 8)
7518         dw_align = true;
7519
7520       if (BLOCK_REG_PADDING (mode, type, 1) == downward
7521           && size < UNITS_PER_WORD)
7522         {
7523           adjust = UNITS_PER_WORD  - size;
7524         }
7525     }
7526
7527   /* Get a local temporary for the field value.  */
7528   off = get_initialized_tmp_var (f_off, pre_p, NULL);
7529
7530   /* Emit code to branch if off >= 0.  */
7531   t = build2 (GE_EXPR, boolean_type_node, off,
7532               build_int_cst (TREE_TYPE (off), 0));
7533   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
7534
7535   if (dw_align)
7536     {
7537       /* Emit: offs = (offs + 15) & -16.  */
7538       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7539                   build_int_cst (TREE_TYPE (off), 15));
7540       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
7541                   build_int_cst (TREE_TYPE (off), -16));
7542       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
7543     }
7544   else
7545     roundup = NULL;
7546
7547   /* Update ap.__[g|v]r_offs  */
7548   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
7549               build_int_cst (TREE_TYPE (off), rsize));
7550   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
7551
7552   /* String up.  */
7553   if (roundup)
7554     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7555
7556   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
7557   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
7558               build_int_cst (TREE_TYPE (f_off), 0));
7559   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
7560
7561   /* String up: make sure the assignment happens before the use.  */
7562   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
7563   COND_EXPR_ELSE (cond1) = t;
7564
7565   /* Prepare the trees handling the argument that is passed on the stack;
7566      the top level node will store in ON_STACK.  */
7567   arg = get_initialized_tmp_var (stack, pre_p, NULL);
7568   if (align > 8)
7569     {
7570       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
7571       t = fold_convert (intDI_type_node, arg);
7572       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7573                   build_int_cst (TREE_TYPE (t), 15));
7574       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7575                   build_int_cst (TREE_TYPE (t), -16));
7576       t = fold_convert (TREE_TYPE (arg), t);
7577       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
7578     }
7579   else
7580     roundup = NULL;
7581   /* Advance ap.__stack  */
7582   t = fold_convert (intDI_type_node, arg);
7583   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
7584               build_int_cst (TREE_TYPE (t), size + 7));
7585   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
7586               build_int_cst (TREE_TYPE (t), -8));
7587   t = fold_convert (TREE_TYPE (arg), t);
7588   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
7589   /* String up roundup and advance.  */
7590   if (roundup)
7591     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
7592   /* String up with arg */
7593   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
7594   /* Big-endianness related address adjustment.  */
7595   if (BLOCK_REG_PADDING (mode, type, 1) == downward
7596       && size < UNITS_PER_WORD)
7597   {
7598     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
7599                 size_int (UNITS_PER_WORD - size));
7600     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
7601   }
7602
7603   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
7604   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
7605
7606   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
7607   t = off;
7608   if (adjust)
7609     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
7610                 build_int_cst (TREE_TYPE (off), adjust));
7611
7612   t = fold_convert (sizetype, t);
7613   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
7614
7615   if (is_ha)
7616     {
7617       /* type ha; // treat as "struct {ftype field[n];}"
7618          ... [computing offs]
7619          for (i = 0; i <nregs; ++i, offs += 16)
7620            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
7621          return ha;  */
7622       int i;
7623       tree tmp_ha, field_t, field_ptr_t;
7624
7625       /* Declare a local variable.  */
7626       tmp_ha = create_tmp_var_raw (type, "ha");
7627       gimple_add_tmp_var (tmp_ha);
7628
7629       /* Establish the base type.  */
7630       switch (ag_mode)
7631         {
7632         case SFmode:
7633           field_t = float_type_node;
7634           field_ptr_t = float_ptr_type_node;
7635           break;
7636         case DFmode:
7637           field_t = double_type_node;
7638           field_ptr_t = double_ptr_type_node;
7639           break;
7640         case TFmode:
7641           field_t = long_double_type_node;
7642           field_ptr_t = long_double_ptr_type_node;
7643           break;
7644 /* The half precision and quad precision are not fully supported yet.  Enable
7645    the following code after the support is complete.  Need to find the correct
7646    type node for __fp16 *.  */
7647 #if 0
7648         case HFmode:
7649           field_t = float_type_node;
7650           field_ptr_t = float_ptr_type_node;
7651           break;
7652 #endif
7653         case V2SImode:
7654         case V4SImode:
7655             {
7656               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
7657               field_t = build_vector_type_for_mode (innertype, ag_mode);
7658               field_ptr_t = build_pointer_type (field_t);
7659             }
7660           break;
7661         default:
7662           gcc_assert (0);
7663         }
7664
7665       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
7666       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
7667       addr = t;
7668       t = fold_convert (field_ptr_t, addr);
7669       t = build2 (MODIFY_EXPR, field_t,
7670                   build1 (INDIRECT_REF, field_t, tmp_ha),
7671                   build1 (INDIRECT_REF, field_t, t));
7672
7673       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
7674       for (i = 1; i < nregs; ++i)
7675         {
7676           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
7677           u = fold_convert (field_ptr_t, addr);
7678           u = build2 (MODIFY_EXPR, field_t,
7679                       build2 (MEM_REF, field_t, tmp_ha,
7680                               build_int_cst (field_ptr_t,
7681                                              (i *
7682                                               int_size_in_bytes (field_t)))),
7683                       build1 (INDIRECT_REF, field_t, u));
7684           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
7685         }
7686
7687       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
7688       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
7689     }
7690
7691   COND_EXPR_ELSE (cond2) = t;
7692   addr = fold_convert (build_pointer_type (type), cond1);
7693   addr = build_va_arg_indirect_ref (addr);
7694
7695   if (indirect_p)
7696     addr = build_va_arg_indirect_ref (addr);
7697
7698   return addr;
7699 }
7700
7701 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
7702
7703 static void
7704 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
7705                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
7706                                 int no_rtl)
7707 {
7708   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
7709   CUMULATIVE_ARGS local_cum;
7710   int gr_saved, vr_saved;
7711
7712   /* The caller has advanced CUM up to, but not beyond, the last named
7713      argument.  Advance a local copy of CUM past the last "real" named
7714      argument, to find out how many registers are left over.  */
7715   local_cum = *cum;
7716   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
7717
7718   /* Found out how many registers we need to save.  */
7719   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
7720   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
7721
7722   if (TARGET_GENERAL_REGS_ONLY)
7723     {
7724       if (local_cum.aapcs_nvrn > 0)
7725         sorry ("%qs and floating point or vector arguments",
7726                "-mgeneral-regs-only");
7727       vr_saved = 0;
7728     }
7729
7730   if (!no_rtl)
7731     {
7732       if (gr_saved > 0)
7733         {
7734           rtx ptr, mem;
7735
7736           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
7737           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
7738                                - gr_saved * UNITS_PER_WORD);
7739           mem = gen_frame_mem (BLKmode, ptr);
7740           set_mem_alias_set (mem, get_varargs_alias_set ());
7741
7742           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
7743                                mem, gr_saved);
7744         }
7745       if (vr_saved > 0)
7746         {
7747           /* We can't use move_block_from_reg, because it will use
7748              the wrong mode, storing D regs only.  */
7749           machine_mode mode = TImode;
7750           int off, i;
7751
7752           /* Set OFF to the offset from virtual_incoming_args_rtx of
7753              the first vector register.  The VR save area lies below
7754              the GR one, and is aligned to 16 bytes.  */
7755           off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7756                                    STACK_BOUNDARY / BITS_PER_UNIT);
7757           off -= vr_saved * UNITS_PER_VREG;
7758
7759           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
7760             {
7761               rtx ptr, mem;
7762
7763               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
7764               mem = gen_frame_mem (mode, ptr);
7765               set_mem_alias_set (mem, get_varargs_alias_set ());
7766               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
7767               off += UNITS_PER_VREG;
7768             }
7769         }
7770     }
7771
7772   /* We don't save the size into *PRETEND_SIZE because we want to avoid
7773      any complication of having crtl->args.pretend_args_size changed.  */
7774   cfun->machine->frame.saved_varargs_size
7775     = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
7776                       STACK_BOUNDARY / BITS_PER_UNIT)
7777        + vr_saved * UNITS_PER_VREG);
7778 }
7779
7780 static void
7781 aarch64_conditional_register_usage (void)
7782 {
7783   int i;
7784   if (!TARGET_FLOAT)
7785     {
7786       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
7787         {
7788           fixed_regs[i] = 1;
7789           call_used_regs[i] = 1;
7790         }
7791     }
7792 }
7793
7794 /* Walk down the type tree of TYPE counting consecutive base elements.
7795    If *MODEP is VOIDmode, then set it to the first valid floating point
7796    type.  If a non-floating point type is found, or if a floating point
7797    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
7798    otherwise return the count in the sub-tree.  */
7799 static int
7800 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
7801 {
7802   machine_mode mode;
7803   HOST_WIDE_INT size;
7804
7805   switch (TREE_CODE (type))
7806     {
7807     case REAL_TYPE:
7808       mode = TYPE_MODE (type);
7809       if (mode != DFmode && mode != SFmode && mode != TFmode)
7810         return -1;
7811
7812       if (*modep == VOIDmode)
7813         *modep = mode;
7814
7815       if (*modep == mode)
7816         return 1;
7817
7818       break;
7819
7820     case COMPLEX_TYPE:
7821       mode = TYPE_MODE (TREE_TYPE (type));
7822       if (mode != DFmode && mode != SFmode && mode != TFmode)
7823         return -1;
7824
7825       if (*modep == VOIDmode)
7826         *modep = mode;
7827
7828       if (*modep == mode)
7829         return 2;
7830
7831       break;
7832
7833     case VECTOR_TYPE:
7834       /* Use V2SImode and V4SImode as representatives of all 64-bit
7835          and 128-bit vector types.  */
7836       size = int_size_in_bytes (type);
7837       switch (size)
7838         {
7839         case 8:
7840           mode = V2SImode;
7841           break;
7842         case 16:
7843           mode = V4SImode;
7844           break;
7845         default:
7846           return -1;
7847         }
7848
7849       if (*modep == VOIDmode)
7850         *modep = mode;
7851
7852       /* Vector modes are considered to be opaque: two vectors are
7853          equivalent for the purposes of being homogeneous aggregates
7854          if they are the same size.  */
7855       if (*modep == mode)
7856         return 1;
7857
7858       break;
7859
7860     case ARRAY_TYPE:
7861       {
7862         int count;
7863         tree index = TYPE_DOMAIN (type);
7864
7865         /* Can't handle incomplete types nor sizes that are not
7866            fixed.  */
7867         if (!COMPLETE_TYPE_P (type)
7868             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7869           return -1;
7870
7871         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
7872         if (count == -1
7873             || !index
7874             || !TYPE_MAX_VALUE (index)
7875             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
7876             || !TYPE_MIN_VALUE (index)
7877             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
7878             || count < 0)
7879           return -1;
7880
7881         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
7882                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
7883
7884         /* There must be no padding.  */
7885         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7886           return -1;
7887
7888         return count;
7889       }
7890
7891     case RECORD_TYPE:
7892       {
7893         int count = 0;
7894         int sub_count;
7895         tree field;
7896
7897         /* Can't handle incomplete types nor sizes that are not
7898            fixed.  */
7899         if (!COMPLETE_TYPE_P (type)
7900             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7901           return -1;
7902
7903         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7904           {
7905             if (TREE_CODE (field) != FIELD_DECL)
7906               continue;
7907
7908             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7909             if (sub_count < 0)
7910               return -1;
7911             count += sub_count;
7912           }
7913
7914         /* There must be no padding.  */
7915         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7916           return -1;
7917
7918         return count;
7919       }
7920
7921     case UNION_TYPE:
7922     case QUAL_UNION_TYPE:
7923       {
7924         /* These aren't very interesting except in a degenerate case.  */
7925         int count = 0;
7926         int sub_count;
7927         tree field;
7928
7929         /* Can't handle incomplete types nor sizes that are not
7930            fixed.  */
7931         if (!COMPLETE_TYPE_P (type)
7932             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
7933           return -1;
7934
7935         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
7936           {
7937             if (TREE_CODE (field) != FIELD_DECL)
7938               continue;
7939
7940             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
7941             if (sub_count < 0)
7942               return -1;
7943             count = count > sub_count ? count : sub_count;
7944           }
7945
7946         /* There must be no padding.  */
7947         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
7948           return -1;
7949
7950         return count;
7951       }
7952
7953     default:
7954       break;
7955     }
7956
7957   return -1;
7958 }
7959
7960 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
7961    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
7962    array types.  The C99 floating-point complex types are also considered
7963    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
7964    types, which are GCC extensions and out of the scope of AAPCS64, are
7965    treated as composite types here as well.
7966
7967    Note that MODE itself is not sufficient in determining whether a type
7968    is such a composite type or not.  This is because
7969    stor-layout.c:compute_record_mode may have already changed the MODE
7970    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
7971    structure with only one field may have its MODE set to the mode of the
7972    field.  Also an integer mode whose size matches the size of the
7973    RECORD_TYPE type may be used to substitute the original mode
7974    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
7975    solely relied on.  */
7976
7977 static bool
7978 aarch64_composite_type_p (const_tree type,
7979                           machine_mode mode)
7980 {
7981   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
7982     return true;
7983
7984   if (mode == BLKmode
7985       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
7986       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
7987     return true;
7988
7989   return false;
7990 }
7991
7992 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
7993    type as described in AAPCS64 \S 4.1.2.
7994
7995    See the comment above aarch64_composite_type_p for the notes on MODE.  */
7996
7997 static bool
7998 aarch64_short_vector_p (const_tree type,
7999                         machine_mode mode)
8000 {
8001   HOST_WIDE_INT size = -1;
8002
8003   if (type && TREE_CODE (type) == VECTOR_TYPE)
8004     size = int_size_in_bytes (type);
8005   else if (!aarch64_composite_type_p (type, mode)
8006            && (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
8007                || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT))
8008     size = GET_MODE_SIZE (mode);
8009
8010   return (size == 8 || size == 16) ? true : false;
8011 }
8012
8013 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
8014    shall be passed or returned in simd/fp register(s) (providing these
8015    parameter passing registers are available).
8016
8017    Upon successful return, *COUNT returns the number of needed registers,
8018    *BASE_MODE returns the mode of the individual register and when IS_HAF
8019    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
8020    floating-point aggregate or a homogeneous short-vector aggregate.  */
8021
8022 static bool
8023 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
8024                                          const_tree type,
8025                                          machine_mode *base_mode,
8026                                          int *count,
8027                                          bool *is_ha)
8028 {
8029   machine_mode new_mode = VOIDmode;
8030   bool composite_p = aarch64_composite_type_p (type, mode);
8031
8032   if (is_ha != NULL) *is_ha = false;
8033
8034   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
8035       || aarch64_short_vector_p (type, mode))
8036     {
8037       *count = 1;
8038       new_mode = mode;
8039     }
8040   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
8041     {
8042       if (is_ha != NULL) *is_ha = true;
8043       *count = 2;
8044       new_mode = GET_MODE_INNER (mode);
8045     }
8046   else if (type && composite_p)
8047     {
8048       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
8049
8050       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
8051         {
8052           if (is_ha != NULL) *is_ha = true;
8053           *count = ag_count;
8054         }
8055       else
8056         return false;
8057     }
8058   else
8059     return false;
8060
8061   *base_mode = new_mode;
8062   return true;
8063 }
8064
8065 /* Implement TARGET_STRUCT_VALUE_RTX.  */
8066
8067 static rtx
8068 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
8069                           int incoming ATTRIBUTE_UNUSED)
8070 {
8071   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
8072 }
8073
8074 /* Implements target hook vector_mode_supported_p.  */
8075 static bool
8076 aarch64_vector_mode_supported_p (machine_mode mode)
8077 {
8078   if (TARGET_SIMD
8079       && (mode == V4SImode  || mode == V8HImode
8080           || mode == V16QImode || mode == V2DImode
8081           || mode == V2SImode  || mode == V4HImode
8082           || mode == V8QImode || mode == V2SFmode
8083           || mode == V4SFmode || mode == V2DFmode
8084           || mode == V1DFmode))
8085     return true;
8086
8087   return false;
8088 }
8089
8090 /* Return appropriate SIMD container
8091    for MODE within a vector of WIDTH bits.  */
8092 static machine_mode
8093 aarch64_simd_container_mode (machine_mode mode, unsigned width)
8094 {
8095   gcc_assert (width == 64 || width == 128);
8096   if (TARGET_SIMD)
8097     {
8098       if (width == 128)
8099         switch (mode)
8100           {
8101           case DFmode:
8102             return V2DFmode;
8103           case SFmode:
8104             return V4SFmode;
8105           case SImode:
8106             return V4SImode;
8107           case HImode:
8108             return V8HImode;
8109           case QImode:
8110             return V16QImode;
8111           case DImode:
8112             return V2DImode;
8113           default:
8114             break;
8115           }
8116       else
8117         switch (mode)
8118           {
8119           case SFmode:
8120             return V2SFmode;
8121           case SImode:
8122             return V2SImode;
8123           case HImode:
8124             return V4HImode;
8125           case QImode:
8126             return V8QImode;
8127           default:
8128             break;
8129           }
8130     }
8131   return word_mode;
8132 }
8133
8134 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
8135 static machine_mode
8136 aarch64_preferred_simd_mode (machine_mode mode)
8137 {
8138   return aarch64_simd_container_mode (mode, 128);
8139 }
8140
8141 /* Return the bitmask of possible vector sizes for the vectorizer
8142    to iterate over.  */
8143 static unsigned int
8144 aarch64_autovectorize_vector_sizes (void)
8145 {
8146   return (16 | 8);
8147 }
8148
8149 /* Implement TARGET_MANGLE_TYPE.  */
8150
8151 static const char *
8152 aarch64_mangle_type (const_tree type)
8153 {
8154   /* The AArch64 ABI documents say that "__va_list" has to be
8155      managled as if it is in the "std" namespace.  */
8156   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8157     return "St9__va_list";
8158
8159   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
8160      builtin types.  */
8161   if (TYPE_NAME (type) != NULL)
8162     return aarch64_mangle_builtin_type (type);
8163
8164   /* Use the default mangling.  */
8165   return NULL;
8166 }
8167
8168
8169 /* Return true if the rtx_insn contains a MEM RTX somewhere
8170    in it.  */
8171
8172 static bool
8173 has_memory_op (rtx_insn *mem_insn)
8174 {
8175   subrtx_iterator::array_type array;
8176   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8177     if (MEM_P (*iter))
8178       return true;
8179
8180   return false;
8181 }
8182
8183 /* Find the first rtx_insn before insn that will generate an assembly
8184    instruction.  */
8185
8186 static rtx_insn *
8187 aarch64_prev_real_insn (rtx_insn *insn)
8188 {
8189   if (!insn)
8190     return NULL;
8191
8192   do
8193     {
8194       insn = prev_real_insn (insn);
8195     }
8196   while (insn && recog_memoized (insn) < 0);
8197
8198   return insn;
8199 }
8200
8201 static bool
8202 is_madd_op (enum attr_type t1)
8203 {
8204   unsigned int i;
8205   /* A number of these may be AArch32 only.  */
8206   enum attr_type mlatypes[] = {
8207     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8208     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8209     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8210   };
8211
8212   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8213     {
8214       if (t1 == mlatypes[i])
8215         return true;
8216     }
8217
8218   return false;
8219 }
8220
8221 /* Check if there is a register dependency between a load and the insn
8222    for which we hold recog_data.  */
8223
8224 static bool
8225 dep_between_memop_and_curr (rtx memop)
8226 {
8227   rtx load_reg;
8228   int opno;
8229
8230   gcc_assert (GET_CODE (memop) == SET);
8231
8232   if (!REG_P (SET_DEST (memop)))
8233     return false;
8234
8235   load_reg = SET_DEST (memop);
8236   for (opno = 1; opno < recog_data.n_operands; opno++)
8237     {
8238       rtx operand = recog_data.operand[opno];
8239       if (REG_P (operand)
8240           && reg_overlap_mentioned_p (load_reg, operand))
8241         return true;
8242
8243     }
8244   return false;
8245 }
8246
8247
8248 /* When working around the Cortex-A53 erratum 835769,
8249    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8250    instruction and has a preceding memory instruction such that a NOP
8251    should be inserted between them.  */
8252
8253 bool
8254 aarch64_madd_needs_nop (rtx_insn* insn)
8255 {
8256   enum attr_type attr_type;
8257   rtx_insn *prev;
8258   rtx body;
8259
8260   if (!aarch64_fix_a53_err835769)
8261     return false;
8262
8263   if (recog_memoized (insn) < 0)
8264     return false;
8265
8266   attr_type = get_attr_type (insn);
8267   if (!is_madd_op (attr_type))
8268     return false;
8269
8270   prev = aarch64_prev_real_insn (insn);
8271   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8272      Restore recog state to INSN to avoid state corruption.  */
8273   extract_constrain_insn_cached (insn);
8274
8275   if (!prev || !has_memory_op (prev))
8276     return false;
8277
8278   body = single_set (prev);
8279
8280   /* If the previous insn is a memory op and there is no dependency between
8281      it and the DImode madd, emit a NOP between them.  If body is NULL then we
8282      have a complex memory operation, probably a load/store pair.
8283      Be conservative for now and emit a NOP.  */
8284   if (GET_MODE (recog_data.operand[0]) == DImode
8285       && (!body || !dep_between_memop_and_curr (body)))
8286     return true;
8287
8288   return false;
8289
8290 }
8291
8292
8293 /* Implement FINAL_PRESCAN_INSN.  */
8294
8295 void
8296 aarch64_final_prescan_insn (rtx_insn *insn)
8297 {
8298   if (aarch64_madd_needs_nop (insn))
8299     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8300 }
8301
8302
8303 /* Return the equivalent letter for size.  */
8304 static char
8305 sizetochar (int size)
8306 {
8307   switch (size)
8308     {
8309     case 64: return 'd';
8310     case 32: return 's';
8311     case 16: return 'h';
8312     case 8 : return 'b';
8313     default: gcc_unreachable ();
8314     }
8315 }
8316
8317 /* Return true iff x is a uniform vector of floating-point
8318    constants, and the constant can be represented in
8319    quarter-precision form.  Note, as aarch64_float_const_representable
8320    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
8321 static bool
8322 aarch64_vect_float_const_representable_p (rtx x)
8323 {
8324   int i = 0;
8325   REAL_VALUE_TYPE r0, ri;
8326   rtx x0, xi;
8327
8328   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8329     return false;
8330
8331   x0 = CONST_VECTOR_ELT (x, 0);
8332   if (!CONST_DOUBLE_P (x0))
8333     return false;
8334
8335   REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8336
8337   for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8338     {
8339       xi = CONST_VECTOR_ELT (x, i);
8340       if (!CONST_DOUBLE_P (xi))
8341         return false;
8342
8343       REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8344       if (!REAL_VALUES_EQUAL (r0, ri))
8345         return false;
8346     }
8347
8348   return aarch64_float_const_representable_p (x0);
8349 }
8350
8351 /* Return true for valid and false for invalid.  */
8352 bool
8353 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8354                               struct simd_immediate_info *info)
8355 {
8356 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
8357   matches = 1;                                          \
8358   for (i = 0; i < idx; i += (STRIDE))                   \
8359     if (!(TEST))                                        \
8360       matches = 0;                                      \
8361   if (matches)                                          \
8362     {                                                   \
8363       immtype = (CLASS);                                \
8364       elsize = (ELSIZE);                                \
8365       eshift = (SHIFT);                                 \
8366       emvn = (NEG);                                     \
8367       break;                                            \
8368     }
8369
8370   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8371   unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8372   unsigned char bytes[16];
8373   int immtype = -1, matches;
8374   unsigned int invmask = inverse ? 0xff : 0;
8375   int eshift, emvn;
8376
8377   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8378     {
8379       if (! (aarch64_simd_imm_zero_p (op, mode)
8380              || aarch64_vect_float_const_representable_p (op)))
8381         return false;
8382
8383       if (info)
8384         {
8385           info->value = CONST_VECTOR_ELT (op, 0);
8386           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8387           info->mvn = false;
8388           info->shift = 0;
8389         }
8390
8391       return true;
8392     }
8393
8394   /* Splat vector constant out into a byte vector.  */
8395   for (i = 0; i < n_elts; i++)
8396     {
8397       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
8398          it must be laid out in the vector register in reverse order.  */
8399       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8400       unsigned HOST_WIDE_INT elpart;
8401       unsigned int part, parts;
8402
8403       if (CONST_INT_P (el))
8404         {
8405           elpart = INTVAL (el);
8406           parts = 1;
8407         }
8408       else if (GET_CODE (el) == CONST_DOUBLE)
8409         {
8410           elpart = CONST_DOUBLE_LOW (el);
8411           parts = 2;
8412         }
8413       else
8414         gcc_unreachable ();
8415
8416       for (part = 0; part < parts; part++)
8417         {
8418           unsigned int byte;
8419           for (byte = 0; byte < innersize; byte++)
8420             {
8421               bytes[idx++] = (elpart & 0xff) ^ invmask;
8422               elpart >>= BITS_PER_UNIT;
8423             }
8424           if (GET_CODE (el) == CONST_DOUBLE)
8425             elpart = CONST_DOUBLE_HIGH (el);
8426         }
8427     }
8428
8429   /* Sanity check.  */
8430   gcc_assert (idx == GET_MODE_SIZE (mode));
8431
8432   do
8433     {
8434       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8435              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8436
8437       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8438              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8439
8440       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8441              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8442
8443       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8444              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8445
8446       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8447
8448       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8449
8450       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8451              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8452
8453       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8454              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8455
8456       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8457              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8458
8459       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8460              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8461
8462       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8463
8464       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8465
8466       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8467              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8468
8469       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8470              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8471
8472       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8473              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8474
8475       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8476              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8477
8478       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8479
8480       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8481              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8482     }
8483   while (0);
8484
8485   if (immtype == -1)
8486     return false;
8487
8488   if (info)
8489     {
8490       info->element_width = elsize;
8491       info->mvn = emvn != 0;
8492       info->shift = eshift;
8493
8494       unsigned HOST_WIDE_INT imm = 0;
8495
8496       if (immtype >= 12 && immtype <= 15)
8497         info->msl = true;
8498
8499       /* Un-invert bytes of recognized vector, if necessary.  */
8500       if (invmask != 0)
8501         for (i = 0; i < idx; i++)
8502           bytes[i] ^= invmask;
8503
8504       if (immtype == 17)
8505         {
8506           /* FIXME: Broken on 32-bit H_W_I hosts.  */
8507           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
8508
8509           for (i = 0; i < 8; i++)
8510             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
8511               << (i * BITS_PER_UNIT);
8512
8513
8514           info->value = GEN_INT (imm);
8515         }
8516       else
8517         {
8518           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
8519             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
8520
8521           /* Construct 'abcdefgh' because the assembler cannot handle
8522              generic constants.  */
8523           if (info->mvn)
8524             imm = ~imm;
8525           imm = (imm >> info->shift) & 0xff;
8526           info->value = GEN_INT (imm);
8527         }
8528     }
8529
8530   return true;
8531 #undef CHECK
8532 }
8533
8534 /* Check of immediate shift constants are within range.  */
8535 bool
8536 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
8537 {
8538   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
8539   if (left)
8540     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
8541   else
8542     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
8543 }
8544
8545 /* Return true if X is a uniform vector where all elements
8546    are either the floating-point constant 0.0 or the
8547    integer constant 0.  */
8548 bool
8549 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
8550 {
8551   return x == CONST0_RTX (mode);
8552 }
8553
8554 bool
8555 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
8556 {
8557   HOST_WIDE_INT imm = INTVAL (x);
8558   int i;
8559
8560   for (i = 0; i < 8; i++)
8561     {
8562       unsigned int byte = imm & 0xff;
8563       if (byte != 0xff && byte != 0)
8564        return false;
8565       imm >>= 8;
8566     }
8567
8568   return true;
8569 }
8570
8571 bool
8572 aarch64_mov_operand_p (rtx x,
8573                        enum aarch64_symbol_context context,
8574                        machine_mode mode)
8575 {
8576   if (GET_CODE (x) == HIGH
8577       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8578     return true;
8579
8580   if (CONST_INT_P (x))
8581     return true;
8582
8583   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
8584     return true;
8585
8586   return aarch64_classify_symbolic_expression (x, context)
8587     == SYMBOL_TINY_ABSOLUTE;
8588 }
8589
8590 /* Return a const_int vector of VAL.  */
8591 rtx
8592 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
8593 {
8594   int nunits = GET_MODE_NUNITS (mode);
8595   rtvec v = rtvec_alloc (nunits);
8596   int i;
8597
8598   for (i=0; i < nunits; i++)
8599     RTVEC_ELT (v, i) = GEN_INT (val);
8600
8601   return gen_rtx_CONST_VECTOR (mode, v);
8602 }
8603
8604 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
8605
8606 bool
8607 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
8608 {
8609   machine_mode vmode;
8610
8611   gcc_assert (!VECTOR_MODE_P (mode));
8612   vmode = aarch64_preferred_simd_mode (mode);
8613   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
8614   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
8615 }
8616
8617 /* Construct and return a PARALLEL RTX vector with elements numbering the
8618    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
8619    the vector - from the perspective of the architecture.  This does not
8620    line up with GCC's perspective on lane numbers, so we end up with
8621    different masks depending on our target endian-ness.  The diagram
8622    below may help.  We must draw the distinction when building masks
8623    which select one half of the vector.  An instruction selecting
8624    architectural low-lanes for a big-endian target, must be described using
8625    a mask selecting GCC high-lanes.
8626
8627                  Big-Endian             Little-Endian
8628
8629 GCC             0   1   2   3           3   2   1   0
8630               | x | x | x | x |       | x | x | x | x |
8631 Architecture    3   2   1   0           3   2   1   0
8632
8633 Low Mask:         { 2, 3 }                { 0, 1 }
8634 High Mask:        { 0, 1 }                { 2, 3 }
8635 */
8636
8637 rtx
8638 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
8639 {
8640   int nunits = GET_MODE_NUNITS (mode);
8641   rtvec v = rtvec_alloc (nunits / 2);
8642   int high_base = nunits / 2;
8643   int low_base = 0;
8644   int base;
8645   rtx t1;
8646   int i;
8647
8648   if (BYTES_BIG_ENDIAN)
8649     base = high ? low_base : high_base;
8650   else
8651     base = high ? high_base : low_base;
8652
8653   for (i = 0; i < nunits / 2; i++)
8654     RTVEC_ELT (v, i) = GEN_INT (base + i);
8655
8656   t1 = gen_rtx_PARALLEL (mode, v);
8657   return t1;
8658 }
8659
8660 /* Check OP for validity as a PARALLEL RTX vector with elements
8661    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
8662    from the perspective of the architecture.  See the diagram above
8663    aarch64_simd_vect_par_cnst_half for more details.  */
8664
8665 bool
8666 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
8667                                        bool high)
8668 {
8669   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
8670   HOST_WIDE_INT count_op = XVECLEN (op, 0);
8671   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
8672   int i = 0;
8673
8674   if (!VECTOR_MODE_P (mode))
8675     return false;
8676
8677   if (count_op != count_ideal)
8678     return false;
8679
8680   for (i = 0; i < count_ideal; i++)
8681     {
8682       rtx elt_op = XVECEXP (op, 0, i);
8683       rtx elt_ideal = XVECEXP (ideal, 0, i);
8684
8685       if (!CONST_INT_P (elt_op)
8686           || INTVAL (elt_ideal) != INTVAL (elt_op))
8687         return false;
8688     }
8689   return true;
8690 }
8691
8692 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
8693    HIGH (exclusive).  */
8694 void
8695 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
8696                           const_tree exp)
8697 {
8698   HOST_WIDE_INT lane;
8699   gcc_assert (CONST_INT_P (operand));
8700   lane = INTVAL (operand);
8701
8702   if (lane < low || lane >= high)
8703   {
8704     if (exp)
8705       error ("%Klane %ld out of range %ld - %ld", exp, lane, low, high - 1);
8706     else
8707       error ("lane %ld out of range %ld - %ld", lane, low, high - 1);
8708   }
8709 }
8710
8711 /* Return TRUE if OP is a valid vector addressing mode.  */
8712 bool
8713 aarch64_simd_mem_operand_p (rtx op)
8714 {
8715   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
8716                         || REG_P (XEXP (op, 0)));
8717 }
8718
8719 /* Emit a register copy from operand to operand, taking care not to
8720    early-clobber source registers in the process.
8721
8722    COUNT is the number of components into which the copy needs to be
8723    decomposed.  */
8724 void
8725 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
8726                                 unsigned int count)
8727 {
8728   unsigned int i;
8729   int rdest = REGNO (operands[0]);
8730   int rsrc = REGNO (operands[1]);
8731
8732   if (!reg_overlap_mentioned_p (operands[0], operands[1])
8733       || rdest < rsrc)
8734     for (i = 0; i < count; i++)
8735       emit_move_insn (gen_rtx_REG (mode, rdest + i),
8736                       gen_rtx_REG (mode, rsrc + i));
8737   else
8738     for (i = 0; i < count; i++)
8739       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
8740                       gen_rtx_REG (mode, rsrc + count - i - 1));
8741 }
8742
8743 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
8744    one of VSTRUCT modes: OI, CI or XI.  */
8745 int
8746 aarch64_simd_attr_length_move (rtx_insn *insn)
8747 {
8748   machine_mode mode;
8749
8750   extract_insn_cached (insn);
8751
8752   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
8753     {
8754       mode = GET_MODE (recog_data.operand[0]);
8755       switch (mode)
8756         {
8757         case OImode:
8758           return 8;
8759         case CImode:
8760           return 12;
8761         case XImode:
8762           return 16;
8763         default:
8764           gcc_unreachable ();
8765         }
8766     }
8767   return 4;
8768 }
8769
8770 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
8771    one of VSTRUCT modes: OI, CI, EI, or XI.  */
8772 int
8773 aarch64_simd_attr_length_rglist (enum machine_mode mode)
8774 {
8775   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
8776 }
8777
8778 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
8779    alignment of a vector to 128 bits.  */
8780 static HOST_WIDE_INT
8781 aarch64_simd_vector_alignment (const_tree type)
8782 {
8783   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
8784   return MIN (align, 128);
8785 }
8786
8787 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
8788 static bool
8789 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
8790 {
8791   if (is_packed)
8792     return false;
8793
8794   /* We guarantee alignment for vectors up to 128-bits.  */
8795   if (tree_int_cst_compare (TYPE_SIZE (type),
8796                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
8797     return false;
8798
8799   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
8800   return true;
8801 }
8802
8803 /* If VALS is a vector constant that can be loaded into a register
8804    using DUP, generate instructions to do so and return an RTX to
8805    assign to the register.  Otherwise return NULL_RTX.  */
8806 static rtx
8807 aarch64_simd_dup_constant (rtx vals)
8808 {
8809   machine_mode mode = GET_MODE (vals);
8810   machine_mode inner_mode = GET_MODE_INNER (mode);
8811   int n_elts = GET_MODE_NUNITS (mode);
8812   bool all_same = true;
8813   rtx x;
8814   int i;
8815
8816   if (GET_CODE (vals) != CONST_VECTOR)
8817     return NULL_RTX;
8818
8819   for (i = 1; i < n_elts; ++i)
8820     {
8821       x = CONST_VECTOR_ELT (vals, i);
8822       if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
8823         all_same = false;
8824     }
8825
8826   if (!all_same)
8827     return NULL_RTX;
8828
8829   /* We can load this constant by using DUP and a constant in a
8830      single ARM register.  This will be cheaper than a vector
8831      load.  */
8832   x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
8833   return gen_rtx_VEC_DUPLICATE (mode, x);
8834 }
8835
8836
8837 /* Generate code to load VALS, which is a PARALLEL containing only
8838    constants (for vec_init) or CONST_VECTOR, efficiently into a
8839    register.  Returns an RTX to copy into the register, or NULL_RTX
8840    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
8841 static rtx
8842 aarch64_simd_make_constant (rtx vals)
8843 {
8844   machine_mode mode = GET_MODE (vals);
8845   rtx const_dup;
8846   rtx const_vec = NULL_RTX;
8847   int n_elts = GET_MODE_NUNITS (mode);
8848   int n_const = 0;
8849   int i;
8850
8851   if (GET_CODE (vals) == CONST_VECTOR)
8852     const_vec = vals;
8853   else if (GET_CODE (vals) == PARALLEL)
8854     {
8855       /* A CONST_VECTOR must contain only CONST_INTs and
8856          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
8857          Only store valid constants in a CONST_VECTOR.  */
8858       for (i = 0; i < n_elts; ++i)
8859         {
8860           rtx x = XVECEXP (vals, 0, i);
8861           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8862             n_const++;
8863         }
8864       if (n_const == n_elts)
8865         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
8866     }
8867   else
8868     gcc_unreachable ();
8869
8870   if (const_vec != NULL_RTX
8871       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
8872     /* Load using MOVI/MVNI.  */
8873     return const_vec;
8874   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
8875     /* Loaded using DUP.  */
8876     return const_dup;
8877   else if (const_vec != NULL_RTX)
8878     /* Load from constant pool. We can not take advantage of single-cycle
8879        LD1 because we need a PC-relative addressing mode.  */
8880     return const_vec;
8881   else
8882     /* A PARALLEL containing something not valid inside CONST_VECTOR.
8883        We can not construct an initializer.  */
8884     return NULL_RTX;
8885 }
8886
8887 void
8888 aarch64_expand_vector_init (rtx target, rtx vals)
8889 {
8890   machine_mode mode = GET_MODE (target);
8891   machine_mode inner_mode = GET_MODE_INNER (mode);
8892   int n_elts = GET_MODE_NUNITS (mode);
8893   int n_var = 0;
8894   rtx any_const = NULL_RTX;
8895   bool all_same = true;
8896
8897   for (int i = 0; i < n_elts; ++i)
8898     {
8899       rtx x = XVECEXP (vals, 0, i);
8900       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
8901         ++n_var;
8902       else
8903         any_const = x;
8904
8905       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
8906         all_same = false;
8907     }
8908
8909   if (n_var == 0)
8910     {
8911       rtx constant = aarch64_simd_make_constant (vals);
8912       if (constant != NULL_RTX)
8913         {
8914           emit_move_insn (target, constant);
8915           return;
8916         }
8917     }
8918
8919   /* Splat a single non-constant element if we can.  */
8920   if (all_same)
8921     {
8922       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
8923       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
8924       return;
8925     }
8926
8927   /* Half the fields (or less) are non-constant.  Load constant then overwrite
8928      varying fields.  Hope that this is more efficient than using the stack.  */
8929   if (n_var <= n_elts/2)
8930     {
8931       rtx copy = copy_rtx (vals);
8932
8933       /* Load constant part of vector.  We really don't care what goes into the
8934          parts we will overwrite, but we're more likely to be able to load the
8935          constant efficiently if it has fewer, larger, repeating parts
8936          (see aarch64_simd_valid_immediate).  */
8937       for (int i = 0; i < n_elts; i++)
8938         {
8939           rtx x = XVECEXP (vals, 0, i);
8940           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8941             continue;
8942           rtx subst = any_const;
8943           for (int bit = n_elts / 2; bit > 0; bit /= 2)
8944             {
8945               /* Look in the copied vector, as more elements are const.  */
8946               rtx test = XVECEXP (copy, 0, i ^ bit);
8947               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
8948                 {
8949                   subst = test;
8950                   break;
8951                 }
8952             }
8953           XVECEXP (copy, 0, i) = subst;
8954         }
8955       aarch64_expand_vector_init (target, copy);
8956
8957       /* Insert variables.  */
8958       enum insn_code icode = optab_handler (vec_set_optab, mode);
8959       gcc_assert (icode != CODE_FOR_nothing);
8960
8961       for (int i = 0; i < n_elts; i++)
8962         {
8963           rtx x = XVECEXP (vals, 0, i);
8964           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
8965             continue;
8966           x = copy_to_mode_reg (inner_mode, x);
8967           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
8968         }
8969       return;
8970     }
8971
8972   /* Construct the vector in memory one field at a time
8973      and load the whole vector.  */
8974   rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
8975   for (int i = 0; i < n_elts; i++)
8976     emit_move_insn (adjust_address_nv (mem, inner_mode,
8977                                     i * GET_MODE_SIZE (inner_mode)),
8978                     XVECEXP (vals, 0, i));
8979   emit_move_insn (target, mem);
8980
8981 }
8982
8983 static unsigned HOST_WIDE_INT
8984 aarch64_shift_truncation_mask (machine_mode mode)
8985 {
8986   return
8987     (aarch64_vector_mode_supported_p (mode)
8988      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
8989 }
8990
8991 #ifndef TLS_SECTION_ASM_FLAG
8992 #define TLS_SECTION_ASM_FLAG 'T'
8993 #endif
8994
8995 void
8996 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
8997                                tree decl ATTRIBUTE_UNUSED)
8998 {
8999   char flagchars[10], *f = flagchars;
9000
9001   /* If we have already declared this section, we can use an
9002      abbreviated form to switch back to it -- unless this section is
9003      part of a COMDAT groups, in which case GAS requires the full
9004      declaration every time.  */
9005   if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9006       && (flags & SECTION_DECLARED))
9007     {
9008       fprintf (asm_out_file, "\t.section\t%s\n", name);
9009       return;
9010     }
9011
9012   if (!(flags & SECTION_DEBUG))
9013     *f++ = 'a';
9014   if (flags & SECTION_WRITE)
9015     *f++ = 'w';
9016   if (flags & SECTION_CODE)
9017     *f++ = 'x';
9018   if (flags & SECTION_SMALL)
9019     *f++ = 's';
9020   if (flags & SECTION_MERGE)
9021     *f++ = 'M';
9022   if (flags & SECTION_STRINGS)
9023     *f++ = 'S';
9024   if (flags & SECTION_TLS)
9025     *f++ = TLS_SECTION_ASM_FLAG;
9026   if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9027     *f++ = 'G';
9028   *f = '\0';
9029
9030   fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
9031
9032   if (!(flags & SECTION_NOTYPE))
9033     {
9034       const char *type;
9035       const char *format;
9036
9037       if (flags & SECTION_BSS)
9038         type = "nobits";
9039       else
9040         type = "progbits";
9041
9042 #ifdef TYPE_OPERAND_FMT
9043       format = "," TYPE_OPERAND_FMT;
9044 #else
9045       format = ",@%s";
9046 #endif
9047
9048       fprintf (asm_out_file, format, type);
9049
9050       if (flags & SECTION_ENTSIZE)
9051         fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
9052       if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9053         {
9054           if (TREE_CODE (decl) == IDENTIFIER_NODE)
9055             fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
9056           else
9057             fprintf (asm_out_file, ",%s,comdat",
9058                      IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
9059         }
9060     }
9061
9062   putc ('\n', asm_out_file);
9063 }
9064
9065 /* Select a format to encode pointers in exception handling data.  */
9066 int
9067 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
9068 {
9069    int type;
9070    switch (aarch64_cmodel)
9071      {
9072      case AARCH64_CMODEL_TINY:
9073      case AARCH64_CMODEL_TINY_PIC:
9074      case AARCH64_CMODEL_SMALL:
9075      case AARCH64_CMODEL_SMALL_PIC:
9076        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
9077           for everything.  */
9078        type = DW_EH_PE_sdata4;
9079        break;
9080      default:
9081        /* No assumptions here.  8-byte relocs required.  */
9082        type = DW_EH_PE_sdata8;
9083        break;
9084      }
9085    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
9086 }
9087
9088 /* Emit load exclusive.  */
9089
9090 static void
9091 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
9092                              rtx mem, rtx model_rtx)
9093 {
9094   rtx (*gen) (rtx, rtx, rtx);
9095
9096   switch (mode)
9097     {
9098     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
9099     case HImode: gen = gen_aarch64_load_exclusivehi; break;
9100     case SImode: gen = gen_aarch64_load_exclusivesi; break;
9101     case DImode: gen = gen_aarch64_load_exclusivedi; break;
9102     default:
9103       gcc_unreachable ();
9104     }
9105
9106   emit_insn (gen (rval, mem, model_rtx));
9107 }
9108
9109 /* Emit store exclusive.  */
9110
9111 static void
9112 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
9113                               rtx rval, rtx mem, rtx model_rtx)
9114 {
9115   rtx (*gen) (rtx, rtx, rtx, rtx);
9116
9117   switch (mode)
9118     {
9119     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
9120     case HImode: gen = gen_aarch64_store_exclusivehi; break;
9121     case SImode: gen = gen_aarch64_store_exclusivesi; break;
9122     case DImode: gen = gen_aarch64_store_exclusivedi; break;
9123     default:
9124       gcc_unreachable ();
9125     }
9126
9127   emit_insn (gen (bval, rval, mem, model_rtx));
9128 }
9129
9130 /* Mark the previous jump instruction as unlikely.  */
9131
9132 static void
9133 aarch64_emit_unlikely_jump (rtx insn)
9134 {
9135   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
9136
9137   insn = emit_jump_insn (insn);
9138   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
9139 }
9140
9141 /* Expand a compare and swap pattern.  */
9142
9143 void
9144 aarch64_expand_compare_and_swap (rtx operands[])
9145 {
9146   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9147   machine_mode mode, cmp_mode;
9148   rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9149
9150   bval = operands[0];
9151   rval = operands[1];
9152   mem = operands[2];
9153   oldval = operands[3];
9154   newval = operands[4];
9155   is_weak = operands[5];
9156   mod_s = operands[6];
9157   mod_f = operands[7];
9158   mode = GET_MODE (mem);
9159   cmp_mode = mode;
9160
9161   /* Normally the succ memory model must be stronger than fail, but in the
9162      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9163      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
9164
9165   if (INTVAL (mod_f) == MEMMODEL_ACQUIRE
9166       && INTVAL (mod_s) == MEMMODEL_RELEASE)
9167     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9168
9169   switch (mode)
9170     {
9171     case QImode:
9172     case HImode:
9173       /* For short modes, we're going to perform the comparison in SImode,
9174          so do the zero-extension now.  */
9175       cmp_mode = SImode;
9176       rval = gen_reg_rtx (SImode);
9177       oldval = convert_modes (SImode, mode, oldval, true);
9178       /* Fall through.  */
9179
9180     case SImode:
9181     case DImode:
9182       /* Force the value into a register if needed.  */
9183       if (!aarch64_plus_operand (oldval, mode))
9184         oldval = force_reg (cmp_mode, oldval);
9185       break;
9186
9187     default:
9188       gcc_unreachable ();
9189     }
9190
9191   switch (mode)
9192     {
9193     case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9194     case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9195     case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9196     case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9197     default:
9198       gcc_unreachable ();
9199     }
9200
9201   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9202
9203   if (mode == QImode || mode == HImode)
9204     emit_move_insn (operands[1], gen_lowpart (mode, rval));
9205
9206   x = gen_rtx_REG (CCmode, CC_REGNUM);
9207   x = gen_rtx_EQ (SImode, x, const0_rtx);
9208   emit_insn (gen_rtx_SET (VOIDmode, bval, x));
9209 }
9210
9211 /* Split a compare and swap pattern.  */
9212
9213 void
9214 aarch64_split_compare_and_swap (rtx operands[])
9215 {
9216   rtx rval, mem, oldval, newval, scratch;
9217   machine_mode mode;
9218   bool is_weak;
9219   rtx_code_label *label1, *label2;
9220   rtx x, cond;
9221
9222   rval = operands[0];
9223   mem = operands[1];
9224   oldval = operands[2];
9225   newval = operands[3];
9226   is_weak = (operands[4] != const0_rtx);
9227   scratch = operands[7];
9228   mode = GET_MODE (mem);
9229
9230   label1 = NULL;
9231   if (!is_weak)
9232     {
9233       label1 = gen_label_rtx ();
9234       emit_label (label1);
9235     }
9236   label2 = gen_label_rtx ();
9237
9238   aarch64_emit_load_exclusive (mode, rval, mem, operands[5]);
9239
9240   cond = aarch64_gen_compare_reg (NE, rval, oldval);
9241   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9242   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9243                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9244   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9245
9246   aarch64_emit_store_exclusive (mode, scratch, mem, newval, operands[5]);
9247
9248   if (!is_weak)
9249     {
9250       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9251       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9252                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9253       aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9254     }
9255   else
9256     {
9257       cond = gen_rtx_REG (CCmode, CC_REGNUM);
9258       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9259       emit_insn (gen_rtx_SET (VOIDmode, cond, x));
9260     }
9261
9262   emit_label (label2);
9263 }
9264
9265 /* Split an atomic operation.  */
9266
9267 void
9268 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9269                      rtx value, rtx model_rtx, rtx cond)
9270 {
9271   machine_mode mode = GET_MODE (mem);
9272   machine_mode wmode = (mode == DImode ? DImode : SImode);
9273   rtx_code_label *label;
9274   rtx x;
9275
9276   label = gen_label_rtx ();
9277   emit_label (label);
9278
9279   if (new_out)
9280     new_out = gen_lowpart (wmode, new_out);
9281   if (old_out)
9282     old_out = gen_lowpart (wmode, old_out);
9283   else
9284     old_out = new_out;
9285   value = simplify_gen_subreg (wmode, value, mode, 0);
9286
9287   aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9288
9289   switch (code)
9290     {
9291     case SET:
9292       new_out = value;
9293       break;
9294
9295     case NOT:
9296       x = gen_rtx_AND (wmode, old_out, value);
9297       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9298       x = gen_rtx_NOT (wmode, new_out);
9299       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9300       break;
9301
9302     case MINUS:
9303       if (CONST_INT_P (value))
9304         {
9305           value = GEN_INT (-INTVAL (value));
9306           code = PLUS;
9307         }
9308       /* Fall through.  */
9309
9310     default:
9311       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9312       emit_insn (gen_rtx_SET (VOIDmode, new_out, x));
9313       break;
9314     }
9315
9316   aarch64_emit_store_exclusive (mode, cond, mem,
9317                                 gen_lowpart (mode, new_out), model_rtx);
9318
9319   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9320   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9321                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9322   aarch64_emit_unlikely_jump (gen_rtx_SET (VOIDmode, pc_rtx, x));
9323 }
9324
9325 static void
9326 aarch64_print_extension (void)
9327 {
9328   const struct aarch64_option_extension *opt = NULL;
9329
9330   for (opt = all_extensions; opt->name != NULL; opt++)
9331     if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9332       asm_fprintf (asm_out_file, "+%s", opt->name);
9333
9334   asm_fprintf (asm_out_file, "\n");
9335 }
9336
9337 static void
9338 aarch64_start_file (void)
9339 {
9340   if (selected_arch)
9341     {
9342       asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9343       aarch64_print_extension ();
9344     }
9345   else if (selected_cpu)
9346     {
9347       const char *truncated_name
9348             = aarch64_rewrite_selected_cpu (selected_cpu->name);
9349       asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9350       aarch64_print_extension ();
9351     }
9352   default_file_start();
9353 }
9354
9355 /* Target hook for c_mode_for_suffix.  */
9356 static machine_mode
9357 aarch64_c_mode_for_suffix (char suffix)
9358 {
9359   if (suffix == 'q')
9360     return TFmode;
9361
9362   return VOIDmode;
9363 }
9364
9365 /* We can only represent floating point constants which will fit in
9366    "quarter-precision" values.  These values are characterised by
9367    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
9368    by:
9369
9370    (-1)^s * (n/16) * 2^r
9371
9372    Where:
9373      's' is the sign bit.
9374      'n' is an integer in the range 16 <= n <= 31.
9375      'r' is an integer in the range -3 <= r <= 4.  */
9376
9377 /* Return true iff X can be represented by a quarter-precision
9378    floating point immediate operand X.  Note, we cannot represent 0.0.  */
9379 bool
9380 aarch64_float_const_representable_p (rtx x)
9381 {
9382   /* This represents our current view of how many bits
9383      make up the mantissa.  */
9384   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9385   int exponent;
9386   unsigned HOST_WIDE_INT mantissa, mask;
9387   REAL_VALUE_TYPE r, m;
9388   bool fail;
9389
9390   if (!CONST_DOUBLE_P (x))
9391     return false;
9392
9393   if (GET_MODE (x) == VOIDmode)
9394     return false;
9395
9396   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9397
9398   /* We cannot represent infinities, NaNs or +/-zero.  We won't
9399      know if we have +zero until we analyse the mantissa, but we
9400      can reject the other invalid values.  */
9401   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9402       || REAL_VALUE_MINUS_ZERO (r))
9403     return false;
9404
9405   /* Extract exponent.  */
9406   r = real_value_abs (&r);
9407   exponent = REAL_EXP (&r);
9408
9409   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9410      highest (sign) bit, with a fixed binary point at bit point_pos.
9411      m1 holds the low part of the mantissa, m2 the high part.
9412      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9413      bits for the mantissa, this can fail (low bits will be lost).  */
9414   real_ldexp (&m, &r, point_pos - exponent);
9415   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9416
9417   /* If the low part of the mantissa has bits set we cannot represent
9418      the value.  */
9419   if (w.elt (0) != 0)
9420     return false;
9421   /* We have rejected the lower HOST_WIDE_INT, so update our
9422      understanding of how many bits lie in the mantissa and
9423      look only at the high HOST_WIDE_INT.  */
9424   mantissa = w.elt (1);
9425   point_pos -= HOST_BITS_PER_WIDE_INT;
9426
9427   /* We can only represent values with a mantissa of the form 1.xxxx.  */
9428   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9429   if ((mantissa & mask) != 0)
9430     return false;
9431
9432   /* Having filtered unrepresentable values, we may now remove all
9433      but the highest 5 bits.  */
9434   mantissa >>= point_pos - 5;
9435
9436   /* We cannot represent the value 0.0, so reject it.  This is handled
9437      elsewhere.  */
9438   if (mantissa == 0)
9439     return false;
9440
9441   /* Then, as bit 4 is always set, we can mask it off, leaving
9442      the mantissa in the range [0, 15].  */
9443   mantissa &= ~(1 << 4);
9444   gcc_assert (mantissa <= 15);
9445
9446   /* GCC internally does not use IEEE754-like encoding (where normalized
9447      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
9448      Our mantissa values are shifted 4 places to the left relative to
9449      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
9450      by 5 places to correct for GCC's representation.  */
9451   exponent = 5 - exponent;
9452
9453   return (exponent >= 0 && exponent <= 7);
9454 }
9455
9456 char*
9457 aarch64_output_simd_mov_immediate (rtx const_vector,
9458                                    machine_mode mode,
9459                                    unsigned width)
9460 {
9461   bool is_valid;
9462   static char templ[40];
9463   const char *mnemonic;
9464   const char *shift_op;
9465   unsigned int lane_count = 0;
9466   char element_char;
9467
9468   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
9469
9470   /* This will return true to show const_vector is legal for use as either
9471      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
9472      also update INFO to show how the immediate should be generated.  */
9473   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
9474   gcc_assert (is_valid);
9475
9476   element_char = sizetochar (info.element_width);
9477   lane_count = width / info.element_width;
9478
9479   mode = GET_MODE_INNER (mode);
9480   if (mode == SFmode || mode == DFmode)
9481     {
9482       gcc_assert (info.shift == 0 && ! info.mvn);
9483       if (aarch64_float_const_zero_rtx_p (info.value))
9484         info.value = GEN_INT (0);
9485       else
9486         {
9487 #define buf_size 20
9488           REAL_VALUE_TYPE r;
9489           REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
9490           char float_buf[buf_size] = {'\0'};
9491           real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
9492 #undef buf_size
9493
9494           if (lane_count == 1)
9495             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
9496           else
9497             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
9498                       lane_count, element_char, float_buf);
9499           return templ;
9500         }
9501     }
9502
9503   mnemonic = info.mvn ? "mvni" : "movi";
9504   shift_op = info.msl ? "msl" : "lsl";
9505
9506   if (lane_count == 1)
9507     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
9508               mnemonic, UINTVAL (info.value));
9509   else if (info.shift)
9510     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
9511               ", %s %d", mnemonic, lane_count, element_char,
9512               UINTVAL (info.value), shift_op, info.shift);
9513   else
9514     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
9515               mnemonic, lane_count, element_char, UINTVAL (info.value));
9516   return templ;
9517 }
9518
9519 char*
9520 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
9521                                           machine_mode mode)
9522 {
9523   machine_mode vmode;
9524
9525   gcc_assert (!VECTOR_MODE_P (mode));
9526   vmode = aarch64_simd_container_mode (mode, 64);
9527   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
9528   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
9529 }
9530
9531 /* Split operands into moves from op[1] + op[2] into op[0].  */
9532
9533 void
9534 aarch64_split_combinev16qi (rtx operands[3])
9535 {
9536   unsigned int dest = REGNO (operands[0]);
9537   unsigned int src1 = REGNO (operands[1]);
9538   unsigned int src2 = REGNO (operands[2]);
9539   machine_mode halfmode = GET_MODE (operands[1]);
9540   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
9541   rtx destlo, desthi;
9542
9543   gcc_assert (halfmode == V16QImode);
9544
9545   if (src1 == dest && src2 == dest + halfregs)
9546     {
9547       /* No-op move.  Can't split to nothing; emit something.  */
9548       emit_note (NOTE_INSN_DELETED);
9549       return;
9550     }
9551
9552   /* Preserve register attributes for variable tracking.  */
9553   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
9554   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
9555                                GET_MODE_SIZE (halfmode));
9556
9557   /* Special case of reversed high/low parts.  */
9558   if (reg_overlap_mentioned_p (operands[2], destlo)
9559       && reg_overlap_mentioned_p (operands[1], desthi))
9560     {
9561       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9562       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
9563       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
9564     }
9565   else if (!reg_overlap_mentioned_p (operands[2], destlo))
9566     {
9567       /* Try to avoid unnecessary moves if part of the result
9568          is in the right place already.  */
9569       if (src1 != dest)
9570         emit_move_insn (destlo, operands[1]);
9571       if (src2 != dest + halfregs)
9572         emit_move_insn (desthi, operands[2]);
9573     }
9574   else
9575     {
9576       if (src2 != dest + halfregs)
9577         emit_move_insn (desthi, operands[2]);
9578       if (src1 != dest)
9579         emit_move_insn (destlo, operands[1]);
9580     }
9581 }
9582
9583 /* vec_perm support.  */
9584
9585 #define MAX_VECT_LEN 16
9586
9587 struct expand_vec_perm_d
9588 {
9589   rtx target, op0, op1;
9590   unsigned char perm[MAX_VECT_LEN];
9591   machine_mode vmode;
9592   unsigned char nelt;
9593   bool one_vector_p;
9594   bool testing_p;
9595 };
9596
9597 /* Generate a variable permutation.  */
9598
9599 static void
9600 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
9601 {
9602   machine_mode vmode = GET_MODE (target);
9603   bool one_vector_p = rtx_equal_p (op0, op1);
9604
9605   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
9606   gcc_checking_assert (GET_MODE (op0) == vmode);
9607   gcc_checking_assert (GET_MODE (op1) == vmode);
9608   gcc_checking_assert (GET_MODE (sel) == vmode);
9609   gcc_checking_assert (TARGET_SIMD);
9610
9611   if (one_vector_p)
9612     {
9613       if (vmode == V8QImode)
9614         {
9615           /* Expand the argument to a V16QI mode by duplicating it.  */
9616           rtx pair = gen_reg_rtx (V16QImode);
9617           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
9618           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9619         }
9620       else
9621         {
9622           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
9623         }
9624     }
9625   else
9626     {
9627       rtx pair;
9628
9629       if (vmode == V8QImode)
9630         {
9631           pair = gen_reg_rtx (V16QImode);
9632           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
9633           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
9634         }
9635       else
9636         {
9637           pair = gen_reg_rtx (OImode);
9638           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
9639           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
9640         }
9641     }
9642 }
9643
9644 void
9645 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
9646 {
9647   machine_mode vmode = GET_MODE (target);
9648   unsigned int nelt = GET_MODE_NUNITS (vmode);
9649   bool one_vector_p = rtx_equal_p (op0, op1);
9650   rtx mask;
9651
9652   /* The TBL instruction does not use a modulo index, so we must take care
9653      of that ourselves.  */
9654   mask = aarch64_simd_gen_const_vector_dup (vmode,
9655       one_vector_p ? nelt - 1 : 2 * nelt - 1);
9656   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
9657
9658   /* For big-endian, we also need to reverse the index within the vector
9659      (but not which vector).  */
9660   if (BYTES_BIG_ENDIAN)
9661     {
9662       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
9663       if (!one_vector_p)
9664         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
9665       sel = expand_simple_binop (vmode, XOR, sel, mask,
9666                                  NULL, 0, OPTAB_LIB_WIDEN);
9667     }
9668   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
9669 }
9670
9671 /* Recognize patterns suitable for the TRN instructions.  */
9672 static bool
9673 aarch64_evpc_trn (struct expand_vec_perm_d *d)
9674 {
9675   unsigned int i, odd, mask, nelt = d->nelt;
9676   rtx out, in0, in1, x;
9677   rtx (*gen) (rtx, rtx, rtx);
9678   machine_mode vmode = d->vmode;
9679
9680   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9681     return false;
9682
9683   /* Note that these are little-endian tests.
9684      We correct for big-endian later.  */
9685   if (d->perm[0] == 0)
9686     odd = 0;
9687   else if (d->perm[0] == 1)
9688     odd = 1;
9689   else
9690     return false;
9691   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9692
9693   for (i = 0; i < nelt; i += 2)
9694     {
9695       if (d->perm[i] != i + odd)
9696         return false;
9697       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
9698         return false;
9699     }
9700
9701   /* Success!  */
9702   if (d->testing_p)
9703     return true;
9704
9705   in0 = d->op0;
9706   in1 = d->op1;
9707   if (BYTES_BIG_ENDIAN)
9708     {
9709       x = in0, in0 = in1, in1 = x;
9710       odd = !odd;
9711     }
9712   out = d->target;
9713
9714   if (odd)
9715     {
9716       switch (vmode)
9717         {
9718         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
9719         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
9720         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
9721         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
9722         case V4SImode: gen = gen_aarch64_trn2v4si; break;
9723         case V2SImode: gen = gen_aarch64_trn2v2si; break;
9724         case V2DImode: gen = gen_aarch64_trn2v2di; break;
9725         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
9726         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
9727         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
9728         default:
9729           return false;
9730         }
9731     }
9732   else
9733     {
9734       switch (vmode)
9735         {
9736         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
9737         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
9738         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
9739         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
9740         case V4SImode: gen = gen_aarch64_trn1v4si; break;
9741         case V2SImode: gen = gen_aarch64_trn1v2si; break;
9742         case V2DImode: gen = gen_aarch64_trn1v2di; break;
9743         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
9744         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
9745         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
9746         default:
9747           return false;
9748         }
9749     }
9750
9751   emit_insn (gen (out, in0, in1));
9752   return true;
9753 }
9754
9755 /* Recognize patterns suitable for the UZP instructions.  */
9756 static bool
9757 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
9758 {
9759   unsigned int i, odd, mask, nelt = d->nelt;
9760   rtx out, in0, in1, x;
9761   rtx (*gen) (rtx, rtx, rtx);
9762   machine_mode vmode = d->vmode;
9763
9764   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9765     return false;
9766
9767   /* Note that these are little-endian tests.
9768      We correct for big-endian later.  */
9769   if (d->perm[0] == 0)
9770     odd = 0;
9771   else if (d->perm[0] == 1)
9772     odd = 1;
9773   else
9774     return false;
9775   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9776
9777   for (i = 0; i < nelt; i++)
9778     {
9779       unsigned elt = (i * 2 + odd) & mask;
9780       if (d->perm[i] != elt)
9781         return false;
9782     }
9783
9784   /* Success!  */
9785   if (d->testing_p)
9786     return true;
9787
9788   in0 = d->op0;
9789   in1 = d->op1;
9790   if (BYTES_BIG_ENDIAN)
9791     {
9792       x = in0, in0 = in1, in1 = x;
9793       odd = !odd;
9794     }
9795   out = d->target;
9796
9797   if (odd)
9798     {
9799       switch (vmode)
9800         {
9801         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
9802         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
9803         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
9804         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
9805         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
9806         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
9807         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
9808         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
9809         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
9810         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
9811         default:
9812           return false;
9813         }
9814     }
9815   else
9816     {
9817       switch (vmode)
9818         {
9819         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
9820         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
9821         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
9822         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
9823         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
9824         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
9825         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
9826         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
9827         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
9828         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
9829         default:
9830           return false;
9831         }
9832     }
9833
9834   emit_insn (gen (out, in0, in1));
9835   return true;
9836 }
9837
9838 /* Recognize patterns suitable for the ZIP instructions.  */
9839 static bool
9840 aarch64_evpc_zip (struct expand_vec_perm_d *d)
9841 {
9842   unsigned int i, high, mask, nelt = d->nelt;
9843   rtx out, in0, in1, x;
9844   rtx (*gen) (rtx, rtx, rtx);
9845   machine_mode vmode = d->vmode;
9846
9847   if (GET_MODE_UNIT_SIZE (vmode) > 8)
9848     return false;
9849
9850   /* Note that these are little-endian tests.
9851      We correct for big-endian later.  */
9852   high = nelt / 2;
9853   if (d->perm[0] == high)
9854     /* Do Nothing.  */
9855     ;
9856   else if (d->perm[0] == 0)
9857     high = 0;
9858   else
9859     return false;
9860   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
9861
9862   for (i = 0; i < nelt / 2; i++)
9863     {
9864       unsigned elt = (i + high) & mask;
9865       if (d->perm[i * 2] != elt)
9866         return false;
9867       elt = (elt + nelt) & mask;
9868       if (d->perm[i * 2 + 1] != elt)
9869         return false;
9870     }
9871
9872   /* Success!  */
9873   if (d->testing_p)
9874     return true;
9875
9876   in0 = d->op0;
9877   in1 = d->op1;
9878   if (BYTES_BIG_ENDIAN)
9879     {
9880       x = in0, in0 = in1, in1 = x;
9881       high = !high;
9882     }
9883   out = d->target;
9884
9885   if (high)
9886     {
9887       switch (vmode)
9888         {
9889         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
9890         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
9891         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
9892         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
9893         case V4SImode: gen = gen_aarch64_zip2v4si; break;
9894         case V2SImode: gen = gen_aarch64_zip2v2si; break;
9895         case V2DImode: gen = gen_aarch64_zip2v2di; break;
9896         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
9897         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
9898         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
9899         default:
9900           return false;
9901         }
9902     }
9903   else
9904     {
9905       switch (vmode)
9906         {
9907         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
9908         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
9909         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
9910         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
9911         case V4SImode: gen = gen_aarch64_zip1v4si; break;
9912         case V2SImode: gen = gen_aarch64_zip1v2si; break;
9913         case V2DImode: gen = gen_aarch64_zip1v2di; break;
9914         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
9915         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
9916         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
9917         default:
9918           return false;
9919         }
9920     }
9921
9922   emit_insn (gen (out, in0, in1));
9923   return true;
9924 }
9925
9926 /* Recognize patterns for the EXT insn.  */
9927
9928 static bool
9929 aarch64_evpc_ext (struct expand_vec_perm_d *d)
9930 {
9931   unsigned int i, nelt = d->nelt;
9932   rtx (*gen) (rtx, rtx, rtx, rtx);
9933   rtx offset;
9934
9935   unsigned int location = d->perm[0]; /* Always < nelt.  */
9936
9937   /* Check if the extracted indices are increasing by one.  */
9938   for (i = 1; i < nelt; i++)
9939     {
9940       unsigned int required = location + i;
9941       if (d->one_vector_p)
9942         {
9943           /* We'll pass the same vector in twice, so allow indices to wrap.  */
9944           required &= (nelt - 1);
9945         }
9946       if (d->perm[i] != required)
9947         return false;
9948     }
9949
9950   switch (d->vmode)
9951     {
9952     case V16QImode: gen = gen_aarch64_extv16qi; break;
9953     case V8QImode: gen = gen_aarch64_extv8qi; break;
9954     case V4HImode: gen = gen_aarch64_extv4hi; break;
9955     case V8HImode: gen = gen_aarch64_extv8hi; break;
9956     case V2SImode: gen = gen_aarch64_extv2si; break;
9957     case V4SImode: gen = gen_aarch64_extv4si; break;
9958     case V2SFmode: gen = gen_aarch64_extv2sf; break;
9959     case V4SFmode: gen = gen_aarch64_extv4sf; break;
9960     case V2DImode: gen = gen_aarch64_extv2di; break;
9961     case V2DFmode: gen = gen_aarch64_extv2df; break;
9962     default:
9963       return false;
9964     }
9965
9966   /* Success! */
9967   if (d->testing_p)
9968     return true;
9969
9970   /* The case where (location == 0) is a no-op for both big- and little-endian,
9971      and is removed by the mid-end at optimization levels -O1 and higher.  */
9972
9973   if (BYTES_BIG_ENDIAN && (location != 0))
9974     {
9975       /* After setup, we want the high elements of the first vector (stored
9976          at the LSB end of the register), and the low elements of the second
9977          vector (stored at the MSB end of the register). So swap.  */
9978       std::swap (d->op0, d->op1);
9979       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
9980       location = nelt - location;
9981     }
9982
9983   offset = GEN_INT (location);
9984   emit_insn (gen (d->target, d->op0, d->op1, offset));
9985   return true;
9986 }
9987
9988 /* Recognize patterns for the REV insns.  */
9989
9990 static bool
9991 aarch64_evpc_rev (struct expand_vec_perm_d *d)
9992 {
9993   unsigned int i, j, diff, nelt = d->nelt;
9994   rtx (*gen) (rtx, rtx);
9995
9996   if (!d->one_vector_p)
9997     return false;
9998
9999   diff = d->perm[0];
10000   switch (diff)
10001     {
10002     case 7:
10003       switch (d->vmode)
10004         {
10005         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
10006         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
10007         default:
10008           return false;
10009         }
10010       break;
10011     case 3:
10012       switch (d->vmode)
10013         {
10014         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
10015         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
10016         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
10017         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
10018         default:
10019           return false;
10020         }
10021       break;
10022     case 1:
10023       switch (d->vmode)
10024         {
10025         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
10026         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
10027         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
10028         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
10029         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
10030         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
10031         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
10032         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
10033         default:
10034           return false;
10035         }
10036       break;
10037     default:
10038       return false;
10039     }
10040
10041   for (i = 0; i < nelt ; i += diff + 1)
10042     for (j = 0; j <= diff; j += 1)
10043       {
10044         /* This is guaranteed to be true as the value of diff
10045            is 7, 3, 1 and we should have enough elements in the
10046            queue to generate this.  Getting a vector mask with a
10047            value of diff other than these values implies that
10048            something is wrong by the time we get here.  */
10049         gcc_assert (i + j < nelt);
10050         if (d->perm[i + j] != i + diff - j)
10051           return false;
10052       }
10053
10054   /* Success! */
10055   if (d->testing_p)
10056     return true;
10057
10058   emit_insn (gen (d->target, d->op0));
10059   return true;
10060 }
10061
10062 static bool
10063 aarch64_evpc_dup (struct expand_vec_perm_d *d)
10064 {
10065   rtx (*gen) (rtx, rtx, rtx);
10066   rtx out = d->target;
10067   rtx in0;
10068   machine_mode vmode = d->vmode;
10069   unsigned int i, elt, nelt = d->nelt;
10070   rtx lane;
10071
10072   elt = d->perm[0];
10073   for (i = 1; i < nelt; i++)
10074     {
10075       if (elt != d->perm[i])
10076         return false;
10077     }
10078
10079   /* The generic preparation in aarch64_expand_vec_perm_const_1
10080      swaps the operand order and the permute indices if it finds
10081      d->perm[0] to be in the second operand.  Thus, we can always
10082      use d->op0 and need not do any extra arithmetic to get the
10083      correct lane number.  */
10084   in0 = d->op0;
10085   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
10086
10087   switch (vmode)
10088     {
10089     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
10090     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
10091     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
10092     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
10093     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
10094     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
10095     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
10096     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
10097     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
10098     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
10099     default:
10100       return false;
10101     }
10102
10103   emit_insn (gen (out, in0, lane));
10104   return true;
10105 }
10106
10107 static bool
10108 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
10109 {
10110   rtx rperm[MAX_VECT_LEN], sel;
10111   machine_mode vmode = d->vmode;
10112   unsigned int i, nelt = d->nelt;
10113
10114   if (d->testing_p)
10115     return true;
10116
10117   /* Generic code will try constant permutation twice.  Once with the
10118      original mode and again with the elements lowered to QImode.
10119      So wait and don't do the selector expansion ourselves.  */
10120   if (vmode != V8QImode && vmode != V16QImode)
10121     return false;
10122
10123   for (i = 0; i < nelt; ++i)
10124     {
10125       int nunits = GET_MODE_NUNITS (vmode);
10126
10127       /* If big-endian and two vectors we end up with a weird mixed-endian
10128          mode on NEON.  Reverse the index within each word but not the word
10129          itself.  */
10130       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
10131                                            : d->perm[i]);
10132     }
10133   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
10134   sel = force_reg (vmode, sel);
10135
10136   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
10137   return true;
10138 }
10139
10140 static bool
10141 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10142 {
10143   /* The pattern matching functions above are written to look for a small
10144      number to begin the sequence (0, 1, N/2).  If we begin with an index
10145      from the second operand, we can swap the operands.  */
10146   if (d->perm[0] >= d->nelt)
10147     {
10148       unsigned i, nelt = d->nelt;
10149
10150       gcc_assert (nelt == (nelt & -nelt));
10151       for (i = 0; i < nelt; ++i)
10152         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
10153
10154       std::swap (d->op0, d->op1);
10155     }
10156
10157   if (TARGET_SIMD)
10158     {
10159       if (aarch64_evpc_rev (d))
10160         return true;
10161       else if (aarch64_evpc_ext (d))
10162         return true;
10163       else if (aarch64_evpc_dup (d))
10164         return true;
10165       else if (aarch64_evpc_zip (d))
10166         return true;
10167       else if (aarch64_evpc_uzp (d))
10168         return true;
10169       else if (aarch64_evpc_trn (d))
10170         return true;
10171       return aarch64_evpc_tbl (d);
10172     }
10173   return false;
10174 }
10175
10176 /* Expand a vec_perm_const pattern.  */
10177
10178 bool
10179 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10180 {
10181   struct expand_vec_perm_d d;
10182   int i, nelt, which;
10183
10184   d.target = target;
10185   d.op0 = op0;
10186   d.op1 = op1;
10187
10188   d.vmode = GET_MODE (target);
10189   gcc_assert (VECTOR_MODE_P (d.vmode));
10190   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10191   d.testing_p = false;
10192
10193   for (i = which = 0; i < nelt; ++i)
10194     {
10195       rtx e = XVECEXP (sel, 0, i);
10196       int ei = INTVAL (e) & (2 * nelt - 1);
10197       which |= (ei < nelt ? 1 : 2);
10198       d.perm[i] = ei;
10199     }
10200
10201   switch (which)
10202     {
10203     default:
10204       gcc_unreachable ();
10205
10206     case 3:
10207       d.one_vector_p = false;
10208       if (!rtx_equal_p (op0, op1))
10209         break;
10210
10211       /* The elements of PERM do not suggest that only the first operand
10212          is used, but both operands are identical.  Allow easier matching
10213          of the permutation by folding the permutation into the single
10214          input vector.  */
10215       /* Fall Through.  */
10216     case 2:
10217       for (i = 0; i < nelt; ++i)
10218         d.perm[i] &= nelt - 1;
10219       d.op0 = op1;
10220       d.one_vector_p = true;
10221       break;
10222
10223     case 1:
10224       d.op1 = op0;
10225       d.one_vector_p = true;
10226       break;
10227     }
10228
10229   return aarch64_expand_vec_perm_const_1 (&d);
10230 }
10231
10232 static bool
10233 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10234                                      const unsigned char *sel)
10235 {
10236   struct expand_vec_perm_d d;
10237   unsigned int i, nelt, which;
10238   bool ret;
10239
10240   d.vmode = vmode;
10241   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10242   d.testing_p = true;
10243   memcpy (d.perm, sel, nelt);
10244
10245   /* Calculate whether all elements are in one vector.  */
10246   for (i = which = 0; i < nelt; ++i)
10247     {
10248       unsigned char e = d.perm[i];
10249       gcc_assert (e < 2 * nelt);
10250       which |= (e < nelt ? 1 : 2);
10251     }
10252
10253   /* If all elements are from the second vector, reindex as if from the
10254      first vector.  */
10255   if (which == 2)
10256     for (i = 0; i < nelt; ++i)
10257       d.perm[i] -= nelt;
10258
10259   /* Check whether the mask can be applied to a single vector.  */
10260   d.one_vector_p = (which != 3);
10261
10262   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10263   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10264   if (!d.one_vector_p)
10265     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10266
10267   start_sequence ();
10268   ret = aarch64_expand_vec_perm_const_1 (&d);
10269   end_sequence ();
10270
10271   return ret;
10272 }
10273
10274 rtx
10275 aarch64_reverse_mask (enum machine_mode mode)
10276 {
10277   /* We have to reverse each vector because we dont have
10278      a permuted load that can reverse-load according to ABI rules.  */
10279   rtx mask;
10280   rtvec v = rtvec_alloc (16);
10281   int i, j;
10282   int nunits = GET_MODE_NUNITS (mode);
10283   int usize = GET_MODE_UNIT_SIZE (mode);
10284
10285   gcc_assert (BYTES_BIG_ENDIAN);
10286   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
10287
10288   for (i = 0; i < nunits; i++)
10289     for (j = 0; j < usize; j++)
10290       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
10291   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
10292   return force_reg (V16QImode, mask);
10293 }
10294
10295 /* Implement MODES_TIEABLE_P.  */
10296
10297 bool
10298 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10299 {
10300   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10301     return true;
10302
10303   /* We specifically want to allow elements of "structure" modes to
10304      be tieable to the structure.  This more general condition allows
10305      other rarer situations too.  */
10306   if (TARGET_SIMD
10307       && aarch64_vector_mode_p (mode1)
10308       && aarch64_vector_mode_p (mode2))
10309     return true;
10310
10311   return false;
10312 }
10313
10314 /* Return a new RTX holding the result of moving POINTER forward by
10315    AMOUNT bytes.  */
10316
10317 static rtx
10318 aarch64_move_pointer (rtx pointer, int amount)
10319 {
10320   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10321
10322   return adjust_automodify_address (pointer, GET_MODE (pointer),
10323                                     next, amount);
10324 }
10325
10326 /* Return a new RTX holding the result of moving POINTER forward by the
10327    size of the mode it points to.  */
10328
10329 static rtx
10330 aarch64_progress_pointer (rtx pointer)
10331 {
10332   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10333
10334   return aarch64_move_pointer (pointer, amount);
10335 }
10336
10337 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10338    MODE bytes.  */
10339
10340 static void
10341 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10342                                               machine_mode mode)
10343 {
10344   rtx reg = gen_reg_rtx (mode);
10345
10346   /* "Cast" the pointers to the correct mode.  */
10347   *src = adjust_address (*src, mode, 0);
10348   *dst = adjust_address (*dst, mode, 0);
10349   /* Emit the memcpy.  */
10350   emit_move_insn (reg, *src);
10351   emit_move_insn (*dst, reg);
10352   /* Move the pointers forward.  */
10353   *src = aarch64_progress_pointer (*src);
10354   *dst = aarch64_progress_pointer (*dst);
10355 }
10356
10357 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
10358    we succeed, otherwise return false.  */
10359
10360 bool
10361 aarch64_expand_movmem (rtx *operands)
10362 {
10363   unsigned int n;
10364   rtx dst = operands[0];
10365   rtx src = operands[1];
10366   rtx base;
10367   bool speed_p = !optimize_function_for_size_p (cfun);
10368
10369   /* When optimizing for size, give a better estimate of the length of a
10370      memcpy call, but use the default otherwise.  */
10371   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10372
10373   /* We can't do anything smart if the amount to copy is not constant.  */
10374   if (!CONST_INT_P (operands[2]))
10375     return false;
10376
10377   n = UINTVAL (operands[2]);
10378
10379   /* Try to keep the number of instructions low.  For cases below 16 bytes we
10380      need to make at most two moves.  For cases above 16 bytes it will be one
10381      move for each 16 byte chunk, then at most two additional moves.  */
10382   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10383     return false;
10384
10385   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10386   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10387
10388   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10389   src = adjust_automodify_address (src, VOIDmode, base, 0);
10390
10391   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10392      1-byte chunk.  */
10393   if (n < 4)
10394     {
10395       if (n >= 2)
10396         {
10397           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10398           n -= 2;
10399         }
10400
10401       if (n == 1)
10402         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10403
10404       return true;
10405     }
10406
10407   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
10408      4-byte chunk, partially overlapping with the previously copied chunk.  */
10409   if (n < 8)
10410     {
10411       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10412       n -= 4;
10413       if (n > 0)
10414         {
10415           int move = n - 4;
10416
10417           src = aarch64_move_pointer (src, move);
10418           dst = aarch64_move_pointer (dst, move);
10419           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10420         }
10421       return true;
10422     }
10423
10424   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
10425      them, then (if applicable) an 8-byte chunk.  */
10426   while (n >= 8)
10427     {
10428       if (n / 16)
10429         {
10430           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10431           n -= 16;
10432         }
10433       else
10434         {
10435           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10436           n -= 8;
10437         }
10438     }
10439
10440   /* Finish the final bytes of the copy.  We can always do this in one
10441      instruction.  We either copy the exact amount we need, or partially
10442      overlap with the previous chunk we copied and copy 8-bytes.  */
10443   if (n == 0)
10444     return true;
10445   else if (n == 1)
10446     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10447   else if (n == 2)
10448     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10449   else if (n == 4)
10450     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10451   else
10452     {
10453       if (n == 3)
10454         {
10455           src = aarch64_move_pointer (src, -1);
10456           dst = aarch64_move_pointer (dst, -1);
10457           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10458         }
10459       else
10460         {
10461           int move = n - 8;
10462
10463           src = aarch64_move_pointer (src, move);
10464           dst = aarch64_move_pointer (dst, move);
10465           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10466         }
10467     }
10468
10469   return true;
10470 }
10471
10472 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
10473
10474 static unsigned HOST_WIDE_INT
10475 aarch64_asan_shadow_offset (void)
10476 {
10477   return (HOST_WIDE_INT_1 << 36);
10478 }
10479
10480 static bool
10481 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
10482                                         unsigned int align,
10483                                         enum by_pieces_operation op,
10484                                         bool speed_p)
10485 {
10486   /* STORE_BY_PIECES can be used when copying a constant string, but
10487      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
10488      For now we always fail this and let the move_by_pieces code copy
10489      the string from read-only memory.  */
10490   if (op == STORE_BY_PIECES)
10491     return false;
10492
10493   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
10494 }
10495
10496 static enum machine_mode
10497 aarch64_code_to_ccmode (enum rtx_code code)
10498 {
10499   switch (code)
10500     {
10501     case NE:
10502       return CC_DNEmode;
10503
10504     case EQ:
10505       return CC_DEQmode;
10506
10507     case LE:
10508       return CC_DLEmode;
10509
10510     case LT:
10511       return CC_DLTmode;
10512
10513     case GE:
10514       return CC_DGEmode;
10515
10516     case GT:
10517       return CC_DGTmode;
10518
10519     case LEU:
10520       return CC_DLEUmode;
10521
10522     case LTU:
10523       return CC_DLTUmode;
10524
10525     case GEU:
10526       return CC_DGEUmode;
10527
10528     case GTU:
10529       return CC_DGTUmode;
10530
10531     default:
10532       return CCmode;
10533     }
10534 }
10535
10536 static rtx
10537 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
10538                         int code, tree treeop0, tree treeop1)
10539 {
10540   enum machine_mode op_mode, cmp_mode, cc_mode;
10541   rtx op0, op1, cmp, target;
10542   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10543   enum insn_code icode;
10544   struct expand_operand ops[4];
10545
10546   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
10547   if (cc_mode == CCmode)
10548     return NULL_RTX;
10549
10550   start_sequence ();
10551   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10552
10553   op_mode = GET_MODE (op0);
10554   if (op_mode == VOIDmode)
10555     op_mode = GET_MODE (op1);
10556
10557   switch (op_mode)
10558     {
10559     case QImode:
10560     case HImode:
10561     case SImode:
10562       cmp_mode = SImode;
10563       icode = CODE_FOR_cmpsi;
10564       break;
10565
10566     case DImode:
10567       cmp_mode = DImode;
10568       icode = CODE_FOR_cmpdi;
10569       break;
10570
10571     default:
10572       end_sequence ();
10573       return NULL_RTX;
10574     }
10575
10576   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10577   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10578   if (!op0 || !op1)
10579     {
10580       end_sequence ();
10581       return NULL_RTX;
10582     }
10583   *prep_seq = get_insns ();
10584   end_sequence ();
10585
10586   cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
10587   target = gen_rtx_REG (CCmode, CC_REGNUM);
10588
10589   create_output_operand (&ops[0], target, CCmode);
10590   create_fixed_operand (&ops[1], cmp);
10591   create_fixed_operand (&ops[2], op0);
10592   create_fixed_operand (&ops[3], op1);
10593
10594   start_sequence ();
10595   if (!maybe_expand_insn (icode, 4, ops))
10596     {
10597       end_sequence ();
10598       return NULL_RTX;
10599     }
10600   *gen_seq = get_insns ();
10601   end_sequence ();
10602
10603   return gen_rtx_REG (cc_mode, CC_REGNUM);
10604 }
10605
10606 static rtx
10607 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
10608                        tree treeop0, tree treeop1, int bit_code)
10609 {
10610   rtx op0, op1, cmp0, cmp1, target;
10611   enum machine_mode op_mode, cmp_mode, cc_mode;
10612   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
10613   enum insn_code icode = CODE_FOR_ccmp_andsi;
10614   struct expand_operand ops[6];
10615
10616   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
10617   if (cc_mode == CCmode)
10618     return NULL_RTX;
10619
10620   push_to_sequence ((rtx_insn*) *prep_seq);
10621   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
10622
10623   op_mode = GET_MODE (op0);
10624   if (op_mode == VOIDmode)
10625     op_mode = GET_MODE (op1);
10626
10627   switch (op_mode)
10628     {
10629     case QImode:
10630     case HImode:
10631     case SImode:
10632       cmp_mode = SImode;
10633       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
10634                                                 : CODE_FOR_ccmp_iorsi;
10635       break;
10636
10637     case DImode:
10638       cmp_mode = DImode;
10639       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
10640                                                 : CODE_FOR_ccmp_iordi;
10641       break;
10642
10643     default:
10644       end_sequence ();
10645       return NULL_RTX;
10646     }
10647
10648   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
10649   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
10650   if (!op0 || !op1)
10651     {
10652       end_sequence ();
10653       return NULL_RTX;
10654     }
10655   *prep_seq = get_insns ();
10656   end_sequence ();
10657
10658   target = gen_rtx_REG (cc_mode, CC_REGNUM);
10659   cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
10660   cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
10661
10662   create_fixed_operand (&ops[0], prev);
10663   create_fixed_operand (&ops[1], target);
10664   create_fixed_operand (&ops[2], op0);
10665   create_fixed_operand (&ops[3], op1);
10666   create_fixed_operand (&ops[4], cmp0);
10667   create_fixed_operand (&ops[5], cmp1);
10668
10669   push_to_sequence ((rtx_insn*) *gen_seq);
10670   if (!maybe_expand_insn (icode, 6, ops))
10671     {
10672       end_sequence ();
10673       return NULL_RTX;
10674     }
10675
10676   *gen_seq = get_insns ();
10677   end_sequence ();
10678
10679   return target;
10680 }
10681
10682 #undef TARGET_GEN_CCMP_FIRST
10683 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
10684
10685 #undef TARGET_GEN_CCMP_NEXT
10686 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
10687
10688 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
10689    instruction fusion of some sort.  */
10690
10691 static bool
10692 aarch64_macro_fusion_p (void)
10693 {
10694   return aarch64_tune_params->fuseable_ops != AARCH64_FUSE_NOTHING;
10695 }
10696
10697
10698 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
10699    should be kept together during scheduling.  */
10700
10701 static bool
10702 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
10703 {
10704   rtx set_dest;
10705   rtx prev_set = single_set (prev);
10706   rtx curr_set = single_set (curr);
10707   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
10708   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
10709
10710   if (!aarch64_macro_fusion_p ())
10711     return false;
10712
10713   if (simple_sets_p
10714       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOV_MOVK))
10715     {
10716       /* We are trying to match:
10717          prev (mov)  == (set (reg r0) (const_int imm16))
10718          curr (movk) == (set (zero_extract (reg r0)
10719                                            (const_int 16)
10720                                            (const_int 16))
10721                              (const_int imm16_1))  */
10722
10723       set_dest = SET_DEST (curr_set);
10724
10725       if (GET_CODE (set_dest) == ZERO_EXTRACT
10726           && CONST_INT_P (SET_SRC (curr_set))
10727           && CONST_INT_P (SET_SRC (prev_set))
10728           && CONST_INT_P (XEXP (set_dest, 2))
10729           && INTVAL (XEXP (set_dest, 2)) == 16
10730           && REG_P (XEXP (set_dest, 0))
10731           && REG_P (SET_DEST (prev_set))
10732           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
10733         {
10734           return true;
10735         }
10736     }
10737
10738   if (simple_sets_p
10739       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_ADD))
10740     {
10741
10742       /*  We're trying to match:
10743           prev (adrp) == (set (reg r1)
10744                               (high (symbol_ref ("SYM"))))
10745           curr (add) == (set (reg r0)
10746                              (lo_sum (reg r1)
10747                                      (symbol_ref ("SYM"))))
10748           Note that r0 need not necessarily be the same as r1, especially
10749           during pre-regalloc scheduling.  */
10750
10751       if (satisfies_constraint_Ush (SET_SRC (prev_set))
10752           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10753         {
10754           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
10755               && REG_P (XEXP (SET_SRC (curr_set), 0))
10756               && REGNO (XEXP (SET_SRC (curr_set), 0))
10757                  == REGNO (SET_DEST (prev_set))
10758               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
10759                               XEXP (SET_SRC (curr_set), 1)))
10760             return true;
10761         }
10762     }
10763
10764   if (simple_sets_p
10765       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_MOVK_MOVK))
10766     {
10767
10768       /* We're trying to match:
10769          prev (movk) == (set (zero_extract (reg r0)
10770                                            (const_int 16)
10771                                            (const_int 32))
10772                              (const_int imm16_1))
10773          curr (movk) == (set (zero_extract (reg r0)
10774                                            (const_int 16)
10775                                            (const_int 48))
10776                              (const_int imm16_2))  */
10777
10778       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
10779           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
10780           && REG_P (XEXP (SET_DEST (prev_set), 0))
10781           && REG_P (XEXP (SET_DEST (curr_set), 0))
10782           && REGNO (XEXP (SET_DEST (prev_set), 0))
10783              == REGNO (XEXP (SET_DEST (curr_set), 0))
10784           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
10785           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
10786           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
10787           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
10788           && CONST_INT_P (SET_SRC (prev_set))
10789           && CONST_INT_P (SET_SRC (curr_set)))
10790         return true;
10791
10792     }
10793   if (simple_sets_p
10794       && (aarch64_tune_params->fuseable_ops & AARCH64_FUSE_ADRP_LDR))
10795     {
10796       /* We're trying to match:
10797           prev (adrp) == (set (reg r0)
10798                               (high (symbol_ref ("SYM"))))
10799           curr (ldr) == (set (reg r1)
10800                              (mem (lo_sum (reg r0)
10801                                              (symbol_ref ("SYM")))))
10802                  or
10803           curr (ldr) == (set (reg r1)
10804                              (zero_extend (mem
10805                                            (lo_sum (reg r0)
10806                                                    (symbol_ref ("SYM"))))))  */
10807       if (satisfies_constraint_Ush (SET_SRC (prev_set))
10808           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
10809         {
10810           rtx curr_src = SET_SRC (curr_set);
10811
10812           if (GET_CODE (curr_src) == ZERO_EXTEND)
10813             curr_src = XEXP (curr_src, 0);
10814
10815           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
10816               && REG_P (XEXP (XEXP (curr_src, 0), 0))
10817               && REGNO (XEXP (XEXP (curr_src, 0), 0))
10818                  == REGNO (SET_DEST (prev_set))
10819               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
10820                               XEXP (SET_SRC (prev_set), 0)))
10821               return true;
10822         }
10823     }
10824
10825   if ((aarch64_tune_params->fuseable_ops & AARCH64_FUSE_CMP_BRANCH)
10826       && any_condjump_p (curr))
10827     {
10828       enum attr_type prev_type = get_attr_type (prev);
10829
10830       /* FIXME: this misses some which is considered simple arthematic
10831          instructions for ThunderX.  Simple shifts are missed here.  */
10832       if (prev_type == TYPE_ALUS_SREG
10833           || prev_type == TYPE_ALUS_IMM
10834           || prev_type == TYPE_LOGICS_REG
10835           || prev_type == TYPE_LOGICS_IMM)
10836         return true;
10837     }
10838
10839   return false;
10840 }
10841
10842 /* If MEM is in the form of [base+offset], extract the two parts
10843    of address and set to BASE and OFFSET, otherwise return false
10844    after clearing BASE and OFFSET.  */
10845
10846 bool
10847 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
10848 {
10849   rtx addr;
10850
10851   gcc_assert (MEM_P (mem));
10852
10853   addr = XEXP (mem, 0);
10854
10855   if (REG_P (addr))
10856     {
10857       *base = addr;
10858       *offset = const0_rtx;
10859       return true;
10860     }
10861
10862   if (GET_CODE (addr) == PLUS
10863       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
10864     {
10865       *base = XEXP (addr, 0);
10866       *offset = XEXP (addr, 1);
10867       return true;
10868     }
10869
10870   *base = NULL_RTX;
10871   *offset = NULL_RTX;
10872
10873   return false;
10874 }
10875
10876 /* Types for scheduling fusion.  */
10877 enum sched_fusion_type
10878 {
10879   SCHED_FUSION_NONE = 0,
10880   SCHED_FUSION_LD_SIGN_EXTEND,
10881   SCHED_FUSION_LD_ZERO_EXTEND,
10882   SCHED_FUSION_LD,
10883   SCHED_FUSION_ST,
10884   SCHED_FUSION_NUM
10885 };
10886
10887 /* If INSN is a load or store of address in the form of [base+offset],
10888    extract the two parts and set to BASE and OFFSET.  Return scheduling
10889    fusion type this INSN is.  */
10890
10891 static enum sched_fusion_type
10892 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
10893 {
10894   rtx x, dest, src;
10895   enum sched_fusion_type fusion = SCHED_FUSION_LD;
10896
10897   gcc_assert (INSN_P (insn));
10898   x = PATTERN (insn);
10899   if (GET_CODE (x) != SET)
10900     return SCHED_FUSION_NONE;
10901
10902   src = SET_SRC (x);
10903   dest = SET_DEST (x);
10904
10905   if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
10906       && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
10907     return SCHED_FUSION_NONE;
10908
10909   if (GET_CODE (src) == SIGN_EXTEND)
10910     {
10911       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
10912       src = XEXP (src, 0);
10913       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10914         return SCHED_FUSION_NONE;
10915     }
10916   else if (GET_CODE (src) == ZERO_EXTEND)
10917     {
10918       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
10919       src = XEXP (src, 0);
10920       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
10921         return SCHED_FUSION_NONE;
10922     }
10923
10924   if (GET_CODE (src) == MEM && REG_P (dest))
10925     extract_base_offset_in_addr (src, base, offset);
10926   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
10927     {
10928       fusion = SCHED_FUSION_ST;
10929       extract_base_offset_in_addr (dest, base, offset);
10930     }
10931   else
10932     return SCHED_FUSION_NONE;
10933
10934   if (*base == NULL_RTX || *offset == NULL_RTX)
10935     fusion = SCHED_FUSION_NONE;
10936
10937   return fusion;
10938 }
10939
10940 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
10941
10942    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
10943    and PRI are only calculated for these instructions.  For other instruction,
10944    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
10945    type instruction fusion can be added by returning different priorities.
10946
10947    It's important that irrelevant instructions get the largest FUSION_PRI.  */
10948
10949 static void
10950 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
10951                                int *fusion_pri, int *pri)
10952 {
10953   int tmp, off_val;
10954   rtx base, offset;
10955   enum sched_fusion_type fusion;
10956
10957   gcc_assert (INSN_P (insn));
10958
10959   tmp = max_pri - 1;
10960   fusion = fusion_load_store (insn, &base, &offset);
10961   if (fusion == SCHED_FUSION_NONE)
10962     {
10963       *pri = tmp;
10964       *fusion_pri = tmp;
10965       return;
10966     }
10967
10968   /* Set FUSION_PRI according to fusion type and base register.  */
10969   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
10970
10971   /* Calculate PRI.  */
10972   tmp /= 2;
10973
10974   /* INSN with smaller offset goes first.  */
10975   off_val = (int)(INTVAL (offset));
10976   if (off_val >= 0)
10977     tmp -= (off_val & 0xfffff);
10978   else
10979     tmp += ((- off_val) & 0xfffff);
10980
10981   *pri = tmp;
10982   return;
10983 }
10984
10985 /* Given OPERANDS of consecutive load/store, check if we can merge
10986    them into ldp/stp.  LOAD is true if they are load instructions.
10987    MODE is the mode of memory operands.  */
10988
10989 bool
10990 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
10991                                 enum machine_mode mode)
10992 {
10993   HOST_WIDE_INT offval_1, offval_2, msize;
10994   enum reg_class rclass_1, rclass_2;
10995   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
10996
10997   if (load)
10998     {
10999       mem_1 = operands[1];
11000       mem_2 = operands[3];
11001       reg_1 = operands[0];
11002       reg_2 = operands[2];
11003       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
11004       if (REGNO (reg_1) == REGNO (reg_2))
11005         return false;
11006     }
11007   else
11008     {
11009       mem_1 = operands[0];
11010       mem_2 = operands[2];
11011       reg_1 = operands[1];
11012       reg_2 = operands[3];
11013     }
11014
11015   /* The mems cannot be volatile.  */
11016   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
11017     return false;
11018
11019   /* Check if the addresses are in the form of [base+offset].  */
11020   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11021   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11022     return false;
11023   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11024   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11025     return false;
11026
11027   /* Check if the bases are same.  */
11028   if (!rtx_equal_p (base_1, base_2))
11029     return false;
11030
11031   offval_1 = INTVAL (offset_1);
11032   offval_2 = INTVAL (offset_2);
11033   msize = GET_MODE_SIZE (mode);
11034   /* Check if the offsets are consecutive.  */
11035   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
11036     return false;
11037
11038   /* Check if the addresses are clobbered by load.  */
11039   if (load)
11040     {
11041       if (reg_mentioned_p (reg_1, mem_1))
11042         return false;
11043
11044       /* In increasing order, the last load can clobber the address.  */
11045       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
11046       return false;
11047     }
11048
11049   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11050     rclass_1 = FP_REGS;
11051   else
11052     rclass_1 = GENERAL_REGS;
11053
11054   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11055     rclass_2 = FP_REGS;
11056   else
11057     rclass_2 = GENERAL_REGS;
11058
11059   /* Check if the registers are of same class.  */
11060   if (rclass_1 != rclass_2)
11061     return false;
11062
11063   return true;
11064 }
11065
11066 /* Given OPERANDS of consecutive load/store, check if we can merge
11067    them into ldp/stp by adjusting the offset.  LOAD is true if they
11068    are load instructions.  MODE is the mode of memory operands.
11069
11070    Given below consecutive stores:
11071
11072      str  w1, [xb, 0x100]
11073      str  w1, [xb, 0x104]
11074      str  w1, [xb, 0x108]
11075      str  w1, [xb, 0x10c]
11076
11077    Though the offsets are out of the range supported by stp, we can
11078    still pair them after adjusting the offset, like:
11079
11080      add  scratch, xb, 0x100
11081      stp  w1, w1, [scratch]
11082      stp  w1, w1, [scratch, 0x8]
11083
11084    The peephole patterns detecting this opportunity should guarantee
11085    the scratch register is avaliable.  */
11086
11087 bool
11088 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
11089                                        enum machine_mode mode)
11090 {
11091   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
11092   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
11093   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
11094   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
11095
11096   if (load)
11097     {
11098       reg_1 = operands[0];
11099       mem_1 = operands[1];
11100       reg_2 = operands[2];
11101       mem_2 = operands[3];
11102       reg_3 = operands[4];
11103       mem_3 = operands[5];
11104       reg_4 = operands[6];
11105       mem_4 = operands[7];
11106       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
11107                   && REG_P (reg_3) && REG_P (reg_4));
11108       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
11109         return false;
11110     }
11111   else
11112     {
11113       mem_1 = operands[0];
11114       reg_1 = operands[1];
11115       mem_2 = operands[2];
11116       reg_2 = operands[3];
11117       mem_3 = operands[4];
11118       reg_3 = operands[5];
11119       mem_4 = operands[6];
11120       reg_4 = operands[7];
11121     }
11122   /* Skip if memory operand is by itslef valid for ldp/stp.  */
11123   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
11124     return false;
11125
11126   /* The mems cannot be volatile.  */
11127   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
11128       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
11129     return false;
11130
11131   /* Check if the addresses are in the form of [base+offset].  */
11132   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11133   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11134     return false;
11135   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11136   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11137     return false;
11138   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11139   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11140     return false;
11141   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11142   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11143     return false;
11144
11145   /* Check if the bases are same.  */
11146   if (!rtx_equal_p (base_1, base_2)
11147       || !rtx_equal_p (base_2, base_3)
11148       || !rtx_equal_p (base_3, base_4))
11149     return false;
11150
11151   offval_1 = INTVAL (offset_1);
11152   offval_2 = INTVAL (offset_2);
11153   offval_3 = INTVAL (offset_3);
11154   offval_4 = INTVAL (offset_4);
11155   msize = GET_MODE_SIZE (mode);
11156   /* Check if the offsets are consecutive.  */
11157   if ((offval_1 != (offval_2 + msize)
11158        || offval_1 != (offval_3 + msize * 2)
11159        || offval_1 != (offval_4 + msize * 3))
11160       && (offval_4 != (offval_3 + msize)
11161           || offval_4 != (offval_2 + msize * 2)
11162           || offval_4 != (offval_1 + msize * 3)))
11163     return false;
11164
11165   /* Check if the addresses are clobbered by load.  */
11166   if (load)
11167     {
11168       if (reg_mentioned_p (reg_1, mem_1)
11169           || reg_mentioned_p (reg_2, mem_2)
11170           || reg_mentioned_p (reg_3, mem_3))
11171         return false;
11172
11173       /* In increasing order, the last load can clobber the address.  */
11174       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11175         return false;
11176     }
11177
11178   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11179     rclass_1 = FP_REGS;
11180   else
11181     rclass_1 = GENERAL_REGS;
11182
11183   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11184     rclass_2 = FP_REGS;
11185   else
11186     rclass_2 = GENERAL_REGS;
11187
11188   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11189     rclass_3 = FP_REGS;
11190   else
11191     rclass_3 = GENERAL_REGS;
11192
11193   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11194     rclass_4 = FP_REGS;
11195   else
11196     rclass_4 = GENERAL_REGS;
11197
11198   /* Check if the registers are of same class.  */
11199   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11200     return false;
11201
11202   return true;
11203 }
11204
11205 /* Given OPERANDS of consecutive load/store, this function pairs them
11206    into ldp/stp after adjusting the offset.  It depends on the fact
11207    that addresses of load/store instructions are in increasing order.
11208    MODE is the mode of memory operands.  CODE is the rtl operator
11209    which should be applied to all memory operands, it's SIGN_EXTEND,
11210    ZERO_EXTEND or UNKNOWN.  */
11211
11212 bool
11213 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11214                              enum machine_mode mode, RTX_CODE code)
11215 {
11216   rtx base, offset, t1, t2;
11217   rtx mem_1, mem_2, mem_3, mem_4;
11218   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11219
11220   if (load)
11221     {
11222       mem_1 = operands[1];
11223       mem_2 = operands[3];
11224       mem_3 = operands[5];
11225       mem_4 = operands[7];
11226     }
11227   else
11228     {
11229       mem_1 = operands[0];
11230       mem_2 = operands[2];
11231       mem_3 = operands[4];
11232       mem_4 = operands[6];
11233       gcc_assert (code == UNKNOWN);
11234     }
11235
11236   extract_base_offset_in_addr (mem_1, &base, &offset);
11237   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11238
11239   /* Adjust offset thus it can fit in ldp/stp instruction.  */
11240   msize = GET_MODE_SIZE (mode);
11241   stp_off_limit = msize * 0x40;
11242   off_val = INTVAL (offset);
11243   abs_off = (off_val < 0) ? -off_val : off_val;
11244   new_off = abs_off % stp_off_limit;
11245   adj_off = abs_off - new_off;
11246
11247   /* Further adjust to make sure all offsets are OK.  */
11248   if ((new_off + msize * 2) >= stp_off_limit)
11249     {
11250       adj_off += stp_off_limit;
11251       new_off -= stp_off_limit;
11252     }
11253
11254   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
11255   if (adj_off >= 0x1000)
11256     return false;
11257
11258   if (off_val < 0)
11259     {
11260       adj_off = -adj_off;
11261       new_off = -new_off;
11262     }
11263
11264   /* Create new memory references.  */
11265   mem_1 = change_address (mem_1, VOIDmode,
11266                           plus_constant (DImode, operands[8], new_off));
11267
11268   /* Check if the adjusted address is OK for ldp/stp.  */
11269   if (!aarch64_mem_pair_operand (mem_1, mode))
11270     return false;
11271
11272   msize = GET_MODE_SIZE (mode);
11273   mem_2 = change_address (mem_2, VOIDmode,
11274                           plus_constant (DImode,
11275                                          operands[8],
11276                                          new_off + msize));
11277   mem_3 = change_address (mem_3, VOIDmode,
11278                           plus_constant (DImode,
11279                                          operands[8],
11280                                          new_off + msize * 2));
11281   mem_4 = change_address (mem_4, VOIDmode,
11282                           plus_constant (DImode,
11283                                          operands[8],
11284                                          new_off + msize * 3));
11285
11286   if (code == ZERO_EXTEND)
11287     {
11288       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11289       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11290       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11291       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11292     }
11293   else if (code == SIGN_EXTEND)
11294     {
11295       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11296       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11297       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11298       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11299     }
11300
11301   if (load)
11302     {
11303       operands[1] = mem_1;
11304       operands[3] = mem_2;
11305       operands[5] = mem_3;
11306       operands[7] = mem_4;
11307     }
11308   else
11309     {
11310       operands[0] = mem_1;
11311       operands[2] = mem_2;
11312       operands[4] = mem_3;
11313       operands[6] = mem_4;
11314     }
11315
11316   /* Emit adjusting instruction.  */
11317   emit_insn (gen_rtx_SET (VOIDmode, operands[8],
11318                           plus_constant (DImode, base, adj_off)));
11319   /* Emit ldp/stp instructions.  */
11320   t1 = gen_rtx_SET (VOIDmode, operands[0], operands[1]);
11321   t2 = gen_rtx_SET (VOIDmode, operands[2], operands[3]);
11322   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11323   t1 = gen_rtx_SET (VOIDmode, operands[4], operands[5]);
11324   t2 = gen_rtx_SET (VOIDmode, operands[6], operands[7]);
11325   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11326   return true;
11327 }
11328
11329 #undef TARGET_ADDRESS_COST
11330 #define TARGET_ADDRESS_COST aarch64_address_cost
11331
11332 /* This hook will determines whether unnamed bitfields affect the alignment
11333    of the containing structure.  The hook returns true if the structure
11334    should inherit the alignment requirements of an unnamed bitfield's
11335    type.  */
11336 #undef TARGET_ALIGN_ANON_BITFIELD
11337 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11338
11339 #undef TARGET_ASM_ALIGNED_DI_OP
11340 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11341
11342 #undef TARGET_ASM_ALIGNED_HI_OP
11343 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11344
11345 #undef TARGET_ASM_ALIGNED_SI_OP
11346 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11347
11348 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11349 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11350   hook_bool_const_tree_hwi_hwi_const_tree_true
11351
11352 #undef TARGET_ASM_FILE_START
11353 #define TARGET_ASM_FILE_START aarch64_start_file
11354
11355 #undef TARGET_ASM_OUTPUT_MI_THUNK
11356 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11357
11358 #undef TARGET_ASM_SELECT_RTX_SECTION
11359 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11360
11361 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11362 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11363
11364 #undef TARGET_BUILD_BUILTIN_VA_LIST
11365 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11366
11367 #undef TARGET_CALLEE_COPIES
11368 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11369
11370 #undef TARGET_CAN_ELIMINATE
11371 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11372
11373 #undef TARGET_CANNOT_FORCE_CONST_MEM
11374 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11375
11376 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11377 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11378
11379 /* Only the least significant bit is used for initialization guard
11380    variables.  */
11381 #undef TARGET_CXX_GUARD_MASK_BIT
11382 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11383
11384 #undef TARGET_C_MODE_FOR_SUFFIX
11385 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11386
11387 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11388 #undef  TARGET_DEFAULT_TARGET_FLAGS
11389 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11390 #endif
11391
11392 #undef TARGET_CLASS_MAX_NREGS
11393 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11394
11395 #undef TARGET_BUILTIN_DECL
11396 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11397
11398 #undef  TARGET_EXPAND_BUILTIN
11399 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11400
11401 #undef TARGET_EXPAND_BUILTIN_VA_START
11402 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11403
11404 #undef TARGET_FOLD_BUILTIN
11405 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11406
11407 #undef TARGET_FUNCTION_ARG
11408 #define TARGET_FUNCTION_ARG aarch64_function_arg
11409
11410 #undef TARGET_FUNCTION_ARG_ADVANCE
11411 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11412
11413 #undef TARGET_FUNCTION_ARG_BOUNDARY
11414 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11415
11416 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11417 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11418
11419 #undef TARGET_FUNCTION_VALUE
11420 #define TARGET_FUNCTION_VALUE aarch64_function_value
11421
11422 #undef TARGET_FUNCTION_VALUE_REGNO_P
11423 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11424
11425 #undef TARGET_FRAME_POINTER_REQUIRED
11426 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11427
11428 #undef TARGET_GIMPLE_FOLD_BUILTIN
11429 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11430
11431 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11432 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11433
11434 #undef  TARGET_INIT_BUILTINS
11435 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
11436
11437 #undef TARGET_LEGITIMATE_ADDRESS_P
11438 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
11439
11440 #undef TARGET_LEGITIMATE_CONSTANT_P
11441 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
11442
11443 #undef TARGET_LIBGCC_CMP_RETURN_MODE
11444 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
11445
11446 #undef TARGET_LRA_P
11447 #define TARGET_LRA_P hook_bool_void_true
11448
11449 #undef TARGET_MANGLE_TYPE
11450 #define TARGET_MANGLE_TYPE aarch64_mangle_type
11451
11452 #undef TARGET_MEMORY_MOVE_COST
11453 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
11454
11455 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
11456 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
11457
11458 #undef TARGET_MUST_PASS_IN_STACK
11459 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
11460
11461 /* This target hook should return true if accesses to volatile bitfields
11462    should use the narrowest mode possible.  It should return false if these
11463    accesses should use the bitfield container type.  */
11464 #undef TARGET_NARROW_VOLATILE_BITFIELD
11465 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
11466
11467 #undef  TARGET_OPTION_OVERRIDE
11468 #define TARGET_OPTION_OVERRIDE aarch64_override_options
11469
11470 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
11471 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
11472   aarch64_override_options_after_change
11473
11474 #undef TARGET_PASS_BY_REFERENCE
11475 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
11476
11477 #undef TARGET_PREFERRED_RELOAD_CLASS
11478 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
11479
11480 #undef TARGET_SCHED_REASSOCIATION_WIDTH
11481 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
11482
11483 #undef TARGET_SECONDARY_RELOAD
11484 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
11485
11486 #undef TARGET_SHIFT_TRUNCATION_MASK
11487 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
11488
11489 #undef TARGET_SETUP_INCOMING_VARARGS
11490 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
11491
11492 #undef TARGET_STRUCT_VALUE_RTX
11493 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
11494
11495 #undef TARGET_REGISTER_MOVE_COST
11496 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
11497
11498 #undef TARGET_RETURN_IN_MEMORY
11499 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
11500
11501 #undef TARGET_RETURN_IN_MSB
11502 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
11503
11504 #undef TARGET_RTX_COSTS
11505 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
11506
11507 #undef TARGET_SCHED_ISSUE_RATE
11508 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
11509
11510 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
11511 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
11512   aarch64_sched_first_cycle_multipass_dfa_lookahead
11513
11514 #undef TARGET_TRAMPOLINE_INIT
11515 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
11516
11517 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
11518 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
11519
11520 #undef TARGET_VECTOR_MODE_SUPPORTED_P
11521 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
11522
11523 #undef TARGET_ARRAY_MODE_SUPPORTED_P
11524 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
11525
11526 #undef TARGET_VECTORIZE_ADD_STMT_COST
11527 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
11528
11529 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
11530 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
11531   aarch64_builtin_vectorization_cost
11532
11533 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
11534 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
11535
11536 #undef TARGET_VECTORIZE_BUILTINS
11537 #define TARGET_VECTORIZE_BUILTINS
11538
11539 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
11540 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
11541   aarch64_builtin_vectorized_function
11542
11543 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
11544 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
11545   aarch64_autovectorize_vector_sizes
11546
11547 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
11548 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
11549   aarch64_atomic_assign_expand_fenv
11550
11551 /* Section anchor support.  */
11552
11553 #undef TARGET_MIN_ANCHOR_OFFSET
11554 #define TARGET_MIN_ANCHOR_OFFSET -256
11555
11556 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
11557    byte offset; we can do much more for larger data types, but have no way
11558    to determine the size of the access.  We assume accesses are aligned.  */
11559 #undef TARGET_MAX_ANCHOR_OFFSET
11560 #define TARGET_MAX_ANCHOR_OFFSET 4095
11561
11562 #undef TARGET_VECTOR_ALIGNMENT
11563 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
11564
11565 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
11566 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
11567   aarch64_simd_vector_alignment_reachable
11568
11569 /* vec_perm support.  */
11570
11571 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
11572 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
11573   aarch64_vectorize_vec_perm_const_ok
11574
11575
11576 #undef TARGET_FIXED_CONDITION_CODE_REGS
11577 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
11578
11579 #undef TARGET_FLAGS_REGNUM
11580 #define TARGET_FLAGS_REGNUM CC_REGNUM
11581
11582 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
11583 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
11584
11585 #undef TARGET_ASAN_SHADOW_OFFSET
11586 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
11587
11588 #undef TARGET_LEGITIMIZE_ADDRESS
11589 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
11590
11591 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
11592 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
11593   aarch64_use_by_pieces_infrastructure_p
11594
11595 #undef TARGET_CAN_USE_DOLOOP_P
11596 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
11597
11598 #undef TARGET_SCHED_MACRO_FUSION_P
11599 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
11600
11601 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
11602 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
11603
11604 #undef TARGET_SCHED_FUSION_PRIORITY
11605 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
11606
11607 struct gcc_target targetm = TARGET_INITIALIZER;
11608
11609 #include "gt-aarch64.h"