gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2015 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "backend.h"
  25 #include "tree.h"
  26 #include "gimple.h"
  27 #include "rtl.h"
  28 #include "df.h"
  29 #include "insn-codes.h"
  30 #include "insn-attr.h"
  31 #include "alias.h"
  32 #include "fold-const.h"
  33 #include "stringpool.h"
  34 #include "stor-layout.h"
  35 #include "calls.h"
  36 #include "varasm.h"
  37 #include "regs.h"
  38 #include "cfgrtl.h"
  39 #include "cfganal.h"
  40 #include "lcm.h"
  41 #include "cfgbuild.h"
  42 #include "cfgcleanup.h"
  43 #include "output.h"
  44 #include "flags.h"
  45 #include "insn-config.h"
  46 #include "expmed.h"
  47 #include "dojump.h"
  48 #include "explow.h"
  49 #include "emit-rtl.h"
  50 #include "stmt.h"
  51 #include "expr.h"
  52 #include "reload.h"
  53 #include "toplev.h"
  54 #include "target.h"
  55 #include "targhooks.h"
  56 #include "tm_p.h"
  57 #include "recog.h"
  58 #include "langhooks.h"
  59 #include "diagnostic-core.h"
  60 #include "internal-fn.h"
  61 #include "gimple-fold.h"
  62 #include "tree-eh.h"
  63 #include "gimplify.h"
  64 #include "optabs.h"
  65 #include "dwarf2.h"
  66 #include "cfgloop.h"
  67 #include "tree-vectorizer.h"
  68 #include "aarch64-cost-tables.h"
  69 #include "dumpfile.h"
  70 #include "builtins.h"
  71 #include "rtl-iter.h"
  72 #include "tm-constrs.h"
  73 #include "sched-int.h"
  74 #include "cortex-a57-fma-steering.h"
  75
  76 /* This file should be included last.  */
  77 #include "target-def.h"
  78
  79 /* Defined for convenience.  */
  80 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  81
  82 /* Classifies an address.
  83
  84    ADDRESS_REG_IMM
  85        A simple base register plus immediate offset.
  86
  87    ADDRESS_REG_WB
  88        A base register indexed by immediate offset with writeback.
  89
  90    ADDRESS_REG_REG
  91        A base register indexed by (optionally scaled) register.
  92
  93    ADDRESS_REG_UXTW
  94        A base register indexed by (optionally scaled) zero-extended register.
  95
  96    ADDRESS_REG_SXTW
  97        A base register indexed by (optionally scaled) sign-extended register.
  98
  99    ADDRESS_LO_SUM
 100        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 101
 102    ADDRESS_SYMBOLIC:
 103        A constant symbolic address, in pc-relative literal pool.  */
 104
 105 enum aarch64_address_type {
 106   ADDRESS_REG_IMM,
 107   ADDRESS_REG_WB,
 108   ADDRESS_REG_REG,
 109   ADDRESS_REG_UXTW,
 110   ADDRESS_REG_SXTW,
 111   ADDRESS_LO_SUM,
 112   ADDRESS_SYMBOLIC
 113 };
 114
 115 struct aarch64_address_info {
 116   enum aarch64_address_type type;
 117   rtx base;
 118   rtx offset;
 119   int shift;
 120   enum aarch64_symbol_type symbol_type;
 121 };
 122
 123 struct simd_immediate_info
 124 {
 125   rtx value;
 126   int shift;
 127   int element_width;
 128   bool mvn;
 129   bool msl;
 130 };
 131
 132 /* The current code model.  */
 133 enum aarch64_code_model aarch64_cmodel;
 134
 135 #ifdef HAVE_AS_TLS
 136 #undef TARGET_HAVE_TLS
 137 #define TARGET_HAVE_TLS 1
 138 #endif
 139
 140 static bool aarch64_composite_type_p (const_tree, machine_mode);
 141 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 142                                                      const_tree,
 143                                                      machine_mode *, int *,
 144                                                      bool *);
 145 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 146 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 147 static void aarch64_override_options_after_change (void);
 148 static bool aarch64_vector_mode_supported_p (machine_mode);
 149 static unsigned bit_count (unsigned HOST_WIDE_INT);
 150 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 151                                                  const unsigned char *sel);
 152 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 153
 154 /* Major revision number of the ARM Architecture implemented by the target.  */
 155 unsigned aarch64_architecture_version;
 156
 157 /* The processor for which instructions should be scheduled.  */
 158 enum aarch64_processor aarch64_tune = cortexa53;
 159
 160 /* Mask to specify which instructions we are allowed to generate.  */
 161 unsigned long aarch64_isa_flags = 0;
 162
 163 /* Mask to specify which instruction scheduling options should be used.  */
 164 unsigned long aarch64_tune_flags = 0;
 165
 166 /* Support for command line parsing of boolean flags in the tuning
 167    structures.  */
 168 struct aarch64_flag_desc
 169 {
 170   const char* name;
 171   unsigned int flag;
 172 };
 173
 174 #define AARCH64_FUSION_PAIR(name, internal_name, y) \
 175   { name, AARCH64_FUSE_##internal_name },
 176 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 177 {
 178   { "none", AARCH64_FUSE_NOTHING },
 179 #include "aarch64-fusion-pairs.def"
 180   { "all", AARCH64_FUSE_ALL },
 181   { NULL, AARCH64_FUSE_NOTHING }
 182 };
 183 #undef AARCH64_FUION_PAIR
 184
 185 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name, y) \
 186   { name, AARCH64_EXTRA_TUNE_##internal_name },
 187 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 188 {
 189   { "none", AARCH64_EXTRA_TUNE_NONE },
 190 #include "aarch64-tuning-flags.def"
 191   { "all", AARCH64_EXTRA_TUNE_ALL },
 192   { NULL, AARCH64_EXTRA_TUNE_NONE }
 193 };
 194 #undef AARCH64_EXTRA_TUNING_OPTION
 195
 196 /* Tuning parameters.  */
 197
 198 static const struct cpu_addrcost_table generic_addrcost_table =
 199 {
 200     {
 201       0, /* hi  */
 202       0, /* si  */
 203       0, /* di  */
 204       0, /* ti  */
 205     },
 206   0, /* pre_modify  */
 207   0, /* post_modify  */
 208   0, /* register_offset  */
 209   0, /* register_extend  */
 210   0 /* imm_offset  */
 211 };
 212
 213 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 214 {
 215     {
 216       1, /* hi  */
 217       0, /* si  */
 218       0, /* di  */
 219       1, /* ti  */
 220     },
 221   0, /* pre_modify  */
 222   0, /* post_modify  */
 223   0, /* register_offset  */
 224   0, /* register_extend  */
 225   0, /* imm_offset  */
 226 };
 227
 228 static const struct cpu_addrcost_table xgene1_addrcost_table =
 229 {
 230     {
 231       1, /* hi  */
 232       0, /* si  */
 233       0, /* di  */
 234       1, /* ti  */
 235     },
 236   1, /* pre_modify  */
 237   0, /* post_modify  */
 238   0, /* register_offset  */
 239   1, /* register_extend  */
 240   0, /* imm_offset  */
 241 };
 242
 243 static const struct cpu_regmove_cost generic_regmove_cost =
 244 {
 245   1, /* GP2GP  */
 246   /* Avoid the use of slow int<->fp moves for spilling by setting
 247      their cost higher than memmov_cost.  */
 248   5, /* GP2FP  */
 249   5, /* FP2GP  */
 250   2 /* FP2FP  */
 251 };
 252
 253 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 254 {
 255   1, /* GP2GP  */
 256   /* Avoid the use of slow int<->fp moves for spilling by setting
 257      their cost higher than memmov_cost.  */
 258   5, /* GP2FP  */
 259   5, /* FP2GP  */
 260   2 /* FP2FP  */
 261 };
 262
 263 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 264 {
 265   1, /* GP2GP  */
 266   /* Avoid the use of slow int<->fp moves for spilling by setting
 267      their cost higher than memmov_cost.  */
 268   5, /* GP2FP  */
 269   5, /* FP2GP  */
 270   2 /* FP2FP  */
 271 };
 272
 273 static const struct cpu_regmove_cost thunderx_regmove_cost =
 274 {
 275   2, /* GP2GP  */
 276   2, /* GP2FP  */
 277   6, /* FP2GP  */
 278   4 /* FP2FP  */
 279 };
 280
 281 static const struct cpu_regmove_cost xgene1_regmove_cost =
 282 {
 283   1, /* GP2GP  */
 284   /* Avoid the use of slow int<->fp moves for spilling by setting
 285      their cost higher than memmov_cost.  */
 286   8, /* GP2FP  */
 287   8, /* FP2GP  */
 288   2 /* FP2FP  */
 289 };
 290
 291 /* Generic costs for vector insn classes.  */
 292 static const struct cpu_vector_cost generic_vector_cost =
 293 {
 294   1, /* scalar_stmt_cost  */
 295   1, /* scalar_load_cost  */
 296   1, /* scalar_store_cost  */
 297   1, /* vec_stmt_cost  */
 298   1, /* vec_to_scalar_cost  */
 299   1, /* scalar_to_vec_cost  */
 300   1, /* vec_align_load_cost  */
 301   1, /* vec_unalign_load_cost  */
 302   1, /* vec_unalign_store_cost  */
 303   1, /* vec_store_cost  */
 304   3, /* cond_taken_branch_cost  */
 305   1 /* cond_not_taken_branch_cost  */
 306 };
 307
 308 /* Generic costs for vector insn classes.  */
 309 static const struct cpu_vector_cost cortexa57_vector_cost =
 310 {
 311   1, /* scalar_stmt_cost  */
 312   4, /* scalar_load_cost  */
 313   1, /* scalar_store_cost  */
 314   3, /* vec_stmt_cost  */
 315   8, /* vec_to_scalar_cost  */
 316   8, /* scalar_to_vec_cost  */
 317   5, /* vec_align_load_cost  */
 318   5, /* vec_unalign_load_cost  */
 319   1, /* vec_unalign_store_cost  */
 320   1, /* vec_store_cost  */
 321   1, /* cond_taken_branch_cost  */
 322   1 /* cond_not_taken_branch_cost  */
 323 };
 324
 325 /* Generic costs for vector insn classes.  */
 326 static const struct cpu_vector_cost xgene1_vector_cost =
 327 {
 328   1, /* scalar_stmt_cost  */
 329   5, /* scalar_load_cost  */
 330   1, /* scalar_store_cost  */
 331   2, /* vec_stmt_cost  */
 332   4, /* vec_to_scalar_cost  */
 333   4, /* scalar_to_vec_cost  */
 334   10, /* vec_align_load_cost  */
 335   10, /* vec_unalign_load_cost  */
 336   2, /* vec_unalign_store_cost  */
 337   2, /* vec_store_cost  */
 338   2, /* cond_taken_branch_cost  */
 339   1 /* cond_not_taken_branch_cost  */
 340 };
 341
 342 /* Generic costs for branch instructions.  */
 343 static const struct cpu_branch_cost generic_branch_cost =
 344 {
 345   2,  /* Predictable.  */
 346   2   /* Unpredictable.  */
 347 };
 348
 349 static const struct tune_params generic_tunings =
 350 {
 351   &cortexa57_extra_costs,
 352   &generic_addrcost_table,
 353   &generic_regmove_cost,
 354   &generic_vector_cost,
 355   &generic_branch_cost,
 356   4, /* memmov_cost  */
 357   2, /* issue_rate  */
 358   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 359   8,    /* function_align.  */
 360   8,    /* jump_align.  */
 361   4,    /* loop_align.  */
 362   2,    /* int_reassoc_width.  */
 363   4,    /* fp_reassoc_width.  */
 364   1,    /* vec_reassoc_width.  */
 365   2,    /* min_div_recip_mul_sf.  */
 366   2,    /* min_div_recip_mul_df.  */
 367   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 368 };
 369
 370 static const struct tune_params cortexa53_tunings =
 371 {
 372   &cortexa53_extra_costs,
 373   &generic_addrcost_table,
 374   &cortexa53_regmove_cost,
 375   &generic_vector_cost,
 376   &generic_branch_cost,
 377   4, /* memmov_cost  */
 378   2, /* issue_rate  */
 379   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 380    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 381   8,    /* function_align.  */
 382   8,    /* jump_align.  */
 383   4,    /* loop_align.  */
 384   2,    /* int_reassoc_width.  */
 385   4,    /* fp_reassoc_width.  */
 386   1,    /* vec_reassoc_width.  */
 387   2,    /* min_div_recip_mul_sf.  */
 388   2,    /* min_div_recip_mul_df.  */
 389   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 390 };
 391
 392 static const struct tune_params cortexa57_tunings =
 393 {
 394   &cortexa57_extra_costs,
 395   &cortexa57_addrcost_table,
 396   &cortexa57_regmove_cost,
 397   &cortexa57_vector_cost,
 398   &generic_branch_cost,
 399   4, /* memmov_cost  */
 400   3, /* issue_rate  */
 401   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 402    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 403   16,   /* function_align.  */
 404   8,    /* jump_align.  */
 405   4,    /* loop_align.  */
 406   2,    /* int_reassoc_width.  */
 407   4,    /* fp_reassoc_width.  */
 408   1,    /* vec_reassoc_width.  */
 409   2,    /* min_div_recip_mul_sf.  */
 410   2,    /* min_div_recip_mul_df.  */
 411   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)  /* tune_flags.  */
 412 };
 413
 414 static const struct tune_params cortexa72_tunings =
 415 {
 416   &cortexa57_extra_costs,
 417   &cortexa57_addrcost_table,
 418   &cortexa57_regmove_cost,
 419   &cortexa57_vector_cost,
 420   &generic_branch_cost,
 421   4, /* memmov_cost  */
 422   3, /* issue_rate  */
 423   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 424    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 425   16,   /* function_align.  */
 426   8,    /* jump_align.  */
 427   4,    /* loop_align.  */
 428   2,    /* int_reassoc_width.  */
 429   4,    /* fp_reassoc_width.  */
 430   1,    /* vec_reassoc_width.  */
 431   2,    /* min_div_recip_mul_sf.  */
 432   2,    /* min_div_recip_mul_df.  */
 433   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 434 };
 435
 436 static const struct tune_params thunderx_tunings =
 437 {
 438   &thunderx_extra_costs,
 439   &generic_addrcost_table,
 440   &thunderx_regmove_cost,
 441   &generic_vector_cost,
 442   &generic_branch_cost,
 443   6, /* memmov_cost  */
 444   2, /* issue_rate  */
 445   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 446   8,    /* function_align.  */
 447   8,    /* jump_align.  */
 448   8,    /* loop_align.  */
 449   2,    /* int_reassoc_width.  */
 450   4,    /* fp_reassoc_width.  */
 451   1,    /* vec_reassoc_width.  */
 452   2,    /* min_div_recip_mul_sf.  */
 453   2,    /* min_div_recip_mul_df.  */
 454   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 455 };
 456
 457 static const struct tune_params xgene1_tunings =
 458 {
 459   &xgene1_extra_costs,
 460   &xgene1_addrcost_table,
 461   &xgene1_regmove_cost,
 462   &xgene1_vector_cost,
 463   &generic_branch_cost,
 464   6, /* memmov_cost  */
 465   4, /* issue_rate  */
 466   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 467   16,   /* function_align.  */
 468   8,    /* jump_align.  */
 469   16,   /* loop_align.  */
 470   2,    /* int_reassoc_width.  */
 471   4,    /* fp_reassoc_width.  */
 472   1,    /* vec_reassoc_width.  */
 473   2,    /* min_div_recip_mul_sf.  */
 474   2,    /* min_div_recip_mul_df.  */
 475   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 476 };
 477
 478 /* Support for fine-grained override of the tuning structures.  */
 479 struct aarch64_tuning_override_function
 480 {
 481   const char* name;
 482   void (*parse_override)(const char*, struct tune_params*);
 483 };
 484
 485 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 486 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 487
 488 static const struct aarch64_tuning_override_function
 489 aarch64_tuning_override_functions[] =
 490 {
 491   { "fuse", aarch64_parse_fuse_string },
 492   { "tune", aarch64_parse_tune_string },
 493   { NULL, NULL }
 494 };
 495
 496 /* A processor implementing AArch64.  */
 497 struct processor
 498 {
 499   const char *const name;
 500   enum aarch64_processor core;
 501   const char *arch;
 502   unsigned architecture_version;
 503   const unsigned long flags;
 504   const struct tune_params *const tune;
 505 };
 506
 507 /* Processor cores implementing AArch64.  */
 508 static const struct processor all_cores[] =
 509 {
 510 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
 511   {NAME, SCHED, #ARCH, ARCH, FLAGS, &COSTS##_tunings},
 512 #include "aarch64-cores.def"
 513 #undef AARCH64_CORE
 514   {"generic", cortexa53, "8", 8, AARCH64_FL_FOR_ARCH8, &generic_tunings},
 515   {NULL, aarch64_none, NULL, 0, 0, NULL}
 516 };
 517
 518 /* Architectures implementing AArch64.  */
 519 static const struct processor all_architectures[] =
 520 {
 521 #define AARCH64_ARCH(NAME, CORE, ARCH, FLAGS) \
 522   {NAME, CORE, #ARCH, ARCH, FLAGS, NULL},
 523 #include "aarch64-arches.def"
 524 #undef AARCH64_ARCH
 525   {NULL, aarch64_none, NULL, 0, 0, NULL}
 526 };
 527
 528 /* Target specification.  These are populated as commandline arguments
 529    are processed, or NULL if not specified.  */
 530 static const struct processor *selected_arch;
 531 static const struct processor *selected_cpu;
 532 static const struct processor *selected_tune;
 533
 534 /* The current tuning set.  */
 535 struct tune_params aarch64_tune_params = generic_tunings;
 536
 537 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 538
 539 /* An ISA extension in the co-processor and main instruction set space.  */
 540 struct aarch64_option_extension
 541 {
 542   const char *const name;
 543   const unsigned long flags_on;
 544   const unsigned long flags_off;
 545 };
 546
 547 /* ISA extensions in AArch64.  */
 548 static const struct aarch64_option_extension all_extensions[] =
 549 {
 550 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
 551   {NAME, FLAGS_ON, FLAGS_OFF},
 552 #include "aarch64-option-extensions.def"
 553 #undef AARCH64_OPT_EXTENSION
 554   {NULL, 0, 0}
 555 };
 556
 557 /* Used to track the size of an address when generating a pre/post
 558    increment address.  */
 559 static machine_mode aarch64_memory_reference_mode;
 560
 561 /* A table of valid AArch64 "bitmask immediate" values for
 562    logical instructions.  */
 563
 564 #define AARCH64_NUM_BITMASKS  5334
 565 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
 566
 567 typedef enum aarch64_cond_code
 568 {
 569   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 570   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 571   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 572 }
 573 aarch64_cc;
 574
 575 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 576
 577 /* The condition codes of the processor, and the inverse function.  */
 578 static const char * const aarch64_condition_codes[] =
 579 {
 580   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 581   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 582 };
 583
 584 void
 585 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 586 {
 587   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 588   if (TARGET_GENERAL_REGS_ONLY)
 589     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 590   else
 591     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 592 }
 593
 594 static unsigned int
 595 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 596 {
 597   if (GET_MODE_UNIT_SIZE (mode) == 4)
 598     return aarch64_tune_params.min_div_recip_mul_sf;
 599   return aarch64_tune_params.min_div_recip_mul_df;
 600 }
 601
 602 static int
 603 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 604                              enum machine_mode mode)
 605 {
 606   if (VECTOR_MODE_P (mode))
 607     return aarch64_tune_params.vec_reassoc_width;
 608   if (INTEGRAL_MODE_P (mode))
 609     return aarch64_tune_params.int_reassoc_width;
 610   if (FLOAT_MODE_P (mode))
 611     return aarch64_tune_params.fp_reassoc_width;
 612   return 1;
 613 }
 614
 615 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 616 unsigned
 617 aarch64_dbx_register_number (unsigned regno)
 618 {
 619    if (GP_REGNUM_P (regno))
 620      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 621    else if (regno == SP_REGNUM)
 622      return AARCH64_DWARF_SP;
 623    else if (FP_REGNUM_P (regno))
 624      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 625
 626    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 627       equivalent DWARF register.  */
 628    return DWARF_FRAME_REGISTERS;
 629 }
 630
 631 /* Return TRUE if MODE is any of the large INT modes.  */
 632 static bool
 633 aarch64_vect_struct_mode_p (machine_mode mode)
 634 {
 635   return mode == OImode || mode == CImode || mode == XImode;
 636 }
 637
 638 /* Return TRUE if MODE is any of the vector modes.  */
 639 static bool
 640 aarch64_vector_mode_p (machine_mode mode)
 641 {
 642   return aarch64_vector_mode_supported_p (mode)
 643          || aarch64_vect_struct_mode_p (mode);
 644 }
 645
 646 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 647 static bool
 648 aarch64_array_mode_supported_p (machine_mode mode,
 649                                 unsigned HOST_WIDE_INT nelems)
 650 {
 651   if (TARGET_SIMD
 652       && AARCH64_VALID_SIMD_QREG_MODE (mode)
 653       && (nelems >= 2 && nelems <= 4))
 654     return true;
 655
 656   return false;
 657 }
 658
 659 /* Implement HARD_REGNO_NREGS.  */
 660
 661 int
 662 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
 663 {
 664   switch (aarch64_regno_regclass (regno))
 665     {
 666     case FP_REGS:
 667     case FP_LO_REGS:
 668       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 669     default:
 670       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 671     }
 672   gcc_unreachable ();
 673 }
 674
 675 /* Implement HARD_REGNO_MODE_OK.  */
 676
 677 int
 678 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
 679 {
 680   if (GET_MODE_CLASS (mode) == MODE_CC)
 681     return regno == CC_REGNUM;
 682
 683   if (regno == SP_REGNUM)
 684     /* The purpose of comparing with ptr_mode is to support the
 685        global register variable associated with the stack pointer
 686        register via the syntax of asm ("wsp") in ILP32.  */
 687     return mode == Pmode || mode == ptr_mode;
 688
 689   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 690     return mode == Pmode;
 691
 692   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 693     return 1;
 694
 695   if (FP_REGNUM_P (regno))
 696     {
 697       if (aarch64_vect_struct_mode_p (mode))
 698         return
 699           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 700       else
 701         return 1;
 702     }
 703
 704   return 0;
 705 }
 706
 707 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 708 machine_mode
 709 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 710                                      machine_mode mode)
 711 {
 712   /* Handle modes that fit within single registers.  */
 713   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 714     {
 715       if (GET_MODE_SIZE (mode) >= 4)
 716         return mode;
 717       else
 718         return SImode;
 719     }
 720   /* Fall back to generic for multi-reg and very large modes.  */
 721   else
 722     return choose_hard_reg_mode (regno, nregs, false);
 723 }
 724
 725 /* Return true if calls to DECL should be treated as
 726    long-calls (ie called via a register).  */
 727 static bool
 728 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 729 {
 730   return false;
 731 }
 732
 733 /* Return true if calls to symbol-ref SYM should be treated as
 734    long-calls (ie called via a register).  */
 735 bool
 736 aarch64_is_long_call_p (rtx sym)
 737 {
 738   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 739 }
 740
 741 /* Return true if the offsets to a zero/sign-extract operation
 742    represent an expression that matches an extend operation.  The
 743    operands represent the paramters from
 744
 745    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 746 bool
 747 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
 748                                 rtx extract_imm)
 749 {
 750   HOST_WIDE_INT mult_val, extract_val;
 751
 752   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 753     return false;
 754
 755   mult_val = INTVAL (mult_imm);
 756   extract_val = INTVAL (extract_imm);
 757
 758   if (extract_val > 8
 759       && extract_val < GET_MODE_BITSIZE (mode)
 760       && exact_log2 (extract_val & ~7) > 0
 761       && (extract_val & 7) <= 4
 762       && mult_val == (1 << (extract_val & 7)))
 763     return true;
 764
 765   return false;
 766 }
 767
 768 /* Emit an insn that's a simple single-set.  Both the operands must be
 769    known to be valid.  */
 770 inline static rtx
 771 emit_set_insn (rtx x, rtx y)
 772 {
 773   return emit_insn (gen_rtx_SET (x, y));
 774 }
 775
 776 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 777    return the rtx for register 0 in the proper mode.  */
 778 rtx
 779 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 780 {
 781   machine_mode mode = SELECT_CC_MODE (code, x, y);
 782   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 783
 784   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 785   return cc_reg;
 786 }
 787
 788 /* Build the SYMBOL_REF for __tls_get_addr.  */
 789
 790 static GTY(()) rtx tls_get_addr_libfunc;
 791
 792 rtx
 793 aarch64_tls_get_addr (void)
 794 {
 795   if (!tls_get_addr_libfunc)
 796     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 797   return tls_get_addr_libfunc;
 798 }
 799
 800 /* Return the TLS model to use for ADDR.  */
 801
 802 static enum tls_model
 803 tls_symbolic_operand_type (rtx addr)
 804 {
 805   enum tls_model tls_kind = TLS_MODEL_NONE;
 806   rtx sym, addend;
 807
 808   if (GET_CODE (addr) == CONST)
 809     {
 810       split_const (addr, &sym, &addend);
 811       if (GET_CODE (sym) == SYMBOL_REF)
 812         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 813     }
 814   else if (GET_CODE (addr) == SYMBOL_REF)
 815     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 816
 817   return tls_kind;
 818 }
 819
 820 /* We'll allow lo_sum's in addresses in our legitimate addresses
 821    so that combine would take care of combining addresses where
 822    necessary, but for generation purposes, we'll generate the address
 823    as :
 824    RTL                               Absolute
 825    tmp = hi (symbol_ref);            adrp  x1, foo
 826    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 827                                      nop
 828
 829    PIC                               TLS
 830    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 831    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 832                                      bl   __tls_get_addr
 833                                      nop
 834
 835    Load TLS symbol, depending on TLS mechanism and TLS access model.
 836
 837    Global Dynamic - Traditional TLS:
 838    adrp tmp, :tlsgd:imm
 839    add  dest, tmp, #:tlsgd_lo12:imm
 840    bl   __tls_get_addr
 841
 842    Global Dynamic - TLS Descriptors:
 843    adrp dest, :tlsdesc:imm
 844    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 845    add  dest, dest, #:tlsdesc_lo12:imm
 846    blr  tmp
 847    mrs  tp, tpidr_el0
 848    add  dest, dest, tp
 849
 850    Initial Exec:
 851    mrs  tp, tpidr_el0
 852    adrp tmp, :gottprel:imm
 853    ldr  dest, [tmp, #:gottprel_lo12:imm]
 854    add  dest, dest, tp
 855
 856    Local Exec:
 857    mrs  tp, tpidr_el0
 858    add  t0, tp, #:tprel_hi12:imm, lsl #12
 859    add  t0, t0, #:tprel_lo12_nc:imm
 860 */
 861
 862 static void
 863 aarch64_load_symref_appropriately (rtx dest, rtx imm,
 864                                    enum aarch64_symbol_type type)
 865 {
 866   switch (type)
 867     {
 868     case SYMBOL_SMALL_ABSOLUTE:
 869       {
 870         /* In ILP32, the mode of dest can be either SImode or DImode.  */
 871         rtx tmp_reg = dest;
 872         machine_mode mode = GET_MODE (dest);
 873
 874         gcc_assert (mode == Pmode || mode == ptr_mode);
 875
 876         if (can_create_pseudo_p ())
 877           tmp_reg = gen_reg_rtx (mode);
 878
 879         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 880         emit_insn (gen_add_losym (dest, tmp_reg, imm));
 881         return;
 882       }
 883
 884     case SYMBOL_TINY_ABSOLUTE:
 885       emit_insn (gen_rtx_SET (dest, imm));
 886       return;
 887
 888     case SYMBOL_SMALL_GOT_28K:
 889       {
 890         machine_mode mode = GET_MODE (dest);
 891         rtx gp_rtx = pic_offset_table_rtx;
 892
 893         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
 894            here before rtl expand.  Tree IVOPT will generate rtl pattern to
 895            decide rtx costs, in which case pic_offset_table_rtx is not
 896            initialized.  For that case no need to generate the first adrp
 897            instruction as the the final cost for global variable access is
 898            one instruction.  */
 899         if (gp_rtx != NULL)
 900           {
 901             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
 902                using the page base as GOT base, the first page may be wasted,
 903                in the worst scenario, there is only 28K space for GOT).
 904
 905                The generate instruction sequence for accessing global variable
 906                is:
 907
 908                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
 909
 910                Only one instruction needed. But we must initialize
 911                pic_offset_table_rtx properly.  We generate initialize insn for
 912                every global access, and allow CSE to remove all redundant.
 913
 914                The final instruction sequences will look like the following
 915                for multiply global variables access.
 916
 917                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
 918
 919                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
 920                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
 921                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
 922                  ...  */
 923
 924             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
 925             crtl->uses_pic_offset_table = 1;
 926             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
 927
 928             if (mode != GET_MODE (gp_rtx))
 929               gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
 930           }
 931
 932         if (mode == ptr_mode)
 933           {
 934             if (mode == DImode)
 935               emit_insn (gen_ldr_got_small_28k_di (dest, gp_rtx, imm));
 936             else
 937               emit_insn (gen_ldr_got_small_28k_si (dest, gp_rtx, imm));
 938           }
 939         else
 940           {
 941             gcc_assert (mode == Pmode);
 942             emit_insn (gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm));
 943           }
 944
 945         return;
 946       }
 947
 948     case SYMBOL_SMALL_GOT_4G:
 949       {
 950         /* In ILP32, the mode of dest can be either SImode or DImode,
 951            while the got entry is always of SImode size.  The mode of
 952            dest depends on how dest is used: if dest is assigned to a
 953            pointer (e.g. in the memory), it has SImode; it may have
 954            DImode if dest is dereferenced to access the memeory.
 955            This is why we have to handle three different ldr_got_small
 956            patterns here (two patterns for ILP32).  */
 957         rtx tmp_reg = dest;
 958         machine_mode mode = GET_MODE (dest);
 959
 960         if (can_create_pseudo_p ())
 961           tmp_reg = gen_reg_rtx (mode);
 962
 963         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 964         if (mode == ptr_mode)
 965           {
 966             if (mode == DImode)
 967               emit_insn (gen_ldr_got_small_di (dest, tmp_reg, imm));
 968             else
 969               emit_insn (gen_ldr_got_small_si (dest, tmp_reg, imm));
 970           }
 971         else
 972           {
 973             gcc_assert (mode == Pmode);
 974             emit_insn (gen_ldr_got_small_sidi (dest, tmp_reg, imm));
 975           }
 976
 977         return;
 978       }
 979
 980     case SYMBOL_SMALL_TLSGD:
 981       {
 982         rtx_insn *insns;
 983         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
 984
 985         start_sequence ();
 986         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
 987         insns = get_insns ();
 988         end_sequence ();
 989
 990         RTL_CONST_CALL_P (insns) = 1;
 991         emit_libcall_block (insns, dest, result, imm);
 992         return;
 993       }
 994
 995     case SYMBOL_SMALL_TLSDESC:
 996       {
 997         machine_mode mode = GET_MODE (dest);
 998         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
 999         rtx tp;
1000
1001         gcc_assert (mode == Pmode || mode == ptr_mode);
1002
1003         /* In ILP32, the got entry is always of SImode size.  Unlike
1004            small GOT, the dest is fixed at reg 0.  */
1005         if (TARGET_ILP32)
1006           emit_insn (gen_tlsdesc_small_si (imm));
1007         else
1008           emit_insn (gen_tlsdesc_small_di (imm));
1009         tp = aarch64_load_tp (NULL);
1010
1011         if (mode != Pmode)
1012           tp = gen_lowpart (mode, tp);
1013
1014         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1015         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1016         return;
1017       }
1018
1019     case SYMBOL_SMALL_GOTTPREL:
1020       {
1021         /* In ILP32, the mode of dest can be either SImode or DImode,
1022            while the got entry is always of SImode size.  The mode of
1023            dest depends on how dest is used: if dest is assigned to a
1024            pointer (e.g. in the memory), it has SImode; it may have
1025            DImode if dest is dereferenced to access the memeory.
1026            This is why we have to handle three different tlsie_small
1027            patterns here (two patterns for ILP32).  */
1028         machine_mode mode = GET_MODE (dest);
1029         rtx tmp_reg = gen_reg_rtx (mode);
1030         rtx tp = aarch64_load_tp (NULL);
1031
1032         if (mode == ptr_mode)
1033           {
1034             if (mode == DImode)
1035               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1036             else
1037               {
1038                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1039                 tp = gen_lowpart (mode, tp);
1040               }
1041           }
1042         else
1043           {
1044             gcc_assert (mode == Pmode);
1045             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1046           }
1047
1048         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1049         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1050         return;
1051       }
1052
1053     case SYMBOL_TLSLE:
1054       {
1055         rtx tp = aarch64_load_tp (NULL);
1056
1057         if (GET_MODE (dest) != Pmode)
1058           tp = gen_lowpart (GET_MODE (dest), tp);
1059
1060         emit_insn (gen_tlsle (dest, tp, imm));
1061         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1062         return;
1063       }
1064
1065     case SYMBOL_TINY_GOT:
1066       emit_insn (gen_ldr_got_tiny (dest, imm));
1067       return;
1068
1069     default:
1070       gcc_unreachable ();
1071     }
1072 }
1073
1074 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1075    handle all moves if !can_create_pseudo_p ().  The distinction is
1076    important because, unlike emit_move_insn, the move expanders know
1077    how to force Pmode objects into the constant pool even when the
1078    constant pool address is not itself legitimate.  */
1079 static rtx
1080 aarch64_emit_move (rtx dest, rtx src)
1081 {
1082   return (can_create_pseudo_p ()
1083           ? emit_move_insn (dest, src)
1084           : emit_move_insn_1 (dest, src));
1085 }
1086
1087 /* Split a 128-bit move operation into two 64-bit move operations,
1088    taking care to handle partial overlap of register to register
1089    copies.  Special cases are needed when moving between GP regs and
1090    FP regs.  SRC can be a register, constant or memory; DST a register
1091    or memory.  If either operand is memory it must not have any side
1092    effects.  */
1093 void
1094 aarch64_split_128bit_move (rtx dst, rtx src)
1095 {
1096   rtx dst_lo, dst_hi;
1097   rtx src_lo, src_hi;
1098
1099   machine_mode mode = GET_MODE (dst);
1100
1101   gcc_assert (mode == TImode || mode == TFmode);
1102   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1103   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1104
1105   if (REG_P (dst) && REG_P (src))
1106     {
1107       int src_regno = REGNO (src);
1108       int dst_regno = REGNO (dst);
1109
1110       /* Handle FP <-> GP regs.  */
1111       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1112         {
1113           src_lo = gen_lowpart (word_mode, src);
1114           src_hi = gen_highpart (word_mode, src);
1115
1116           if (mode == TImode)
1117             {
1118               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1119               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1120             }
1121           else
1122             {
1123               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1124               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1125             }
1126           return;
1127         }
1128       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1129         {
1130           dst_lo = gen_lowpart (word_mode, dst);
1131           dst_hi = gen_highpart (word_mode, dst);
1132
1133           if (mode == TImode)
1134             {
1135               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1136               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1137             }
1138           else
1139             {
1140               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1141               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1142             }
1143           return;
1144         }
1145     }
1146
1147   dst_lo = gen_lowpart (word_mode, dst);
1148   dst_hi = gen_highpart (word_mode, dst);
1149   src_lo = gen_lowpart (word_mode, src);
1150   src_hi = gen_highpart_mode (word_mode, mode, src);
1151
1152   /* At most one pairing may overlap.  */
1153   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1154     {
1155       aarch64_emit_move (dst_hi, src_hi);
1156       aarch64_emit_move (dst_lo, src_lo);
1157     }
1158   else
1159     {
1160       aarch64_emit_move (dst_lo, src_lo);
1161       aarch64_emit_move (dst_hi, src_hi);
1162     }
1163 }
1164
1165 bool
1166 aarch64_split_128bit_move_p (rtx dst, rtx src)
1167 {
1168   return (! REG_P (src)
1169           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1170 }
1171
1172 /* Split a complex SIMD combine.  */
1173
1174 void
1175 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1176 {
1177   machine_mode src_mode = GET_MODE (src1);
1178   machine_mode dst_mode = GET_MODE (dst);
1179
1180   gcc_assert (VECTOR_MODE_P (dst_mode));
1181
1182   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1183     {
1184       rtx (*gen) (rtx, rtx, rtx);
1185
1186       switch (src_mode)
1187         {
1188         case V8QImode:
1189           gen = gen_aarch64_simd_combinev8qi;
1190           break;
1191         case V4HImode:
1192           gen = gen_aarch64_simd_combinev4hi;
1193           break;
1194         case V2SImode:
1195           gen = gen_aarch64_simd_combinev2si;
1196           break;
1197         case V2SFmode:
1198           gen = gen_aarch64_simd_combinev2sf;
1199           break;
1200         case DImode:
1201           gen = gen_aarch64_simd_combinedi;
1202           break;
1203         case DFmode:
1204           gen = gen_aarch64_simd_combinedf;
1205           break;
1206         default:
1207           gcc_unreachable ();
1208         }
1209
1210       emit_insn (gen (dst, src1, src2));
1211       return;
1212     }
1213 }
1214
1215 /* Split a complex SIMD move.  */
1216
1217 void
1218 aarch64_split_simd_move (rtx dst, rtx src)
1219 {
1220   machine_mode src_mode = GET_MODE (src);
1221   machine_mode dst_mode = GET_MODE (dst);
1222
1223   gcc_assert (VECTOR_MODE_P (dst_mode));
1224
1225   if (REG_P (dst) && REG_P (src))
1226     {
1227       rtx (*gen) (rtx, rtx);
1228
1229       gcc_assert (VECTOR_MODE_P (src_mode));
1230
1231       switch (src_mode)
1232         {
1233         case V16QImode:
1234           gen = gen_aarch64_split_simd_movv16qi;
1235           break;
1236         case V8HImode:
1237           gen = gen_aarch64_split_simd_movv8hi;
1238           break;
1239         case V4SImode:
1240           gen = gen_aarch64_split_simd_movv4si;
1241           break;
1242         case V2DImode:
1243           gen = gen_aarch64_split_simd_movv2di;
1244           break;
1245         case V4SFmode:
1246           gen = gen_aarch64_split_simd_movv4sf;
1247           break;
1248         case V2DFmode:
1249           gen = gen_aarch64_split_simd_movv2df;
1250           break;
1251         default:
1252           gcc_unreachable ();
1253         }
1254
1255       emit_insn (gen (dst, src));
1256       return;
1257     }
1258 }
1259
1260 static rtx
1261 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1262 {
1263   if (can_create_pseudo_p ())
1264     return force_reg (mode, value);
1265   else
1266     {
1267       x = aarch64_emit_move (x, value);
1268       return x;
1269     }
1270 }
1271
1272
1273 static rtx
1274 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1275 {
1276   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1277     {
1278       rtx high;
1279       /* Load the full offset into a register.  This
1280          might be improvable in the future.  */
1281       high = GEN_INT (offset);
1282       offset = 0;
1283       high = aarch64_force_temporary (mode, temp, high);
1284       reg = aarch64_force_temporary (mode, temp,
1285                                      gen_rtx_PLUS (mode, high, reg));
1286     }
1287   return plus_constant (mode, reg, offset);
1288 }
1289
1290 static int
1291 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1292                                 machine_mode mode)
1293 {
1294   unsigned HOST_WIDE_INT mask;
1295   int i;
1296   bool first;
1297   unsigned HOST_WIDE_INT val;
1298   bool subtargets;
1299   rtx subtarget;
1300   int one_match, zero_match, first_not_ffff_match;
1301   int num_insns = 0;
1302
1303   if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1304     {
1305       if (generate)
1306         emit_insn (gen_rtx_SET (dest, imm));
1307       num_insns++;
1308       return num_insns;
1309     }
1310
1311   if (mode == SImode)
1312     {
1313       /* We know we can't do this in 1 insn, and we must be able to do it
1314          in two; so don't mess around looking for sequences that don't buy
1315          us anything.  */
1316       if (generate)
1317         {
1318           emit_insn (gen_rtx_SET (dest, GEN_INT (INTVAL (imm) & 0xffff)));
1319           emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1320                                      GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1321         }
1322       num_insns += 2;
1323       return num_insns;
1324     }
1325
1326   /* Remaining cases are all for DImode.  */
1327
1328   val = INTVAL (imm);
1329   subtargets = optimize && can_create_pseudo_p ();
1330
1331   one_match = 0;
1332   zero_match = 0;
1333   mask = 0xffff;
1334   first_not_ffff_match = -1;
1335
1336   for (i = 0; i < 64; i += 16, mask <<= 16)
1337     {
1338       if ((val & mask) == mask)
1339         one_match++;
1340       else
1341         {
1342           if (first_not_ffff_match < 0)
1343             first_not_ffff_match = i;
1344           if ((val & mask) == 0)
1345             zero_match++;
1346         }
1347     }
1348
1349   if (one_match == 2)
1350     {
1351       /* Set one of the quarters and then insert back into result.  */
1352       mask = 0xffffll << first_not_ffff_match;
1353       if (generate)
1354         {
1355           emit_insn (gen_rtx_SET (dest, GEN_INT (val | mask)));
1356           emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1357                                      GEN_INT ((val >> first_not_ffff_match)
1358                                               & 0xffff)));
1359         }
1360       num_insns += 2;
1361       return num_insns;
1362     }
1363
1364   if (zero_match == 2)
1365     goto simple_sequence;
1366
1367   mask = 0x0ffff0000UL;
1368   for (i = 16; i < 64; i += 16, mask <<= 16)
1369     {
1370       HOST_WIDE_INT comp = mask & ~(mask - 1);
1371
1372       if (aarch64_uimm12_shift (val - (val & mask)))
1373         {
1374           if (generate)
1375             {
1376               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1377               emit_insn (gen_rtx_SET (subtarget, GEN_INT (val & mask)));
1378               emit_insn (gen_adddi3 (dest, subtarget,
1379                                      GEN_INT (val - (val & mask))));
1380             }
1381           num_insns += 2;
1382           return num_insns;
1383         }
1384       else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1385         {
1386           if (generate)
1387             {
1388               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1389               emit_insn (gen_rtx_SET (subtarget,
1390                                       GEN_INT ((val + comp) & mask)));
1391               emit_insn (gen_adddi3 (dest, subtarget,
1392                                      GEN_INT (val - ((val + comp) & mask))));
1393             }
1394           num_insns += 2;
1395           return num_insns;
1396         }
1397       else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1398         {
1399           if (generate)
1400             {
1401               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1402               emit_insn (gen_rtx_SET (subtarget,
1403                                       GEN_INT ((val - comp) | ~mask)));
1404               emit_insn (gen_adddi3 (dest, subtarget,
1405                                      GEN_INT (val - ((val - comp) | ~mask))));
1406             }
1407           num_insns += 2;
1408           return num_insns;
1409         }
1410       else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1411         {
1412           if (generate)
1413             {
1414               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1415               emit_insn (gen_rtx_SET (subtarget, GEN_INT (val | ~mask)));
1416               emit_insn (gen_adddi3 (dest, subtarget,
1417                                      GEN_INT (val - (val | ~mask))));
1418             }
1419           num_insns += 2;
1420           return num_insns;
1421         }
1422     }
1423
1424   /* See if we can do it by arithmetically combining two
1425      immediates.  */
1426   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1427     {
1428       int j;
1429       mask = 0xffff;
1430
1431       if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1432           || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1433         {
1434           if (generate)
1435             {
1436               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1437               emit_insn (gen_rtx_SET (subtarget,
1438                                       GEN_INT (aarch64_bitmasks[i])));
1439               emit_insn (gen_adddi3 (dest, subtarget,
1440                                      GEN_INT (val - aarch64_bitmasks[i])));
1441             }
1442           num_insns += 2;
1443           return num_insns;
1444         }
1445
1446       for (j = 0; j < 64; j += 16, mask <<= 16)
1447         {
1448           if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1449             {
1450               if (generate)
1451                 {
1452                   emit_insn (gen_rtx_SET (dest,
1453                                           GEN_INT (aarch64_bitmasks[i])));
1454                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1455                                              GEN_INT ((val >> j) & 0xffff)));
1456                 }
1457               num_insns += 2;
1458               return num_insns;
1459             }
1460         }
1461     }
1462
1463   /* See if we can do it by logically combining two immediates.  */
1464   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1465     {
1466       if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1467         {
1468           int j;
1469
1470           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1471             if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1472               {
1473                 if (generate)
1474                   {
1475                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1476                     emit_insn (gen_rtx_SET (subtarget,
1477                                             GEN_INT (aarch64_bitmasks[i])));
1478                     emit_insn (gen_iordi3 (dest, subtarget,
1479                                            GEN_INT (aarch64_bitmasks[j])));
1480                   }
1481                 num_insns += 2;
1482                 return num_insns;
1483               }
1484         }
1485       else if ((val & aarch64_bitmasks[i]) == val)
1486         {
1487           int j;
1488
1489           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1490             if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1491               {
1492                 if (generate)
1493                   {
1494                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1495                     emit_insn (gen_rtx_SET (subtarget,
1496                                             GEN_INT (aarch64_bitmasks[j])));
1497                     emit_insn (gen_anddi3 (dest, subtarget,
1498                                            GEN_INT (aarch64_bitmasks[i])));
1499                   }
1500                 num_insns += 2;
1501                 return num_insns;
1502               }
1503         }
1504     }
1505
1506   if (one_match > zero_match)
1507     {
1508       /* Set either first three quarters or all but the third.   */
1509       mask = 0xffffll << (16 - first_not_ffff_match);
1510       if (generate)
1511         emit_insn (gen_rtx_SET (dest,
1512                                 GEN_INT (val | mask | 0xffffffff00000000ull)));
1513       num_insns ++;
1514
1515       /* Now insert other two quarters.  */
1516       for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1517            i < 64; i += 16, mask <<= 16)
1518         {
1519           if ((val & mask) != mask)
1520             {
1521               if (generate)
1522                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1523                                            GEN_INT ((val >> i) & 0xffff)));
1524               num_insns ++;
1525             }
1526         }
1527       return num_insns;
1528     }
1529
1530  simple_sequence:
1531   first = true;
1532   mask = 0xffff;
1533   for (i = 0; i < 64; i += 16, mask <<= 16)
1534     {
1535       if ((val & mask) != 0)
1536         {
1537           if (first)
1538             {
1539               if (generate)
1540                 emit_insn (gen_rtx_SET (dest, GEN_INT (val & mask)));
1541               num_insns ++;
1542               first = false;
1543             }
1544           else
1545             {
1546               if (generate)
1547                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1548                                            GEN_INT ((val >> i) & 0xffff)));
1549               num_insns ++;
1550             }
1551         }
1552     }
1553
1554   return num_insns;
1555 }
1556
1557
1558 void
1559 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1560 {
1561   machine_mode mode = GET_MODE (dest);
1562
1563   gcc_assert (mode == SImode || mode == DImode);
1564
1565   /* Check on what type of symbol it is.  */
1566   if (GET_CODE (imm) == SYMBOL_REF
1567       || GET_CODE (imm) == LABEL_REF
1568       || GET_CODE (imm) == CONST)
1569     {
1570       rtx mem, base, offset;
1571       enum aarch64_symbol_type sty;
1572
1573       /* If we have (const (plus symbol offset)), separate out the offset
1574          before we start classifying the symbol.  */
1575       split_const (imm, &base, &offset);
1576
1577       sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1578       switch (sty)
1579         {
1580         case SYMBOL_FORCE_TO_MEM:
1581           if (offset != const0_rtx
1582               && targetm.cannot_force_const_mem (mode, imm))
1583             {
1584               gcc_assert (can_create_pseudo_p ());
1585               base = aarch64_force_temporary (mode, dest, base);
1586               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1587               aarch64_emit_move (dest, base);
1588               return;
1589             }
1590           mem = force_const_mem (ptr_mode, imm);
1591           gcc_assert (mem);
1592           if (mode != ptr_mode)
1593             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1594           emit_insn (gen_rtx_SET (dest, mem));
1595           return;
1596
1597         case SYMBOL_SMALL_TLSGD:
1598         case SYMBOL_SMALL_TLSDESC:
1599         case SYMBOL_SMALL_GOTTPREL:
1600         case SYMBOL_SMALL_GOT_28K:
1601         case SYMBOL_SMALL_GOT_4G:
1602         case SYMBOL_TINY_GOT:
1603           if (offset != const0_rtx)
1604             {
1605               gcc_assert(can_create_pseudo_p ());
1606               base = aarch64_force_temporary (mode, dest, base);
1607               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1608               aarch64_emit_move (dest, base);
1609               return;
1610             }
1611           /* FALLTHRU */
1612
1613         case SYMBOL_SMALL_ABSOLUTE:
1614         case SYMBOL_TINY_ABSOLUTE:
1615         case SYMBOL_TLSLE:
1616           aarch64_load_symref_appropriately (dest, imm, sty);
1617           return;
1618
1619         default:
1620           gcc_unreachable ();
1621         }
1622     }
1623
1624   if (!CONST_INT_P (imm))
1625     {
1626       if (GET_CODE (imm) == HIGH)
1627         emit_insn (gen_rtx_SET (dest, imm));
1628       else
1629         {
1630           rtx mem = force_const_mem (mode, imm);
1631           gcc_assert (mem);
1632           emit_insn (gen_rtx_SET (dest, mem));
1633         }
1634
1635       return;
1636     }
1637
1638   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1639 }
1640
1641 static bool
1642 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1643                                  tree exp ATTRIBUTE_UNUSED)
1644 {
1645   /* Currently, always true.  */
1646   return true;
1647 }
1648
1649 /* Implement TARGET_PASS_BY_REFERENCE.  */
1650
1651 static bool
1652 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1653                            machine_mode mode,
1654                            const_tree type,
1655                            bool named ATTRIBUTE_UNUSED)
1656 {
1657   HOST_WIDE_INT size;
1658   machine_mode dummymode;
1659   int nregs;
1660
1661   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1662   size = (mode == BLKmode && type)
1663     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1664
1665   /* Aggregates are passed by reference based on their size.  */
1666   if (type && AGGREGATE_TYPE_P (type))
1667     {
1668       size = int_size_in_bytes (type);
1669     }
1670
1671   /* Variable sized arguments are always returned by reference.  */
1672   if (size < 0)
1673     return true;
1674
1675   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1676   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1677                                                &dummymode, &nregs,
1678                                                NULL))
1679     return false;
1680
1681   /* Arguments which are variable sized or larger than 2 registers are
1682      passed by reference unless they are a homogenous floating point
1683      aggregate.  */
1684   return size > 2 * UNITS_PER_WORD;
1685 }
1686
1687 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1688 static bool
1689 aarch64_return_in_msb (const_tree valtype)
1690 {
1691   machine_mode dummy_mode;
1692   int dummy_int;
1693
1694   /* Never happens in little-endian mode.  */
1695   if (!BYTES_BIG_ENDIAN)
1696     return false;
1697
1698   /* Only composite types smaller than or equal to 16 bytes can
1699      be potentially returned in registers.  */
1700   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1701       || int_size_in_bytes (valtype) <= 0
1702       || int_size_in_bytes (valtype) > 16)
1703     return false;
1704
1705   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1706      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1707      is always passed/returned in the least significant bits of fp/simd
1708      register(s).  */
1709   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1710                                                &dummy_mode, &dummy_int, NULL))
1711     return false;
1712
1713   return true;
1714 }
1715
1716 /* Implement TARGET_FUNCTION_VALUE.
1717    Define how to find the value returned by a function.  */
1718
1719 static rtx
1720 aarch64_function_value (const_tree type, const_tree func,
1721                         bool outgoing ATTRIBUTE_UNUSED)
1722 {
1723   machine_mode mode;
1724   int unsignedp;
1725   int count;
1726   machine_mode ag_mode;
1727
1728   mode = TYPE_MODE (type);
1729   if (INTEGRAL_TYPE_P (type))
1730     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1731
1732   if (aarch64_return_in_msb (type))
1733     {
1734       HOST_WIDE_INT size = int_size_in_bytes (type);
1735
1736       if (size % UNITS_PER_WORD != 0)
1737         {
1738           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1739           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1740         }
1741     }
1742
1743   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1744                                                &ag_mode, &count, NULL))
1745     {
1746       if (!aarch64_composite_type_p (type, mode))
1747         {
1748           gcc_assert (count == 1 && mode == ag_mode);
1749           return gen_rtx_REG (mode, V0_REGNUM);
1750         }
1751       else
1752         {
1753           int i;
1754           rtx par;
1755
1756           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1757           for (i = 0; i < count; i++)
1758             {
1759               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1760               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1761                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1762               XVECEXP (par, 0, i) = tmp;
1763             }
1764           return par;
1765         }
1766     }
1767   else
1768     return gen_rtx_REG (mode, R0_REGNUM);
1769 }
1770
1771 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1772    Return true if REGNO is the number of a hard register in which the values
1773    of called function may come back.  */
1774
1775 static bool
1776 aarch64_function_value_regno_p (const unsigned int regno)
1777 {
1778   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1779      of 16-byte return values are: 128-bit integers and 16-byte small
1780      structures (excluding homogeneous floating-point aggregates).  */
1781   if (regno == R0_REGNUM || regno == R1_REGNUM)
1782     return true;
1783
1784   /* Up to four fp/simd registers can return a function value, e.g. a
1785      homogeneous floating-point aggregate having four members.  */
1786   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1787     return TARGET_FLOAT;
1788
1789   return false;
1790 }
1791
1792 /* Implement TARGET_RETURN_IN_MEMORY.
1793
1794    If the type T of the result of a function is such that
1795      void func (T arg)
1796    would require that arg be passed as a value in a register (or set of
1797    registers) according to the parameter passing rules, then the result
1798    is returned in the same registers as would be used for such an
1799    argument.  */
1800
1801 static bool
1802 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1803 {
1804   HOST_WIDE_INT size;
1805   machine_mode ag_mode;
1806   int count;
1807
1808   if (!AGGREGATE_TYPE_P (type)
1809       && TREE_CODE (type) != COMPLEX_TYPE
1810       && TREE_CODE (type) != VECTOR_TYPE)
1811     /* Simple scalar types always returned in registers.  */
1812     return false;
1813
1814   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1815                                                type,
1816                                                &ag_mode,
1817                                                &count,
1818                                                NULL))
1819     return false;
1820
1821   /* Types larger than 2 registers returned in memory.  */
1822   size = int_size_in_bytes (type);
1823   return (size < 0 || size > 2 * UNITS_PER_WORD);
1824 }
1825
1826 static bool
1827 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1828                                const_tree type, int *nregs)
1829 {
1830   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1831   return aarch64_vfp_is_call_or_return_candidate (mode,
1832                                                   type,
1833                                                   &pcum->aapcs_vfp_rmode,
1834                                                   nregs,
1835                                                   NULL);
1836 }
1837
1838 /* Given MODE and TYPE of a function argument, return the alignment in
1839    bits.  The idea is to suppress any stronger alignment requested by
1840    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1841    This is a helper function for local use only.  */
1842
1843 static unsigned int
1844 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1845 {
1846   unsigned int alignment;
1847
1848   if (type)
1849     {
1850       if (!integer_zerop (TYPE_SIZE (type)))
1851         {
1852           if (TYPE_MODE (type) == mode)
1853             alignment = TYPE_ALIGN (type);
1854           else
1855             alignment = GET_MODE_ALIGNMENT (mode);
1856         }
1857       else
1858         alignment = 0;
1859     }
1860   else
1861     alignment = GET_MODE_ALIGNMENT (mode);
1862
1863   return alignment;
1864 }
1865
1866 /* Layout a function argument according to the AAPCS64 rules.  The rule
1867    numbers refer to the rule numbers in the AAPCS64.  */
1868
1869 static void
1870 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1871                     const_tree type,
1872                     bool named ATTRIBUTE_UNUSED)
1873 {
1874   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1875   int ncrn, nvrn, nregs;
1876   bool allocate_ncrn, allocate_nvrn;
1877   HOST_WIDE_INT size;
1878
1879   /* We need to do this once per argument.  */
1880   if (pcum->aapcs_arg_processed)
1881     return;
1882
1883   pcum->aapcs_arg_processed = true;
1884
1885   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1886   size
1887     = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1888                         UNITS_PER_WORD);
1889
1890   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1891   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1892                                                  mode,
1893                                                  type,
1894                                                  &nregs);
1895
1896   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1897      The following code thus handles passing by SIMD/FP registers first.  */
1898
1899   nvrn = pcum->aapcs_nvrn;
1900
1901   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1902      and homogenous short-vector aggregates (HVA).  */
1903   if (allocate_nvrn)
1904     {
1905       if (!TARGET_FLOAT)
1906         aarch64_err_no_fpadvsimd (mode, "argument");
1907
1908       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1909         {
1910           pcum->aapcs_nextnvrn = nvrn + nregs;
1911           if (!aarch64_composite_type_p (type, mode))
1912             {
1913               gcc_assert (nregs == 1);
1914               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1915             }
1916           else
1917             {
1918               rtx par;
1919               int i;
1920               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1921               for (i = 0; i < nregs; i++)
1922                 {
1923                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1924                                          V0_REGNUM + nvrn + i);
1925                   tmp = gen_rtx_EXPR_LIST
1926                     (VOIDmode, tmp,
1927                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1928                   XVECEXP (par, 0, i) = tmp;
1929                 }
1930               pcum->aapcs_reg = par;
1931             }
1932           return;
1933         }
1934       else
1935         {
1936           /* C.3 NSRN is set to 8.  */
1937           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1938           goto on_stack;
1939         }
1940     }
1941
1942   ncrn = pcum->aapcs_ncrn;
1943   nregs = size / UNITS_PER_WORD;
1944
1945   /* C6 - C9.  though the sign and zero extension semantics are
1946      handled elsewhere.  This is the case where the argument fits
1947      entirely general registers.  */
1948   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1949     {
1950       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1951
1952       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1953
1954       /* C.8 if the argument has an alignment of 16 then the NGRN is
1955          rounded up to the next even number.  */
1956       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1957         {
1958           ++ncrn;
1959           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1960         }
1961       /* NREGS can be 0 when e.g. an empty structure is to be passed.
1962          A reg is still generated for it, but the caller should be smart
1963          enough not to use it.  */
1964       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1965         {
1966           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1967         }
1968       else
1969         {
1970           rtx par;
1971           int i;
1972
1973           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1974           for (i = 0; i < nregs; i++)
1975             {
1976               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1977               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1978                                        GEN_INT (i * UNITS_PER_WORD));
1979               XVECEXP (par, 0, i) = tmp;
1980             }
1981           pcum->aapcs_reg = par;
1982         }
1983
1984       pcum->aapcs_nextncrn = ncrn + nregs;
1985       return;
1986     }
1987
1988   /* C.11  */
1989   pcum->aapcs_nextncrn = NUM_ARG_REGS;
1990
1991   /* The argument is passed on stack; record the needed number of words for
1992      this argument and align the total size if necessary.  */
1993 on_stack:
1994   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1995   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1996     pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
1997                                                16 / UNITS_PER_WORD);
1998   return;
1999 }
2000
2001 /* Implement TARGET_FUNCTION_ARG.  */
2002
2003 static rtx
2004 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2005                       const_tree type, bool named)
2006 {
2007   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2008   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2009
2010   if (mode == VOIDmode)
2011     return NULL_RTX;
2012
2013   aarch64_layout_arg (pcum_v, mode, type, named);
2014   return pcum->aapcs_reg;
2015 }
2016
2017 void
2018 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2019                            const_tree fntype ATTRIBUTE_UNUSED,
2020                            rtx libname ATTRIBUTE_UNUSED,
2021                            const_tree fndecl ATTRIBUTE_UNUSED,
2022                            unsigned n_named ATTRIBUTE_UNUSED)
2023 {
2024   pcum->aapcs_ncrn = 0;
2025   pcum->aapcs_nvrn = 0;
2026   pcum->aapcs_nextncrn = 0;
2027   pcum->aapcs_nextnvrn = 0;
2028   pcum->pcs_variant = ARM_PCS_AAPCS64;
2029   pcum->aapcs_reg = NULL_RTX;
2030   pcum->aapcs_arg_processed = false;
2031   pcum->aapcs_stack_words = 0;
2032   pcum->aapcs_stack_size = 0;
2033
2034   if (!TARGET_FLOAT
2035       && fndecl && TREE_PUBLIC (fndecl)
2036       && fntype && fntype != error_mark_node)
2037     {
2038       const_tree type = TREE_TYPE (fntype);
2039       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2040       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2041       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2042                                                    &mode, &nregs, NULL))
2043         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2044     }
2045   return;
2046 }
2047
2048 static void
2049 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2050                               machine_mode mode,
2051                               const_tree type,
2052                               bool named)
2053 {
2054   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2055   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2056     {
2057       aarch64_layout_arg (pcum_v, mode, type, named);
2058       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2059                   != (pcum->aapcs_stack_words != 0));
2060       pcum->aapcs_arg_processed = false;
2061       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2062       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2063       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2064       pcum->aapcs_stack_words = 0;
2065       pcum->aapcs_reg = NULL_RTX;
2066     }
2067 }
2068
2069 bool
2070 aarch64_function_arg_regno_p (unsigned regno)
2071 {
2072   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2073           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2074 }
2075
2076 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2077    PARM_BOUNDARY bits of alignment, but will be given anything up
2078    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2079    that both before and after the layout of each argument, the Next
2080    Stacked Argument Address (NSAA) will have a minimum alignment of
2081    8 bytes.  */
2082
2083 static unsigned int
2084 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2085 {
2086   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2087
2088   if (alignment < PARM_BOUNDARY)
2089     alignment = PARM_BOUNDARY;
2090   if (alignment > STACK_BOUNDARY)
2091     alignment = STACK_BOUNDARY;
2092   return alignment;
2093 }
2094
2095 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2096
2097    Return true if an argument passed on the stack should be padded upwards,
2098    i.e. if the least-significant byte of the stack slot has useful data.
2099
2100    Small aggregate types are placed in the lowest memory address.
2101
2102    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2103
2104 bool
2105 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2106 {
2107   /* On little-endian targets, the least significant byte of every stack
2108      argument is passed at the lowest byte address of the stack slot.  */
2109   if (!BYTES_BIG_ENDIAN)
2110     return true;
2111
2112   /* Otherwise, integral, floating-point and pointer types are padded downward:
2113      the least significant byte of a stack argument is passed at the highest
2114      byte address of the stack slot.  */
2115   if (type
2116       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2117          || POINTER_TYPE_P (type))
2118       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2119     return false;
2120
2121   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2122   return true;
2123 }
2124
2125 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2126
2127    It specifies padding for the last (may also be the only)
2128    element of a block move between registers and memory.  If
2129    assuming the block is in the memory, padding upward means that
2130    the last element is padded after its highest significant byte,
2131    while in downward padding, the last element is padded at the
2132    its least significant byte side.
2133
2134    Small aggregates and small complex types are always padded
2135    upwards.
2136
2137    We don't need to worry about homogeneous floating-point or
2138    short-vector aggregates; their move is not affected by the
2139    padding direction determined here.  Regardless of endianness,
2140    each element of such an aggregate is put in the least
2141    significant bits of a fp/simd register.
2142
2143    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2144    register has useful data, and return the opposite if the most
2145    significant byte does.  */
2146
2147 bool
2148 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2149                      bool first ATTRIBUTE_UNUSED)
2150 {
2151
2152   /* Small composite types are always padded upward.  */
2153   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2154     {
2155       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2156                             : GET_MODE_SIZE (mode));
2157       if (size < 2 * UNITS_PER_WORD)
2158         return true;
2159     }
2160
2161   /* Otherwise, use the default padding.  */
2162   return !BYTES_BIG_ENDIAN;
2163 }
2164
2165 static machine_mode
2166 aarch64_libgcc_cmp_return_mode (void)
2167 {
2168   return SImode;
2169 }
2170
2171 static bool
2172 aarch64_frame_pointer_required (void)
2173 {
2174   /* In aarch64_override_options_after_change
2175      flag_omit_leaf_frame_pointer turns off the frame pointer by
2176      default.  Turn it back on now if we've not got a leaf
2177      function.  */
2178   if (flag_omit_leaf_frame_pointer
2179       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2180     return true;
2181
2182   return false;
2183 }
2184
2185 /* Mark the registers that need to be saved by the callee and calculate
2186    the size of the callee-saved registers area and frame record (both FP
2187    and LR may be omitted).  */
2188 static void
2189 aarch64_layout_frame (void)
2190 {
2191   HOST_WIDE_INT offset = 0;
2192   int regno;
2193
2194   if (reload_completed && cfun->machine->frame.laid_out)
2195     return;
2196
2197 #define SLOT_NOT_REQUIRED (-2)
2198 #define SLOT_REQUIRED     (-1)
2199
2200   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2201   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2202
2203   /* First mark all the registers that really need to be saved...  */
2204   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2205     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2206
2207   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2208     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2209
2210   /* ... that includes the eh data registers (if needed)...  */
2211   if (crtl->calls_eh_return)
2212     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2213       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2214         = SLOT_REQUIRED;
2215
2216   /* ... and any callee saved register that dataflow says is live.  */
2217   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2218     if (df_regs_ever_live_p (regno)
2219         && (regno == R30_REGNUM
2220             || !call_used_regs[regno]))
2221       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2222
2223   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2224     if (df_regs_ever_live_p (regno)
2225         && !call_used_regs[regno])
2226       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2227
2228   if (frame_pointer_needed)
2229     {
2230       /* FP and LR are placed in the linkage record.  */
2231       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2232       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2233       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2234       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2235       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2236       offset += 2 * UNITS_PER_WORD;
2237     }
2238
2239   /* Now assign stack slots for them.  */
2240   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2241     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2242       {
2243         cfun->machine->frame.reg_offset[regno] = offset;
2244         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2245           cfun->machine->frame.wb_candidate1 = regno;
2246         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2247           cfun->machine->frame.wb_candidate2 = regno;
2248         offset += UNITS_PER_WORD;
2249       }
2250
2251   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2252     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2253       {
2254         cfun->machine->frame.reg_offset[regno] = offset;
2255         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2256           cfun->machine->frame.wb_candidate1 = regno;
2257         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2258                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2259           cfun->machine->frame.wb_candidate2 = regno;
2260         offset += UNITS_PER_WORD;
2261       }
2262
2263   cfun->machine->frame.padding0 =
2264     (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2265   offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2266
2267   cfun->machine->frame.saved_regs_size = offset;
2268
2269   cfun->machine->frame.hard_fp_offset
2270     = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2271                         + get_frame_size ()
2272                         + cfun->machine->frame.saved_regs_size,
2273                         STACK_BOUNDARY / BITS_PER_UNIT);
2274
2275   cfun->machine->frame.frame_size
2276     = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2277                         + crtl->outgoing_args_size,
2278                         STACK_BOUNDARY / BITS_PER_UNIT);
2279
2280   cfun->machine->frame.laid_out = true;
2281 }
2282
2283 static bool
2284 aarch64_register_saved_on_entry (int regno)
2285 {
2286   return cfun->machine->frame.reg_offset[regno] >= 0;
2287 }
2288
2289 static unsigned
2290 aarch64_next_callee_save (unsigned regno, unsigned limit)
2291 {
2292   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2293     regno ++;
2294   return regno;
2295 }
2296
2297 static void
2298 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2299                            HOST_WIDE_INT adjustment)
2300  {
2301   rtx base_rtx = stack_pointer_rtx;
2302   rtx insn, reg, mem;
2303
2304   reg = gen_rtx_REG (mode, regno);
2305   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2306                             plus_constant (Pmode, base_rtx, -adjustment));
2307   mem = gen_rtx_MEM (mode, mem);
2308
2309   insn = emit_move_insn (mem, reg);
2310   RTX_FRAME_RELATED_P (insn) = 1;
2311 }
2312
2313 static rtx
2314 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2315                           HOST_WIDE_INT adjustment)
2316 {
2317   switch (mode)
2318     {
2319     case DImode:
2320       return gen_storewb_pairdi_di (base, base, reg, reg2,
2321                                     GEN_INT (-adjustment),
2322                                     GEN_INT (UNITS_PER_WORD - adjustment));
2323     case DFmode:
2324       return gen_storewb_pairdf_di (base, base, reg, reg2,
2325                                     GEN_INT (-adjustment),
2326                                     GEN_INT (UNITS_PER_WORD - adjustment));
2327     default:
2328       gcc_unreachable ();
2329     }
2330 }
2331
2332 static void
2333 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2334                          unsigned regno2, HOST_WIDE_INT adjustment)
2335 {
2336   rtx_insn *insn;
2337   rtx reg1 = gen_rtx_REG (mode, regno1);
2338   rtx reg2 = gen_rtx_REG (mode, regno2);
2339
2340   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2341                                               reg2, adjustment));
2342   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2343   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2344   RTX_FRAME_RELATED_P (insn) = 1;
2345 }
2346
2347 static rtx
2348 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2349                          HOST_WIDE_INT adjustment)
2350 {
2351   switch (mode)
2352     {
2353     case DImode:
2354       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2355                                    GEN_INT (UNITS_PER_WORD));
2356     case DFmode:
2357       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2358                                    GEN_INT (UNITS_PER_WORD));
2359     default:
2360       gcc_unreachable ();
2361     }
2362 }
2363
2364 static rtx
2365 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2366                         rtx reg2)
2367 {
2368   switch (mode)
2369     {
2370     case DImode:
2371       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2372
2373     case DFmode:
2374       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2375
2376     default:
2377       gcc_unreachable ();
2378     }
2379 }
2380
2381 static rtx
2382 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2383                        rtx mem2)
2384 {
2385   switch (mode)
2386     {
2387     case DImode:
2388       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2389
2390     case DFmode:
2391       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2392
2393     default:
2394       gcc_unreachable ();
2395     }
2396 }
2397
2398
2399 static void
2400 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2401                            unsigned start, unsigned limit, bool skip_wb)
2402 {
2403   rtx_insn *insn;
2404   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2405                                                  ? gen_frame_mem : gen_rtx_MEM);
2406   unsigned regno;
2407   unsigned regno2;
2408
2409   for (regno = aarch64_next_callee_save (start, limit);
2410        regno <= limit;
2411        regno = aarch64_next_callee_save (regno + 1, limit))
2412     {
2413       rtx reg, mem;
2414       HOST_WIDE_INT offset;
2415
2416       if (skip_wb
2417           && (regno == cfun->machine->frame.wb_candidate1
2418               || regno == cfun->machine->frame.wb_candidate2))
2419         continue;
2420
2421       reg = gen_rtx_REG (mode, regno);
2422       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2423       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2424                                               offset));
2425
2426       regno2 = aarch64_next_callee_save (regno + 1, limit);
2427
2428       if (regno2 <= limit
2429           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2430               == cfun->machine->frame.reg_offset[regno2]))
2431
2432         {
2433           rtx reg2 = gen_rtx_REG (mode, regno2);
2434           rtx mem2;
2435
2436           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2437           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2438                                                    offset));
2439           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2440                                                     reg2));
2441
2442           /* The first part of a frame-related parallel insn is
2443              always assumed to be relevant to the frame
2444              calculations; subsequent parts, are only
2445              frame-related if explicitly marked.  */
2446           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2447           regno = regno2;
2448         }
2449       else
2450         insn = emit_move_insn (mem, reg);
2451
2452       RTX_FRAME_RELATED_P (insn) = 1;
2453     }
2454 }
2455
2456 static void
2457 aarch64_restore_callee_saves (machine_mode mode,
2458                               HOST_WIDE_INT start_offset, unsigned start,
2459                               unsigned limit, bool skip_wb, rtx *cfi_ops)
2460 {
2461   rtx base_rtx = stack_pointer_rtx;
2462   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2463                                                  ? gen_frame_mem : gen_rtx_MEM);
2464   unsigned regno;
2465   unsigned regno2;
2466   HOST_WIDE_INT offset;
2467
2468   for (regno = aarch64_next_callee_save (start, limit);
2469        regno <= limit;
2470        regno = aarch64_next_callee_save (regno + 1, limit))
2471     {
2472       rtx reg, mem;
2473
2474       if (skip_wb
2475           && (regno == cfun->machine->frame.wb_candidate1
2476               || regno == cfun->machine->frame.wb_candidate2))
2477         continue;
2478
2479       reg = gen_rtx_REG (mode, regno);
2480       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2481       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2482
2483       regno2 = aarch64_next_callee_save (regno + 1, limit);
2484
2485       if (regno2 <= limit
2486           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2487               == cfun->machine->frame.reg_offset[regno2]))
2488         {
2489           rtx reg2 = gen_rtx_REG (mode, regno2);
2490           rtx mem2;
2491
2492           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2493           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2494           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2495
2496           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2497           regno = regno2;
2498         }
2499       else
2500         emit_move_insn (reg, mem);
2501       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2502     }
2503 }
2504
2505 /* AArch64 stack frames generated by this compiler look like:
2506
2507         +-------------------------------+
2508         |                               |
2509         |  incoming stack arguments     |
2510         |                               |
2511         +-------------------------------+
2512         |                               | <-- incoming stack pointer (aligned)
2513         |  callee-allocated save area   |
2514         |  for register varargs         |
2515         |                               |
2516         +-------------------------------+
2517         |  local variables              | <-- frame_pointer_rtx
2518         |                               |
2519         +-------------------------------+
2520         |  padding0                     | \
2521         +-------------------------------+  |
2522         |  callee-saved registers       |  | frame.saved_regs_size
2523         +-------------------------------+  |
2524         |  LR'                          |  |
2525         +-------------------------------+  |
2526         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2527         +-------------------------------+
2528         |  dynamic allocation           |
2529         +-------------------------------+
2530         |  padding                      |
2531         +-------------------------------+
2532         |  outgoing stack arguments     | <-- arg_pointer
2533         |                               |
2534         +-------------------------------+
2535         |                               | <-- stack_pointer_rtx (aligned)
2536
2537    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2538    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2539    unchanged.  */
2540
2541 /* Generate the prologue instructions for entry into a function.
2542    Establish the stack frame by decreasing the stack pointer with a
2543    properly calculated size and, if necessary, create a frame record
2544    filled with the values of LR and previous frame pointer.  The
2545    current FP is also set up if it is in use.  */
2546
2547 void
2548 aarch64_expand_prologue (void)
2549 {
2550   /* sub sp, sp, #<frame_size>
2551      stp {fp, lr}, [sp, #<frame_size> - 16]
2552      add fp, sp, #<frame_size> - hardfp_offset
2553      stp {cs_reg}, [fp, #-16] etc.
2554
2555      sub sp, sp, <final_adjustment_if_any>
2556   */
2557   HOST_WIDE_INT frame_size, offset;
2558   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2559   HOST_WIDE_INT hard_fp_offset;
2560   rtx_insn *insn;
2561
2562   aarch64_layout_frame ();
2563
2564   offset = frame_size = cfun->machine->frame.frame_size;
2565   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2566   fp_offset = frame_size - hard_fp_offset;
2567
2568   if (flag_stack_usage_info)
2569     current_function_static_stack_size = frame_size;
2570
2571   /* Store pairs and load pairs have a range only -512 to 504.  */
2572   if (offset >= 512)
2573     {
2574       /* When the frame has a large size, an initial decrease is done on
2575          the stack pointer to jump over the callee-allocated save area for
2576          register varargs, the local variable area and/or the callee-saved
2577          register area.  This will allow the pre-index write-back
2578          store pair instructions to be used for setting up the stack frame
2579          efficiently.  */
2580       offset = hard_fp_offset;
2581       if (offset >= 512)
2582         offset = cfun->machine->frame.saved_regs_size;
2583
2584       frame_size -= (offset + crtl->outgoing_args_size);
2585       fp_offset = 0;
2586
2587       if (frame_size >= 0x1000000)
2588         {
2589           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2590           emit_move_insn (op0, GEN_INT (-frame_size));
2591           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2592
2593           add_reg_note (insn, REG_CFA_ADJUST_CFA,
2594                         gen_rtx_SET (stack_pointer_rtx,
2595                                      plus_constant (Pmode, stack_pointer_rtx,
2596                                                     -frame_size)));
2597           RTX_FRAME_RELATED_P (insn) = 1;
2598         }
2599       else if (frame_size > 0)
2600         {
2601           int hi_ofs = frame_size & 0xfff000;
2602           int lo_ofs = frame_size & 0x000fff;
2603
2604           if (hi_ofs)
2605             {
2606               insn = emit_insn (gen_add2_insn
2607                                 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2608               RTX_FRAME_RELATED_P (insn) = 1;
2609             }
2610           if (lo_ofs)
2611             {
2612               insn = emit_insn (gen_add2_insn
2613                                 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2614               RTX_FRAME_RELATED_P (insn) = 1;
2615             }
2616         }
2617     }
2618   else
2619     frame_size = -1;
2620
2621   if (offset > 0)
2622     {
2623       bool skip_wb = false;
2624
2625       if (frame_pointer_needed)
2626         {
2627           skip_wb = true;
2628
2629           if (fp_offset)
2630             {
2631               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2632                                                GEN_INT (-offset)));
2633               RTX_FRAME_RELATED_P (insn) = 1;
2634
2635               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2636                                          R30_REGNUM, false);
2637             }
2638           else
2639             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2640
2641           /* Set up frame pointer to point to the location of the
2642              previous frame pointer on the stack.  */
2643           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2644                                            stack_pointer_rtx,
2645                                            GEN_INT (fp_offset)));
2646           RTX_FRAME_RELATED_P (insn) = 1;
2647           emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2648         }
2649       else
2650         {
2651           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2652           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2653
2654           if (fp_offset
2655               || reg1 == FIRST_PSEUDO_REGISTER
2656               || (reg2 == FIRST_PSEUDO_REGISTER
2657                   && offset >= 256))
2658             {
2659               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2660                                                GEN_INT (-offset)));
2661               RTX_FRAME_RELATED_P (insn) = 1;
2662             }
2663           else
2664             {
2665               machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2666
2667               skip_wb = true;
2668
2669               if (reg2 == FIRST_PSEUDO_REGISTER)
2670                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2671               else
2672                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2673             }
2674         }
2675
2676       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2677                                  skip_wb);
2678       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2679                                  skip_wb);
2680     }
2681
2682   /* when offset >= 512,
2683      sub sp, sp, #<outgoing_args_size> */
2684   if (frame_size > -1)
2685     {
2686       if (crtl->outgoing_args_size > 0)
2687         {
2688           insn = emit_insn (gen_add2_insn
2689                             (stack_pointer_rtx,
2690                              GEN_INT (- crtl->outgoing_args_size)));
2691           RTX_FRAME_RELATED_P (insn) = 1;
2692         }
2693     }
2694 }
2695
2696 /* Return TRUE if we can use a simple_return insn.
2697
2698    This function checks whether the callee saved stack is empty, which
2699    means no restore actions are need. The pro_and_epilogue will use
2700    this to check whether shrink-wrapping opt is feasible.  */
2701
2702 bool
2703 aarch64_use_return_insn_p (void)
2704 {
2705   if (!reload_completed)
2706     return false;
2707
2708   if (crtl->profile)
2709     return false;
2710
2711   aarch64_layout_frame ();
2712
2713   return cfun->machine->frame.frame_size == 0;
2714 }
2715
2716 /* Generate the epilogue instructions for returning from a function.  */
2717 void
2718 aarch64_expand_epilogue (bool for_sibcall)
2719 {
2720   HOST_WIDE_INT frame_size, offset;
2721   HOST_WIDE_INT fp_offset;
2722   HOST_WIDE_INT hard_fp_offset;
2723   rtx_insn *insn;
2724   /* We need to add memory barrier to prevent read from deallocated stack.  */
2725   bool need_barrier_p = (get_frame_size () != 0
2726                          || cfun->machine->frame.saved_varargs_size);
2727
2728   aarch64_layout_frame ();
2729
2730   offset = frame_size = cfun->machine->frame.frame_size;
2731   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2732   fp_offset = frame_size - hard_fp_offset;
2733
2734   /* Store pairs and load pairs have a range only -512 to 504.  */
2735   if (offset >= 512)
2736     {
2737       offset = hard_fp_offset;
2738       if (offset >= 512)
2739         offset = cfun->machine->frame.saved_regs_size;
2740
2741       frame_size -= (offset + crtl->outgoing_args_size);
2742       fp_offset = 0;
2743       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2744         {
2745           insn = emit_insn (gen_add2_insn
2746                             (stack_pointer_rtx,
2747                              GEN_INT (crtl->outgoing_args_size)));
2748           RTX_FRAME_RELATED_P (insn) = 1;
2749         }
2750     }
2751   else
2752     frame_size = -1;
2753
2754   /* If there were outgoing arguments or we've done dynamic stack
2755      allocation, then restore the stack pointer from the frame
2756      pointer.  This is at most one insn and more efficient than using
2757      GCC's internal mechanism.  */
2758   if (frame_pointer_needed
2759       && (crtl->outgoing_args_size || cfun->calls_alloca))
2760     {
2761       if (cfun->calls_alloca)
2762         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2763
2764       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2765                                        hard_frame_pointer_rtx,
2766                                        GEN_INT (0)));
2767       offset = offset - fp_offset;
2768     }
2769
2770   if (offset > 0)
2771     {
2772       unsigned reg1 = cfun->machine->frame.wb_candidate1;
2773       unsigned reg2 = cfun->machine->frame.wb_candidate2;
2774       bool skip_wb = true;
2775       rtx cfi_ops = NULL;
2776
2777       if (frame_pointer_needed)
2778         fp_offset = 0;
2779       else if (fp_offset
2780                || reg1 == FIRST_PSEUDO_REGISTER
2781                || (reg2 == FIRST_PSEUDO_REGISTER
2782                    && offset >= 256))
2783         skip_wb = false;
2784
2785       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2786                                     skip_wb, &cfi_ops);
2787       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2788                                     skip_wb, &cfi_ops);
2789
2790       if (need_barrier_p)
2791         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2792
2793       if (skip_wb)
2794         {
2795           machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2796           rtx rreg1 = gen_rtx_REG (mode1, reg1);
2797
2798           cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2799           if (reg2 == FIRST_PSEUDO_REGISTER)
2800             {
2801               rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2802               mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2803               mem = gen_rtx_MEM (mode1, mem);
2804               insn = emit_move_insn (rreg1, mem);
2805             }
2806           else
2807             {
2808               rtx rreg2 = gen_rtx_REG (mode1, reg2);
2809
2810               cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2811               insn = emit_insn (aarch64_gen_loadwb_pair
2812                                 (mode1, stack_pointer_rtx, rreg1,
2813                                  rreg2, offset));
2814             }
2815         }
2816       else
2817         {
2818           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2819                                            GEN_INT (offset)));
2820         }
2821
2822       /* Reset the CFA to be SP + FRAME_SIZE.  */
2823       rtx new_cfa = stack_pointer_rtx;
2824       if (frame_size > 0)
2825         new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2826       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2827       REG_NOTES (insn) = cfi_ops;
2828       RTX_FRAME_RELATED_P (insn) = 1;
2829     }
2830
2831   if (frame_size > 0)
2832     {
2833       if (need_barrier_p)
2834         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2835
2836       if (frame_size >= 0x1000000)
2837         {
2838           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2839           emit_move_insn (op0, GEN_INT (frame_size));
2840           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2841         }
2842       else
2843         {
2844           int hi_ofs = frame_size & 0xfff000;
2845           int lo_ofs = frame_size & 0x000fff;
2846
2847           if (hi_ofs && lo_ofs)
2848             {
2849               insn = emit_insn (gen_add2_insn
2850                                 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2851               RTX_FRAME_RELATED_P (insn) = 1;
2852               frame_size = lo_ofs;
2853             }
2854           insn = emit_insn (gen_add2_insn
2855                             (stack_pointer_rtx, GEN_INT (frame_size)));
2856         }
2857
2858       /* Reset the CFA to be SP + 0.  */
2859       add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2860       RTX_FRAME_RELATED_P (insn) = 1;
2861     }
2862
2863   /* Stack adjustment for exception handler.  */
2864   if (crtl->calls_eh_return)
2865     {
2866       /* We need to unwind the stack by the offset computed by
2867          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
2868          to be SP; letting the CFA move during this adjustment
2869          is just as correct as retaining the CFA from the body
2870          of the function.  Therefore, do nothing special.  */
2871       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2872     }
2873
2874   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2875   if (!for_sibcall)
2876     emit_jump_insn (ret_rtx);
2877 }
2878
2879 /* Return the place to copy the exception unwinding return address to.
2880    This will probably be a stack slot, but could (in theory be the
2881    return register).  */
2882 rtx
2883 aarch64_final_eh_return_addr (void)
2884 {
2885   HOST_WIDE_INT fp_offset;
2886
2887   aarch64_layout_frame ();
2888
2889   fp_offset = cfun->machine->frame.frame_size
2890               - cfun->machine->frame.hard_fp_offset;
2891
2892   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2893     return gen_rtx_REG (DImode, LR_REGNUM);
2894
2895   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2896      result in a store to save LR introduced by builtin_eh_return () being
2897      incorrectly deleted because the alias is not detected.
2898      So in the calculation of the address to copy the exception unwinding
2899      return address to, we note 2 cases.
2900      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2901      we return a SP-relative location since all the addresses are SP-relative
2902      in this case.  This prevents the store from being optimized away.
2903      If the fp_offset is not 0, then the addresses will be FP-relative and
2904      therefore we return a FP-relative location.  */
2905
2906   if (frame_pointer_needed)
2907     {
2908       if (fp_offset)
2909         return gen_frame_mem (DImode,
2910                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2911       else
2912         return gen_frame_mem (DImode,
2913                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2914     }
2915
2916   /* If FP is not needed, we calculate the location of LR, which would be
2917      at the top of the saved registers block.  */
2918
2919   return gen_frame_mem (DImode,
2920                         plus_constant (Pmode,
2921                                        stack_pointer_rtx,
2922                                        fp_offset
2923                                        + cfun->machine->frame.saved_regs_size
2924                                        - 2 * UNITS_PER_WORD));
2925 }
2926
2927 /* Possibly output code to build up a constant in a register.  For
2928    the benefit of the costs infrastructure, returns the number of
2929    instructions which would be emitted.  GENERATE inhibits or
2930    enables code generation.  */
2931
2932 static int
2933 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2934 {
2935   int insns = 0;
2936
2937   if (aarch64_bitmask_imm (val, DImode))
2938     {
2939       if (generate)
2940         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2941       insns = 1;
2942     }
2943   else
2944     {
2945       int i;
2946       int ncount = 0;
2947       int zcount = 0;
2948       HOST_WIDE_INT valp = val >> 16;
2949       HOST_WIDE_INT valm;
2950       HOST_WIDE_INT tval;
2951
2952       for (i = 16; i < 64; i += 16)
2953         {
2954           valm = (valp & 0xffff);
2955
2956           if (valm != 0)
2957             ++ zcount;
2958
2959           if (valm != 0xffff)
2960             ++ ncount;
2961
2962           valp >>= 16;
2963         }
2964
2965       /* zcount contains the number of additional MOVK instructions
2966          required if the constant is built up with an initial MOVZ instruction,
2967          while ncount is the number of MOVK instructions required if starting
2968          with a MOVN instruction.  Choose the sequence that yields the fewest
2969          number of instructions, preferring MOVZ instructions when they are both
2970          the same.  */
2971       if (ncount < zcount)
2972         {
2973           if (generate)
2974             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2975                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2976           tval = 0xffff;
2977           insns++;
2978         }
2979       else
2980         {
2981           if (generate)
2982             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2983                             GEN_INT (val & 0xffff));
2984           tval = 0;
2985           insns++;
2986         }
2987
2988       val >>= 16;
2989
2990       for (i = 16; i < 64; i += 16)
2991         {
2992           if ((val & 0xffff) != tval)
2993             {
2994               if (generate)
2995                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2996                                            GEN_INT (i),
2997                                            GEN_INT (val & 0xffff)));
2998               insns++;
2999             }
3000           val >>= 16;
3001         }
3002     }
3003   return insns;
3004 }
3005
3006 static void
3007 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
3008 {
3009   HOST_WIDE_INT mdelta = delta;
3010   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
3011   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
3012
3013   if (mdelta < 0)
3014     mdelta = -mdelta;
3015
3016   if (mdelta >= 4096 * 4096)
3017     {
3018       (void) aarch64_build_constant (scratchreg, delta, true);
3019       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
3020     }
3021   else if (mdelta > 0)
3022     {
3023       if (mdelta >= 4096)
3024         {
3025           emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
3026           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
3027           if (delta < 0)
3028             emit_insn (gen_rtx_SET (this_rtx,
3029                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
3030           else
3031             emit_insn (gen_rtx_SET (this_rtx,
3032                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
3033         }
3034       if (mdelta % 4096 != 0)
3035         {
3036           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
3037           emit_insn (gen_rtx_SET (this_rtx,
3038                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
3039         }
3040     }
3041 }
3042
3043 /* Output code to add DELTA to the first argument, and then jump
3044    to FUNCTION.  Used for C++ multiple inheritance.  */
3045 static void
3046 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3047                          HOST_WIDE_INT delta,
3048                          HOST_WIDE_INT vcall_offset,
3049                          tree function)
3050 {
3051   /* The this pointer is always in x0.  Note that this differs from
3052      Arm where the this pointer maybe bumped to r1 if r0 is required
3053      to return a pointer to an aggregate.  On AArch64 a result value
3054      pointer will be in x8.  */
3055   int this_regno = R0_REGNUM;
3056   rtx this_rtx, temp0, temp1, addr, funexp;
3057   rtx_insn *insn;
3058
3059   reload_completed = 1;
3060   emit_note (NOTE_INSN_PROLOGUE_END);
3061
3062   if (vcall_offset == 0)
3063     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3064   else
3065     {
3066       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3067
3068       this_rtx = gen_rtx_REG (Pmode, this_regno);
3069       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3070       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3071
3072       addr = this_rtx;
3073       if (delta != 0)
3074         {
3075           if (delta >= -256 && delta < 256)
3076             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3077                                        plus_constant (Pmode, this_rtx, delta));
3078           else
3079             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3080         }
3081
3082       if (Pmode == ptr_mode)
3083         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3084       else
3085         aarch64_emit_move (temp0,
3086                            gen_rtx_ZERO_EXTEND (Pmode,
3087                                                 gen_rtx_MEM (ptr_mode, addr)));
3088
3089       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3090           addr = plus_constant (Pmode, temp0, vcall_offset);
3091       else
3092         {
3093           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
3094           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3095         }
3096
3097       if (Pmode == ptr_mode)
3098         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3099       else
3100         aarch64_emit_move (temp1,
3101                            gen_rtx_SIGN_EXTEND (Pmode,
3102                                                 gen_rtx_MEM (ptr_mode, addr)));
3103
3104       emit_insn (gen_add2_insn (this_rtx, temp1));
3105     }
3106
3107   /* Generate a tail call to the target function.  */
3108   if (!TREE_USED (function))
3109     {
3110       assemble_external (function);
3111       TREE_USED (function) = 1;
3112     }
3113   funexp = XEXP (DECL_RTL (function), 0);
3114   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3115   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3116   SIBLING_CALL_P (insn) = 1;
3117
3118   insn = get_insns ();
3119   shorten_branches (insn);
3120   final_start_function (insn, file, 1);
3121   final (insn, file, 1);
3122   final_end_function ();
3123
3124   /* Stop pretending to be a post-reload pass.  */
3125   reload_completed = 0;
3126 }
3127
3128 static bool
3129 aarch64_tls_referenced_p (rtx x)
3130 {
3131   if (!TARGET_HAVE_TLS)
3132     return false;
3133   subrtx_iterator::array_type array;
3134   FOR_EACH_SUBRTX (iter, array, x, ALL)
3135     {
3136       const_rtx x = *iter;
3137       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3138         return true;
3139       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3140          TLS offsets, not real symbol references.  */
3141       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3142         iter.skip_subrtxes ();
3143     }
3144   return false;
3145 }
3146
3147
3148 static int
3149 aarch64_bitmasks_cmp (const void *i1, const void *i2)
3150 {
3151   const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3152   const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3153
3154   if (*imm1 < *imm2)
3155     return -1;
3156   if (*imm1 > *imm2)
3157     return +1;
3158   return 0;
3159 }
3160
3161
3162 static void
3163 aarch64_build_bitmask_table (void)
3164 {
3165   unsigned HOST_WIDE_INT mask, imm;
3166   unsigned int log_e, e, s, r;
3167   unsigned int nimms = 0;
3168
3169   for (log_e = 1; log_e <= 6; log_e++)
3170     {
3171       e = 1 << log_e;
3172       if (e == 64)
3173         mask = ~(HOST_WIDE_INT) 0;
3174       else
3175         mask = ((HOST_WIDE_INT) 1 << e) - 1;
3176       for (s = 1; s < e; s++)
3177         {
3178           for (r = 0; r < e; r++)
3179             {
3180               /* set s consecutive bits to 1 (s < 64) */
3181               imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3182               /* rotate right by r */
3183               if (r != 0)
3184                 imm = ((imm >> r) | (imm << (e - r))) & mask;
3185               /* replicate the constant depending on SIMD size */
3186               switch (log_e) {
3187               case 1: imm |= (imm <<  2);
3188               case 2: imm |= (imm <<  4);
3189               case 3: imm |= (imm <<  8);
3190               case 4: imm |= (imm << 16);
3191               case 5: imm |= (imm << 32);
3192               case 6:
3193                 break;
3194               default:
3195                 gcc_unreachable ();
3196               }
3197               gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3198               aarch64_bitmasks[nimms++] = imm;
3199             }
3200         }
3201     }
3202
3203   gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3204   qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3205          aarch64_bitmasks_cmp);
3206 }
3207
3208
3209 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3210    a left shift of 0 or 12 bits.  */
3211 bool
3212 aarch64_uimm12_shift (HOST_WIDE_INT val)
3213 {
3214   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3215           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3216           );
3217 }
3218
3219
3220 /* Return true if val is an immediate that can be loaded into a
3221    register by a MOVZ instruction.  */
3222 static bool
3223 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3224 {
3225   if (GET_MODE_SIZE (mode) > 4)
3226     {
3227       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3228           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3229         return 1;
3230     }
3231   else
3232     {
3233       /* Ignore sign extension.  */
3234       val &= (HOST_WIDE_INT) 0xffffffff;
3235     }
3236   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3237           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3238 }
3239
3240
3241 /* Return true if val is a valid bitmask immediate.  */
3242 bool
3243 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3244 {
3245   if (GET_MODE_SIZE (mode) < 8)
3246     {
3247       /* Replicate bit pattern.  */
3248       val &= (HOST_WIDE_INT) 0xffffffff;
3249       val |= val << 32;
3250     }
3251   return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3252                   sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3253 }
3254
3255
3256 /* Return true if val is an immediate that can be loaded into a
3257    register in a single instruction.  */
3258 bool
3259 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3260 {
3261   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3262     return 1;
3263   return aarch64_bitmask_imm (val, mode);
3264 }
3265
3266 static bool
3267 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3268 {
3269   rtx base, offset;
3270
3271   if (GET_CODE (x) == HIGH)
3272     return true;
3273
3274   split_const (x, &base, &offset);
3275   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3276     {
3277       if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3278           != SYMBOL_FORCE_TO_MEM)
3279         return true;
3280       else
3281         /* Avoid generating a 64-bit relocation in ILP32; leave
3282            to aarch64_expand_mov_immediate to handle it properly.  */
3283         return mode != ptr_mode;
3284     }
3285
3286   return aarch64_tls_referenced_p (x);
3287 }
3288
3289 /* Return true if register REGNO is a valid index register.
3290    STRICT_P is true if REG_OK_STRICT is in effect.  */
3291
3292 bool
3293 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3294 {
3295   if (!HARD_REGISTER_NUM_P (regno))
3296     {
3297       if (!strict_p)
3298         return true;
3299
3300       if (!reg_renumber)
3301         return false;
3302
3303       regno = reg_renumber[regno];
3304     }
3305   return GP_REGNUM_P (regno);
3306 }
3307
3308 /* Return true if register REGNO is a valid base register for mode MODE.
3309    STRICT_P is true if REG_OK_STRICT is in effect.  */
3310
3311 bool
3312 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3313 {
3314   if (!HARD_REGISTER_NUM_P (regno))
3315     {
3316       if (!strict_p)
3317         return true;
3318
3319       if (!reg_renumber)
3320         return false;
3321
3322       regno = reg_renumber[regno];
3323     }
3324
3325   /* The fake registers will be eliminated to either the stack or
3326      hard frame pointer, both of which are usually valid base registers.
3327      Reload deals with the cases where the eliminated form isn't valid.  */
3328   return (GP_REGNUM_P (regno)
3329           || regno == SP_REGNUM
3330           || regno == FRAME_POINTER_REGNUM
3331           || regno == ARG_POINTER_REGNUM);
3332 }
3333
3334 /* Return true if X is a valid base register for mode MODE.
3335    STRICT_P is true if REG_OK_STRICT is in effect.  */
3336
3337 static bool
3338 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3339 {
3340   if (!strict_p && GET_CODE (x) == SUBREG)
3341     x = SUBREG_REG (x);
3342
3343   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3344 }
3345
3346 /* Return true if address offset is a valid index.  If it is, fill in INFO
3347    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3348
3349 static bool
3350 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3351                         machine_mode mode, bool strict_p)
3352 {
3353   enum aarch64_address_type type;
3354   rtx index;
3355   int shift;
3356
3357   /* (reg:P) */
3358   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3359       && GET_MODE (x) == Pmode)
3360     {
3361       type = ADDRESS_REG_REG;
3362       index = x;
3363       shift = 0;
3364     }
3365   /* (sign_extend:DI (reg:SI)) */
3366   else if ((GET_CODE (x) == SIGN_EXTEND
3367             || GET_CODE (x) == ZERO_EXTEND)
3368            && GET_MODE (x) == DImode
3369            && GET_MODE (XEXP (x, 0)) == SImode)
3370     {
3371       type = (GET_CODE (x) == SIGN_EXTEND)
3372         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3373       index = XEXP (x, 0);
3374       shift = 0;
3375     }
3376   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3377   else if (GET_CODE (x) == MULT
3378            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3379                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3380            && GET_MODE (XEXP (x, 0)) == DImode
3381            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3382            && CONST_INT_P (XEXP (x, 1)))
3383     {
3384       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3385         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3386       index = XEXP (XEXP (x, 0), 0);
3387       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3388     }
3389   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3390   else if (GET_CODE (x) == ASHIFT
3391            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3392                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3393            && GET_MODE (XEXP (x, 0)) == DImode
3394            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3395            && CONST_INT_P (XEXP (x, 1)))
3396     {
3397       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3398         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3399       index = XEXP (XEXP (x, 0), 0);
3400       shift = INTVAL (XEXP (x, 1));
3401     }
3402   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3403   else if ((GET_CODE (x) == SIGN_EXTRACT
3404             || GET_CODE (x) == ZERO_EXTRACT)
3405            && GET_MODE (x) == DImode
3406            && GET_CODE (XEXP (x, 0)) == MULT
3407            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3408            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3409     {
3410       type = (GET_CODE (x) == SIGN_EXTRACT)
3411         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3412       index = XEXP (XEXP (x, 0), 0);
3413       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3414       if (INTVAL (XEXP (x, 1)) != 32 + shift
3415           || INTVAL (XEXP (x, 2)) != 0)
3416         shift = -1;
3417     }
3418   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3419      (const_int 0xffffffff<<shift)) */
3420   else if (GET_CODE (x) == AND
3421            && GET_MODE (x) == DImode
3422            && GET_CODE (XEXP (x, 0)) == MULT
3423            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3424            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3425            && CONST_INT_P (XEXP (x, 1)))
3426     {
3427       type = ADDRESS_REG_UXTW;
3428       index = XEXP (XEXP (x, 0), 0);
3429       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3430       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3431         shift = -1;
3432     }
3433   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3434   else if ((GET_CODE (x) == SIGN_EXTRACT
3435             || GET_CODE (x) == ZERO_EXTRACT)
3436            && GET_MODE (x) == DImode
3437            && GET_CODE (XEXP (x, 0)) == ASHIFT
3438            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3439            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3440     {
3441       type = (GET_CODE (x) == SIGN_EXTRACT)
3442         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3443       index = XEXP (XEXP (x, 0), 0);
3444       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3445       if (INTVAL (XEXP (x, 1)) != 32 + shift
3446           || INTVAL (XEXP (x, 2)) != 0)
3447         shift = -1;
3448     }
3449   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3450      (const_int 0xffffffff<<shift)) */
3451   else if (GET_CODE (x) == AND
3452            && GET_MODE (x) == DImode
3453            && GET_CODE (XEXP (x, 0)) == ASHIFT
3454            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3455            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3456            && CONST_INT_P (XEXP (x, 1)))
3457     {
3458       type = ADDRESS_REG_UXTW;
3459       index = XEXP (XEXP (x, 0), 0);
3460       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3461       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3462         shift = -1;
3463     }
3464   /* (mult:P (reg:P) (const_int scale)) */
3465   else if (GET_CODE (x) == MULT
3466            && GET_MODE (x) == Pmode
3467            && GET_MODE (XEXP (x, 0)) == Pmode
3468            && CONST_INT_P (XEXP (x, 1)))
3469     {
3470       type = ADDRESS_REG_REG;
3471       index = XEXP (x, 0);
3472       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3473     }
3474   /* (ashift:P (reg:P) (const_int shift)) */
3475   else if (GET_CODE (x) == ASHIFT
3476            && GET_MODE (x) == Pmode
3477            && GET_MODE (XEXP (x, 0)) == Pmode
3478            && CONST_INT_P (XEXP (x, 1)))
3479     {
3480       type = ADDRESS_REG_REG;
3481       index = XEXP (x, 0);
3482       shift = INTVAL (XEXP (x, 1));
3483     }
3484   else
3485     return false;
3486
3487   if (GET_CODE (index) == SUBREG)
3488     index = SUBREG_REG (index);
3489
3490   if ((shift == 0 ||
3491        (shift > 0 && shift <= 3
3492         && (1 << shift) == GET_MODE_SIZE (mode)))
3493       && REG_P (index)
3494       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3495     {
3496       info->type = type;
3497       info->offset = index;
3498       info->shift = shift;
3499       return true;
3500     }
3501
3502   return false;
3503 }
3504
3505 bool
3506 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3507 {
3508   return (offset >= -64 * GET_MODE_SIZE (mode)
3509           && offset < 64 * GET_MODE_SIZE (mode)
3510           && offset % GET_MODE_SIZE (mode) == 0);
3511 }
3512
3513 static inline bool
3514 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3515                                HOST_WIDE_INT offset)
3516 {
3517   return offset >= -256 && offset < 256;
3518 }
3519
3520 static inline bool
3521 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3522 {
3523   return (offset >= 0
3524           && offset < 4096 * GET_MODE_SIZE (mode)
3525           && offset % GET_MODE_SIZE (mode) == 0);
3526 }
3527
3528 /* Return true if X is a valid address for machine mode MODE.  If it is,
3529    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3530    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3531
3532 static bool
3533 aarch64_classify_address (struct aarch64_address_info *info,
3534                           rtx x, machine_mode mode,
3535                           RTX_CODE outer_code, bool strict_p)
3536 {
3537   enum rtx_code code = GET_CODE (x);
3538   rtx op0, op1;
3539
3540   /* On BE, we use load/store pair for all large int mode load/stores.  */
3541   bool load_store_pair_p = (outer_code == PARALLEL
3542                             || (BYTES_BIG_ENDIAN
3543                                 && aarch64_vect_struct_mode_p (mode)));
3544
3545   bool allow_reg_index_p =
3546     !load_store_pair_p
3547     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3548     && !aarch64_vect_struct_mode_p (mode);
3549
3550   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3551      REG addressing.  */
3552   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3553       && (code != POST_INC && code != REG))
3554     return false;
3555
3556   switch (code)
3557     {
3558     case REG:
3559     case SUBREG:
3560       info->type = ADDRESS_REG_IMM;
3561       info->base = x;
3562       info->offset = const0_rtx;
3563       return aarch64_base_register_rtx_p (x, strict_p);
3564
3565     case PLUS:
3566       op0 = XEXP (x, 0);
3567       op1 = XEXP (x, 1);
3568
3569       if (! strict_p
3570           && REG_P (op0)
3571           && (op0 == virtual_stack_vars_rtx
3572               || op0 == frame_pointer_rtx
3573               || op0 == arg_pointer_rtx)
3574           && CONST_INT_P (op1))
3575         {
3576           info->type = ADDRESS_REG_IMM;
3577           info->base = op0;
3578           info->offset = op1;
3579
3580           return true;
3581         }
3582
3583       if (GET_MODE_SIZE (mode) != 0
3584           && CONST_INT_P (op1)
3585           && aarch64_base_register_rtx_p (op0, strict_p))
3586         {
3587           HOST_WIDE_INT offset = INTVAL (op1);
3588
3589           info->type = ADDRESS_REG_IMM;
3590           info->base = op0;
3591           info->offset = op1;
3592
3593           /* TImode and TFmode values are allowed in both pairs of X
3594              registers and individual Q registers.  The available
3595              address modes are:
3596              X,X: 7-bit signed scaled offset
3597              Q:   9-bit signed offset
3598              We conservatively require an offset representable in either mode.
3599            */
3600           if (mode == TImode || mode == TFmode)
3601             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3602                     && offset_9bit_signed_unscaled_p (mode, offset));
3603
3604           /* A 7bit offset check because OImode will emit a ldp/stp
3605              instruction (only big endian will get here).
3606              For ldp/stp instructions, the offset is scaled for the size of a
3607              single element of the pair.  */
3608           if (mode == OImode)
3609             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3610
3611           /* Three 9/12 bit offsets checks because CImode will emit three
3612              ldr/str instructions (only big endian will get here).  */
3613           if (mode == CImode)
3614             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3615                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3616                         || offset_12bit_unsigned_scaled_p (V16QImode,
3617                                                            offset + 32)));
3618
3619           /* Two 7bit offsets checks because XImode will emit two ldp/stp
3620              instructions (only big endian will get here).  */
3621           if (mode == XImode)
3622             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3623                     && aarch64_offset_7bit_signed_scaled_p (TImode,
3624                                                             offset + 32));
3625
3626           if (load_store_pair_p)
3627             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3628                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3629           else
3630             return (offset_9bit_signed_unscaled_p (mode, offset)
3631                     || offset_12bit_unsigned_scaled_p (mode, offset));
3632         }
3633
3634       if (allow_reg_index_p)
3635         {
3636           /* Look for base + (scaled/extended) index register.  */
3637           if (aarch64_base_register_rtx_p (op0, strict_p)
3638               && aarch64_classify_index (info, op1, mode, strict_p))
3639             {
3640               info->base = op0;
3641               return true;
3642             }
3643           if (aarch64_base_register_rtx_p (op1, strict_p)
3644               && aarch64_classify_index (info, op0, mode, strict_p))
3645             {
3646               info->base = op1;
3647               return true;
3648             }
3649         }
3650
3651       return false;
3652
3653     case POST_INC:
3654     case POST_DEC:
3655     case PRE_INC:
3656     case PRE_DEC:
3657       info->type = ADDRESS_REG_WB;
3658       info->base = XEXP (x, 0);
3659       info->offset = NULL_RTX;
3660       return aarch64_base_register_rtx_p (info->base, strict_p);
3661
3662     case POST_MODIFY:
3663     case PRE_MODIFY:
3664       info->type = ADDRESS_REG_WB;
3665       info->base = XEXP (x, 0);
3666       if (GET_CODE (XEXP (x, 1)) == PLUS
3667           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3668           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3669           && aarch64_base_register_rtx_p (info->base, strict_p))
3670         {
3671           HOST_WIDE_INT offset;
3672           info->offset = XEXP (XEXP (x, 1), 1);
3673           offset = INTVAL (info->offset);
3674
3675           /* TImode and TFmode values are allowed in both pairs of X
3676              registers and individual Q registers.  The available
3677              address modes are:
3678              X,X: 7-bit signed scaled offset
3679              Q:   9-bit signed offset
3680              We conservatively require an offset representable in either mode.
3681            */
3682           if (mode == TImode || mode == TFmode)
3683             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3684                     && offset_9bit_signed_unscaled_p (mode, offset));
3685
3686           if (load_store_pair_p)
3687             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3688                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3689           else
3690             return offset_9bit_signed_unscaled_p (mode, offset);
3691         }
3692       return false;
3693
3694     case CONST:
3695     case SYMBOL_REF:
3696     case LABEL_REF:
3697       /* load literal: pc-relative constant pool entry.  Only supported
3698          for SI mode or larger.  */
3699       info->type = ADDRESS_SYMBOLIC;
3700
3701       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3702         {
3703           rtx sym, addend;
3704
3705           split_const (x, &sym, &addend);
3706           return (GET_CODE (sym) == LABEL_REF
3707                   || (GET_CODE (sym) == SYMBOL_REF
3708                       && CONSTANT_POOL_ADDRESS_P (sym)));
3709         }
3710       return false;
3711
3712     case LO_SUM:
3713       info->type = ADDRESS_LO_SUM;
3714       info->base = XEXP (x, 0);
3715       info->offset = XEXP (x, 1);
3716       if (allow_reg_index_p
3717           && aarch64_base_register_rtx_p (info->base, strict_p))
3718         {
3719           rtx sym, offs;
3720           split_const (info->offset, &sym, &offs);
3721           if (GET_CODE (sym) == SYMBOL_REF
3722               && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3723                   == SYMBOL_SMALL_ABSOLUTE))
3724             {
3725               /* The symbol and offset must be aligned to the access size.  */
3726               unsigned int align;
3727               unsigned int ref_size;
3728
3729               if (CONSTANT_POOL_ADDRESS_P (sym))
3730                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3731               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3732                 {
3733                   tree exp = SYMBOL_REF_DECL (sym);
3734                   align = TYPE_ALIGN (TREE_TYPE (exp));
3735                   align = CONSTANT_ALIGNMENT (exp, align);
3736                 }
3737               else if (SYMBOL_REF_DECL (sym))
3738                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3739               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3740                        && SYMBOL_REF_BLOCK (sym) != NULL)
3741                 align = SYMBOL_REF_BLOCK (sym)->alignment;
3742               else
3743                 align = BITS_PER_UNIT;
3744
3745               ref_size = GET_MODE_SIZE (mode);
3746               if (ref_size == 0)
3747                 ref_size = GET_MODE_SIZE (DImode);
3748
3749               return ((INTVAL (offs) & (ref_size - 1)) == 0
3750                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3751             }
3752         }
3753       return false;
3754
3755     default:
3756       return false;
3757     }
3758 }
3759
3760 bool
3761 aarch64_symbolic_address_p (rtx x)
3762 {
3763   rtx offset;
3764
3765   split_const (x, &x, &offset);
3766   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3767 }
3768
3769 /* Classify the base of symbolic expression X, given that X appears in
3770    context CONTEXT.  */
3771
3772 enum aarch64_symbol_type
3773 aarch64_classify_symbolic_expression (rtx x,
3774                                       enum aarch64_symbol_context context)
3775 {
3776   rtx offset;
3777
3778   split_const (x, &x, &offset);
3779   return aarch64_classify_symbol (x, offset, context);
3780 }
3781
3782
3783 /* Return TRUE if X is a legitimate address for accessing memory in
3784    mode MODE.  */
3785 static bool
3786 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3787 {
3788   struct aarch64_address_info addr;
3789
3790   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3791 }
3792
3793 /* Return TRUE if X is a legitimate address for accessing memory in
3794    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3795    pair operation.  */
3796 bool
3797 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3798                               RTX_CODE outer_code, bool strict_p)
3799 {
3800   struct aarch64_address_info addr;
3801
3802   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3803 }
3804
3805 /* Return TRUE if rtx X is immediate constant 0.0 */
3806 bool
3807 aarch64_float_const_zero_rtx_p (rtx x)
3808 {
3809   REAL_VALUE_TYPE r;
3810
3811   if (GET_MODE (x) == VOIDmode)
3812     return false;
3813
3814   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3815   if (REAL_VALUE_MINUS_ZERO (r))
3816     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3817   return REAL_VALUES_EQUAL (r, dconst0);
3818 }
3819
3820 /* Return the fixed registers used for condition codes.  */
3821
3822 static bool
3823 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3824 {
3825   *p1 = CC_REGNUM;
3826   *p2 = INVALID_REGNUM;
3827   return true;
3828 }
3829
3830 /* Emit call insn with PAT and do aarch64-specific handling.  */
3831
3832 void
3833 aarch64_emit_call_insn (rtx pat)
3834 {
3835   rtx insn = emit_call_insn (pat);
3836
3837   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3838   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3839   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3840 }
3841
3842 machine_mode
3843 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3844 {
3845   /* All floating point compares return CCFP if it is an equality
3846      comparison, and CCFPE otherwise.  */
3847   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3848     {
3849       switch (code)
3850         {
3851         case EQ:
3852         case NE:
3853         case UNORDERED:
3854         case ORDERED:
3855         case UNLT:
3856         case UNLE:
3857         case UNGT:
3858         case UNGE:
3859         case UNEQ:
3860         case LTGT:
3861           return CCFPmode;
3862
3863         case LT:
3864         case LE:
3865         case GT:
3866         case GE:
3867           return CCFPEmode;
3868
3869         default:
3870           gcc_unreachable ();
3871         }
3872     }
3873
3874   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3875       && y == const0_rtx
3876       && (code == EQ || code == NE || code == LT || code == GE)
3877       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3878           || GET_CODE (x) == NEG))
3879     return CC_NZmode;
3880
3881   /* A compare with a shifted operand.  Because of canonicalization,
3882      the comparison will have to be swapped when we emit the assembly
3883      code.  */
3884   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3885       && (REG_P (y) || GET_CODE (y) == SUBREG)
3886       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3887           || GET_CODE (x) == LSHIFTRT
3888           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3889     return CC_SWPmode;
3890
3891   /* Similarly for a negated operand, but we can only do this for
3892      equalities.  */
3893   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3894       && (REG_P (y) || GET_CODE (y) == SUBREG)
3895       && (code == EQ || code == NE)
3896       && GET_CODE (x) == NEG)
3897     return CC_Zmode;
3898
3899   /* A compare of a mode narrower than SI mode against zero can be done
3900      by extending the value in the comparison.  */
3901   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3902       && y == const0_rtx)
3903     /* Only use sign-extension if we really need it.  */
3904     return ((code == GT || code == GE || code == LE || code == LT)
3905             ? CC_SESWPmode : CC_ZESWPmode);
3906
3907   /* For everything else, return CCmode.  */
3908   return CCmode;
3909 }
3910
3911 static int
3912 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3913
3914 int
3915 aarch64_get_condition_code (rtx x)
3916 {
3917   machine_mode mode = GET_MODE (XEXP (x, 0));
3918   enum rtx_code comp_code = GET_CODE (x);
3919
3920   if (GET_MODE_CLASS (mode) != MODE_CC)
3921     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3922   return aarch64_get_condition_code_1 (mode, comp_code);
3923 }
3924
3925 static int
3926 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3927 {
3928   int ne = -1, eq = -1;
3929   switch (mode)
3930     {
3931     case CCFPmode:
3932     case CCFPEmode:
3933       switch (comp_code)
3934         {
3935         case GE: return AARCH64_GE;
3936         case GT: return AARCH64_GT;
3937         case LE: return AARCH64_LS;
3938         case LT: return AARCH64_MI;
3939         case NE: return AARCH64_NE;
3940         case EQ: return AARCH64_EQ;
3941         case ORDERED: return AARCH64_VC;
3942         case UNORDERED: return AARCH64_VS;
3943         case UNLT: return AARCH64_LT;
3944         case UNLE: return AARCH64_LE;
3945         case UNGT: return AARCH64_HI;
3946         case UNGE: return AARCH64_PL;
3947         default: return -1;
3948         }
3949       break;
3950
3951     case CC_DNEmode:
3952       ne = AARCH64_NE;
3953       eq = AARCH64_EQ;
3954       break;
3955
3956     case CC_DEQmode:
3957       ne = AARCH64_EQ;
3958       eq = AARCH64_NE;
3959       break;
3960
3961     case CC_DGEmode:
3962       ne = AARCH64_GE;
3963       eq = AARCH64_LT;
3964       break;
3965
3966     case CC_DLTmode:
3967       ne = AARCH64_LT;
3968       eq = AARCH64_GE;
3969       break;
3970
3971     case CC_DGTmode:
3972       ne = AARCH64_GT;
3973       eq = AARCH64_LE;
3974       break;
3975
3976     case CC_DLEmode:
3977       ne = AARCH64_LE;
3978       eq = AARCH64_GT;
3979       break;
3980
3981     case CC_DGEUmode:
3982       ne = AARCH64_CS;
3983       eq = AARCH64_CC;
3984       break;
3985
3986     case CC_DLTUmode:
3987       ne = AARCH64_CC;
3988       eq = AARCH64_CS;
3989       break;
3990
3991     case CC_DGTUmode:
3992       ne = AARCH64_HI;
3993       eq = AARCH64_LS;
3994       break;
3995
3996     case CC_DLEUmode:
3997       ne = AARCH64_LS;
3998       eq = AARCH64_HI;
3999       break;
4000
4001     case CCmode:
4002       switch (comp_code)
4003         {
4004         case NE: return AARCH64_NE;
4005         case EQ: return AARCH64_EQ;
4006         case GE: return AARCH64_GE;
4007         case GT: return AARCH64_GT;
4008         case LE: return AARCH64_LE;
4009         case LT: return AARCH64_LT;
4010         case GEU: return AARCH64_CS;
4011         case GTU: return AARCH64_HI;
4012         case LEU: return AARCH64_LS;
4013         case LTU: return AARCH64_CC;
4014         default: return -1;
4015         }
4016       break;
4017
4018     case CC_SWPmode:
4019     case CC_ZESWPmode:
4020     case CC_SESWPmode:
4021       switch (comp_code)
4022         {
4023         case NE: return AARCH64_NE;
4024         case EQ: return AARCH64_EQ;
4025         case GE: return AARCH64_LE;
4026         case GT: return AARCH64_LT;
4027         case LE: return AARCH64_GE;
4028         case LT: return AARCH64_GT;
4029         case GEU: return AARCH64_LS;
4030         case GTU: return AARCH64_CC;
4031         case LEU: return AARCH64_CS;
4032         case LTU: return AARCH64_HI;
4033         default: return -1;
4034         }
4035       break;
4036
4037     case CC_NZmode:
4038       switch (comp_code)
4039         {
4040         case NE: return AARCH64_NE;
4041         case EQ: return AARCH64_EQ;
4042         case GE: return AARCH64_PL;
4043         case LT: return AARCH64_MI;
4044         default: return -1;
4045         }
4046       break;
4047
4048     case CC_Zmode:
4049       switch (comp_code)
4050         {
4051         case NE: return AARCH64_NE;
4052         case EQ: return AARCH64_EQ;
4053         default: return -1;
4054         }
4055       break;
4056
4057     default:
4058       return -1;
4059       break;
4060     }
4061
4062   if (comp_code == NE)
4063     return ne;
4064
4065   if (comp_code == EQ)
4066     return eq;
4067
4068   return -1;
4069 }
4070
4071 bool
4072 aarch64_const_vec_all_same_in_range_p (rtx x,
4073                                   HOST_WIDE_INT minval,
4074                                   HOST_WIDE_INT maxval)
4075 {
4076   HOST_WIDE_INT firstval;
4077   int count, i;
4078
4079   if (GET_CODE (x) != CONST_VECTOR
4080       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4081     return false;
4082
4083   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4084   if (firstval < minval || firstval > maxval)
4085     return false;
4086
4087   count = CONST_VECTOR_NUNITS (x);
4088   for (i = 1; i < count; i++)
4089     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4090       return false;
4091
4092   return true;
4093 }
4094
4095 bool
4096 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4097 {
4098   return aarch64_const_vec_all_same_in_range_p (x, val, val);
4099 }
4100
4101 static unsigned
4102 bit_count (unsigned HOST_WIDE_INT value)
4103 {
4104   unsigned count = 0;
4105
4106   while (value)
4107     {
4108       count++;
4109       value &= value - 1;
4110     }
4111
4112   return count;
4113 }
4114
4115 /* N Z C V.  */
4116 #define AARCH64_CC_V 1
4117 #define AARCH64_CC_C (1 << 1)
4118 #define AARCH64_CC_Z (1 << 2)
4119 #define AARCH64_CC_N (1 << 3)
4120
4121 /* N Z C V flags for ccmp.  The first code is for AND op and the other
4122    is for IOR op.  Indexed by AARCH64_COND_CODE.  */
4123 static const int aarch64_nzcv_codes[][2] =
4124 {
4125   {AARCH64_CC_Z, 0}, /* EQ, Z == 1.  */
4126   {0, AARCH64_CC_Z}, /* NE, Z == 0.  */
4127   {AARCH64_CC_C, 0}, /* CS, C == 1.  */
4128   {0, AARCH64_CC_C}, /* CC, C == 0.  */
4129   {AARCH64_CC_N, 0}, /* MI, N == 1.  */
4130   {0, AARCH64_CC_N}, /* PL, N == 0.  */
4131   {AARCH64_CC_V, 0}, /* VS, V == 1.  */
4132   {0, AARCH64_CC_V}, /* VC, V == 0.  */
4133   {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0.  */
4134   {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0).  */
4135   {0, AARCH64_CC_V}, /* GE, N == V.  */
4136   {AARCH64_CC_V, 0}, /* LT, N != V.  */
4137   {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V.  */
4138   {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V).  */
4139   {0, 0}, /* AL, Any.  */
4140   {0, 0}, /* NV, Any.  */
4141 };
4142
4143 int
4144 aarch64_ccmp_mode_to_code (enum machine_mode mode)
4145 {
4146   switch (mode)
4147     {
4148     case CC_DNEmode:
4149       return NE;
4150
4151     case CC_DEQmode:
4152       return EQ;
4153
4154     case CC_DLEmode:
4155       return LE;
4156
4157     case CC_DGTmode:
4158       return GT;
4159
4160     case CC_DLTmode:
4161       return LT;
4162
4163     case CC_DGEmode:
4164       return GE;
4165
4166     case CC_DLEUmode:
4167       return LEU;
4168
4169     case CC_DGTUmode:
4170       return GTU;
4171
4172     case CC_DLTUmode:
4173       return LTU;
4174
4175     case CC_DGEUmode:
4176       return GEU;
4177
4178     default:
4179       gcc_unreachable ();
4180     }
4181 }
4182
4183
4184 void
4185 aarch64_print_operand (FILE *f, rtx x, char code)
4186 {
4187   switch (code)
4188     {
4189     /* An integer or symbol address without a preceding # sign.  */
4190     case 'c':
4191       switch (GET_CODE (x))
4192         {
4193         case CONST_INT:
4194           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4195           break;
4196
4197         case SYMBOL_REF:
4198           output_addr_const (f, x);
4199           break;
4200
4201         case CONST:
4202           if (GET_CODE (XEXP (x, 0)) == PLUS
4203               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4204             {
4205               output_addr_const (f, x);
4206               break;
4207             }
4208           /* Fall through.  */
4209
4210         default:
4211           output_operand_lossage ("Unsupported operand for code '%c'", code);
4212         }
4213       break;
4214
4215     case 'e':
4216       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4217       {
4218         int n;
4219
4220         if (!CONST_INT_P (x)
4221             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4222           {
4223             output_operand_lossage ("invalid operand for '%%%c'", code);
4224             return;
4225           }
4226
4227         switch (n)
4228           {
4229           case 3:
4230             fputc ('b', f);
4231             break;
4232           case 4:
4233             fputc ('h', f);
4234             break;
4235           case 5:
4236             fputc ('w', f);
4237             break;
4238           default:
4239             output_operand_lossage ("invalid operand for '%%%c'", code);
4240             return;
4241           }
4242       }
4243       break;
4244
4245     case 'p':
4246       {
4247         int n;
4248
4249         /* Print N such that 2^N == X.  */
4250         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4251           {
4252             output_operand_lossage ("invalid operand for '%%%c'", code);
4253             return;
4254           }
4255
4256         asm_fprintf (f, "%d", n);
4257       }
4258       break;
4259
4260     case 'P':
4261       /* Print the number of non-zero bits in X (a const_int).  */
4262       if (!CONST_INT_P (x))
4263         {
4264           output_operand_lossage ("invalid operand for '%%%c'", code);
4265           return;
4266         }
4267
4268       asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4269       break;
4270
4271     case 'H':
4272       /* Print the higher numbered register of a pair (TImode) of regs.  */
4273       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4274         {
4275           output_operand_lossage ("invalid operand for '%%%c'", code);
4276           return;
4277         }
4278
4279       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4280       break;
4281
4282     case 'm':
4283       {
4284         int cond_code;
4285         /* Print a condition (eq, ne, etc).  */
4286
4287         /* CONST_TRUE_RTX means always -- that's the default.  */
4288         if (x == const_true_rtx)
4289           return;
4290
4291         if (!COMPARISON_P (x))
4292           {
4293             output_operand_lossage ("invalid operand for '%%%c'", code);
4294             return;
4295           }
4296
4297         cond_code = aarch64_get_condition_code (x);
4298         gcc_assert (cond_code >= 0);
4299         fputs (aarch64_condition_codes[cond_code], f);
4300       }
4301       break;
4302
4303     case 'M':
4304       {
4305         int cond_code;
4306         /* Print the inverse of a condition (eq <-> ne, etc).  */
4307
4308         /* CONST_TRUE_RTX means never -- that's the default.  */
4309         if (x == const_true_rtx)
4310           {
4311             fputs ("nv", f);
4312             return;
4313           }
4314
4315         if (!COMPARISON_P (x))
4316           {
4317             output_operand_lossage ("invalid operand for '%%%c'", code);
4318             return;
4319           }
4320         cond_code = aarch64_get_condition_code (x);
4321         gcc_assert (cond_code >= 0);
4322         fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4323                                        (cond_code)], f);
4324       }
4325       break;
4326
4327     case 'b':
4328     case 'h':
4329     case 's':
4330     case 'd':
4331     case 'q':
4332       /* Print a scalar FP/SIMD register name.  */
4333       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4334         {
4335           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4336           return;
4337         }
4338       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4339       break;
4340
4341     case 'S':
4342     case 'T':
4343     case 'U':
4344     case 'V':
4345       /* Print the first FP/SIMD register name in a list.  */
4346       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4347         {
4348           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4349           return;
4350         }
4351       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4352       break;
4353
4354     case 'R':
4355       /* Print a scalar FP/SIMD register name + 1.  */
4356       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4357         {
4358           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4359           return;
4360         }
4361       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4362       break;
4363
4364     case 'X':
4365       /* Print bottom 16 bits of integer constant in hex.  */
4366       if (!CONST_INT_P (x))
4367         {
4368           output_operand_lossage ("invalid operand for '%%%c'", code);
4369           return;
4370         }
4371       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4372       break;
4373
4374     case 'w':
4375     case 'x':
4376       /* Print a general register name or the zero register (32-bit or
4377          64-bit).  */
4378       if (x == const0_rtx
4379           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4380         {
4381           asm_fprintf (f, "%czr", code);
4382           break;
4383         }
4384
4385       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4386         {
4387           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4388           break;
4389         }
4390
4391       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4392         {
4393           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4394           break;
4395         }
4396
4397       /* Fall through */
4398
4399     case 0:
4400       /* Print a normal operand, if it's a general register, then we
4401          assume DImode.  */
4402       if (x == NULL)
4403         {
4404           output_operand_lossage ("missing operand");
4405           return;
4406         }
4407
4408       switch (GET_CODE (x))
4409         {
4410         case REG:
4411           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4412           break;
4413
4414         case MEM:
4415           aarch64_memory_reference_mode = GET_MODE (x);
4416           output_address (XEXP (x, 0));
4417           break;
4418
4419         case LABEL_REF:
4420         case SYMBOL_REF:
4421           output_addr_const (asm_out_file, x);
4422           break;
4423
4424         case CONST_INT:
4425           asm_fprintf (f, "%wd", INTVAL (x));
4426           break;
4427
4428         case CONST_VECTOR:
4429           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4430             {
4431               gcc_assert (
4432                   aarch64_const_vec_all_same_in_range_p (x,
4433                                                          HOST_WIDE_INT_MIN,
4434                                                          HOST_WIDE_INT_MAX));
4435               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4436             }
4437           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4438             {
4439               fputc ('0', f);
4440             }
4441           else
4442             gcc_unreachable ();
4443           break;
4444
4445         case CONST_DOUBLE:
4446           /* CONST_DOUBLE can represent a double-width integer.
4447              In this case, the mode of x is VOIDmode.  */
4448           if (GET_MODE (x) == VOIDmode)
4449             ; /* Do Nothing.  */
4450           else if (aarch64_float_const_zero_rtx_p (x))
4451             {
4452               fputc ('0', f);
4453               break;
4454             }
4455           else if (aarch64_float_const_representable_p (x))
4456             {
4457 #define buf_size 20
4458               char float_buf[buf_size] = {'\0'};
4459               REAL_VALUE_TYPE r;
4460               REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4461               real_to_decimal_for_mode (float_buf, &r,
4462                                         buf_size, buf_size,
4463                                         1, GET_MODE (x));
4464               asm_fprintf (asm_out_file, "%s", float_buf);
4465               break;
4466 #undef buf_size
4467             }
4468           output_operand_lossage ("invalid constant");
4469           return;
4470         default:
4471           output_operand_lossage ("invalid operand");
4472           return;
4473         }
4474       break;
4475
4476     case 'A':
4477       if (GET_CODE (x) == HIGH)
4478         x = XEXP (x, 0);
4479
4480       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4481         {
4482         case SYMBOL_SMALL_GOT_4G:
4483           asm_fprintf (asm_out_file, ":got:");
4484           break;
4485
4486         case SYMBOL_SMALL_TLSGD:
4487           asm_fprintf (asm_out_file, ":tlsgd:");
4488           break;
4489
4490         case SYMBOL_SMALL_TLSDESC:
4491           asm_fprintf (asm_out_file, ":tlsdesc:");
4492           break;
4493
4494         case SYMBOL_SMALL_GOTTPREL:
4495           asm_fprintf (asm_out_file, ":gottprel:");
4496           break;
4497
4498         case SYMBOL_TLSLE:
4499           asm_fprintf (asm_out_file, ":tprel:");
4500           break;
4501
4502         case SYMBOL_TINY_GOT:
4503           gcc_unreachable ();
4504           break;
4505
4506         default:
4507           break;
4508         }
4509       output_addr_const (asm_out_file, x);
4510       break;
4511
4512     case 'L':
4513       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4514         {
4515         case SYMBOL_SMALL_GOT_4G:
4516           asm_fprintf (asm_out_file, ":lo12:");
4517           break;
4518
4519         case SYMBOL_SMALL_TLSGD:
4520           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4521           break;
4522
4523         case SYMBOL_SMALL_TLSDESC:
4524           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4525           break;
4526
4527         case SYMBOL_SMALL_GOTTPREL:
4528           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4529           break;
4530
4531         case SYMBOL_TLSLE:
4532           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4533           break;
4534
4535         case SYMBOL_TINY_GOT:
4536           asm_fprintf (asm_out_file, ":got:");
4537           break;
4538
4539         default:
4540           break;
4541         }
4542       output_addr_const (asm_out_file, x);
4543       break;
4544
4545     case 'G':
4546
4547       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4548         {
4549         case SYMBOL_TLSLE:
4550           asm_fprintf (asm_out_file, ":tprel_hi12:");
4551           break;
4552         default:
4553           break;
4554         }
4555       output_addr_const (asm_out_file, x);
4556       break;
4557
4558     case 'K':
4559       {
4560         int cond_code;
4561         /* Print nzcv.  */
4562
4563         if (!COMPARISON_P (x))
4564           {
4565             output_operand_lossage ("invalid operand for '%%%c'", code);
4566             return;
4567           }
4568
4569         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4570         gcc_assert (cond_code >= 0);
4571         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4572       }
4573       break;
4574
4575     case 'k':
4576       {
4577         int cond_code;
4578         /* Print nzcv.  */
4579
4580         if (!COMPARISON_P (x))
4581           {
4582             output_operand_lossage ("invalid operand for '%%%c'", code);
4583             return;
4584           }
4585
4586         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4587         gcc_assert (cond_code >= 0);
4588         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4589       }
4590       break;
4591
4592     default:
4593       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4594       return;
4595     }
4596 }
4597
4598 void
4599 aarch64_print_operand_address (FILE *f, rtx x)
4600 {
4601   struct aarch64_address_info addr;
4602
4603   if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4604                              MEM, true))
4605     switch (addr.type)
4606       {
4607       case ADDRESS_REG_IMM:
4608         if (addr.offset == const0_rtx)
4609           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4610         else
4611           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4612                        INTVAL (addr.offset));
4613         return;
4614
4615       case ADDRESS_REG_REG:
4616         if (addr.shift == 0)
4617           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4618                        reg_names [REGNO (addr.offset)]);
4619         else
4620           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4621                        reg_names [REGNO (addr.offset)], addr.shift);
4622         return;
4623
4624       case ADDRESS_REG_UXTW:
4625         if (addr.shift == 0)
4626           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4627                        REGNO (addr.offset) - R0_REGNUM);
4628         else
4629           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4630                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4631         return;
4632
4633       case ADDRESS_REG_SXTW:
4634         if (addr.shift == 0)
4635           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4636                        REGNO (addr.offset) - R0_REGNUM);
4637         else
4638           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4639                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4640         return;
4641
4642       case ADDRESS_REG_WB:
4643         switch (GET_CODE (x))
4644           {
4645           case PRE_INC:
4646             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4647                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4648             return;
4649           case POST_INC:
4650             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4651                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4652             return;
4653           case PRE_DEC:
4654             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4655                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4656             return;
4657           case POST_DEC:
4658             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4659                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4660             return;
4661           case PRE_MODIFY:
4662             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4663                          INTVAL (addr.offset));
4664             return;
4665           case POST_MODIFY:
4666             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4667                          INTVAL (addr.offset));
4668             return;
4669           default:
4670             break;
4671           }
4672         break;
4673
4674       case ADDRESS_LO_SUM:
4675         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4676         output_addr_const (f, addr.offset);
4677         asm_fprintf (f, "]");
4678         return;
4679
4680       case ADDRESS_SYMBOLIC:
4681         break;
4682       }
4683
4684   output_addr_const (f, x);
4685 }
4686
4687 bool
4688 aarch64_label_mentioned_p (rtx x)
4689 {
4690   const char *fmt;
4691   int i;
4692
4693   if (GET_CODE (x) == LABEL_REF)
4694     return true;
4695
4696   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4697      referencing instruction, but they are constant offsets, not
4698      symbols.  */
4699   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4700     return false;
4701
4702   fmt = GET_RTX_FORMAT (GET_CODE (x));
4703   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4704     {
4705       if (fmt[i] == 'E')
4706         {
4707           int j;
4708
4709           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4710             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4711               return 1;
4712         }
4713       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4714         return 1;
4715     }
4716
4717   return 0;
4718 }
4719
4720 /* Implement REGNO_REG_CLASS.  */
4721
4722 enum reg_class
4723 aarch64_regno_regclass (unsigned regno)
4724 {
4725   if (GP_REGNUM_P (regno))
4726     return GENERAL_REGS;
4727
4728   if (regno == SP_REGNUM)
4729     return STACK_REG;
4730
4731   if (regno == FRAME_POINTER_REGNUM
4732       || regno == ARG_POINTER_REGNUM)
4733     return POINTER_REGS;
4734
4735   if (FP_REGNUM_P (regno))
4736     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4737
4738   return NO_REGS;
4739 }
4740
4741 static rtx
4742 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
4743 {
4744   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4745      where mask is selected by alignment and size of the offset.
4746      We try to pick as large a range for the offset as possible to
4747      maximize the chance of a CSE.  However, for aligned addresses
4748      we limit the range to 4k so that structures with different sized
4749      elements are likely to use the same base.  */
4750
4751   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4752     {
4753       HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4754       HOST_WIDE_INT base_offset;
4755
4756       /* Does it look like we'll need a load/store-pair operation?  */
4757       if (GET_MODE_SIZE (mode) > 16
4758           || mode == TImode)
4759         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4760                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
4761       /* For offsets aren't a multiple of the access size, the limit is
4762          -256...255.  */
4763       else if (offset & (GET_MODE_SIZE (mode) - 1))
4764         base_offset = (offset + 0x100) & ~0x1ff;
4765       else
4766         base_offset = offset & ~0xfff;
4767
4768       if (base_offset == 0)
4769         return x;
4770
4771       offset -= base_offset;
4772       rtx base_reg = gen_reg_rtx (Pmode);
4773       rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4774                            NULL_RTX);
4775       emit_move_insn (base_reg, val);
4776       x = plus_constant (Pmode, base_reg, offset);
4777     }
4778
4779   return x;
4780 }
4781
4782 /* Try a machine-dependent way of reloading an illegitimate address
4783    operand.  If we find one, push the reload and return the new rtx.  */
4784
4785 rtx
4786 aarch64_legitimize_reload_address (rtx *x_p,
4787                                    machine_mode mode,
4788                                    int opnum, int type,
4789                                    int ind_levels ATTRIBUTE_UNUSED)
4790 {
4791   rtx x = *x_p;
4792
4793   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4794   if (aarch64_vect_struct_mode_p (mode)
4795       && GET_CODE (x) == PLUS
4796       && REG_P (XEXP (x, 0))
4797       && CONST_INT_P (XEXP (x, 1)))
4798     {
4799       rtx orig_rtx = x;
4800       x = copy_rtx (x);
4801       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4802                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4803                    opnum, (enum reload_type) type);
4804       return x;
4805     }
4806
4807   /* We must recognize output that we have already generated ourselves.  */
4808   if (GET_CODE (x) == PLUS
4809       && GET_CODE (XEXP (x, 0)) == PLUS
4810       && REG_P (XEXP (XEXP (x, 0), 0))
4811       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4812       && CONST_INT_P (XEXP (x, 1)))
4813     {
4814       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4815                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4816                    opnum, (enum reload_type) type);
4817       return x;
4818     }
4819
4820   /* We wish to handle large displacements off a base register by splitting
4821      the addend across an add and the mem insn.  This can cut the number of
4822      extra insns needed from 3 to 1.  It is only useful for load/store of a
4823      single register with 12 bit offset field.  */
4824   if (GET_CODE (x) == PLUS
4825       && REG_P (XEXP (x, 0))
4826       && CONST_INT_P (XEXP (x, 1))
4827       && HARD_REGISTER_P (XEXP (x, 0))
4828       && mode != TImode
4829       && mode != TFmode
4830       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4831     {
4832       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4833       HOST_WIDE_INT low = val & 0xfff;
4834       HOST_WIDE_INT high = val - low;
4835       HOST_WIDE_INT offs;
4836       rtx cst;
4837       machine_mode xmode = GET_MODE (x);
4838
4839       /* In ILP32, xmode can be either DImode or SImode.  */
4840       gcc_assert (xmode == DImode || xmode == SImode);
4841
4842       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4843          BLKmode alignment.  */
4844       if (GET_MODE_SIZE (mode) == 0)
4845         return NULL_RTX;
4846
4847       offs = low % GET_MODE_SIZE (mode);
4848
4849       /* Align misaligned offset by adjusting high part to compensate.  */
4850       if (offs != 0)
4851         {
4852           if (aarch64_uimm12_shift (high + offs))
4853             {
4854               /* Align down.  */
4855               low = low - offs;
4856               high = high + offs;
4857             }
4858           else
4859             {
4860               /* Align up.  */
4861               offs = GET_MODE_SIZE (mode) - offs;
4862               low = low + offs;
4863               high = high + (low & 0x1000) - offs;
4864               low &= 0xfff;
4865             }
4866         }
4867
4868       /* Check for overflow.  */
4869       if (high + low != val)
4870         return NULL_RTX;
4871
4872       cst = GEN_INT (high);
4873       if (!aarch64_uimm12_shift (high))
4874         cst = force_const_mem (xmode, cst);
4875
4876       /* Reload high part into base reg, leaving the low part
4877          in the mem instruction.
4878          Note that replacing this gen_rtx_PLUS with plus_constant is
4879          wrong in this case because we rely on the
4880          (plus (plus reg c1) c2) structure being preserved so that
4881          XEXP (*p, 0) in push_reload below uses the correct term.  */
4882       x = gen_rtx_PLUS (xmode,
4883                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4884                         GEN_INT (low));
4885
4886       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4887                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4888                    opnum, (enum reload_type) type);
4889       return x;
4890     }
4891
4892   return NULL_RTX;
4893 }
4894
4895
4896 static reg_class_t
4897 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4898                           reg_class_t rclass,
4899                           machine_mode mode,
4900                           secondary_reload_info *sri)
4901 {
4902   /* Without the TARGET_SIMD instructions we cannot move a Q register
4903      to a Q register directly.  We need a scratch.  */
4904   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4905       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4906       && reg_class_subset_p (rclass, FP_REGS))
4907     {
4908       if (mode == TFmode)
4909         sri->icode = CODE_FOR_aarch64_reload_movtf;
4910       else if (mode == TImode)
4911         sri->icode = CODE_FOR_aarch64_reload_movti;
4912       return NO_REGS;
4913     }
4914
4915   /* A TFmode or TImode memory access should be handled via an FP_REGS
4916      because AArch64 has richer addressing modes for LDR/STR instructions
4917      than LDP/STP instructions.  */
4918   if (TARGET_FLOAT && rclass == GENERAL_REGS
4919       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4920     return FP_REGS;
4921
4922   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4923       return GENERAL_REGS;
4924
4925   return NO_REGS;
4926 }
4927
4928 static bool
4929 aarch64_can_eliminate (const int from, const int to)
4930 {
4931   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4932      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4933
4934   if (frame_pointer_needed)
4935     {
4936       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4937         return true;
4938       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4939         return false;
4940       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4941           && !cfun->calls_alloca)
4942         return true;
4943       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4944         return true;
4945
4946       return false;
4947     }
4948   else
4949     {
4950       /* If we decided that we didn't need a leaf frame pointer but then used
4951          LR in the function, then we'll want a frame pointer after all, so
4952          prevent this elimination to ensure a frame pointer is used.  */
4953       if (to == STACK_POINTER_REGNUM
4954           && flag_omit_leaf_frame_pointer
4955           && df_regs_ever_live_p (LR_REGNUM))
4956         return false;
4957     }
4958
4959   return true;
4960 }
4961
4962 HOST_WIDE_INT
4963 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4964 {
4965   aarch64_layout_frame ();
4966
4967   if (to == HARD_FRAME_POINTER_REGNUM)
4968     {
4969       if (from == ARG_POINTER_REGNUM)
4970         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4971
4972       if (from == FRAME_POINTER_REGNUM)
4973         return (cfun->machine->frame.hard_fp_offset
4974                 - cfun->machine->frame.saved_varargs_size);
4975     }
4976
4977   if (to == STACK_POINTER_REGNUM)
4978     {
4979       if (from == FRAME_POINTER_REGNUM)
4980           return (cfun->machine->frame.frame_size
4981                   - cfun->machine->frame.saved_varargs_size);
4982     }
4983
4984   return cfun->machine->frame.frame_size;
4985 }
4986
4987 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
4988    previous frame.  */
4989
4990 rtx
4991 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
4992 {
4993   if (count != 0)
4994     return const0_rtx;
4995   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
4996 }
4997
4998
4999 static void
5000 aarch64_asm_trampoline_template (FILE *f)
5001 {
5002   if (TARGET_ILP32)
5003     {
5004       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5005       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5006     }
5007   else
5008     {
5009       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5010       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5011     }
5012   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5013   assemble_aligned_integer (4, const0_rtx);
5014   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5015   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5016 }
5017
5018 static void
5019 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5020 {
5021   rtx fnaddr, mem, a_tramp;
5022   const int tramp_code_sz = 16;
5023
5024   /* Don't need to copy the trailing D-words, we fill those in below.  */
5025   emit_block_move (m_tramp, assemble_trampoline_template (),
5026                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5027   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5028   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5029   if (GET_MODE (fnaddr) != ptr_mode)
5030     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5031   emit_move_insn (mem, fnaddr);
5032
5033   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5034   emit_move_insn (mem, chain_value);
5035
5036   /* XXX We should really define a "clear_cache" pattern and use
5037      gen_clear_cache().  */
5038   a_tramp = XEXP (m_tramp, 0);
5039   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5040                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5041                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5042                      ptr_mode);
5043 }
5044
5045 static unsigned char
5046 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5047 {
5048   switch (regclass)
5049     {
5050     case CALLER_SAVE_REGS:
5051     case POINTER_REGS:
5052     case GENERAL_REGS:
5053     case ALL_REGS:
5054     case FP_REGS:
5055     case FP_LO_REGS:
5056       return
5057         aarch64_vector_mode_p (mode)
5058           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5059           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5060     case STACK_REG:
5061       return 1;
5062
5063     case NO_REGS:
5064       return 0;
5065
5066     default:
5067       break;
5068     }
5069   gcc_unreachable ();
5070 }
5071
5072 static reg_class_t
5073 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5074 {
5075   if (regclass == POINTER_REGS)
5076     return GENERAL_REGS;
5077
5078   if (regclass == STACK_REG)
5079     {
5080       if (REG_P(x)
5081           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5082           return regclass;
5083
5084       return NO_REGS;
5085     }
5086
5087   /* If it's an integer immediate that MOVI can't handle, then
5088      FP_REGS is not an option, so we return NO_REGS instead.  */
5089   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5090       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5091     return NO_REGS;
5092
5093   /* Register eliminiation can result in a request for
5094      SP+constant->FP_REGS.  We cannot support such operations which
5095      use SP as source and an FP_REG as destination, so reject out
5096      right now.  */
5097   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5098     {
5099       rtx lhs = XEXP (x, 0);
5100
5101       /* Look through a possible SUBREG introduced by ILP32.  */
5102       if (GET_CODE (lhs) == SUBREG)
5103         lhs = SUBREG_REG (lhs);
5104
5105       gcc_assert (REG_P (lhs));
5106       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5107                                       POINTER_REGS));
5108       return NO_REGS;
5109     }
5110
5111   return regclass;
5112 }
5113
5114 void
5115 aarch64_asm_output_labelref (FILE* f, const char *name)
5116 {
5117   asm_fprintf (f, "%U%s", name);
5118 }
5119
5120 static void
5121 aarch64_elf_asm_constructor (rtx symbol, int priority)
5122 {
5123   if (priority == DEFAULT_INIT_PRIORITY)
5124     default_ctor_section_asm_out_constructor (symbol, priority);
5125   else
5126     {
5127       section *s;
5128       char buf[18];
5129       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5130       s = get_section (buf, SECTION_WRITE, NULL);
5131       switch_to_section (s);
5132       assemble_align (POINTER_SIZE);
5133       assemble_aligned_integer (POINTER_BYTES, symbol);
5134     }
5135 }
5136
5137 static void
5138 aarch64_elf_asm_destructor (rtx symbol, int priority)
5139 {
5140   if (priority == DEFAULT_INIT_PRIORITY)
5141     default_dtor_section_asm_out_destructor (symbol, priority);
5142   else
5143     {
5144       section *s;
5145       char buf[18];
5146       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5147       s = get_section (buf, SECTION_WRITE, NULL);
5148       switch_to_section (s);
5149       assemble_align (POINTER_SIZE);
5150       assemble_aligned_integer (POINTER_BYTES, symbol);
5151     }
5152 }
5153
5154 const char*
5155 aarch64_output_casesi (rtx *operands)
5156 {
5157   char buf[100];
5158   char label[100];
5159   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5160   int index;
5161   static const char *const patterns[4][2] =
5162   {
5163     {
5164       "ldrb\t%w3, [%0,%w1,uxtw]",
5165       "add\t%3, %4, %w3, sxtb #2"
5166     },
5167     {
5168       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5169       "add\t%3, %4, %w3, sxth #2"
5170     },
5171     {
5172       "ldr\t%w3, [%0,%w1,uxtw #2]",
5173       "add\t%3, %4, %w3, sxtw #2"
5174     },
5175     /* We assume that DImode is only generated when not optimizing and
5176        that we don't really need 64-bit address offsets.  That would
5177        imply an object file with 8GB of code in a single function!  */
5178     {
5179       "ldr\t%w3, [%0,%w1,uxtw #2]",
5180       "add\t%3, %4, %w3, sxtw #2"
5181     }
5182   };
5183
5184   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5185
5186   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5187
5188   gcc_assert (index >= 0 && index <= 3);
5189
5190   /* Need to implement table size reduction, by chaning the code below.  */
5191   output_asm_insn (patterns[index][0], operands);
5192   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5193   snprintf (buf, sizeof (buf),
5194             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5195   output_asm_insn (buf, operands);
5196   output_asm_insn (patterns[index][1], operands);
5197   output_asm_insn ("br\t%3", operands);
5198   assemble_label (asm_out_file, label);
5199   return "";
5200 }
5201
5202
5203 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5204    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5205    operator.  */
5206
5207 int
5208 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5209 {
5210   if (shift >= 0 && shift <= 3)
5211     {
5212       int size;
5213       for (size = 8; size <= 32; size *= 2)
5214         {
5215           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5216           if (mask == bits << shift)
5217             return size;
5218         }
5219     }
5220   return 0;
5221 }
5222
5223 static bool
5224 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5225                                    const_rtx x ATTRIBUTE_UNUSED)
5226 {
5227   /* We can't use blocks for constants when we're using a per-function
5228      constant pool.  */
5229   return false;
5230 }
5231
5232 static section *
5233 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5234                             rtx x ATTRIBUTE_UNUSED,
5235                             unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5236 {
5237   /* Force all constant pool entries into the current function section.  */
5238   return function_section (current_function_decl);
5239 }
5240
5241
5242 /* Costs.  */
5243
5244 /* Helper function for rtx cost calculation.  Strip a shift expression
5245    from X.  Returns the inner operand if successful, or the original
5246    expression on failure.  */
5247 static rtx
5248 aarch64_strip_shift (rtx x)
5249 {
5250   rtx op = x;
5251
5252   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5253      we can convert both to ROR during final output.  */
5254   if ((GET_CODE (op) == ASHIFT
5255        || GET_CODE (op) == ASHIFTRT
5256        || GET_CODE (op) == LSHIFTRT
5257        || GET_CODE (op) == ROTATERT
5258        || GET_CODE (op) == ROTATE)
5259       && CONST_INT_P (XEXP (op, 1)))
5260     return XEXP (op, 0);
5261
5262   if (GET_CODE (op) == MULT
5263       && CONST_INT_P (XEXP (op, 1))
5264       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5265     return XEXP (op, 0);
5266
5267   return x;
5268 }
5269
5270 /* Helper function for rtx cost calculation.  Strip an extend
5271    expression from X.  Returns the inner operand if successful, or the
5272    original expression on failure.  We deal with a number of possible
5273    canonicalization variations here.  */
5274 static rtx
5275 aarch64_strip_extend (rtx x)
5276 {
5277   rtx op = x;
5278
5279   /* Zero and sign extraction of a widened value.  */
5280   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5281       && XEXP (op, 2) == const0_rtx
5282       && GET_CODE (XEXP (op, 0)) == MULT
5283       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5284                                          XEXP (op, 1)))
5285     return XEXP (XEXP (op, 0), 0);
5286
5287   /* It can also be represented (for zero-extend) as an AND with an
5288      immediate.  */
5289   if (GET_CODE (op) == AND
5290       && GET_CODE (XEXP (op, 0)) == MULT
5291       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5292       && CONST_INT_P (XEXP (op, 1))
5293       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5294                            INTVAL (XEXP (op, 1))) != 0)
5295     return XEXP (XEXP (op, 0), 0);
5296
5297   /* Now handle extended register, as this may also have an optional
5298      left shift by 1..4.  */
5299   if (GET_CODE (op) == ASHIFT
5300       && CONST_INT_P (XEXP (op, 1))
5301       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5302     op = XEXP (op, 0);
5303
5304   if (GET_CODE (op) == ZERO_EXTEND
5305       || GET_CODE (op) == SIGN_EXTEND)
5306     op = XEXP (op, 0);
5307
5308   if (op != x)
5309     return op;
5310
5311   return x;
5312 }
5313
5314 /* Return true iff CODE is a shift supported in combination
5315    with arithmetic instructions.  */
5316
5317 static bool
5318 aarch64_shift_p (enum rtx_code code)
5319 {
5320   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5321 }
5322
5323 /* Helper function for rtx cost calculation.  Calculate the cost of
5324    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5325    Return the calculated cost of the expression, recursing manually in to
5326    operands where needed.  */
5327
5328 static int
5329 aarch64_rtx_mult_cost (rtx x, int code, int outer, bool speed)
5330 {
5331   rtx op0, op1;
5332   const struct cpu_cost_table *extra_cost
5333     = aarch64_tune_params.insn_extra_cost;
5334   int cost = 0;
5335   bool compound_p = (outer == PLUS || outer == MINUS);
5336   machine_mode mode = GET_MODE (x);
5337
5338   gcc_checking_assert (code == MULT);
5339
5340   op0 = XEXP (x, 0);
5341   op1 = XEXP (x, 1);
5342
5343   if (VECTOR_MODE_P (mode))
5344     mode = GET_MODE_INNER (mode);
5345
5346   /* Integer multiply/fma.  */
5347   if (GET_MODE_CLASS (mode) == MODE_INT)
5348     {
5349       /* The multiply will be canonicalized as a shift, cost it as such.  */
5350       if (aarch64_shift_p (GET_CODE (x))
5351           || (CONST_INT_P (op1)
5352               && exact_log2 (INTVAL (op1)) > 0))
5353         {
5354           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5355                            || GET_CODE (op0) == SIGN_EXTEND;
5356           if (speed)
5357             {
5358               if (compound_p)
5359                 {
5360                   if (REG_P (op1))
5361                     /* ARITH + shift-by-register.  */
5362                     cost += extra_cost->alu.arith_shift_reg;
5363                   else if (is_extend)
5364                     /* ARITH + extended register.  We don't have a cost field
5365                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
5366                     cost += extra_cost->alu.extend_arith;
5367                   else
5368                     /* ARITH + shift-by-immediate.  */
5369                     cost += extra_cost->alu.arith_shift;
5370                 }
5371               else
5372                 /* LSL (immediate).  */
5373                 cost += extra_cost->alu.shift;
5374
5375             }
5376           /* Strip extends as we will have costed them in the case above.  */
5377           if (is_extend)
5378             op0 = aarch64_strip_extend (op0);
5379
5380           cost += rtx_cost (op0, GET_CODE (op0), 0, speed);
5381
5382           return cost;
5383         }
5384
5385       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
5386          compound and let the below cases handle it.  After all, MNEG is a
5387          special-case alias of MSUB.  */
5388       if (GET_CODE (op0) == NEG)
5389         {
5390           op0 = XEXP (op0, 0);
5391           compound_p = true;
5392         }
5393
5394       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5395       if ((GET_CODE (op0) == ZERO_EXTEND
5396            && GET_CODE (op1) == ZERO_EXTEND)
5397           || (GET_CODE (op0) == SIGN_EXTEND
5398               && GET_CODE (op1) == SIGN_EXTEND))
5399         {
5400           cost += rtx_cost (XEXP (op0, 0), MULT, 0, speed)
5401                   + rtx_cost (XEXP (op1, 0), MULT, 1, speed);
5402
5403           if (speed)
5404             {
5405               if (compound_p)
5406                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
5407                 cost += extra_cost->mult[0].extend_add;
5408               else
5409                 /* MUL/SMULL/UMULL.  */
5410                 cost += extra_cost->mult[0].extend;
5411             }
5412
5413           return cost;
5414         }
5415
5416       /* This is either an integer multiply or a MADD.  In both cases
5417          we want to recurse and cost the operands.  */
5418       cost += rtx_cost (op0, MULT, 0, speed)
5419               + rtx_cost (op1, MULT, 1, speed);
5420
5421       if (speed)
5422         {
5423           if (compound_p)
5424             /* MADD/MSUB.  */
5425             cost += extra_cost->mult[mode == DImode].add;
5426           else
5427             /* MUL.  */
5428             cost += extra_cost->mult[mode == DImode].simple;
5429         }
5430
5431       return cost;
5432     }
5433   else
5434     {
5435       if (speed)
5436         {
5437           /* Floating-point FMA/FMUL can also support negations of the
5438              operands.  */
5439           if (GET_CODE (op0) == NEG)
5440             op0 = XEXP (op0, 0);
5441           if (GET_CODE (op1) == NEG)
5442             op1 = XEXP (op1, 0);
5443
5444           if (compound_p)
5445             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5446             cost += extra_cost->fp[mode == DFmode].fma;
5447           else
5448             /* FMUL/FNMUL.  */
5449             cost += extra_cost->fp[mode == DFmode].mult;
5450         }
5451
5452       cost += rtx_cost (op0, MULT, 0, speed)
5453               + rtx_cost (op1, MULT, 1, speed);
5454       return cost;
5455     }
5456 }
5457
5458 static int
5459 aarch64_address_cost (rtx x,
5460                       machine_mode mode,
5461                       addr_space_t as ATTRIBUTE_UNUSED,
5462                       bool speed)
5463 {
5464   enum rtx_code c = GET_CODE (x);
5465   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
5466   struct aarch64_address_info info;
5467   int cost = 0;
5468   info.shift = 0;
5469
5470   if (!aarch64_classify_address (&info, x, mode, c, false))
5471     {
5472       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5473         {
5474           /* This is a CONST or SYMBOL ref which will be split
5475              in a different way depending on the code model in use.
5476              Cost it through the generic infrastructure.  */
5477           int cost_symbol_ref = rtx_cost (x, MEM, 1, speed);
5478           /* Divide through by the cost of one instruction to
5479              bring it to the same units as the address costs.  */
5480           cost_symbol_ref /= COSTS_N_INSNS (1);
5481           /* The cost is then the cost of preparing the address,
5482              followed by an immediate (possibly 0) offset.  */
5483           return cost_symbol_ref + addr_cost->imm_offset;
5484         }
5485       else
5486         {
5487           /* This is most likely a jump table from a case
5488              statement.  */
5489           return addr_cost->register_offset;
5490         }
5491     }
5492
5493   switch (info.type)
5494     {
5495       case ADDRESS_LO_SUM:
5496       case ADDRESS_SYMBOLIC:
5497       case ADDRESS_REG_IMM:
5498         cost += addr_cost->imm_offset;
5499         break;
5500
5501       case ADDRESS_REG_WB:
5502         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5503           cost += addr_cost->pre_modify;
5504         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5505           cost += addr_cost->post_modify;
5506         else
5507           gcc_unreachable ();
5508
5509         break;
5510
5511       case ADDRESS_REG_REG:
5512         cost += addr_cost->register_offset;
5513         break;
5514
5515       case ADDRESS_REG_UXTW:
5516       case ADDRESS_REG_SXTW:
5517         cost += addr_cost->register_extend;
5518         break;
5519
5520       default:
5521         gcc_unreachable ();
5522     }
5523
5524
5525   if (info.shift > 0)
5526     {
5527       /* For the sake of calculating the cost of the shifted register
5528          component, we can treat same sized modes in the same way.  */
5529       switch (GET_MODE_BITSIZE (mode))
5530         {
5531           case 16:
5532             cost += addr_cost->addr_scale_costs.hi;
5533             break;
5534
5535           case 32:
5536             cost += addr_cost->addr_scale_costs.si;
5537             break;
5538
5539           case 64:
5540             cost += addr_cost->addr_scale_costs.di;
5541             break;
5542
5543           /* We can't tell, or this is a 128-bit vector.  */
5544           default:
5545             cost += addr_cost->addr_scale_costs.ti;
5546             break;
5547         }
5548     }
5549
5550   return cost;
5551 }
5552
5553 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
5554    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
5555    to be taken.  */
5556
5557 int
5558 aarch64_branch_cost (bool speed_p, bool predictable_p)
5559 {
5560   /* When optimizing for speed, use the cost of unpredictable branches.  */
5561   const struct cpu_branch_cost *branch_costs =
5562     aarch64_tune_params.branch_costs;
5563
5564   if (!speed_p || predictable_p)
5565     return branch_costs->predictable;
5566   else
5567     return branch_costs->unpredictable;
5568 }
5569
5570 /* Return true if the RTX X in mode MODE is a zero or sign extract
5571    usable in an ADD or SUB (extended register) instruction.  */
5572 static bool
5573 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5574 {
5575   /* Catch add with a sign extract.
5576      This is add_<optab><mode>_multp2.  */
5577   if (GET_CODE (x) == SIGN_EXTRACT
5578       || GET_CODE (x) == ZERO_EXTRACT)
5579     {
5580       rtx op0 = XEXP (x, 0);
5581       rtx op1 = XEXP (x, 1);
5582       rtx op2 = XEXP (x, 2);
5583
5584       if (GET_CODE (op0) == MULT
5585           && CONST_INT_P (op1)
5586           && op2 == const0_rtx
5587           && CONST_INT_P (XEXP (op0, 1))
5588           && aarch64_is_extend_from_extract (mode,
5589                                              XEXP (op0, 1),
5590                                              op1))
5591         {
5592           return true;
5593         }
5594     }
5595
5596   return false;
5597 }
5598
5599 static bool
5600 aarch64_frint_unspec_p (unsigned int u)
5601 {
5602   switch (u)
5603     {
5604       case UNSPEC_FRINTZ:
5605       case UNSPEC_FRINTP:
5606       case UNSPEC_FRINTM:
5607       case UNSPEC_FRINTA:
5608       case UNSPEC_FRINTN:
5609       case UNSPEC_FRINTX:
5610       case UNSPEC_FRINTI:
5611         return true;
5612
5613       default:
5614         return false;
5615     }
5616 }
5617
5618 /* Return true iff X is an rtx that will match an extr instruction
5619    i.e. as described in the *extr<mode>5_insn family of patterns.
5620    OP0 and OP1 will be set to the operands of the shifts involved
5621    on success and will be NULL_RTX otherwise.  */
5622
5623 static bool
5624 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5625 {
5626   rtx op0, op1;
5627   machine_mode mode = GET_MODE (x);
5628
5629   *res_op0 = NULL_RTX;
5630   *res_op1 = NULL_RTX;
5631
5632   if (GET_CODE (x) != IOR)
5633     return false;
5634
5635   op0 = XEXP (x, 0);
5636   op1 = XEXP (x, 1);
5637
5638   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5639       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5640     {
5641      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
5642       if (GET_CODE (op1) == ASHIFT)
5643         std::swap (op0, op1);
5644
5645       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5646         return false;
5647
5648       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5649       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5650
5651       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5652           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5653         {
5654           *res_op0 = XEXP (op0, 0);
5655           *res_op1 = XEXP (op1, 0);
5656           return true;
5657         }
5658     }
5659
5660   return false;
5661 }
5662
5663 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5664    storing it in *COST.  Result is true if the total cost of the operation
5665    has now been calculated.  */
5666 static bool
5667 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5668 {
5669   rtx inner;
5670   rtx comparator;
5671   enum rtx_code cmpcode;
5672
5673   if (COMPARISON_P (op0))
5674     {
5675       inner = XEXP (op0, 0);
5676       comparator = XEXP (op0, 1);
5677       cmpcode = GET_CODE (op0);
5678     }
5679   else
5680     {
5681       inner = op0;
5682       comparator = const0_rtx;
5683       cmpcode = NE;
5684     }
5685
5686   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5687     {
5688       /* Conditional branch.  */
5689       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5690         return true;
5691       else
5692         {
5693           if (cmpcode == NE || cmpcode == EQ)
5694             {
5695               if (comparator == const0_rtx)
5696                 {
5697                   /* TBZ/TBNZ/CBZ/CBNZ.  */
5698                   if (GET_CODE (inner) == ZERO_EXTRACT)
5699                     /* TBZ/TBNZ.  */
5700                     *cost += rtx_cost (XEXP (inner, 0), ZERO_EXTRACT,
5701                                        0, speed);
5702                 else
5703                   /* CBZ/CBNZ.  */
5704                   *cost += rtx_cost (inner, cmpcode, 0, speed);
5705
5706                 return true;
5707               }
5708             }
5709           else if (cmpcode == LT || cmpcode == GE)
5710             {
5711               /* TBZ/TBNZ.  */
5712               if (comparator == const0_rtx)
5713                 return true;
5714             }
5715         }
5716     }
5717   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5718     {
5719       /* It's a conditional operation based on the status flags,
5720          so it must be some flavor of CSEL.  */
5721
5722       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
5723       if (GET_CODE (op1) == NEG
5724           || GET_CODE (op1) == NOT
5725           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5726         op1 = XEXP (op1, 0);
5727
5728       *cost += rtx_cost (op1, IF_THEN_ELSE, 1, speed);
5729       *cost += rtx_cost (op2, IF_THEN_ELSE, 2, speed);
5730       return true;
5731     }
5732
5733   /* We don't know what this is, cost all operands.  */
5734   return false;
5735 }
5736
5737 /* Calculate the cost of calculating X, storing it in *COST.  Result
5738    is true if the total cost of the operation has now been calculated.  */
5739 static bool
5740 aarch64_rtx_costs (rtx x, int code, int outer ATTRIBUTE_UNUSED,
5741                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5742 {
5743   rtx op0, op1, op2;
5744   const struct cpu_cost_table *extra_cost
5745     = aarch64_tune_params.insn_extra_cost;
5746   machine_mode mode = GET_MODE (x);
5747
5748   /* By default, assume that everything has equivalent cost to the
5749      cheapest instruction.  Any additional costs are applied as a delta
5750      above this default.  */
5751   *cost = COSTS_N_INSNS (1);
5752
5753   switch (code)
5754     {
5755     case SET:
5756       /* The cost depends entirely on the operands to SET.  */
5757       *cost = 0;
5758       op0 = SET_DEST (x);
5759       op1 = SET_SRC (x);
5760
5761       switch (GET_CODE (op0))
5762         {
5763         case MEM:
5764           if (speed)
5765             {
5766               rtx address = XEXP (op0, 0);
5767               if (VECTOR_MODE_P (mode))
5768                 *cost += extra_cost->ldst.storev;
5769               else if (GET_MODE_CLASS (mode) == MODE_INT)
5770                 *cost += extra_cost->ldst.store;
5771               else if (mode == SFmode)
5772                 *cost += extra_cost->ldst.storef;
5773               else if (mode == DFmode)
5774                 *cost += extra_cost->ldst.stored;
5775
5776               *cost +=
5777                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5778                                                      0, speed));
5779             }
5780
5781           *cost += rtx_cost (op1, SET, 1, speed);
5782           return true;
5783
5784         case SUBREG:
5785           if (! REG_P (SUBREG_REG (op0)))
5786             *cost += rtx_cost (SUBREG_REG (op0), SET, 0, speed);
5787
5788           /* Fall through.  */
5789         case REG:
5790           /* The cost is one per vector-register copied.  */
5791           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
5792             {
5793               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5794                               / GET_MODE_SIZE (V4SImode);
5795               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5796             }
5797           /* const0_rtx is in general free, but we will use an
5798              instruction to set a register to 0.  */
5799           else if (REG_P (op1) || op1 == const0_rtx)
5800             {
5801               /* The cost is 1 per register copied.  */
5802               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5803                               / UNITS_PER_WORD;
5804               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5805             }
5806           else
5807             /* Cost is just the cost of the RHS of the set.  */
5808             *cost += rtx_cost (op1, SET, 1, speed);
5809           return true;
5810
5811         case ZERO_EXTRACT:
5812         case SIGN_EXTRACT:
5813           /* Bit-field insertion.  Strip any redundant widening of
5814              the RHS to meet the width of the target.  */
5815           if (GET_CODE (op1) == SUBREG)
5816             op1 = SUBREG_REG (op1);
5817           if ((GET_CODE (op1) == ZERO_EXTEND
5818                || GET_CODE (op1) == SIGN_EXTEND)
5819               && CONST_INT_P (XEXP (op0, 1))
5820               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5821                   >= INTVAL (XEXP (op0, 1))))
5822             op1 = XEXP (op1, 0);
5823
5824           if (CONST_INT_P (op1))
5825             {
5826               /* MOV immediate is assumed to always be cheap.  */
5827               *cost = COSTS_N_INSNS (1);
5828             }
5829           else
5830             {
5831               /* BFM.  */
5832               if (speed)
5833                 *cost += extra_cost->alu.bfi;
5834               *cost += rtx_cost (op1, (enum rtx_code) code, 1, speed);
5835             }
5836
5837           return true;
5838
5839         default:
5840           /* We can't make sense of this, assume default cost.  */
5841           *cost = COSTS_N_INSNS (1);
5842           return false;
5843         }
5844       return false;
5845
5846     case CONST_INT:
5847       /* If an instruction can incorporate a constant within the
5848          instruction, the instruction's expression avoids calling
5849          rtx_cost() on the constant.  If rtx_cost() is called on a
5850          constant, then it is usually because the constant must be
5851          moved into a register by one or more instructions.
5852
5853          The exception is constant 0, which can be expressed
5854          as XZR/WZR and is therefore free.  The exception to this is
5855          if we have (set (reg) (const0_rtx)) in which case we must cost
5856          the move.  However, we can catch that when we cost the SET, so
5857          we don't need to consider that here.  */
5858       if (x == const0_rtx)
5859         *cost = 0;
5860       else
5861         {
5862           /* To an approximation, building any other constant is
5863              proportionally expensive to the number of instructions
5864              required to build that constant.  This is true whether we
5865              are compiling for SPEED or otherwise.  */
5866           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5867                                  (NULL_RTX, x, false, mode));
5868         }
5869       return true;
5870
5871     case CONST_DOUBLE:
5872       if (speed)
5873         {
5874           /* mov[df,sf]_aarch64.  */
5875           if (aarch64_float_const_representable_p (x))
5876             /* FMOV (scalar immediate).  */
5877             *cost += extra_cost->fp[mode == DFmode].fpconst;
5878           else if (!aarch64_float_const_zero_rtx_p (x))
5879             {
5880               /* This will be a load from memory.  */
5881               if (mode == DFmode)
5882                 *cost += extra_cost->ldst.loadd;
5883               else
5884                 *cost += extra_cost->ldst.loadf;
5885             }
5886           else
5887             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5888                or MOV v0.s[0], wzr - neither of which are modeled by the
5889                cost tables.  Just use the default cost.  */
5890             {
5891             }
5892         }
5893
5894       return true;
5895
5896     case MEM:
5897       if (speed)
5898         {
5899           /* For loads we want the base cost of a load, plus an
5900              approximation for the additional cost of the addressing
5901              mode.  */
5902           rtx address = XEXP (x, 0);
5903           if (VECTOR_MODE_P (mode))
5904             *cost += extra_cost->ldst.loadv;
5905           else if (GET_MODE_CLASS (mode) == MODE_INT)
5906             *cost += extra_cost->ldst.load;
5907           else if (mode == SFmode)
5908             *cost += extra_cost->ldst.loadf;
5909           else if (mode == DFmode)
5910             *cost += extra_cost->ldst.loadd;
5911
5912           *cost +=
5913                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5914                                                      0, speed));
5915         }
5916
5917       return true;
5918
5919     case NEG:
5920       op0 = XEXP (x, 0);
5921
5922       if (VECTOR_MODE_P (mode))
5923         {
5924           if (speed)
5925             {
5926               /* FNEG.  */
5927               *cost += extra_cost->vect.alu;
5928             }
5929           return false;
5930         }
5931
5932       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
5933        {
5934           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5935               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5936             {
5937               /* CSETM.  */
5938               *cost += rtx_cost (XEXP (op0, 0), NEG, 0, speed);
5939               return true;
5940             }
5941
5942           /* Cost this as SUB wzr, X.  */
5943           op0 = CONST0_RTX (GET_MODE (x));
5944           op1 = XEXP (x, 0);
5945           goto cost_minus;
5946         }
5947
5948       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
5949         {
5950           /* Support (neg(fma...)) as a single instruction only if
5951              sign of zeros is unimportant.  This matches the decision
5952              making in aarch64.md.  */
5953           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5954             {
5955               /* FNMADD.  */
5956               *cost = rtx_cost (op0, NEG, 0, speed);
5957               return true;
5958             }
5959           if (speed)
5960             /* FNEG.  */
5961             *cost += extra_cost->fp[mode == DFmode].neg;
5962           return false;
5963         }
5964
5965       return false;
5966
5967     case CLRSB:
5968     case CLZ:
5969       if (speed)
5970         {
5971           if (VECTOR_MODE_P (mode))
5972             *cost += extra_cost->vect.alu;
5973           else
5974             *cost += extra_cost->alu.clz;
5975         }
5976
5977       return false;
5978
5979     case COMPARE:
5980       op0 = XEXP (x, 0);
5981       op1 = XEXP (x, 1);
5982
5983       if (op1 == const0_rtx
5984           && GET_CODE (op0) == AND)
5985         {
5986           x = op0;
5987           goto cost_logic;
5988         }
5989
5990       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
5991         {
5992           /* TODO: A write to the CC flags possibly costs extra, this
5993              needs encoding in the cost tables.  */
5994
5995           /* CC_ZESWPmode supports zero extend for free.  */
5996           if (GET_MODE (x) == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
5997             op0 = XEXP (op0, 0);
5998
5999           /* ANDS.  */
6000           if (GET_CODE (op0) == AND)
6001             {
6002               x = op0;
6003               goto cost_logic;
6004             }
6005
6006           if (GET_CODE (op0) == PLUS)
6007             {
6008               /* ADDS (and CMN alias).  */
6009               x = op0;
6010               goto cost_plus;
6011             }
6012
6013           if (GET_CODE (op0) == MINUS)
6014             {
6015               /* SUBS.  */
6016               x = op0;
6017               goto cost_minus;
6018             }
6019
6020           if (GET_CODE (op1) == NEG)
6021             {
6022               /* CMN.  */
6023               if (speed)
6024                 *cost += extra_cost->alu.arith;
6025
6026               *cost += rtx_cost (op0, COMPARE, 0, speed);
6027               *cost += rtx_cost (XEXP (op1, 0), NEG, 1, speed);
6028               return true;
6029             }
6030
6031           /* CMP.
6032
6033              Compare can freely swap the order of operands, and
6034              canonicalization puts the more complex operation first.
6035              But the integer MINUS logic expects the shift/extend
6036              operation in op1.  */
6037           if (! (REG_P (op0)
6038                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6039           {
6040             op0 = XEXP (x, 1);
6041             op1 = XEXP (x, 0);
6042           }
6043           goto cost_minus;
6044         }
6045
6046       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6047         {
6048           /* FCMP.  */
6049           if (speed)
6050             *cost += extra_cost->fp[mode == DFmode].compare;
6051
6052           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6053             {
6054               *cost += rtx_cost (op0, COMPARE, 0, speed);
6055               /* FCMP supports constant 0.0 for no extra cost. */
6056               return true;
6057             }
6058           return false;
6059         }
6060
6061       if (VECTOR_MODE_P (mode))
6062         {
6063           /* Vector compare.  */
6064           if (speed)
6065             *cost += extra_cost->vect.alu;
6066
6067           if (aarch64_float_const_zero_rtx_p (op1))
6068             {
6069               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6070                  cost.  */
6071               return true;
6072             }
6073           return false;
6074         }
6075       return false;
6076
6077     case MINUS:
6078       {
6079         op0 = XEXP (x, 0);
6080         op1 = XEXP (x, 1);
6081
6082 cost_minus:
6083         *cost += rtx_cost (op0, MINUS, 0, speed);
6084
6085         /* Detect valid immediates.  */
6086         if ((GET_MODE_CLASS (mode) == MODE_INT
6087              || (GET_MODE_CLASS (mode) == MODE_CC
6088                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6089             && CONST_INT_P (op1)
6090             && aarch64_uimm12_shift (INTVAL (op1)))
6091           {
6092             if (speed)
6093               /* SUB(S) (immediate).  */
6094               *cost += extra_cost->alu.arith;
6095             return true;
6096           }
6097
6098         /* Look for SUB (extended register).  */
6099         if (aarch64_rtx_arith_op_extract_p (op1, mode))
6100           {
6101             if (speed)
6102               *cost += extra_cost->alu.extend_arith;
6103
6104             *cost += rtx_cost (XEXP (XEXP (op1, 0), 0),
6105                                (enum rtx_code) GET_CODE (op1),
6106                                0, speed);
6107             return true;
6108           }
6109
6110         rtx new_op1 = aarch64_strip_extend (op1);
6111
6112         /* Cost this as an FMA-alike operation.  */
6113         if ((GET_CODE (new_op1) == MULT
6114              || aarch64_shift_p (GET_CODE (new_op1)))
6115             && code != COMPARE)
6116           {
6117             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6118                                             (enum rtx_code) code,
6119                                             speed);
6120             return true;
6121           }
6122
6123         *cost += rtx_cost (new_op1, MINUS, 1, speed);
6124
6125         if (speed)
6126           {
6127             if (VECTOR_MODE_P (mode))
6128               {
6129                 /* Vector SUB.  */
6130                 *cost += extra_cost->vect.alu;
6131               }
6132             else if (GET_MODE_CLASS (mode) == MODE_INT)
6133               {
6134                 /* SUB(S).  */
6135                 *cost += extra_cost->alu.arith;
6136               }
6137             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6138               {
6139                 /* FSUB.  */
6140                 *cost += extra_cost->fp[mode == DFmode].addsub;
6141               }
6142           }
6143         return true;
6144       }
6145
6146     case PLUS:
6147       {
6148         rtx new_op0;
6149
6150         op0 = XEXP (x, 0);
6151         op1 = XEXP (x, 1);
6152
6153 cost_plus:
6154         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6155             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6156           {
6157             /* CSINC.  */
6158             *cost += rtx_cost (XEXP (op0, 0), PLUS, 0, speed);
6159             *cost += rtx_cost (op1, PLUS, 1, speed);
6160             return true;
6161           }
6162
6163         if (GET_MODE_CLASS (mode) == MODE_INT
6164             && CONST_INT_P (op1)
6165             && aarch64_uimm12_shift (INTVAL (op1)))
6166           {
6167             *cost += rtx_cost (op0, PLUS, 0, speed);
6168
6169             if (speed)
6170               /* ADD (immediate).  */
6171               *cost += extra_cost->alu.arith;
6172             return true;
6173           }
6174
6175         *cost += rtx_cost (op1, PLUS, 1, speed);
6176
6177         /* Look for ADD (extended register).  */
6178         if (aarch64_rtx_arith_op_extract_p (op0, mode))
6179           {
6180             if (speed)
6181               *cost += extra_cost->alu.extend_arith;
6182
6183             *cost += rtx_cost (XEXP (XEXP (op0, 0), 0),
6184                                (enum rtx_code) GET_CODE (op0),
6185                                0, speed);
6186             return true;
6187           }
6188
6189         /* Strip any extend, leave shifts behind as we will
6190            cost them through mult_cost.  */
6191         new_op0 = aarch64_strip_extend (op0);
6192
6193         if (GET_CODE (new_op0) == MULT
6194             || aarch64_shift_p (GET_CODE (new_op0)))
6195           {
6196             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6197                                             speed);
6198             return true;
6199           }
6200
6201         *cost += rtx_cost (new_op0, PLUS, 0, speed);
6202
6203         if (speed)
6204           {
6205             if (VECTOR_MODE_P (mode))
6206               {
6207                 /* Vector ADD.  */
6208                 *cost += extra_cost->vect.alu;
6209               }
6210             else if (GET_MODE_CLASS (mode) == MODE_INT)
6211               {
6212                 /* ADD.  */
6213                 *cost += extra_cost->alu.arith;
6214               }
6215             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6216               {
6217                 /* FADD.  */
6218                 *cost += extra_cost->fp[mode == DFmode].addsub;
6219               }
6220           }
6221         return true;
6222       }
6223
6224     case BSWAP:
6225       *cost = COSTS_N_INSNS (1);
6226
6227       if (speed)
6228         {
6229           if (VECTOR_MODE_P (mode))
6230             *cost += extra_cost->vect.alu;
6231           else
6232             *cost += extra_cost->alu.rev;
6233         }
6234       return false;
6235
6236     case IOR:
6237       if (aarch_rev16_p (x))
6238         {
6239           *cost = COSTS_N_INSNS (1);
6240
6241           if (speed)
6242             {
6243               if (VECTOR_MODE_P (mode))
6244                 *cost += extra_cost->vect.alu;
6245               else
6246                 *cost += extra_cost->alu.rev;
6247             }
6248           return true;
6249         }
6250
6251       if (aarch64_extr_rtx_p (x, &op0, &op1))
6252         {
6253           *cost += rtx_cost (op0, IOR, 0, speed)
6254                    + rtx_cost (op1, IOR, 1, speed);
6255           if (speed)
6256             *cost += extra_cost->alu.shift;
6257
6258           return true;
6259         }
6260     /* Fall through.  */
6261     case XOR:
6262     case AND:
6263     cost_logic:
6264       op0 = XEXP (x, 0);
6265       op1 = XEXP (x, 1);
6266
6267       if (VECTOR_MODE_P (mode))
6268         {
6269           if (speed)
6270             *cost += extra_cost->vect.alu;
6271           return true;
6272         }
6273
6274       if (code == AND
6275           && GET_CODE (op0) == MULT
6276           && CONST_INT_P (XEXP (op0, 1))
6277           && CONST_INT_P (op1)
6278           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6279                                INTVAL (op1)) != 0)
6280         {
6281           /* This is a UBFM/SBFM.  */
6282           *cost += rtx_cost (XEXP (op0, 0), ZERO_EXTRACT, 0, speed);
6283           if (speed)
6284             *cost += extra_cost->alu.bfx;
6285           return true;
6286         }
6287
6288       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6289         {
6290           /* We possibly get the immediate for free, this is not
6291              modelled.  */
6292           if (CONST_INT_P (op1)
6293               && aarch64_bitmask_imm (INTVAL (op1), GET_MODE (x)))
6294             {
6295               *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6296
6297               if (speed)
6298                 *cost += extra_cost->alu.logical;
6299
6300               return true;
6301             }
6302           else
6303             {
6304               rtx new_op0 = op0;
6305
6306               /* Handle ORN, EON, or BIC.  */
6307               if (GET_CODE (op0) == NOT)
6308                 op0 = XEXP (op0, 0);
6309
6310               new_op0 = aarch64_strip_shift (op0);
6311
6312               /* If we had a shift on op0 then this is a logical-shift-
6313                  by-register/immediate operation.  Otherwise, this is just
6314                  a logical operation.  */
6315               if (speed)
6316                 {
6317                   if (new_op0 != op0)
6318                     {
6319                       /* Shift by immediate.  */
6320                       if (CONST_INT_P (XEXP (op0, 1)))
6321                         *cost += extra_cost->alu.log_shift;
6322                       else
6323                         *cost += extra_cost->alu.log_shift_reg;
6324                     }
6325                   else
6326                     *cost += extra_cost->alu.logical;
6327                 }
6328
6329               /* In both cases we want to cost both operands.  */
6330               *cost += rtx_cost (new_op0, (enum rtx_code) code, 0, speed)
6331                        + rtx_cost (op1, (enum rtx_code) code, 1, speed);
6332
6333               return true;
6334             }
6335         }
6336       return false;
6337
6338     case NOT:
6339       x = XEXP (x, 0);
6340       op0 = aarch64_strip_shift (x);
6341
6342       if (VECTOR_MODE_P (mode))
6343         {
6344           /* Vector NOT.  */
6345           *cost += extra_cost->vect.alu;
6346           return false;
6347         }
6348
6349       /* MVN-shifted-reg.  */
6350       if (op0 != x)
6351         {
6352           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6353
6354           if (speed)
6355             *cost += extra_cost->alu.log_shift;
6356
6357           return true;
6358         }
6359       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6360          Handle the second form here taking care that 'a' in the above can
6361          be a shift.  */
6362       else if (GET_CODE (op0) == XOR)
6363         {
6364           rtx newop0 = XEXP (op0, 0);
6365           rtx newop1 = XEXP (op0, 1);
6366           rtx op0_stripped = aarch64_strip_shift (newop0);
6367
6368           *cost += rtx_cost (newop1, (enum rtx_code) code, 1, speed)
6369                    + rtx_cost (op0_stripped, XOR, 0, speed);
6370
6371           if (speed)
6372             {
6373               if (op0_stripped != newop0)
6374                 *cost += extra_cost->alu.log_shift;
6375               else
6376                 *cost += extra_cost->alu.logical;
6377             }
6378
6379           return true;
6380         }
6381       /* MVN.  */
6382       if (speed)
6383         *cost += extra_cost->alu.logical;
6384
6385       return false;
6386
6387     case ZERO_EXTEND:
6388
6389       op0 = XEXP (x, 0);
6390       /* If a value is written in SI mode, then zero extended to DI
6391          mode, the operation will in general be free as a write to
6392          a 'w' register implicitly zeroes the upper bits of an 'x'
6393          register.  However, if this is
6394
6395            (set (reg) (zero_extend (reg)))
6396
6397          we must cost the explicit register move.  */
6398       if (mode == DImode
6399           && GET_MODE (op0) == SImode
6400           && outer == SET)
6401         {
6402           int op_cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, 0, speed);
6403
6404           if (!op_cost && speed)
6405             /* MOV.  */
6406             *cost += extra_cost->alu.extend;
6407           else
6408             /* Free, the cost is that of the SI mode operation.  */
6409             *cost = op_cost;
6410
6411           return true;
6412         }
6413       else if (MEM_P (XEXP (x, 0)))
6414         {
6415           /* All loads can zero extend to any size for free.  */
6416           *cost = rtx_cost (XEXP (x, 0), ZERO_EXTEND, param, speed);
6417           return true;
6418         }
6419
6420       if (speed)
6421         {
6422           if (VECTOR_MODE_P (mode))
6423             {
6424               /* UMOV.  */
6425               *cost += extra_cost->vect.alu;
6426             }
6427           else
6428             {
6429               /* UXTB/UXTH.  */
6430               *cost += extra_cost->alu.extend;
6431             }
6432         }
6433       return false;
6434
6435     case SIGN_EXTEND:
6436       if (MEM_P (XEXP (x, 0)))
6437         {
6438           /* LDRSH.  */
6439           if (speed)
6440             {
6441               rtx address = XEXP (XEXP (x, 0), 0);
6442               *cost += extra_cost->ldst.load_sign_extend;
6443
6444               *cost +=
6445                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6446                                                      0, speed));
6447             }
6448           return true;
6449         }
6450
6451       if (speed)
6452         {
6453           if (VECTOR_MODE_P (mode))
6454             *cost += extra_cost->vect.alu;
6455           else
6456             *cost += extra_cost->alu.extend;
6457         }
6458       return false;
6459
6460     case ASHIFT:
6461       op0 = XEXP (x, 0);
6462       op1 = XEXP (x, 1);
6463
6464       if (CONST_INT_P (op1))
6465         {
6466           if (speed)
6467             {
6468               if (VECTOR_MODE_P (mode))
6469                 {
6470                   /* Vector shift (immediate).  */
6471                   *cost += extra_cost->vect.alu;
6472                 }
6473               else
6474                 {
6475                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6476                      aliases.  */
6477                   *cost += extra_cost->alu.shift;
6478                 }
6479             }
6480
6481           /* We can incorporate zero/sign extend for free.  */
6482           if (GET_CODE (op0) == ZERO_EXTEND
6483               || GET_CODE (op0) == SIGN_EXTEND)
6484             op0 = XEXP (op0, 0);
6485
6486           *cost += rtx_cost (op0, ASHIFT, 0, speed);
6487           return true;
6488         }
6489       else
6490         {
6491           if (speed)
6492             {
6493               if (VECTOR_MODE_P (mode))
6494                 {
6495                   /* Vector shift (register).  */
6496                   *cost += extra_cost->vect.alu;
6497                 }
6498               else
6499                 {
6500                   /* LSLV.  */
6501                   *cost += extra_cost->alu.shift_reg;
6502                 }
6503             }
6504           return false;  /* All arguments need to be in registers.  */
6505         }
6506
6507     case ROTATE:
6508     case ROTATERT:
6509     case LSHIFTRT:
6510     case ASHIFTRT:
6511       op0 = XEXP (x, 0);
6512       op1 = XEXP (x, 1);
6513
6514       if (CONST_INT_P (op1))
6515         {
6516           /* ASR (immediate) and friends.  */
6517           if (speed)
6518             {
6519               if (VECTOR_MODE_P (mode))
6520                 *cost += extra_cost->vect.alu;
6521               else
6522                 *cost += extra_cost->alu.shift;
6523             }
6524
6525           *cost += rtx_cost (op0, (enum rtx_code) code, 0, speed);
6526           return true;
6527         }
6528       else
6529         {
6530
6531           /* ASR (register) and friends.  */
6532           if (speed)
6533             {
6534               if (VECTOR_MODE_P (mode))
6535                 *cost += extra_cost->vect.alu;
6536               else
6537                 *cost += extra_cost->alu.shift_reg;
6538             }
6539           return false;  /* All arguments need to be in registers.  */
6540         }
6541
6542     case SYMBOL_REF:
6543
6544       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
6545           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
6546         {
6547           /* LDR.  */
6548           if (speed)
6549             *cost += extra_cost->ldst.load;
6550         }
6551       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6552                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6553         {
6554           /* ADRP, followed by ADD.  */
6555           *cost += COSTS_N_INSNS (1);
6556           if (speed)
6557             *cost += 2 * extra_cost->alu.arith;
6558         }
6559       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6560                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6561         {
6562           /* ADR.  */
6563           if (speed)
6564             *cost += extra_cost->alu.arith;
6565         }
6566
6567       if (flag_pic)
6568         {
6569           /* One extra load instruction, after accessing the GOT.  */
6570           *cost += COSTS_N_INSNS (1);
6571           if (speed)
6572             *cost += extra_cost->ldst.load;
6573         }
6574       return true;
6575
6576     case HIGH:
6577     case LO_SUM:
6578       /* ADRP/ADD (immediate).  */
6579       if (speed)
6580         *cost += extra_cost->alu.arith;
6581       return true;
6582
6583     case ZERO_EXTRACT:
6584     case SIGN_EXTRACT:
6585       /* UBFX/SBFX.  */
6586       if (speed)
6587         {
6588           if (VECTOR_MODE_P (mode))
6589             *cost += extra_cost->vect.alu;
6590           else
6591             *cost += extra_cost->alu.bfx;
6592         }
6593
6594       /* We can trust that the immediates used will be correct (there
6595          are no by-register forms), so we need only cost op0.  */
6596       *cost += rtx_cost (XEXP (x, 0), (enum rtx_code) code, 0, speed);
6597       return true;
6598
6599     case MULT:
6600       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6601       /* aarch64_rtx_mult_cost always handles recursion to its
6602          operands.  */
6603       return true;
6604
6605     case MOD:
6606     case UMOD:
6607       if (speed)
6608         {
6609           if (VECTOR_MODE_P (mode))
6610             *cost += extra_cost->vect.alu;
6611           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_INT)
6612             *cost += (extra_cost->mult[GET_MODE (x) == DImode].add
6613                       + extra_cost->mult[GET_MODE (x) == DImode].idiv);
6614           else if (GET_MODE (x) == DFmode)
6615             *cost += (extra_cost->fp[1].mult
6616                       + extra_cost->fp[1].div);
6617           else if (GET_MODE (x) == SFmode)
6618             *cost += (extra_cost->fp[0].mult
6619                       + extra_cost->fp[0].div);
6620         }
6621       return false;  /* All arguments need to be in registers.  */
6622
6623     case DIV:
6624     case UDIV:
6625     case SQRT:
6626       if (speed)
6627         {
6628           if (VECTOR_MODE_P (mode))
6629             *cost += extra_cost->vect.alu;
6630           else if (GET_MODE_CLASS (mode) == MODE_INT)
6631             /* There is no integer SQRT, so only DIV and UDIV can get
6632                here.  */
6633             *cost += extra_cost->mult[mode == DImode].idiv;
6634           else
6635             *cost += extra_cost->fp[mode == DFmode].div;
6636         }
6637       return false;  /* All arguments need to be in registers.  */
6638
6639     case IF_THEN_ELSE:
6640       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6641                                          XEXP (x, 2), cost, speed);
6642
6643     case EQ:
6644     case NE:
6645     case GT:
6646     case GTU:
6647     case LT:
6648     case LTU:
6649     case GE:
6650     case GEU:
6651     case LE:
6652     case LEU:
6653
6654       return false; /* All arguments must be in registers.  */
6655
6656     case FMA:
6657       op0 = XEXP (x, 0);
6658       op1 = XEXP (x, 1);
6659       op2 = XEXP (x, 2);
6660
6661       if (speed)
6662         {
6663           if (VECTOR_MODE_P (mode))
6664             *cost += extra_cost->vect.alu;
6665           else
6666             *cost += extra_cost->fp[mode == DFmode].fma;
6667         }
6668
6669       /* FMSUB, FNMADD, and FNMSUB are free.  */
6670       if (GET_CODE (op0) == NEG)
6671         op0 = XEXP (op0, 0);
6672
6673       if (GET_CODE (op2) == NEG)
6674         op2 = XEXP (op2, 0);
6675
6676       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6677          and the by-element operand as operand 0.  */
6678       if (GET_CODE (op1) == NEG)
6679         op1 = XEXP (op1, 0);
6680
6681       /* Catch vector-by-element operations.  The by-element operand can
6682          either be (vec_duplicate (vec_select (x))) or just
6683          (vec_select (x)), depending on whether we are multiplying by
6684          a vector or a scalar.
6685
6686          Canonicalization is not very good in these cases, FMA4 will put the
6687          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
6688       if (GET_CODE (op0) == VEC_DUPLICATE)
6689         op0 = XEXP (op0, 0);
6690       else if (GET_CODE (op1) == VEC_DUPLICATE)
6691         op1 = XEXP (op1, 0);
6692
6693       if (GET_CODE (op0) == VEC_SELECT)
6694         op0 = XEXP (op0, 0);
6695       else if (GET_CODE (op1) == VEC_SELECT)
6696         op1 = XEXP (op1, 0);
6697
6698       /* If the remaining parameters are not registers,
6699          get the cost to put them into registers.  */
6700       *cost += rtx_cost (op0, FMA, 0, speed);
6701       *cost += rtx_cost (op1, FMA, 1, speed);
6702       *cost += rtx_cost (op2, FMA, 2, speed);
6703       return true;
6704
6705     case FLOAT:
6706     case UNSIGNED_FLOAT:
6707       if (speed)
6708         *cost += extra_cost->fp[mode == DFmode].fromint;
6709       return false;
6710
6711     case FLOAT_EXTEND:
6712       if (speed)
6713         {
6714           if (VECTOR_MODE_P (mode))
6715             {
6716               /*Vector truncate.  */
6717               *cost += extra_cost->vect.alu;
6718             }
6719           else
6720             *cost += extra_cost->fp[mode == DFmode].widen;
6721         }
6722       return false;
6723
6724     case FLOAT_TRUNCATE:
6725       if (speed)
6726         {
6727           if (VECTOR_MODE_P (mode))
6728             {
6729               /*Vector conversion.  */
6730               *cost += extra_cost->vect.alu;
6731             }
6732           else
6733             *cost += extra_cost->fp[mode == DFmode].narrow;
6734         }
6735       return false;
6736
6737     case FIX:
6738     case UNSIGNED_FIX:
6739       x = XEXP (x, 0);
6740       /* Strip the rounding part.  They will all be implemented
6741          by the fcvt* family of instructions anyway.  */
6742       if (GET_CODE (x) == UNSPEC)
6743         {
6744           unsigned int uns_code = XINT (x, 1);
6745
6746           if (uns_code == UNSPEC_FRINTA
6747               || uns_code == UNSPEC_FRINTM
6748               || uns_code == UNSPEC_FRINTN
6749               || uns_code == UNSPEC_FRINTP
6750               || uns_code == UNSPEC_FRINTZ)
6751             x = XVECEXP (x, 0, 0);
6752         }
6753
6754       if (speed)
6755         {
6756           if (VECTOR_MODE_P (mode))
6757             *cost += extra_cost->vect.alu;
6758           else
6759             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6760         }
6761       *cost += rtx_cost (x, (enum rtx_code) code, 0, speed);
6762       return true;
6763
6764     case ABS:
6765       if (VECTOR_MODE_P (mode))
6766         {
6767           /* ABS (vector).  */
6768           if (speed)
6769             *cost += extra_cost->vect.alu;
6770         }
6771       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6772         {
6773           op0 = XEXP (x, 0);
6774
6775           /* FABD, which is analogous to FADD.  */
6776           if (GET_CODE (op0) == MINUS)
6777             {
6778               *cost += rtx_cost (XEXP (op0, 0), MINUS, 0, speed);
6779                         + rtx_cost (XEXP (op0, 1), MINUS, 1, speed);
6780               if (speed)
6781                 *cost += extra_cost->fp[mode == DFmode].addsub;
6782
6783               return true;
6784             }
6785           /* Simple FABS is analogous to FNEG.  */
6786           if (speed)
6787             *cost += extra_cost->fp[mode == DFmode].neg;
6788         }
6789       else
6790         {
6791           /* Integer ABS will either be split to
6792              two arithmetic instructions, or will be an ABS
6793              (scalar), which we don't model.  */
6794           *cost = COSTS_N_INSNS (2);
6795           if (speed)
6796             *cost += 2 * extra_cost->alu.arith;
6797         }
6798       return false;
6799
6800     case SMAX:
6801     case SMIN:
6802       if (speed)
6803         {
6804           if (VECTOR_MODE_P (mode))
6805             *cost += extra_cost->vect.alu;
6806           else
6807             {
6808               /* FMAXNM/FMINNM/FMAX/FMIN.
6809                  TODO: This may not be accurate for all implementations, but
6810                  we do not model this in the cost tables.  */
6811               *cost += extra_cost->fp[mode == DFmode].addsub;
6812             }
6813         }
6814       return false;
6815
6816     case UNSPEC:
6817       /* The floating point round to integer frint* instructions.  */
6818       if (aarch64_frint_unspec_p (XINT (x, 1)))
6819         {
6820           if (speed)
6821             *cost += extra_cost->fp[mode == DFmode].roundint;
6822
6823           return false;
6824         }
6825
6826       if (XINT (x, 1) == UNSPEC_RBIT)
6827         {
6828           if (speed)
6829             *cost += extra_cost->alu.rev;
6830
6831           return false;
6832         }
6833       break;
6834
6835     case TRUNCATE:
6836
6837       /* Decompose <su>muldi3_highpart.  */
6838       if (/* (truncate:DI  */
6839           mode == DImode
6840           /*   (lshiftrt:TI  */
6841           && GET_MODE (XEXP (x, 0)) == TImode
6842           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6843           /*      (mult:TI  */
6844           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6845           /*        (ANY_EXTEND:TI (reg:DI))
6846                     (ANY_EXTEND:TI (reg:DI)))  */
6847           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6848                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6849               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6850                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6851           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6852           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6853           /*     (const_int 64)  */
6854           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6855           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6856         {
6857           /* UMULH/SMULH.  */
6858           if (speed)
6859             *cost += extra_cost->mult[mode == DImode].extend;
6860           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6861                              MULT, 0, speed);
6862           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6863                              MULT, 1, speed);
6864           return true;
6865         }
6866
6867       /* Fall through.  */
6868     default:
6869       break;
6870     }
6871
6872   if (dump_file && (dump_flags & TDF_DETAILS))
6873     fprintf (dump_file,
6874       "\nFailed to cost RTX.  Assuming default cost.\n");
6875
6876   return true;
6877 }
6878
6879 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6880    calculated for X.  This cost is stored in *COST.  Returns true
6881    if the total cost of X was calculated.  */
6882 static bool
6883 aarch64_rtx_costs_wrapper (rtx x, int code, int outer,
6884                    int param, int *cost, bool speed)
6885 {
6886   bool result = aarch64_rtx_costs (x, code, outer, param, cost, speed);
6887
6888   if (dump_file && (dump_flags & TDF_DETAILS))
6889     {
6890       print_rtl_single (dump_file, x);
6891       fprintf (dump_file, "\n%s cost: %d (%s)\n",
6892                speed ? "Hot" : "Cold",
6893                *cost, result ? "final" : "partial");
6894     }
6895
6896   return result;
6897 }
6898
6899 static int
6900 aarch64_register_move_cost (machine_mode mode,
6901                             reg_class_t from_i, reg_class_t to_i)
6902 {
6903   enum reg_class from = (enum reg_class) from_i;
6904   enum reg_class to = (enum reg_class) to_i;
6905   const struct cpu_regmove_cost *regmove_cost
6906     = aarch64_tune_params.regmove_cost;
6907
6908   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
6909   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6910     to = GENERAL_REGS;
6911
6912   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6913     from = GENERAL_REGS;
6914
6915   /* Moving between GPR and stack cost is the same as GP2GP.  */
6916   if ((from == GENERAL_REGS && to == STACK_REG)
6917       || (to == GENERAL_REGS && from == STACK_REG))
6918     return regmove_cost->GP2GP;
6919
6920   /* To/From the stack register, we move via the gprs.  */
6921   if (to == STACK_REG || from == STACK_REG)
6922     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6923             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6924
6925   if (GET_MODE_SIZE (mode) == 16)
6926     {
6927       /* 128-bit operations on general registers require 2 instructions.  */
6928       if (from == GENERAL_REGS && to == GENERAL_REGS)
6929         return regmove_cost->GP2GP * 2;
6930       else if (from == GENERAL_REGS)
6931         return regmove_cost->GP2FP * 2;
6932       else if (to == GENERAL_REGS)
6933         return regmove_cost->FP2GP * 2;
6934
6935       /* When AdvSIMD instructions are disabled it is not possible to move
6936          a 128-bit value directly between Q registers.  This is handled in
6937          secondary reload.  A general register is used as a scratch to move
6938          the upper DI value and the lower DI value is moved directly,
6939          hence the cost is the sum of three moves. */
6940       if (! TARGET_SIMD)
6941         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6942
6943       return regmove_cost->FP2FP;
6944     }
6945
6946   if (from == GENERAL_REGS && to == GENERAL_REGS)
6947     return regmove_cost->GP2GP;
6948   else if (from == GENERAL_REGS)
6949     return regmove_cost->GP2FP;
6950   else if (to == GENERAL_REGS)
6951     return regmove_cost->FP2GP;
6952
6953   return regmove_cost->FP2FP;
6954 }
6955
6956 static int
6957 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6958                           reg_class_t rclass ATTRIBUTE_UNUSED,
6959                           bool in ATTRIBUTE_UNUSED)
6960 {
6961   return aarch64_tune_params.memmov_cost;
6962 }
6963
6964 /* Return the number of instructions that can be issued per cycle.  */
6965 static int
6966 aarch64_sched_issue_rate (void)
6967 {
6968   return aarch64_tune_params.issue_rate;
6969 }
6970
6971 static int
6972 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
6973 {
6974   int issue_rate = aarch64_sched_issue_rate ();
6975
6976   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
6977 }
6978
6979 /* Vectorizer cost model target hooks.  */
6980
6981 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
6982 static int
6983 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
6984                                     tree vectype,
6985                                     int misalign ATTRIBUTE_UNUSED)
6986 {
6987   unsigned elements;
6988
6989   switch (type_of_cost)
6990     {
6991       case scalar_stmt:
6992         return aarch64_tune_params.vec_costs->scalar_stmt_cost;
6993
6994       case scalar_load:
6995         return aarch64_tune_params.vec_costs->scalar_load_cost;
6996
6997       case scalar_store:
6998         return aarch64_tune_params.vec_costs->scalar_store_cost;
6999
7000       case vector_stmt:
7001         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7002
7003       case vector_load:
7004         return aarch64_tune_params.vec_costs->vec_align_load_cost;
7005
7006       case vector_store:
7007         return aarch64_tune_params.vec_costs->vec_store_cost;
7008
7009       case vec_to_scalar:
7010         return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
7011
7012       case scalar_to_vec:
7013         return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
7014
7015       case unaligned_load:
7016         return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
7017
7018       case unaligned_store:
7019         return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
7020
7021       case cond_branch_taken:
7022         return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
7023
7024       case cond_branch_not_taken:
7025         return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
7026
7027       case vec_perm:
7028       case vec_promote_demote:
7029         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7030
7031       case vec_construct:
7032         elements = TYPE_VECTOR_SUBPARTS (vectype);
7033         return elements / 2 + 1;
7034
7035       default:
7036         gcc_unreachable ();
7037     }
7038 }
7039
7040 /* Implement targetm.vectorize.add_stmt_cost.  */
7041 static unsigned
7042 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
7043                        struct _stmt_vec_info *stmt_info, int misalign,
7044                        enum vect_cost_model_location where)
7045 {
7046   unsigned *cost = (unsigned *) data;
7047   unsigned retval = 0;
7048
7049   if (flag_vect_cost_model)
7050     {
7051       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
7052       int stmt_cost =
7053             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
7054
7055       /* Statements in an inner loop relative to the loop being
7056          vectorized are weighted more heavily.  The value here is
7057          a function (linear for now) of the loop nest level.  */
7058       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
7059         {
7060           loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
7061           struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
7062           unsigned nest_level = loop_depth (loop);
7063
7064           count *= nest_level;
7065         }
7066
7067       retval = (unsigned) (count * stmt_cost);
7068       cost[where] += retval;
7069     }
7070
7071   return retval;
7072 }
7073
7074 static void initialize_aarch64_code_model (void);
7075
7076 /* Parse the architecture extension string.  */
7077
7078 static void
7079 aarch64_parse_extension (char *str)
7080 {
7081   /* The extension string is parsed left to right.  */
7082   const struct aarch64_option_extension *opt = NULL;
7083
7084   /* Flag to say whether we are adding or removing an extension.  */
7085   int adding_ext = -1;
7086
7087   while (str != NULL && *str != 0)
7088     {
7089       char *ext;
7090       size_t len;
7091
7092       str++;
7093       ext = strchr (str, '+');
7094
7095       if (ext != NULL)
7096         len = ext - str;
7097       else
7098         len = strlen (str);
7099
7100       if (len >= 2 && strncmp (str, "no", 2) == 0)
7101         {
7102           adding_ext = 0;
7103           len -= 2;
7104           str += 2;
7105         }
7106       else if (len > 0)
7107         adding_ext = 1;
7108
7109       if (len == 0)
7110         {
7111           error ("missing feature modifier after %qs", adding_ext ? "+"
7112                                                                   : "+no");
7113           return;
7114         }
7115
7116       /* Scan over the extensions table trying to find an exact match.  */
7117       for (opt = all_extensions; opt->name != NULL; opt++)
7118         {
7119           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
7120             {
7121               /* Add or remove the extension.  */
7122               if (adding_ext)
7123                 aarch64_isa_flags |= opt->flags_on;
7124               else
7125                 aarch64_isa_flags &= ~(opt->flags_off);
7126               break;
7127             }
7128         }
7129
7130       if (opt->name == NULL)
7131         {
7132           /* Extension not found in list.  */
7133           error ("unknown feature modifier %qs", str);
7134           return;
7135         }
7136
7137       str = ext;
7138     };
7139
7140   return;
7141 }
7142
7143 /* Parse the ARCH string.  */
7144
7145 static void
7146 aarch64_parse_arch (void)
7147 {
7148   char *ext;
7149   const struct processor *arch;
7150   char *str = (char *) alloca (strlen (aarch64_arch_string) + 1);
7151   size_t len;
7152
7153   strcpy (str, aarch64_arch_string);
7154
7155   ext = strchr (str, '+');
7156
7157   if (ext != NULL)
7158     len = ext - str;
7159   else
7160     len = strlen (str);
7161
7162   if (len == 0)
7163     {
7164       error ("missing arch name in -march=%qs", str);
7165       return;
7166     }
7167
7168   /* Loop through the list of supported ARCHs to find a match.  */
7169   for (arch = all_architectures; arch->name != NULL; arch++)
7170     {
7171       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7172         {
7173           selected_arch = arch;
7174           aarch64_isa_flags = selected_arch->flags;
7175
7176           if (!selected_cpu)
7177             selected_cpu = &all_cores[selected_arch->core];
7178
7179           if (ext != NULL)
7180             {
7181               /* ARCH string contains at least one extension.  */
7182               aarch64_parse_extension (ext);
7183             }
7184
7185           if (strcmp (selected_arch->arch, selected_cpu->arch))
7186             {
7187               warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
7188                        selected_cpu->name, selected_arch->name);
7189             }
7190
7191           return;
7192         }
7193     }
7194
7195   /* ARCH name not found in list.  */
7196   error ("unknown value %qs for -march", str);
7197   return;
7198 }
7199
7200 /* Parse the CPU string.  */
7201
7202 static void
7203 aarch64_parse_cpu (void)
7204 {
7205   char *ext;
7206   const struct processor *cpu;
7207   char *str = (char *) alloca (strlen (aarch64_cpu_string) + 1);
7208   size_t len;
7209
7210   strcpy (str, aarch64_cpu_string);
7211
7212   ext = strchr (str, '+');
7213
7214   if (ext != NULL)
7215     len = ext - str;
7216   else
7217     len = strlen (str);
7218
7219   if (len == 0)
7220     {
7221       error ("missing cpu name in -mcpu=%qs", str);
7222       return;
7223     }
7224
7225   /* Loop through the list of supported CPUs to find a match.  */
7226   for (cpu = all_cores; cpu->name != NULL; cpu++)
7227     {
7228       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7229         {
7230           selected_cpu = cpu;
7231           aarch64_isa_flags = selected_cpu->flags;
7232
7233           if (ext != NULL)
7234             {
7235               /* CPU string contains at least one extension.  */
7236               aarch64_parse_extension (ext);
7237             }
7238
7239           return;
7240         }
7241     }
7242
7243   /* CPU name not found in list.  */
7244   error ("unknown value %qs for -mcpu", str);
7245   return;
7246 }
7247
7248 /* Parse the TUNE string.  */
7249
7250 static void
7251 aarch64_parse_tune (void)
7252 {
7253   const struct processor *cpu;
7254   char *str = (char *) alloca (strlen (aarch64_tune_string) + 1);
7255   strcpy (str, aarch64_tune_string);
7256
7257   /* Loop through the list of supported CPUs to find a match.  */
7258   for (cpu = all_cores; cpu->name != NULL; cpu++)
7259     {
7260       if (strcmp (cpu->name, str) == 0)
7261         {
7262           selected_tune = cpu;
7263           return;
7264         }
7265     }
7266
7267   /* CPU name not found in list.  */
7268   error ("unknown value %qs for -mtune", str);
7269   return;
7270 }
7271
7272 /* Parse TOKEN, which has length LENGTH to see if it is an option
7273    described in FLAG.  If it is, return the index bit for that fusion type.
7274    If not, error (printing OPTION_NAME) and return zero.  */
7275
7276 static unsigned int
7277 aarch64_parse_one_option_token (const char *token,
7278                                 size_t length,
7279                                 const struct aarch64_flag_desc *flag,
7280                                 const char *option_name)
7281 {
7282   for (; flag->name != NULL; flag++)
7283     {
7284       if (length == strlen (flag->name)
7285           && !strncmp (flag->name, token, length))
7286         return flag->flag;
7287     }
7288
7289   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
7290   return 0;
7291 }
7292
7293 /* Parse OPTION which is a comma-separated list of flags to enable.
7294    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
7295    default state we inherit from the CPU tuning structures.  OPTION_NAME
7296    gives the top-level option we are parsing in the -moverride string,
7297    for use in error messages.  */
7298
7299 static unsigned int
7300 aarch64_parse_boolean_options (const char *option,
7301                                const struct aarch64_flag_desc *flags,
7302                                unsigned int initial_state,
7303                                const char *option_name)
7304 {
7305   const char separator = '.';
7306   const char* specs = option;
7307   const char* ntoken = option;
7308   unsigned int found_flags = initial_state;
7309
7310   while ((ntoken = strchr (specs, separator)))
7311     {
7312       size_t token_length = ntoken - specs;
7313       unsigned token_ops = aarch64_parse_one_option_token (specs,
7314                                                            token_length,
7315                                                            flags,
7316                                                            option_name);
7317       /* If we find "none" (or, for simplicity's sake, an error) anywhere
7318          in the token stream, reset the supported operations.  So:
7319
7320            adrp+add.cmp+branch.none.adrp+add
7321
7322            would have the result of turning on only adrp+add fusion.  */
7323       if (!token_ops)
7324         found_flags = 0;
7325
7326       found_flags |= token_ops;
7327       specs = ++ntoken;
7328     }
7329
7330   /* We ended with a comma, print something.  */
7331   if (!(*specs))
7332     {
7333       error ("%s string ill-formed\n", option_name);
7334       return 0;
7335     }
7336
7337   /* We still have one more token to parse.  */
7338   size_t token_length = strlen (specs);
7339   unsigned token_ops = aarch64_parse_one_option_token (specs,
7340                                                        token_length,
7341                                                        flags,
7342                                                        option_name);
7343    if (!token_ops)
7344      found_flags = 0;
7345
7346   found_flags |= token_ops;
7347   return found_flags;
7348 }
7349
7350 /* Support for overriding instruction fusion.  */
7351
7352 static void
7353 aarch64_parse_fuse_string (const char *fuse_string,
7354                             struct tune_params *tune)
7355 {
7356   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
7357                                                      aarch64_fusible_pairs,
7358                                                      tune->fusible_ops,
7359                                                      "fuse=");
7360 }
7361
7362 /* Support for overriding other tuning flags.  */
7363
7364 static void
7365 aarch64_parse_tune_string (const char *tune_string,
7366                             struct tune_params *tune)
7367 {
7368   tune->extra_tuning_flags
7369     = aarch64_parse_boolean_options (tune_string,
7370                                      aarch64_tuning_flags,
7371                                      tune->extra_tuning_flags,
7372                                      "tune=");
7373 }
7374
7375 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
7376    we understand.  If it is, extract the option string and handoff to
7377    the appropriate function.  */
7378
7379 void
7380 aarch64_parse_one_override_token (const char* token,
7381                                   size_t length,
7382                                   struct tune_params *tune)
7383 {
7384   const struct aarch64_tuning_override_function *fn
7385     = aarch64_tuning_override_functions;
7386
7387   const char *option_part = strchr (token, '=');
7388   if (!option_part)
7389     {
7390       error ("tuning string missing in option (%s)", token);
7391       return;
7392     }
7393
7394   /* Get the length of the option name.  */
7395   length = option_part - token;
7396   /* Skip the '=' to get to the option string.  */
7397   option_part++;
7398
7399   for (; fn->name != NULL; fn++)
7400     {
7401       if (!strncmp (fn->name, token, length))
7402         {
7403           fn->parse_override (option_part, tune);
7404           return;
7405         }
7406     }
7407
7408   error ("unknown tuning option (%s)",token);
7409   return;
7410 }
7411
7412 /* Parse STRING looking for options in the format:
7413      string     :: option:string
7414      option     :: name=substring
7415      name       :: {a-z}
7416      substring  :: defined by option.  */
7417
7418 static void
7419 aarch64_parse_override_string (const char* input_string,
7420                                struct tune_params* tune)
7421 {
7422   const char separator = ':';
7423   size_t string_length = strlen (input_string) + 1;
7424   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
7425   char *string = string_root;
7426   strncpy (string, input_string, string_length);
7427   string[string_length - 1] = '\0';
7428
7429   char* ntoken = string;
7430
7431   while ((ntoken = strchr (string, separator)))
7432     {
7433       size_t token_length = ntoken - string;
7434       /* Make this substring look like a string.  */
7435       *ntoken = '\0';
7436       aarch64_parse_one_override_token (string, token_length, tune);
7437       string = ++ntoken;
7438     }
7439
7440   /* One last option to parse.  */
7441   aarch64_parse_one_override_token (string, strlen (string), tune);
7442   free (string_root);
7443 }
7444
7445 /* Implement TARGET_OPTION_OVERRIDE.  */
7446
7447 static void
7448 aarch64_override_options (void)
7449 {
7450   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
7451      If either of -march or -mtune is given, they override their
7452      respective component of -mcpu.
7453
7454      So, first parse AARCH64_CPU_STRING, then the others, be careful
7455      with -march as, if -mcpu is not present on the command line, march
7456      must set a sensible default CPU.  */
7457   if (aarch64_cpu_string)
7458     {
7459       aarch64_parse_cpu ();
7460     }
7461
7462   if (aarch64_arch_string)
7463     {
7464       aarch64_parse_arch ();
7465     }
7466
7467   if (aarch64_tune_string)
7468     {
7469       aarch64_parse_tune ();
7470     }
7471
7472 #ifndef HAVE_AS_MABI_OPTION
7473   /* The compiler may have been configured with 2.23.* binutils, which does
7474      not have support for ILP32.  */
7475   if (TARGET_ILP32)
7476     error ("Assembler does not support -mabi=ilp32");
7477 #endif
7478
7479   initialize_aarch64_code_model ();
7480
7481   aarch64_build_bitmask_table ();
7482
7483   /* This target defaults to strict volatile bitfields.  */
7484   if (flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
7485     flag_strict_volatile_bitfields = 1;
7486
7487   /* If the user did not specify a processor, choose the default
7488      one for them.  This will be the CPU set during configuration using
7489      --with-cpu, otherwise it is "generic".  */
7490   if (!selected_cpu)
7491     {
7492       selected_cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
7493       aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
7494     }
7495
7496   gcc_assert (selected_cpu);
7497
7498   if (!selected_tune)
7499     selected_tune = selected_cpu;
7500
7501   aarch64_tune_flags = selected_tune->flags;
7502   aarch64_tune = selected_tune->core;
7503   /* Make a copy of the tuning parameters attached to the core, which
7504      we may later overwrite.  */
7505   aarch64_tune_params = *(selected_tune->tune);
7506   aarch64_architecture_version = selected_cpu->architecture_version;
7507
7508   if (aarch64_override_tune_string)
7509     aarch64_parse_override_string (aarch64_override_tune_string,
7510                                    &aarch64_tune_params);
7511
7512   if (aarch64_fix_a53_err835769 == 2)
7513     {
7514 #ifdef TARGET_FIX_ERR_A53_835769_DEFAULT
7515       aarch64_fix_a53_err835769 = 1;
7516 #else
7517       aarch64_fix_a53_err835769 = 0;
7518 #endif
7519     }
7520
7521   aarch64_register_fma_steering ();
7522
7523   aarch64_override_options_after_change ();
7524 }
7525
7526 /* Implement targetm.override_options_after_change.  */
7527
7528 static void
7529 aarch64_override_options_after_change (void)
7530 {
7531   if (flag_omit_frame_pointer)
7532     flag_omit_leaf_frame_pointer = false;
7533   else if (flag_omit_leaf_frame_pointer)
7534     flag_omit_frame_pointer = true;
7535
7536   /* If not optimizing for size, set the default
7537      alignment to what the target wants */
7538   if (!optimize_size)
7539     {
7540       if (align_loops <= 0)
7541         align_loops = aarch64_tune_params.loop_align;
7542       if (align_jumps <= 0)
7543         align_jumps = aarch64_tune_params.jump_align;
7544       if (align_functions <= 0)
7545         align_functions = aarch64_tune_params.function_align;
7546     }
7547 }
7548
7549 static struct machine_function *
7550 aarch64_init_machine_status (void)
7551 {
7552   struct machine_function *machine;
7553   machine = ggc_cleared_alloc<machine_function> ();
7554   return machine;
7555 }
7556
7557 void
7558 aarch64_init_expanders (void)
7559 {
7560   init_machine_status = aarch64_init_machine_status;
7561 }
7562
7563 /* A checking mechanism for the implementation of the various code models.  */
7564 static void
7565 initialize_aarch64_code_model (void)
7566 {
7567    if (flag_pic)
7568      {
7569        switch (aarch64_cmodel_var)
7570          {
7571          case AARCH64_CMODEL_TINY:
7572            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
7573            break;
7574          case AARCH64_CMODEL_SMALL:
7575 #ifdef HAVE_AS_SMALL_PIC_RELOCS
7576            aarch64_cmodel = (flag_pic == 2
7577                              ? AARCH64_CMODEL_SMALL_PIC
7578                              : AARCH64_CMODEL_SMALL_SPIC);
7579 #else
7580            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
7581 #endif
7582            break;
7583          case AARCH64_CMODEL_LARGE:
7584            sorry ("code model %qs with -f%s", "large",
7585                   flag_pic > 1 ? "PIC" : "pic");
7586          default:
7587            gcc_unreachable ();
7588          }
7589      }
7590    else
7591      aarch64_cmodel = aarch64_cmodel_var;
7592 }
7593
7594 /* Return true if SYMBOL_REF X binds locally.  */
7595
7596 static bool
7597 aarch64_symbol_binds_local_p (const_rtx x)
7598 {
7599   return (SYMBOL_REF_DECL (x)
7600           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
7601           : SYMBOL_REF_LOCAL_P (x));
7602 }
7603
7604 /* Return true if SYMBOL_REF X is thread local */
7605 static bool
7606 aarch64_tls_symbol_p (rtx x)
7607 {
7608   if (! TARGET_HAVE_TLS)
7609     return false;
7610
7611   if (GET_CODE (x) != SYMBOL_REF)
7612     return false;
7613
7614   return SYMBOL_REF_TLS_MODEL (x) != 0;
7615 }
7616
7617 /* Classify a TLS symbol into one of the TLS kinds.  */
7618 enum aarch64_symbol_type
7619 aarch64_classify_tls_symbol (rtx x)
7620 {
7621   enum tls_model tls_kind = tls_symbolic_operand_type (x);
7622
7623   switch (tls_kind)
7624     {
7625     case TLS_MODEL_GLOBAL_DYNAMIC:
7626     case TLS_MODEL_LOCAL_DYNAMIC:
7627       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
7628
7629     case TLS_MODEL_INITIAL_EXEC:
7630       return SYMBOL_SMALL_GOTTPREL;
7631
7632     case TLS_MODEL_LOCAL_EXEC:
7633       return SYMBOL_TLSLE;
7634
7635     case TLS_MODEL_EMULATED:
7636     case TLS_MODEL_NONE:
7637       return SYMBOL_FORCE_TO_MEM;
7638
7639     default:
7640       gcc_unreachable ();
7641     }
7642 }
7643
7644 /* Return the method that should be used to access SYMBOL_REF or
7645    LABEL_REF X in context CONTEXT.  */
7646
7647 enum aarch64_symbol_type
7648 aarch64_classify_symbol (rtx x, rtx offset,
7649                          enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
7650 {
7651   if (GET_CODE (x) == LABEL_REF)
7652     {
7653       switch (aarch64_cmodel)
7654         {
7655         case AARCH64_CMODEL_LARGE:
7656           return SYMBOL_FORCE_TO_MEM;
7657
7658         case AARCH64_CMODEL_TINY_PIC:
7659         case AARCH64_CMODEL_TINY:
7660           return SYMBOL_TINY_ABSOLUTE;
7661
7662         case AARCH64_CMODEL_SMALL_SPIC:
7663         case AARCH64_CMODEL_SMALL_PIC:
7664         case AARCH64_CMODEL_SMALL:
7665           return SYMBOL_SMALL_ABSOLUTE;
7666
7667         default:
7668           gcc_unreachable ();
7669         }
7670     }
7671
7672   if (GET_CODE (x) == SYMBOL_REF)
7673     {
7674       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
7675           return SYMBOL_FORCE_TO_MEM;
7676
7677       if (aarch64_tls_symbol_p (x))
7678         return aarch64_classify_tls_symbol (x);
7679
7680       switch (aarch64_cmodel)
7681         {
7682         case AARCH64_CMODEL_TINY:
7683           /* When we retreive symbol + offset address, we have to make sure
7684              the offset does not cause overflow of the final address.  But
7685              we have no way of knowing the address of symbol at compile time
7686              so we can't accurately say if the distance between the PC and
7687              symbol + offset is outside the addressible range of +/-1M in the
7688              TINY code model.  So we rely on images not being greater than
7689              1M and cap the offset at 1M and anything beyond 1M will have to
7690              be loaded using an alternative mechanism.  */
7691           if (SYMBOL_REF_WEAK (x)
7692               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
7693             return SYMBOL_FORCE_TO_MEM;
7694           return SYMBOL_TINY_ABSOLUTE;
7695
7696         case AARCH64_CMODEL_SMALL:
7697           /* Same reasoning as the tiny code model, but the offset cap here is
7698              4G.  */
7699           if (SYMBOL_REF_WEAK (x)
7700               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
7701                             HOST_WIDE_INT_C (4294967264)))
7702             return SYMBOL_FORCE_TO_MEM;
7703           return SYMBOL_SMALL_ABSOLUTE;
7704
7705         case AARCH64_CMODEL_TINY_PIC:
7706           if (!aarch64_symbol_binds_local_p (x))
7707             return SYMBOL_TINY_GOT;
7708           return SYMBOL_TINY_ABSOLUTE;
7709
7710         case AARCH64_CMODEL_SMALL_SPIC:
7711         case AARCH64_CMODEL_SMALL_PIC:
7712           if (!aarch64_symbol_binds_local_p (x))
7713             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
7714                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
7715           return SYMBOL_SMALL_ABSOLUTE;
7716
7717         default:
7718           gcc_unreachable ();
7719         }
7720     }
7721
7722   /* By default push everything into the constant pool.  */
7723   return SYMBOL_FORCE_TO_MEM;
7724 }
7725
7726 bool
7727 aarch64_constant_address_p (rtx x)
7728 {
7729   return (CONSTANT_P (x) && memory_address_p (DImode, x));
7730 }
7731
7732 bool
7733 aarch64_legitimate_pic_operand_p (rtx x)
7734 {
7735   if (GET_CODE (x) == SYMBOL_REF
7736       || (GET_CODE (x) == CONST
7737           && GET_CODE (XEXP (x, 0)) == PLUS
7738           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
7739      return false;
7740
7741   return true;
7742 }
7743
7744 /* Return true if X holds either a quarter-precision or
7745      floating-point +0.0 constant.  */
7746 static bool
7747 aarch64_valid_floating_const (machine_mode mode, rtx x)
7748 {
7749   if (!CONST_DOUBLE_P (x))
7750     return false;
7751
7752   if (aarch64_float_const_zero_rtx_p (x))
7753     return true;
7754
7755   /* We only handle moving 0.0 to a TFmode register.  */
7756   if (!(mode == SFmode || mode == DFmode))
7757     return false;
7758
7759   return aarch64_float_const_representable_p (x);
7760 }
7761
7762 static bool
7763 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
7764 {
7765   /* Do not allow vector struct mode constants.  We could support
7766      0 and -1 easily, but they need support in aarch64-simd.md.  */
7767   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
7768     return false;
7769
7770   /* This could probably go away because
7771      we now decompose CONST_INTs according to expand_mov_immediate.  */
7772   if ((GET_CODE (x) == CONST_VECTOR
7773        && aarch64_simd_valid_immediate (x, mode, false, NULL))
7774       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
7775         return !targetm.cannot_force_const_mem (mode, x);
7776
7777   if (GET_CODE (x) == HIGH
7778       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
7779     return true;
7780
7781   return aarch64_constant_address_p (x);
7782 }
7783
7784 rtx
7785 aarch64_load_tp (rtx target)
7786 {
7787   if (!target
7788       || GET_MODE (target) != Pmode
7789       || !register_operand (target, Pmode))
7790     target = gen_reg_rtx (Pmode);
7791
7792   /* Can return in any reg.  */
7793   emit_insn (gen_aarch64_load_tp_hard (target));
7794   return target;
7795 }
7796
7797 /* On AAPCS systems, this is the "struct __va_list".  */
7798 static GTY(()) tree va_list_type;
7799
7800 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
7801    Return the type to use as __builtin_va_list.
7802
7803    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
7804
7805    struct __va_list
7806    {
7807      void *__stack;
7808      void *__gr_top;
7809      void *__vr_top;
7810      int   __gr_offs;
7811      int   __vr_offs;
7812    };  */
7813
7814 static tree
7815 aarch64_build_builtin_va_list (void)
7816 {
7817   tree va_list_name;
7818   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7819
7820   /* Create the type.  */
7821   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
7822   /* Give it the required name.  */
7823   va_list_name = build_decl (BUILTINS_LOCATION,
7824                              TYPE_DECL,
7825                              get_identifier ("__va_list"),
7826                              va_list_type);
7827   DECL_ARTIFICIAL (va_list_name) = 1;
7828   TYPE_NAME (va_list_type) = va_list_name;
7829   TYPE_STUB_DECL (va_list_type) = va_list_name;
7830
7831   /* Create the fields.  */
7832   f_stack = build_decl (BUILTINS_LOCATION,
7833                         FIELD_DECL, get_identifier ("__stack"),
7834                         ptr_type_node);
7835   f_grtop = build_decl (BUILTINS_LOCATION,
7836                         FIELD_DECL, get_identifier ("__gr_top"),
7837                         ptr_type_node);
7838   f_vrtop = build_decl (BUILTINS_LOCATION,
7839                         FIELD_DECL, get_identifier ("__vr_top"),
7840                         ptr_type_node);
7841   f_groff = build_decl (BUILTINS_LOCATION,
7842                         FIELD_DECL, get_identifier ("__gr_offs"),
7843                         integer_type_node);
7844   f_vroff = build_decl (BUILTINS_LOCATION,
7845                         FIELD_DECL, get_identifier ("__vr_offs"),
7846                         integer_type_node);
7847
7848   DECL_ARTIFICIAL (f_stack) = 1;
7849   DECL_ARTIFICIAL (f_grtop) = 1;
7850   DECL_ARTIFICIAL (f_vrtop) = 1;
7851   DECL_ARTIFICIAL (f_groff) = 1;
7852   DECL_ARTIFICIAL (f_vroff) = 1;
7853
7854   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
7855   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
7856   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
7857   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
7858   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
7859
7860   TYPE_FIELDS (va_list_type) = f_stack;
7861   DECL_CHAIN (f_stack) = f_grtop;
7862   DECL_CHAIN (f_grtop) = f_vrtop;
7863   DECL_CHAIN (f_vrtop) = f_groff;
7864   DECL_CHAIN (f_groff) = f_vroff;
7865
7866   /* Compute its layout.  */
7867   layout_type (va_list_type);
7868
7869   return va_list_type;
7870 }
7871
7872 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
7873 static void
7874 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
7875 {
7876   const CUMULATIVE_ARGS *cum;
7877   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7878   tree stack, grtop, vrtop, groff, vroff;
7879   tree t;
7880   int gr_save_area_size;
7881   int vr_save_area_size;
7882   int vr_offset;
7883
7884   cum = &crtl->args.info;
7885   gr_save_area_size
7886     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
7887   vr_save_area_size
7888     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
7889
7890   if (!TARGET_FLOAT)
7891     {
7892       gcc_assert (cum->aapcs_nvrn == 0);
7893       vr_save_area_size = 0;
7894     }
7895
7896   f_stack = TYPE_FIELDS (va_list_type_node);
7897   f_grtop = DECL_CHAIN (f_stack);
7898   f_vrtop = DECL_CHAIN (f_grtop);
7899   f_groff = DECL_CHAIN (f_vrtop);
7900   f_vroff = DECL_CHAIN (f_groff);
7901
7902   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
7903                   NULL_TREE);
7904   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
7905                   NULL_TREE);
7906   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
7907                   NULL_TREE);
7908   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
7909                   NULL_TREE);
7910   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
7911                   NULL_TREE);
7912
7913   /* Emit code to initialize STACK, which points to the next varargs stack
7914      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
7915      by named arguments.  STACK is 8-byte aligned.  */
7916   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
7917   if (cum->aapcs_stack_size > 0)
7918     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
7919   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
7920   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7921
7922   /* Emit code to initialize GRTOP, the top of the GR save area.
7923      virtual_incoming_args_rtx should have been 16 byte aligned.  */
7924   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
7925   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
7926   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7927
7928   /* Emit code to initialize VRTOP, the top of the VR save area.
7929      This address is gr_save_area_bytes below GRTOP, rounded
7930      down to the next 16-byte boundary.  */
7931   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
7932   vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
7933                              STACK_BOUNDARY / BITS_PER_UNIT);
7934
7935   if (vr_offset)
7936     t = fold_build_pointer_plus_hwi (t, -vr_offset);
7937   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
7938   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7939
7940   /* Emit code to initialize GROFF, the offset from GRTOP of the
7941      next GPR argument.  */
7942   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
7943               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
7944   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7945
7946   /* Likewise emit code to initialize VROFF, the offset from FTOP
7947      of the next VR argument.  */
7948   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
7949               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
7950   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
7951 }
7952
7953 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
7954
7955 static tree
7956 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
7957                               gimple_seq *post_p ATTRIBUTE_UNUSED)
7958 {
7959   tree addr;
7960   bool indirect_p;
7961   bool is_ha;           /* is HFA or HVA.  */
7962   bool dw_align;        /* double-word align.  */
7963   machine_mode ag_mode = VOIDmode;
7964   int nregs;
7965   machine_mode mode;
7966
7967   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
7968   tree stack, f_top, f_off, off, arg, roundup, on_stack;
7969   HOST_WIDE_INT size, rsize, adjust, align;
7970   tree t, u, cond1, cond2;
7971
7972   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
7973   if (indirect_p)
7974     type = build_pointer_type (type);
7975
7976   mode = TYPE_MODE (type);
7977
7978   f_stack = TYPE_FIELDS (va_list_type_node);
7979   f_grtop = DECL_CHAIN (f_stack);
7980   f_vrtop = DECL_CHAIN (f_grtop);
7981   f_groff = DECL_CHAIN (f_vrtop);
7982   f_vroff = DECL_CHAIN (f_groff);
7983
7984   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
7985                   f_stack, NULL_TREE);
7986   size = int_size_in_bytes (type);
7987   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
7988
7989   dw_align = false;
7990   adjust = 0;
7991   if (aarch64_vfp_is_call_or_return_candidate (mode,
7992                                                type,
7993                                                &ag_mode,
7994                                                &nregs,
7995                                                &is_ha))
7996     {
7997       /* TYPE passed in fp/simd registers.  */
7998       if (!TARGET_FLOAT)
7999         aarch64_err_no_fpadvsimd (mode, "varargs");
8000
8001       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
8002                       unshare_expr (valist), f_vrtop, NULL_TREE);
8003       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
8004                       unshare_expr (valist), f_vroff, NULL_TREE);
8005
8006       rsize = nregs * UNITS_PER_VREG;
8007
8008       if (is_ha)
8009         {
8010           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
8011             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
8012         }
8013       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
8014                && size < UNITS_PER_VREG)
8015         {
8016           adjust = UNITS_PER_VREG - size;
8017         }
8018     }
8019   else
8020     {
8021       /* TYPE passed in general registers.  */
8022       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
8023                       unshare_expr (valist), f_grtop, NULL_TREE);
8024       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
8025                       unshare_expr (valist), f_groff, NULL_TREE);
8026       rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
8027       nregs = rsize / UNITS_PER_WORD;
8028
8029       if (align > 8)
8030         dw_align = true;
8031
8032       if (BLOCK_REG_PADDING (mode, type, 1) == downward
8033           && size < UNITS_PER_WORD)
8034         {
8035           adjust = UNITS_PER_WORD  - size;
8036         }
8037     }
8038
8039   /* Get a local temporary for the field value.  */
8040   off = get_initialized_tmp_var (f_off, pre_p, NULL);
8041
8042   /* Emit code to branch if off >= 0.  */
8043   t = build2 (GE_EXPR, boolean_type_node, off,
8044               build_int_cst (TREE_TYPE (off), 0));
8045   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
8046
8047   if (dw_align)
8048     {
8049       /* Emit: offs = (offs + 15) & -16.  */
8050       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
8051                   build_int_cst (TREE_TYPE (off), 15));
8052       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
8053                   build_int_cst (TREE_TYPE (off), -16));
8054       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
8055     }
8056   else
8057     roundup = NULL;
8058
8059   /* Update ap.__[g|v]r_offs  */
8060   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
8061               build_int_cst (TREE_TYPE (off), rsize));
8062   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
8063
8064   /* String up.  */
8065   if (roundup)
8066     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
8067
8068   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
8069   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
8070               build_int_cst (TREE_TYPE (f_off), 0));
8071   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
8072
8073   /* String up: make sure the assignment happens before the use.  */
8074   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
8075   COND_EXPR_ELSE (cond1) = t;
8076
8077   /* Prepare the trees handling the argument that is passed on the stack;
8078      the top level node will store in ON_STACK.  */
8079   arg = get_initialized_tmp_var (stack, pre_p, NULL);
8080   if (align > 8)
8081     {
8082       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
8083       t = fold_convert (intDI_type_node, arg);
8084       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
8085                   build_int_cst (TREE_TYPE (t), 15));
8086       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8087                   build_int_cst (TREE_TYPE (t), -16));
8088       t = fold_convert (TREE_TYPE (arg), t);
8089       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
8090     }
8091   else
8092     roundup = NULL;
8093   /* Advance ap.__stack  */
8094   t = fold_convert (intDI_type_node, arg);
8095   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
8096               build_int_cst (TREE_TYPE (t), size + 7));
8097   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
8098               build_int_cst (TREE_TYPE (t), -8));
8099   t = fold_convert (TREE_TYPE (arg), t);
8100   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
8101   /* String up roundup and advance.  */
8102   if (roundup)
8103     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
8104   /* String up with arg */
8105   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
8106   /* Big-endianness related address adjustment.  */
8107   if (BLOCK_REG_PADDING (mode, type, 1) == downward
8108       && size < UNITS_PER_WORD)
8109   {
8110     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
8111                 size_int (UNITS_PER_WORD - size));
8112     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
8113   }
8114
8115   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
8116   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
8117
8118   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
8119   t = off;
8120   if (adjust)
8121     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
8122                 build_int_cst (TREE_TYPE (off), adjust));
8123
8124   t = fold_convert (sizetype, t);
8125   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
8126
8127   if (is_ha)
8128     {
8129       /* type ha; // treat as "struct {ftype field[n];}"
8130          ... [computing offs]
8131          for (i = 0; i <nregs; ++i, offs += 16)
8132            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
8133          return ha;  */
8134       int i;
8135       tree tmp_ha, field_t, field_ptr_t;
8136
8137       /* Declare a local variable.  */
8138       tmp_ha = create_tmp_var_raw (type, "ha");
8139       gimple_add_tmp_var (tmp_ha);
8140
8141       /* Establish the base type.  */
8142       switch (ag_mode)
8143         {
8144         case SFmode:
8145           field_t = float_type_node;
8146           field_ptr_t = float_ptr_type_node;
8147           break;
8148         case DFmode:
8149           field_t = double_type_node;
8150           field_ptr_t = double_ptr_type_node;
8151           break;
8152         case TFmode:
8153           field_t = long_double_type_node;
8154           field_ptr_t = long_double_ptr_type_node;
8155           break;
8156 /* The half precision and quad precision are not fully supported yet.  Enable
8157    the following code after the support is complete.  Need to find the correct
8158    type node for __fp16 *.  */
8159 #if 0
8160         case HFmode:
8161           field_t = float_type_node;
8162           field_ptr_t = float_ptr_type_node;
8163           break;
8164 #endif
8165         case V2SImode:
8166         case V4SImode:
8167             {
8168               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
8169               field_t = build_vector_type_for_mode (innertype, ag_mode);
8170               field_ptr_t = build_pointer_type (field_t);
8171             }
8172           break;
8173         default:
8174           gcc_assert (0);
8175         }
8176
8177       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
8178       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
8179       addr = t;
8180       t = fold_convert (field_ptr_t, addr);
8181       t = build2 (MODIFY_EXPR, field_t,
8182                   build1 (INDIRECT_REF, field_t, tmp_ha),
8183                   build1 (INDIRECT_REF, field_t, t));
8184
8185       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
8186       for (i = 1; i < nregs; ++i)
8187         {
8188           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
8189           u = fold_convert (field_ptr_t, addr);
8190           u = build2 (MODIFY_EXPR, field_t,
8191                       build2 (MEM_REF, field_t, tmp_ha,
8192                               build_int_cst (field_ptr_t,
8193                                              (i *
8194                                               int_size_in_bytes (field_t)))),
8195                       build1 (INDIRECT_REF, field_t, u));
8196           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
8197         }
8198
8199       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
8200       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
8201     }
8202
8203   COND_EXPR_ELSE (cond2) = t;
8204   addr = fold_convert (build_pointer_type (type), cond1);
8205   addr = build_va_arg_indirect_ref (addr);
8206
8207   if (indirect_p)
8208     addr = build_va_arg_indirect_ref (addr);
8209
8210   return addr;
8211 }
8212
8213 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
8214
8215 static void
8216 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
8217                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
8218                                 int no_rtl)
8219 {
8220   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
8221   CUMULATIVE_ARGS local_cum;
8222   int gr_saved, vr_saved;
8223
8224   /* The caller has advanced CUM up to, but not beyond, the last named
8225      argument.  Advance a local copy of CUM past the last "real" named
8226      argument, to find out how many registers are left over.  */
8227   local_cum = *cum;
8228   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
8229
8230   /* Found out how many registers we need to save.  */
8231   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
8232   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
8233
8234   if (!TARGET_FLOAT)
8235     {
8236       gcc_assert (local_cum.aapcs_nvrn == 0);
8237       vr_saved = 0;
8238     }
8239
8240   if (!no_rtl)
8241     {
8242       if (gr_saved > 0)
8243         {
8244           rtx ptr, mem;
8245
8246           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
8247           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
8248                                - gr_saved * UNITS_PER_WORD);
8249           mem = gen_frame_mem (BLKmode, ptr);
8250           set_mem_alias_set (mem, get_varargs_alias_set ());
8251
8252           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
8253                                mem, gr_saved);
8254         }
8255       if (vr_saved > 0)
8256         {
8257           /* We can't use move_block_from_reg, because it will use
8258              the wrong mode, storing D regs only.  */
8259           machine_mode mode = TImode;
8260           int off, i;
8261
8262           /* Set OFF to the offset from virtual_incoming_args_rtx of
8263              the first vector register.  The VR save area lies below
8264              the GR one, and is aligned to 16 bytes.  */
8265           off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
8266                                    STACK_BOUNDARY / BITS_PER_UNIT);
8267           off -= vr_saved * UNITS_PER_VREG;
8268
8269           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
8270             {
8271               rtx ptr, mem;
8272
8273               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
8274               mem = gen_frame_mem (mode, ptr);
8275               set_mem_alias_set (mem, get_varargs_alias_set ());
8276               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
8277               off += UNITS_PER_VREG;
8278             }
8279         }
8280     }
8281
8282   /* We don't save the size into *PRETEND_SIZE because we want to avoid
8283      any complication of having crtl->args.pretend_args_size changed.  */
8284   cfun->machine->frame.saved_varargs_size
8285     = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
8286                       STACK_BOUNDARY / BITS_PER_UNIT)
8287        + vr_saved * UNITS_PER_VREG);
8288 }
8289
8290 static void
8291 aarch64_conditional_register_usage (void)
8292 {
8293   int i;
8294   if (!TARGET_FLOAT)
8295     {
8296       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
8297         {
8298           fixed_regs[i] = 1;
8299           call_used_regs[i] = 1;
8300         }
8301     }
8302 }
8303
8304 /* Walk down the type tree of TYPE counting consecutive base elements.
8305    If *MODEP is VOIDmode, then set it to the first valid floating point
8306    type.  If a non-floating point type is found, or if a floating point
8307    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
8308    otherwise return the count in the sub-tree.  */
8309 static int
8310 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
8311 {
8312   machine_mode mode;
8313   HOST_WIDE_INT size;
8314
8315   switch (TREE_CODE (type))
8316     {
8317     case REAL_TYPE:
8318       mode = TYPE_MODE (type);
8319       if (mode != DFmode && mode != SFmode && mode != TFmode)
8320         return -1;
8321
8322       if (*modep == VOIDmode)
8323         *modep = mode;
8324
8325       if (*modep == mode)
8326         return 1;
8327
8328       break;
8329
8330     case COMPLEX_TYPE:
8331       mode = TYPE_MODE (TREE_TYPE (type));
8332       if (mode != DFmode && mode != SFmode && mode != TFmode)
8333         return -1;
8334
8335       if (*modep == VOIDmode)
8336         *modep = mode;
8337
8338       if (*modep == mode)
8339         return 2;
8340
8341       break;
8342
8343     case VECTOR_TYPE:
8344       /* Use V2SImode and V4SImode as representatives of all 64-bit
8345          and 128-bit vector types.  */
8346       size = int_size_in_bytes (type);
8347       switch (size)
8348         {
8349         case 8:
8350           mode = V2SImode;
8351           break;
8352         case 16:
8353           mode = V4SImode;
8354           break;
8355         default:
8356           return -1;
8357         }
8358
8359       if (*modep == VOIDmode)
8360         *modep = mode;
8361
8362       /* Vector modes are considered to be opaque: two vectors are
8363          equivalent for the purposes of being homogeneous aggregates
8364          if they are the same size.  */
8365       if (*modep == mode)
8366         return 1;
8367
8368       break;
8369
8370     case ARRAY_TYPE:
8371       {
8372         int count;
8373         tree index = TYPE_DOMAIN (type);
8374
8375         /* Can't handle incomplete types nor sizes that are not
8376            fixed.  */
8377         if (!COMPLETE_TYPE_P (type)
8378             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8379           return -1;
8380
8381         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
8382         if (count == -1
8383             || !index
8384             || !TYPE_MAX_VALUE (index)
8385             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
8386             || !TYPE_MIN_VALUE (index)
8387             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
8388             || count < 0)
8389           return -1;
8390
8391         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
8392                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
8393
8394         /* There must be no padding.  */
8395         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8396           return -1;
8397
8398         return count;
8399       }
8400
8401     case RECORD_TYPE:
8402       {
8403         int count = 0;
8404         int sub_count;
8405         tree field;
8406
8407         /* Can't handle incomplete types nor sizes that are not
8408            fixed.  */
8409         if (!COMPLETE_TYPE_P (type)
8410             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8411           return -1;
8412
8413         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
8414           {
8415             if (TREE_CODE (field) != FIELD_DECL)
8416               continue;
8417
8418             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
8419             if (sub_count < 0)
8420               return -1;
8421             count += sub_count;
8422           }
8423
8424         /* There must be no padding.  */
8425         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8426           return -1;
8427
8428         return count;
8429       }
8430
8431     case UNION_TYPE:
8432     case QUAL_UNION_TYPE:
8433       {
8434         /* These aren't very interesting except in a degenerate case.  */
8435         int count = 0;
8436         int sub_count;
8437         tree field;
8438
8439         /* Can't handle incomplete types nor sizes that are not
8440            fixed.  */
8441         if (!COMPLETE_TYPE_P (type)
8442             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
8443           return -1;
8444
8445         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
8446           {
8447             if (TREE_CODE (field) != FIELD_DECL)
8448               continue;
8449
8450             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
8451             if (sub_count < 0)
8452               return -1;
8453             count = count > sub_count ? count : sub_count;
8454           }
8455
8456         /* There must be no padding.  */
8457         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
8458           return -1;
8459
8460         return count;
8461       }
8462
8463     default:
8464       break;
8465     }
8466
8467   return -1;
8468 }
8469
8470 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
8471    type as described in AAPCS64 \S 4.1.2.
8472
8473    See the comment above aarch64_composite_type_p for the notes on MODE.  */
8474
8475 static bool
8476 aarch64_short_vector_p (const_tree type,
8477                         machine_mode mode)
8478 {
8479   HOST_WIDE_INT size = -1;
8480
8481   if (type && TREE_CODE (type) == VECTOR_TYPE)
8482     size = int_size_in_bytes (type);
8483   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
8484             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8485     size = GET_MODE_SIZE (mode);
8486
8487   return (size == 8 || size == 16);
8488 }
8489
8490 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
8491    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
8492    array types.  The C99 floating-point complex types are also considered
8493    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
8494    types, which are GCC extensions and out of the scope of AAPCS64, are
8495    treated as composite types here as well.
8496
8497    Note that MODE itself is not sufficient in determining whether a type
8498    is such a composite type or not.  This is because
8499    stor-layout.c:compute_record_mode may have already changed the MODE
8500    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
8501    structure with only one field may have its MODE set to the mode of the
8502    field.  Also an integer mode whose size matches the size of the
8503    RECORD_TYPE type may be used to substitute the original mode
8504    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
8505    solely relied on.  */
8506
8507 static bool
8508 aarch64_composite_type_p (const_tree type,
8509                           machine_mode mode)
8510 {
8511   if (aarch64_short_vector_p (type, mode))
8512     return false;
8513
8514   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
8515     return true;
8516
8517   if (mode == BLKmode
8518       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
8519       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
8520     return true;
8521
8522   return false;
8523 }
8524
8525 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
8526    shall be passed or returned in simd/fp register(s) (providing these
8527    parameter passing registers are available).
8528
8529    Upon successful return, *COUNT returns the number of needed registers,
8530    *BASE_MODE returns the mode of the individual register and when IS_HAF
8531    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
8532    floating-point aggregate or a homogeneous short-vector aggregate.  */
8533
8534 static bool
8535 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
8536                                          const_tree type,
8537                                          machine_mode *base_mode,
8538                                          int *count,
8539                                          bool *is_ha)
8540 {
8541   machine_mode new_mode = VOIDmode;
8542   bool composite_p = aarch64_composite_type_p (type, mode);
8543
8544   if (is_ha != NULL) *is_ha = false;
8545
8546   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
8547       || aarch64_short_vector_p (type, mode))
8548     {
8549       *count = 1;
8550       new_mode = mode;
8551     }
8552   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
8553     {
8554       if (is_ha != NULL) *is_ha = true;
8555       *count = 2;
8556       new_mode = GET_MODE_INNER (mode);
8557     }
8558   else if (type && composite_p)
8559     {
8560       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
8561
8562       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
8563         {
8564           if (is_ha != NULL) *is_ha = true;
8565           *count = ag_count;
8566         }
8567       else
8568         return false;
8569     }
8570   else
8571     return false;
8572
8573   *base_mode = new_mode;
8574   return true;
8575 }
8576
8577 /* Implement TARGET_STRUCT_VALUE_RTX.  */
8578
8579 static rtx
8580 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
8581                           int incoming ATTRIBUTE_UNUSED)
8582 {
8583   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
8584 }
8585
8586 /* Implements target hook vector_mode_supported_p.  */
8587 static bool
8588 aarch64_vector_mode_supported_p (machine_mode mode)
8589 {
8590   if (TARGET_SIMD
8591       && (mode == V4SImode  || mode == V8HImode
8592           || mode == V16QImode || mode == V2DImode
8593           || mode == V2SImode  || mode == V4HImode
8594           || mode == V8QImode || mode == V2SFmode
8595           || mode == V4SFmode || mode == V2DFmode
8596           || mode == V1DFmode))
8597     return true;
8598
8599   return false;
8600 }
8601
8602 /* Return appropriate SIMD container
8603    for MODE within a vector of WIDTH bits.  */
8604 static machine_mode
8605 aarch64_simd_container_mode (machine_mode mode, unsigned width)
8606 {
8607   gcc_assert (width == 64 || width == 128);
8608   if (TARGET_SIMD)
8609     {
8610       if (width == 128)
8611         switch (mode)
8612           {
8613           case DFmode:
8614             return V2DFmode;
8615           case SFmode:
8616             return V4SFmode;
8617           case SImode:
8618             return V4SImode;
8619           case HImode:
8620             return V8HImode;
8621           case QImode:
8622             return V16QImode;
8623           case DImode:
8624             return V2DImode;
8625           default:
8626             break;
8627           }
8628       else
8629         switch (mode)
8630           {
8631           case SFmode:
8632             return V2SFmode;
8633           case SImode:
8634             return V2SImode;
8635           case HImode:
8636             return V4HImode;
8637           case QImode:
8638             return V8QImode;
8639           default:
8640             break;
8641           }
8642     }
8643   return word_mode;
8644 }
8645
8646 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
8647 static machine_mode
8648 aarch64_preferred_simd_mode (machine_mode mode)
8649 {
8650   return aarch64_simd_container_mode (mode, 128);
8651 }
8652
8653 /* Return the bitmask of possible vector sizes for the vectorizer
8654    to iterate over.  */
8655 static unsigned int
8656 aarch64_autovectorize_vector_sizes (void)
8657 {
8658   return (16 | 8);
8659 }
8660
8661 /* Implement TARGET_MANGLE_TYPE.  */
8662
8663 static const char *
8664 aarch64_mangle_type (const_tree type)
8665 {
8666   /* The AArch64 ABI documents say that "__va_list" has to be
8667      managled as if it is in the "std" namespace.  */
8668   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
8669     return "St9__va_list";
8670
8671   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
8672      builtin types.  */
8673   if (TYPE_NAME (type) != NULL)
8674     return aarch64_mangle_builtin_type (type);
8675
8676   /* Use the default mangling.  */
8677   return NULL;
8678 }
8679
8680
8681 /* Return true if the rtx_insn contains a MEM RTX somewhere
8682    in it.  */
8683
8684 static bool
8685 has_memory_op (rtx_insn *mem_insn)
8686 {
8687   subrtx_iterator::array_type array;
8688   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
8689     if (MEM_P (*iter))
8690       return true;
8691
8692   return false;
8693 }
8694
8695 /* Find the first rtx_insn before insn that will generate an assembly
8696    instruction.  */
8697
8698 static rtx_insn *
8699 aarch64_prev_real_insn (rtx_insn *insn)
8700 {
8701   if (!insn)
8702     return NULL;
8703
8704   do
8705     {
8706       insn = prev_real_insn (insn);
8707     }
8708   while (insn && recog_memoized (insn) < 0);
8709
8710   return insn;
8711 }
8712
8713 static bool
8714 is_madd_op (enum attr_type t1)
8715 {
8716   unsigned int i;
8717   /* A number of these may be AArch32 only.  */
8718   enum attr_type mlatypes[] = {
8719     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
8720     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
8721     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
8722   };
8723
8724   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
8725     {
8726       if (t1 == mlatypes[i])
8727         return true;
8728     }
8729
8730   return false;
8731 }
8732
8733 /* Check if there is a register dependency between a load and the insn
8734    for which we hold recog_data.  */
8735
8736 static bool
8737 dep_between_memop_and_curr (rtx memop)
8738 {
8739   rtx load_reg;
8740   int opno;
8741
8742   gcc_assert (GET_CODE (memop) == SET);
8743
8744   if (!REG_P (SET_DEST (memop)))
8745     return false;
8746
8747   load_reg = SET_DEST (memop);
8748   for (opno = 1; opno < recog_data.n_operands; opno++)
8749     {
8750       rtx operand = recog_data.operand[opno];
8751       if (REG_P (operand)
8752           && reg_overlap_mentioned_p (load_reg, operand))
8753         return true;
8754
8755     }
8756   return false;
8757 }
8758
8759
8760 /* When working around the Cortex-A53 erratum 835769,
8761    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
8762    instruction and has a preceding memory instruction such that a NOP
8763    should be inserted between them.  */
8764
8765 bool
8766 aarch64_madd_needs_nop (rtx_insn* insn)
8767 {
8768   enum attr_type attr_type;
8769   rtx_insn *prev;
8770   rtx body;
8771
8772   if (!aarch64_fix_a53_err835769)
8773     return false;
8774
8775   if (recog_memoized (insn) < 0)
8776     return false;
8777
8778   attr_type = get_attr_type (insn);
8779   if (!is_madd_op (attr_type))
8780     return false;
8781
8782   prev = aarch64_prev_real_insn (insn);
8783   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
8784      Restore recog state to INSN to avoid state corruption.  */
8785   extract_constrain_insn_cached (insn);
8786
8787   if (!prev || !has_memory_op (prev))
8788     return false;
8789
8790   body = single_set (prev);
8791
8792   /* If the previous insn is a memory op and there is no dependency between
8793      it and the DImode madd, emit a NOP between them.  If body is NULL then we
8794      have a complex memory operation, probably a load/store pair.
8795      Be conservative for now and emit a NOP.  */
8796   if (GET_MODE (recog_data.operand[0]) == DImode
8797       && (!body || !dep_between_memop_and_curr (body)))
8798     return true;
8799
8800   return false;
8801
8802 }
8803
8804
8805 /* Implement FINAL_PRESCAN_INSN.  */
8806
8807 void
8808 aarch64_final_prescan_insn (rtx_insn *insn)
8809 {
8810   if (aarch64_madd_needs_nop (insn))
8811     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
8812 }
8813
8814
8815 /* Return the equivalent letter for size.  */
8816 static char
8817 sizetochar (int size)
8818 {
8819   switch (size)
8820     {
8821     case 64: return 'd';
8822     case 32: return 's';
8823     case 16: return 'h';
8824     case 8 : return 'b';
8825     default: gcc_unreachable ();
8826     }
8827 }
8828
8829 /* Return true iff x is a uniform vector of floating-point
8830    constants, and the constant can be represented in
8831    quarter-precision form.  Note, as aarch64_float_const_representable
8832    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
8833 static bool
8834 aarch64_vect_float_const_representable_p (rtx x)
8835 {
8836   int i = 0;
8837   REAL_VALUE_TYPE r0, ri;
8838   rtx x0, xi;
8839
8840   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
8841     return false;
8842
8843   x0 = CONST_VECTOR_ELT (x, 0);
8844   if (!CONST_DOUBLE_P (x0))
8845     return false;
8846
8847   REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
8848
8849   for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
8850     {
8851       xi = CONST_VECTOR_ELT (x, i);
8852       if (!CONST_DOUBLE_P (xi))
8853         return false;
8854
8855       REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
8856       if (!REAL_VALUES_EQUAL (r0, ri))
8857         return false;
8858     }
8859
8860   return aarch64_float_const_representable_p (x0);
8861 }
8862
8863 /* Return true for valid and false for invalid.  */
8864 bool
8865 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
8866                               struct simd_immediate_info *info)
8867 {
8868 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
8869   matches = 1;                                          \
8870   for (i = 0; i < idx; i += (STRIDE))                   \
8871     if (!(TEST))                                        \
8872       matches = 0;                                      \
8873   if (matches)                                          \
8874     {                                                   \
8875       immtype = (CLASS);                                \
8876       elsize = (ELSIZE);                                \
8877       eshift = (SHIFT);                                 \
8878       emvn = (NEG);                                     \
8879       break;                                            \
8880     }
8881
8882   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
8883   unsigned int innersize = GET_MODE_SIZE (GET_MODE_INNER (mode));
8884   unsigned char bytes[16];
8885   int immtype = -1, matches;
8886   unsigned int invmask = inverse ? 0xff : 0;
8887   int eshift, emvn;
8888
8889   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
8890     {
8891       if (! (aarch64_simd_imm_zero_p (op, mode)
8892              || aarch64_vect_float_const_representable_p (op)))
8893         return false;
8894
8895       if (info)
8896         {
8897           info->value = CONST_VECTOR_ELT (op, 0);
8898           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
8899           info->mvn = false;
8900           info->shift = 0;
8901         }
8902
8903       return true;
8904     }
8905
8906   /* Splat vector constant out into a byte vector.  */
8907   for (i = 0; i < n_elts; i++)
8908     {
8909       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
8910          it must be laid out in the vector register in reverse order.  */
8911       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
8912       unsigned HOST_WIDE_INT elpart;
8913       unsigned int part, parts;
8914
8915       if (CONST_INT_P (el))
8916         {
8917           elpart = INTVAL (el);
8918           parts = 1;
8919         }
8920       else if (GET_CODE (el) == CONST_DOUBLE)
8921         {
8922           elpart = CONST_DOUBLE_LOW (el);
8923           parts = 2;
8924         }
8925       else
8926         gcc_unreachable ();
8927
8928       for (part = 0; part < parts; part++)
8929         {
8930           unsigned int byte;
8931           for (byte = 0; byte < innersize; byte++)
8932             {
8933               bytes[idx++] = (elpart & 0xff) ^ invmask;
8934               elpart >>= BITS_PER_UNIT;
8935             }
8936           if (GET_CODE (el) == CONST_DOUBLE)
8937             elpart = CONST_DOUBLE_HIGH (el);
8938         }
8939     }
8940
8941   /* Sanity check.  */
8942   gcc_assert (idx == GET_MODE_SIZE (mode));
8943
8944   do
8945     {
8946       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
8947              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
8948
8949       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8950              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8951
8952       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
8953              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8954
8955       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
8956              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
8957
8958       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
8959
8960       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
8961
8962       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
8963              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
8964
8965       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8966              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8967
8968       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
8969              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8970
8971       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
8972              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
8973
8974       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
8975
8976       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
8977
8978       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
8979              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
8980
8981       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
8982              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
8983
8984       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
8985              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
8986
8987       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
8988              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
8989
8990       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
8991
8992       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
8993              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
8994     }
8995   while (0);
8996
8997   if (immtype == -1)
8998     return false;
8999
9000   if (info)
9001     {
9002       info->element_width = elsize;
9003       info->mvn = emvn != 0;
9004       info->shift = eshift;
9005
9006       unsigned HOST_WIDE_INT imm = 0;
9007
9008       if (immtype >= 12 && immtype <= 15)
9009         info->msl = true;
9010
9011       /* Un-invert bytes of recognized vector, if necessary.  */
9012       if (invmask != 0)
9013         for (i = 0; i < idx; i++)
9014           bytes[i] ^= invmask;
9015
9016       if (immtype == 17)
9017         {
9018           /* FIXME: Broken on 32-bit H_W_I hosts.  */
9019           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
9020
9021           for (i = 0; i < 8; i++)
9022             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
9023               << (i * BITS_PER_UNIT);
9024
9025
9026           info->value = GEN_INT (imm);
9027         }
9028       else
9029         {
9030           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
9031             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
9032
9033           /* Construct 'abcdefgh' because the assembler cannot handle
9034              generic constants.  */
9035           if (info->mvn)
9036             imm = ~imm;
9037           imm = (imm >> info->shift) & 0xff;
9038           info->value = GEN_INT (imm);
9039         }
9040     }
9041
9042   return true;
9043 #undef CHECK
9044 }
9045
9046 /* Check of immediate shift constants are within range.  */
9047 bool
9048 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
9049 {
9050   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
9051   if (left)
9052     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
9053   else
9054     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
9055 }
9056
9057 /* Return true if X is a uniform vector where all elements
9058    are either the floating-point constant 0.0 or the
9059    integer constant 0.  */
9060 bool
9061 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
9062 {
9063   return x == CONST0_RTX (mode);
9064 }
9065
9066 bool
9067 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
9068 {
9069   HOST_WIDE_INT imm = INTVAL (x);
9070   int i;
9071
9072   for (i = 0; i < 8; i++)
9073     {
9074       unsigned int byte = imm & 0xff;
9075       if (byte != 0xff && byte != 0)
9076        return false;
9077       imm >>= 8;
9078     }
9079
9080   return true;
9081 }
9082
9083 bool
9084 aarch64_mov_operand_p (rtx x,
9085                        enum aarch64_symbol_context context,
9086                        machine_mode mode)
9087 {
9088   if (GET_CODE (x) == HIGH
9089       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9090     return true;
9091
9092   if (CONST_INT_P (x))
9093     return true;
9094
9095   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
9096     return true;
9097
9098   return aarch64_classify_symbolic_expression (x, context)
9099     == SYMBOL_TINY_ABSOLUTE;
9100 }
9101
9102 /* Return a const_int vector of VAL.  */
9103 rtx
9104 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
9105 {
9106   int nunits = GET_MODE_NUNITS (mode);
9107   rtvec v = rtvec_alloc (nunits);
9108   int i;
9109
9110   for (i=0; i < nunits; i++)
9111     RTVEC_ELT (v, i) = GEN_INT (val);
9112
9113   return gen_rtx_CONST_VECTOR (mode, v);
9114 }
9115
9116 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
9117
9118 bool
9119 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
9120 {
9121   machine_mode vmode;
9122
9123   gcc_assert (!VECTOR_MODE_P (mode));
9124   vmode = aarch64_preferred_simd_mode (mode);
9125   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
9126   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
9127 }
9128
9129 /* Construct and return a PARALLEL RTX vector with elements numbering the
9130    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
9131    the vector - from the perspective of the architecture.  This does not
9132    line up with GCC's perspective on lane numbers, so we end up with
9133    different masks depending on our target endian-ness.  The diagram
9134    below may help.  We must draw the distinction when building masks
9135    which select one half of the vector.  An instruction selecting
9136    architectural low-lanes for a big-endian target, must be described using
9137    a mask selecting GCC high-lanes.
9138
9139                  Big-Endian             Little-Endian
9140
9141 GCC             0   1   2   3           3   2   1   0
9142               | x | x | x | x |       | x | x | x | x |
9143 Architecture    3   2   1   0           3   2   1   0
9144
9145 Low Mask:         { 2, 3 }                { 0, 1 }
9146 High Mask:        { 0, 1 }                { 2, 3 }
9147 */
9148
9149 rtx
9150 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
9151 {
9152   int nunits = GET_MODE_NUNITS (mode);
9153   rtvec v = rtvec_alloc (nunits / 2);
9154   int high_base = nunits / 2;
9155   int low_base = 0;
9156   int base;
9157   rtx t1;
9158   int i;
9159
9160   if (BYTES_BIG_ENDIAN)
9161     base = high ? low_base : high_base;
9162   else
9163     base = high ? high_base : low_base;
9164
9165   for (i = 0; i < nunits / 2; i++)
9166     RTVEC_ELT (v, i) = GEN_INT (base + i);
9167
9168   t1 = gen_rtx_PARALLEL (mode, v);
9169   return t1;
9170 }
9171
9172 /* Check OP for validity as a PARALLEL RTX vector with elements
9173    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
9174    from the perspective of the architecture.  See the diagram above
9175    aarch64_simd_vect_par_cnst_half for more details.  */
9176
9177 bool
9178 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
9179                                        bool high)
9180 {
9181   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
9182   HOST_WIDE_INT count_op = XVECLEN (op, 0);
9183   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
9184   int i = 0;
9185
9186   if (!VECTOR_MODE_P (mode))
9187     return false;
9188
9189   if (count_op != count_ideal)
9190     return false;
9191
9192   for (i = 0; i < count_ideal; i++)
9193     {
9194       rtx elt_op = XVECEXP (op, 0, i);
9195       rtx elt_ideal = XVECEXP (ideal, 0, i);
9196
9197       if (!CONST_INT_P (elt_op)
9198           || INTVAL (elt_ideal) != INTVAL (elt_op))
9199         return false;
9200     }
9201   return true;
9202 }
9203
9204 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
9205    HIGH (exclusive).  */
9206 void
9207 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
9208                           const_tree exp)
9209 {
9210   HOST_WIDE_INT lane;
9211   gcc_assert (CONST_INT_P (operand));
9212   lane = INTVAL (operand);
9213
9214   if (lane < low || lane >= high)
9215   {
9216     if (exp)
9217       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
9218     else
9219       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
9220   }
9221 }
9222
9223 /* Return TRUE if OP is a valid vector addressing mode.  */
9224 bool
9225 aarch64_simd_mem_operand_p (rtx op)
9226 {
9227   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
9228                         || REG_P (XEXP (op, 0)));
9229 }
9230
9231 /* Emit a register copy from operand to operand, taking care not to
9232    early-clobber source registers in the process.
9233
9234    COUNT is the number of components into which the copy needs to be
9235    decomposed.  */
9236 void
9237 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
9238                                 unsigned int count)
9239 {
9240   unsigned int i;
9241   int rdest = REGNO (operands[0]);
9242   int rsrc = REGNO (operands[1]);
9243
9244   if (!reg_overlap_mentioned_p (operands[0], operands[1])
9245       || rdest < rsrc)
9246     for (i = 0; i < count; i++)
9247       emit_move_insn (gen_rtx_REG (mode, rdest + i),
9248                       gen_rtx_REG (mode, rsrc + i));
9249   else
9250     for (i = 0; i < count; i++)
9251       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
9252                       gen_rtx_REG (mode, rsrc + count - i - 1));
9253 }
9254
9255 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
9256    one of VSTRUCT modes: OI, CI or XI.  */
9257 int
9258 aarch64_simd_attr_length_move (rtx_insn *insn)
9259 {
9260   machine_mode mode;
9261
9262   extract_insn_cached (insn);
9263
9264   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
9265     {
9266       mode = GET_MODE (recog_data.operand[0]);
9267       switch (mode)
9268         {
9269         case OImode:
9270           return 8;
9271         case CImode:
9272           return 12;
9273         case XImode:
9274           return 16;
9275         default:
9276           gcc_unreachable ();
9277         }
9278     }
9279   return 4;
9280 }
9281
9282 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
9283    one of VSTRUCT modes: OI, CI, EI, or XI.  */
9284 int
9285 aarch64_simd_attr_length_rglist (enum machine_mode mode)
9286 {
9287   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
9288 }
9289
9290 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
9291    alignment of a vector to 128 bits.  */
9292 static HOST_WIDE_INT
9293 aarch64_simd_vector_alignment (const_tree type)
9294 {
9295   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
9296   return MIN (align, 128);
9297 }
9298
9299 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
9300 static bool
9301 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
9302 {
9303   if (is_packed)
9304     return false;
9305
9306   /* We guarantee alignment for vectors up to 128-bits.  */
9307   if (tree_int_cst_compare (TYPE_SIZE (type),
9308                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
9309     return false;
9310
9311   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
9312   return true;
9313 }
9314
9315 /* If VALS is a vector constant that can be loaded into a register
9316    using DUP, generate instructions to do so and return an RTX to
9317    assign to the register.  Otherwise return NULL_RTX.  */
9318 static rtx
9319 aarch64_simd_dup_constant (rtx vals)
9320 {
9321   machine_mode mode = GET_MODE (vals);
9322   machine_mode inner_mode = GET_MODE_INNER (mode);
9323   int n_elts = GET_MODE_NUNITS (mode);
9324   bool all_same = true;
9325   rtx x;
9326   int i;
9327
9328   if (GET_CODE (vals) != CONST_VECTOR)
9329     return NULL_RTX;
9330
9331   for (i = 1; i < n_elts; ++i)
9332     {
9333       x = CONST_VECTOR_ELT (vals, i);
9334       if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
9335         all_same = false;
9336     }
9337
9338   if (!all_same)
9339     return NULL_RTX;
9340
9341   /* We can load this constant by using DUP and a constant in a
9342      single ARM register.  This will be cheaper than a vector
9343      load.  */
9344   x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
9345   return gen_rtx_VEC_DUPLICATE (mode, x);
9346 }
9347
9348
9349 /* Generate code to load VALS, which is a PARALLEL containing only
9350    constants (for vec_init) or CONST_VECTOR, efficiently into a
9351    register.  Returns an RTX to copy into the register, or NULL_RTX
9352    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
9353 static rtx
9354 aarch64_simd_make_constant (rtx vals)
9355 {
9356   machine_mode mode = GET_MODE (vals);
9357   rtx const_dup;
9358   rtx const_vec = NULL_RTX;
9359   int n_elts = GET_MODE_NUNITS (mode);
9360   int n_const = 0;
9361   int i;
9362
9363   if (GET_CODE (vals) == CONST_VECTOR)
9364     const_vec = vals;
9365   else if (GET_CODE (vals) == PARALLEL)
9366     {
9367       /* A CONST_VECTOR must contain only CONST_INTs and
9368          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
9369          Only store valid constants in a CONST_VECTOR.  */
9370       for (i = 0; i < n_elts; ++i)
9371         {
9372           rtx x = XVECEXP (vals, 0, i);
9373           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9374             n_const++;
9375         }
9376       if (n_const == n_elts)
9377         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
9378     }
9379   else
9380     gcc_unreachable ();
9381
9382   if (const_vec != NULL_RTX
9383       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
9384     /* Load using MOVI/MVNI.  */
9385     return const_vec;
9386   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
9387     /* Loaded using DUP.  */
9388     return const_dup;
9389   else if (const_vec != NULL_RTX)
9390     /* Load from constant pool. We can not take advantage of single-cycle
9391        LD1 because we need a PC-relative addressing mode.  */
9392     return const_vec;
9393   else
9394     /* A PARALLEL containing something not valid inside CONST_VECTOR.
9395        We can not construct an initializer.  */
9396     return NULL_RTX;
9397 }
9398
9399 void
9400 aarch64_expand_vector_init (rtx target, rtx vals)
9401 {
9402   machine_mode mode = GET_MODE (target);
9403   machine_mode inner_mode = GET_MODE_INNER (mode);
9404   int n_elts = GET_MODE_NUNITS (mode);
9405   int n_var = 0;
9406   rtx any_const = NULL_RTX;
9407   bool all_same = true;
9408
9409   for (int i = 0; i < n_elts; ++i)
9410     {
9411       rtx x = XVECEXP (vals, 0, i);
9412       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
9413         ++n_var;
9414       else
9415         any_const = x;
9416
9417       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
9418         all_same = false;
9419     }
9420
9421   if (n_var == 0)
9422     {
9423       rtx constant = aarch64_simd_make_constant (vals);
9424       if (constant != NULL_RTX)
9425         {
9426           emit_move_insn (target, constant);
9427           return;
9428         }
9429     }
9430
9431   /* Splat a single non-constant element if we can.  */
9432   if (all_same)
9433     {
9434       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
9435       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
9436       return;
9437     }
9438
9439   /* Half the fields (or less) are non-constant.  Load constant then overwrite
9440      varying fields.  Hope that this is more efficient than using the stack.  */
9441   if (n_var <= n_elts/2)
9442     {
9443       rtx copy = copy_rtx (vals);
9444
9445       /* Load constant part of vector.  We really don't care what goes into the
9446          parts we will overwrite, but we're more likely to be able to load the
9447          constant efficiently if it has fewer, larger, repeating parts
9448          (see aarch64_simd_valid_immediate).  */
9449       for (int i = 0; i < n_elts; i++)
9450         {
9451           rtx x = XVECEXP (vals, 0, i);
9452           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9453             continue;
9454           rtx subst = any_const;
9455           for (int bit = n_elts / 2; bit > 0; bit /= 2)
9456             {
9457               /* Look in the copied vector, as more elements are const.  */
9458               rtx test = XVECEXP (copy, 0, i ^ bit);
9459               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
9460                 {
9461                   subst = test;
9462                   break;
9463                 }
9464             }
9465           XVECEXP (copy, 0, i) = subst;
9466         }
9467       aarch64_expand_vector_init (target, copy);
9468
9469       /* Insert variables.  */
9470       enum insn_code icode = optab_handler (vec_set_optab, mode);
9471       gcc_assert (icode != CODE_FOR_nothing);
9472
9473       for (int i = 0; i < n_elts; i++)
9474         {
9475           rtx x = XVECEXP (vals, 0, i);
9476           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
9477             continue;
9478           x = copy_to_mode_reg (inner_mode, x);
9479           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
9480         }
9481       return;
9482     }
9483
9484   /* Construct the vector in memory one field at a time
9485      and load the whole vector.  */
9486   rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
9487   for (int i = 0; i < n_elts; i++)
9488     emit_move_insn (adjust_address_nv (mem, inner_mode,
9489                                     i * GET_MODE_SIZE (inner_mode)),
9490                     XVECEXP (vals, 0, i));
9491   emit_move_insn (target, mem);
9492
9493 }
9494
9495 static unsigned HOST_WIDE_INT
9496 aarch64_shift_truncation_mask (machine_mode mode)
9497 {
9498   return
9499     (aarch64_vector_mode_supported_p (mode)
9500      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
9501 }
9502
9503 #ifndef TLS_SECTION_ASM_FLAG
9504 #define TLS_SECTION_ASM_FLAG 'T'
9505 #endif
9506
9507 void
9508 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
9509                                tree decl ATTRIBUTE_UNUSED)
9510 {
9511   char flagchars[10], *f = flagchars;
9512
9513   /* If we have already declared this section, we can use an
9514      abbreviated form to switch back to it -- unless this section is
9515      part of a COMDAT groups, in which case GAS requires the full
9516      declaration every time.  */
9517   if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9518       && (flags & SECTION_DECLARED))
9519     {
9520       fprintf (asm_out_file, "\t.section\t%s\n", name);
9521       return;
9522     }
9523
9524   if (!(flags & SECTION_DEBUG))
9525     *f++ = 'a';
9526   if (flags & SECTION_WRITE)
9527     *f++ = 'w';
9528   if (flags & SECTION_CODE)
9529     *f++ = 'x';
9530   if (flags & SECTION_SMALL)
9531     *f++ = 's';
9532   if (flags & SECTION_MERGE)
9533     *f++ = 'M';
9534   if (flags & SECTION_STRINGS)
9535     *f++ = 'S';
9536   if (flags & SECTION_TLS)
9537     *f++ = TLS_SECTION_ASM_FLAG;
9538   if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9539     *f++ = 'G';
9540   *f = '\0';
9541
9542   fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
9543
9544   if (!(flags & SECTION_NOTYPE))
9545     {
9546       const char *type;
9547       const char *format;
9548
9549       if (flags & SECTION_BSS)
9550         type = "nobits";
9551       else
9552         type = "progbits";
9553
9554 #ifdef TYPE_OPERAND_FMT
9555       format = "," TYPE_OPERAND_FMT;
9556 #else
9557       format = ",@%s";
9558 #endif
9559
9560       fprintf (asm_out_file, format, type);
9561
9562       if (flags & SECTION_ENTSIZE)
9563         fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
9564       if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
9565         {
9566           if (TREE_CODE (decl) == IDENTIFIER_NODE)
9567             fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
9568           else
9569             fprintf (asm_out_file, ",%s,comdat",
9570                      IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
9571         }
9572     }
9573
9574   putc ('\n', asm_out_file);
9575 }
9576
9577 /* Select a format to encode pointers in exception handling data.  */
9578 int
9579 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
9580 {
9581    int type;
9582    switch (aarch64_cmodel)
9583      {
9584      case AARCH64_CMODEL_TINY:
9585      case AARCH64_CMODEL_TINY_PIC:
9586      case AARCH64_CMODEL_SMALL:
9587      case AARCH64_CMODEL_SMALL_PIC:
9588      case AARCH64_CMODEL_SMALL_SPIC:
9589        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
9590           for everything.  */
9591        type = DW_EH_PE_sdata4;
9592        break;
9593      default:
9594        /* No assumptions here.  8-byte relocs required.  */
9595        type = DW_EH_PE_sdata8;
9596        break;
9597      }
9598    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
9599 }
9600
9601 /* Emit load exclusive.  */
9602
9603 static void
9604 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
9605                              rtx mem, rtx model_rtx)
9606 {
9607   rtx (*gen) (rtx, rtx, rtx);
9608
9609   switch (mode)
9610     {
9611     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
9612     case HImode: gen = gen_aarch64_load_exclusivehi; break;
9613     case SImode: gen = gen_aarch64_load_exclusivesi; break;
9614     case DImode: gen = gen_aarch64_load_exclusivedi; break;
9615     default:
9616       gcc_unreachable ();
9617     }
9618
9619   emit_insn (gen (rval, mem, model_rtx));
9620 }
9621
9622 /* Emit store exclusive.  */
9623
9624 static void
9625 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
9626                               rtx rval, rtx mem, rtx model_rtx)
9627 {
9628   rtx (*gen) (rtx, rtx, rtx, rtx);
9629
9630   switch (mode)
9631     {
9632     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
9633     case HImode: gen = gen_aarch64_store_exclusivehi; break;
9634     case SImode: gen = gen_aarch64_store_exclusivesi; break;
9635     case DImode: gen = gen_aarch64_store_exclusivedi; break;
9636     default:
9637       gcc_unreachable ();
9638     }
9639
9640   emit_insn (gen (bval, rval, mem, model_rtx));
9641 }
9642
9643 /* Mark the previous jump instruction as unlikely.  */
9644
9645 static void
9646 aarch64_emit_unlikely_jump (rtx insn)
9647 {
9648   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
9649
9650   insn = emit_jump_insn (insn);
9651   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
9652 }
9653
9654 /* Expand a compare and swap pattern.  */
9655
9656 void
9657 aarch64_expand_compare_and_swap (rtx operands[])
9658 {
9659   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
9660   machine_mode mode, cmp_mode;
9661   rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
9662
9663   bval = operands[0];
9664   rval = operands[1];
9665   mem = operands[2];
9666   oldval = operands[3];
9667   newval = operands[4];
9668   is_weak = operands[5];
9669   mod_s = operands[6];
9670   mod_f = operands[7];
9671   mode = GET_MODE (mem);
9672   cmp_mode = mode;
9673
9674   /* Normally the succ memory model must be stronger than fail, but in the
9675      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
9676      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
9677
9678   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
9679       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
9680     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
9681
9682   switch (mode)
9683     {
9684     case QImode:
9685     case HImode:
9686       /* For short modes, we're going to perform the comparison in SImode,
9687          so do the zero-extension now.  */
9688       cmp_mode = SImode;
9689       rval = gen_reg_rtx (SImode);
9690       oldval = convert_modes (SImode, mode, oldval, true);
9691       /* Fall through.  */
9692
9693     case SImode:
9694     case DImode:
9695       /* Force the value into a register if needed.  */
9696       if (!aarch64_plus_operand (oldval, mode))
9697         oldval = force_reg (cmp_mode, oldval);
9698       break;
9699
9700     default:
9701       gcc_unreachable ();
9702     }
9703
9704   switch (mode)
9705     {
9706     case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
9707     case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
9708     case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
9709     case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
9710     default:
9711       gcc_unreachable ();
9712     }
9713
9714   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
9715
9716   if (mode == QImode || mode == HImode)
9717     emit_move_insn (operands[1], gen_lowpart (mode, rval));
9718
9719   x = gen_rtx_REG (CCmode, CC_REGNUM);
9720   x = gen_rtx_EQ (SImode, x, const0_rtx);
9721   emit_insn (gen_rtx_SET (bval, x));
9722 }
9723
9724 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
9725    sequence implementing an atomic operation.  */
9726
9727 static void
9728 aarch64_emit_post_barrier (enum memmodel model)
9729 {
9730   const enum memmodel base_model = memmodel_base (model);
9731
9732   if (is_mm_sync (model)
9733       && (base_model == MEMMODEL_ACQUIRE
9734           || base_model == MEMMODEL_ACQ_REL
9735           || base_model == MEMMODEL_SEQ_CST))
9736     {
9737       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
9738     }
9739 }
9740
9741 /* Split a compare and swap pattern.  */
9742
9743 void
9744 aarch64_split_compare_and_swap (rtx operands[])
9745 {
9746   rtx rval, mem, oldval, newval, scratch;
9747   machine_mode mode;
9748   bool is_weak;
9749   rtx_code_label *label1, *label2;
9750   rtx x, cond;
9751   enum memmodel model;
9752   rtx model_rtx;
9753
9754   rval = operands[0];
9755   mem = operands[1];
9756   oldval = operands[2];
9757   newval = operands[3];
9758   is_weak = (operands[4] != const0_rtx);
9759   model_rtx = operands[5];
9760   scratch = operands[7];
9761   mode = GET_MODE (mem);
9762   model = memmodel_from_int (INTVAL (model_rtx));
9763
9764   label1 = NULL;
9765   if (!is_weak)
9766     {
9767       label1 = gen_label_rtx ();
9768       emit_label (label1);
9769     }
9770   label2 = gen_label_rtx ();
9771
9772   /* The initial load can be relaxed for a __sync operation since a final
9773      barrier will be emitted to stop code hoisting.  */
9774   if (is_mm_sync (model))
9775     aarch64_emit_load_exclusive (mode, rval, mem,
9776                                  GEN_INT (MEMMODEL_RELAXED));
9777   else
9778     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
9779
9780   cond = aarch64_gen_compare_reg (NE, rval, oldval);
9781   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9782   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9783                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
9784   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9785
9786   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
9787
9788   if (!is_weak)
9789     {
9790       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
9791       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9792                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
9793       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9794     }
9795   else
9796     {
9797       cond = gen_rtx_REG (CCmode, CC_REGNUM);
9798       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
9799       emit_insn (gen_rtx_SET (cond, x));
9800     }
9801
9802   emit_label (label2);
9803
9804   /* Emit any final barrier needed for a __sync operation.  */
9805   if (is_mm_sync (model))
9806     aarch64_emit_post_barrier (model);
9807 }
9808
9809 /* Split an atomic operation.  */
9810
9811 void
9812 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
9813                      rtx value, rtx model_rtx, rtx cond)
9814 {
9815   machine_mode mode = GET_MODE (mem);
9816   machine_mode wmode = (mode == DImode ? DImode : SImode);
9817   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
9818   const bool is_sync = is_mm_sync (model);
9819   rtx_code_label *label;
9820   rtx x;
9821
9822   label = gen_label_rtx ();
9823   emit_label (label);
9824
9825   if (new_out)
9826     new_out = gen_lowpart (wmode, new_out);
9827   if (old_out)
9828     old_out = gen_lowpart (wmode, old_out);
9829   else
9830     old_out = new_out;
9831   value = simplify_gen_subreg (wmode, value, mode, 0);
9832
9833   /* The initial load can be relaxed for a __sync operation since a final
9834      barrier will be emitted to stop code hoisting.  */
9835  if (is_sync)
9836     aarch64_emit_load_exclusive (mode, old_out, mem,
9837                                  GEN_INT (MEMMODEL_RELAXED));
9838   else
9839     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
9840
9841   switch (code)
9842     {
9843     case SET:
9844       new_out = value;
9845       break;
9846
9847     case NOT:
9848       x = gen_rtx_AND (wmode, old_out, value);
9849       emit_insn (gen_rtx_SET (new_out, x));
9850       x = gen_rtx_NOT (wmode, new_out);
9851       emit_insn (gen_rtx_SET (new_out, x));
9852       break;
9853
9854     case MINUS:
9855       if (CONST_INT_P (value))
9856         {
9857           value = GEN_INT (-INTVAL (value));
9858           code = PLUS;
9859         }
9860       /* Fall through.  */
9861
9862     default:
9863       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
9864       emit_insn (gen_rtx_SET (new_out, x));
9865       break;
9866     }
9867
9868   aarch64_emit_store_exclusive (mode, cond, mem,
9869                                 gen_lowpart (mode, new_out), model_rtx);
9870
9871   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
9872   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9873                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9874   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
9875
9876   /* Emit any final barrier needed for a __sync operation.  */
9877   if (is_sync)
9878     aarch64_emit_post_barrier (model);
9879 }
9880
9881 static void
9882 aarch64_print_extension (void)
9883 {
9884   const struct aarch64_option_extension *opt = NULL;
9885
9886   for (opt = all_extensions; opt->name != NULL; opt++)
9887     if ((aarch64_isa_flags & opt->flags_on) == opt->flags_on)
9888       asm_fprintf (asm_out_file, "+%s", opt->name);
9889
9890   asm_fprintf (asm_out_file, "\n");
9891 }
9892
9893 static void
9894 aarch64_start_file (void)
9895 {
9896   if (selected_arch)
9897     {
9898       asm_fprintf (asm_out_file, "\t.arch %s", selected_arch->name);
9899       aarch64_print_extension ();
9900     }
9901   else if (selected_cpu)
9902     {
9903       const char *truncated_name
9904             = aarch64_rewrite_selected_cpu (selected_cpu->name);
9905       asm_fprintf (asm_out_file, "\t.cpu %s", truncated_name);
9906       aarch64_print_extension ();
9907     }
9908   default_file_start();
9909 }
9910
9911 /* Target hook for c_mode_for_suffix.  */
9912 static machine_mode
9913 aarch64_c_mode_for_suffix (char suffix)
9914 {
9915   if (suffix == 'q')
9916     return TFmode;
9917
9918   return VOIDmode;
9919 }
9920
9921 /* We can only represent floating point constants which will fit in
9922    "quarter-precision" values.  These values are characterised by
9923    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
9924    by:
9925
9926    (-1)^s * (n/16) * 2^r
9927
9928    Where:
9929      's' is the sign bit.
9930      'n' is an integer in the range 16 <= n <= 31.
9931      'r' is an integer in the range -3 <= r <= 4.  */
9932
9933 /* Return true iff X can be represented by a quarter-precision
9934    floating point immediate operand X.  Note, we cannot represent 0.0.  */
9935 bool
9936 aarch64_float_const_representable_p (rtx x)
9937 {
9938   /* This represents our current view of how many bits
9939      make up the mantissa.  */
9940   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
9941   int exponent;
9942   unsigned HOST_WIDE_INT mantissa, mask;
9943   REAL_VALUE_TYPE r, m;
9944   bool fail;
9945
9946   if (!CONST_DOUBLE_P (x))
9947     return false;
9948
9949   if (GET_MODE (x) == VOIDmode)
9950     return false;
9951
9952   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
9953
9954   /* We cannot represent infinities, NaNs or +/-zero.  We won't
9955      know if we have +zero until we analyse the mantissa, but we
9956      can reject the other invalid values.  */
9957   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
9958       || REAL_VALUE_MINUS_ZERO (r))
9959     return false;
9960
9961   /* Extract exponent.  */
9962   r = real_value_abs (&r);
9963   exponent = REAL_EXP (&r);
9964
9965   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
9966      highest (sign) bit, with a fixed binary point at bit point_pos.
9967      m1 holds the low part of the mantissa, m2 the high part.
9968      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
9969      bits for the mantissa, this can fail (low bits will be lost).  */
9970   real_ldexp (&m, &r, point_pos - exponent);
9971   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
9972
9973   /* If the low part of the mantissa has bits set we cannot represent
9974      the value.  */
9975   if (w.elt (0) != 0)
9976     return false;
9977   /* We have rejected the lower HOST_WIDE_INT, so update our
9978      understanding of how many bits lie in the mantissa and
9979      look only at the high HOST_WIDE_INT.  */
9980   mantissa = w.elt (1);
9981   point_pos -= HOST_BITS_PER_WIDE_INT;
9982
9983   /* We can only represent values with a mantissa of the form 1.xxxx.  */
9984   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
9985   if ((mantissa & mask) != 0)
9986     return false;
9987
9988   /* Having filtered unrepresentable values, we may now remove all
9989      but the highest 5 bits.  */
9990   mantissa >>= point_pos - 5;
9991
9992   /* We cannot represent the value 0.0, so reject it.  This is handled
9993      elsewhere.  */
9994   if (mantissa == 0)
9995     return false;
9996
9997   /* Then, as bit 4 is always set, we can mask it off, leaving
9998      the mantissa in the range [0, 15].  */
9999   mantissa &= ~(1 << 4);
10000   gcc_assert (mantissa <= 15);
10001
10002   /* GCC internally does not use IEEE754-like encoding (where normalized
10003      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
10004      Our mantissa values are shifted 4 places to the left relative to
10005      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
10006      by 5 places to correct for GCC's representation.  */
10007   exponent = 5 - exponent;
10008
10009   return (exponent >= 0 && exponent <= 7);
10010 }
10011
10012 char*
10013 aarch64_output_simd_mov_immediate (rtx const_vector,
10014                                    machine_mode mode,
10015                                    unsigned width)
10016 {
10017   bool is_valid;
10018   static char templ[40];
10019   const char *mnemonic;
10020   const char *shift_op;
10021   unsigned int lane_count = 0;
10022   char element_char;
10023
10024   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
10025
10026   /* This will return true to show const_vector is legal for use as either
10027      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
10028      also update INFO to show how the immediate should be generated.  */
10029   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
10030   gcc_assert (is_valid);
10031
10032   element_char = sizetochar (info.element_width);
10033   lane_count = width / info.element_width;
10034
10035   mode = GET_MODE_INNER (mode);
10036   if (mode == SFmode || mode == DFmode)
10037     {
10038       gcc_assert (info.shift == 0 && ! info.mvn);
10039       if (aarch64_float_const_zero_rtx_p (info.value))
10040         info.value = GEN_INT (0);
10041       else
10042         {
10043 #define buf_size 20
10044           REAL_VALUE_TYPE r;
10045           REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
10046           char float_buf[buf_size] = {'\0'};
10047           real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
10048 #undef buf_size
10049
10050           if (lane_count == 1)
10051             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
10052           else
10053             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
10054                       lane_count, element_char, float_buf);
10055           return templ;
10056         }
10057     }
10058
10059   mnemonic = info.mvn ? "mvni" : "movi";
10060   shift_op = info.msl ? "msl" : "lsl";
10061
10062   if (lane_count == 1)
10063     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
10064               mnemonic, UINTVAL (info.value));
10065   else if (info.shift)
10066     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
10067               ", %s %d", mnemonic, lane_count, element_char,
10068               UINTVAL (info.value), shift_op, info.shift);
10069   else
10070     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
10071               mnemonic, lane_count, element_char, UINTVAL (info.value));
10072   return templ;
10073 }
10074
10075 char*
10076 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
10077                                           machine_mode mode)
10078 {
10079   machine_mode vmode;
10080
10081   gcc_assert (!VECTOR_MODE_P (mode));
10082   vmode = aarch64_simd_container_mode (mode, 64);
10083   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
10084   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
10085 }
10086
10087 /* Split operands into moves from op[1] + op[2] into op[0].  */
10088
10089 void
10090 aarch64_split_combinev16qi (rtx operands[3])
10091 {
10092   unsigned int dest = REGNO (operands[0]);
10093   unsigned int src1 = REGNO (operands[1]);
10094   unsigned int src2 = REGNO (operands[2]);
10095   machine_mode halfmode = GET_MODE (operands[1]);
10096   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
10097   rtx destlo, desthi;
10098
10099   gcc_assert (halfmode == V16QImode);
10100
10101   if (src1 == dest && src2 == dest + halfregs)
10102     {
10103       /* No-op move.  Can't split to nothing; emit something.  */
10104       emit_note (NOTE_INSN_DELETED);
10105       return;
10106     }
10107
10108   /* Preserve register attributes for variable tracking.  */
10109   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
10110   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
10111                                GET_MODE_SIZE (halfmode));
10112
10113   /* Special case of reversed high/low parts.  */
10114   if (reg_overlap_mentioned_p (operands[2], destlo)
10115       && reg_overlap_mentioned_p (operands[1], desthi))
10116     {
10117       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
10118       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
10119       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
10120     }
10121   else if (!reg_overlap_mentioned_p (operands[2], destlo))
10122     {
10123       /* Try to avoid unnecessary moves if part of the result
10124          is in the right place already.  */
10125       if (src1 != dest)
10126         emit_move_insn (destlo, operands[1]);
10127       if (src2 != dest + halfregs)
10128         emit_move_insn (desthi, operands[2]);
10129     }
10130   else
10131     {
10132       if (src2 != dest + halfregs)
10133         emit_move_insn (desthi, operands[2]);
10134       if (src1 != dest)
10135         emit_move_insn (destlo, operands[1]);
10136     }
10137 }
10138
10139 /* vec_perm support.  */
10140
10141 #define MAX_VECT_LEN 16
10142
10143 struct expand_vec_perm_d
10144 {
10145   rtx target, op0, op1;
10146   unsigned char perm[MAX_VECT_LEN];
10147   machine_mode vmode;
10148   unsigned char nelt;
10149   bool one_vector_p;
10150   bool testing_p;
10151 };
10152
10153 /* Generate a variable permutation.  */
10154
10155 static void
10156 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
10157 {
10158   machine_mode vmode = GET_MODE (target);
10159   bool one_vector_p = rtx_equal_p (op0, op1);
10160
10161   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
10162   gcc_checking_assert (GET_MODE (op0) == vmode);
10163   gcc_checking_assert (GET_MODE (op1) == vmode);
10164   gcc_checking_assert (GET_MODE (sel) == vmode);
10165   gcc_checking_assert (TARGET_SIMD);
10166
10167   if (one_vector_p)
10168     {
10169       if (vmode == V8QImode)
10170         {
10171           /* Expand the argument to a V16QI mode by duplicating it.  */
10172           rtx pair = gen_reg_rtx (V16QImode);
10173           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
10174           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
10175         }
10176       else
10177         {
10178           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
10179         }
10180     }
10181   else
10182     {
10183       rtx pair;
10184
10185       if (vmode == V8QImode)
10186         {
10187           pair = gen_reg_rtx (V16QImode);
10188           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
10189           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
10190         }
10191       else
10192         {
10193           pair = gen_reg_rtx (OImode);
10194           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
10195           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
10196         }
10197     }
10198 }
10199
10200 void
10201 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
10202 {
10203   machine_mode vmode = GET_MODE (target);
10204   unsigned int nelt = GET_MODE_NUNITS (vmode);
10205   bool one_vector_p = rtx_equal_p (op0, op1);
10206   rtx mask;
10207
10208   /* The TBL instruction does not use a modulo index, so we must take care
10209      of that ourselves.  */
10210   mask = aarch64_simd_gen_const_vector_dup (vmode,
10211       one_vector_p ? nelt - 1 : 2 * nelt - 1);
10212   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
10213
10214   /* For big-endian, we also need to reverse the index within the vector
10215      (but not which vector).  */
10216   if (BYTES_BIG_ENDIAN)
10217     {
10218       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
10219       if (!one_vector_p)
10220         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
10221       sel = expand_simple_binop (vmode, XOR, sel, mask,
10222                                  NULL, 0, OPTAB_LIB_WIDEN);
10223     }
10224   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
10225 }
10226
10227 /* Recognize patterns suitable for the TRN instructions.  */
10228 static bool
10229 aarch64_evpc_trn (struct expand_vec_perm_d *d)
10230 {
10231   unsigned int i, odd, mask, nelt = d->nelt;
10232   rtx out, in0, in1, x;
10233   rtx (*gen) (rtx, rtx, rtx);
10234   machine_mode vmode = d->vmode;
10235
10236   if (GET_MODE_UNIT_SIZE (vmode) > 8)
10237     return false;
10238
10239   /* Note that these are little-endian tests.
10240      We correct for big-endian later.  */
10241   if (d->perm[0] == 0)
10242     odd = 0;
10243   else if (d->perm[0] == 1)
10244     odd = 1;
10245   else
10246     return false;
10247   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10248
10249   for (i = 0; i < nelt; i += 2)
10250     {
10251       if (d->perm[i] != i + odd)
10252         return false;
10253       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
10254         return false;
10255     }
10256
10257   /* Success!  */
10258   if (d->testing_p)
10259     return true;
10260
10261   in0 = d->op0;
10262   in1 = d->op1;
10263   if (BYTES_BIG_ENDIAN)
10264     {
10265       x = in0, in0 = in1, in1 = x;
10266       odd = !odd;
10267     }
10268   out = d->target;
10269
10270   if (odd)
10271     {
10272       switch (vmode)
10273         {
10274         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
10275         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
10276         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
10277         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
10278         case V4SImode: gen = gen_aarch64_trn2v4si; break;
10279         case V2SImode: gen = gen_aarch64_trn2v2si; break;
10280         case V2DImode: gen = gen_aarch64_trn2v2di; break;
10281         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
10282         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
10283         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
10284         default:
10285           return false;
10286         }
10287     }
10288   else
10289     {
10290       switch (vmode)
10291         {
10292         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
10293         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
10294         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
10295         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
10296         case V4SImode: gen = gen_aarch64_trn1v4si; break;
10297         case V2SImode: gen = gen_aarch64_trn1v2si; break;
10298         case V2DImode: gen = gen_aarch64_trn1v2di; break;
10299         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
10300         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
10301         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
10302         default:
10303           return false;
10304         }
10305     }
10306
10307   emit_insn (gen (out, in0, in1));
10308   return true;
10309 }
10310
10311 /* Recognize patterns suitable for the UZP instructions.  */
10312 static bool
10313 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
10314 {
10315   unsigned int i, odd, mask, nelt = d->nelt;
10316   rtx out, in0, in1, x;
10317   rtx (*gen) (rtx, rtx, rtx);
10318   machine_mode vmode = d->vmode;
10319
10320   if (GET_MODE_UNIT_SIZE (vmode) > 8)
10321     return false;
10322
10323   /* Note that these are little-endian tests.
10324      We correct for big-endian later.  */
10325   if (d->perm[0] == 0)
10326     odd = 0;
10327   else if (d->perm[0] == 1)
10328     odd = 1;
10329   else
10330     return false;
10331   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10332
10333   for (i = 0; i < nelt; i++)
10334     {
10335       unsigned elt = (i * 2 + odd) & mask;
10336       if (d->perm[i] != elt)
10337         return false;
10338     }
10339
10340   /* Success!  */
10341   if (d->testing_p)
10342     return true;
10343
10344   in0 = d->op0;
10345   in1 = d->op1;
10346   if (BYTES_BIG_ENDIAN)
10347     {
10348       x = in0, in0 = in1, in1 = x;
10349       odd = !odd;
10350     }
10351   out = d->target;
10352
10353   if (odd)
10354     {
10355       switch (vmode)
10356         {
10357         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
10358         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
10359         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
10360         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
10361         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
10362         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
10363         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
10364         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
10365         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
10366         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
10367         default:
10368           return false;
10369         }
10370     }
10371   else
10372     {
10373       switch (vmode)
10374         {
10375         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
10376         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
10377         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
10378         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
10379         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
10380         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
10381         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
10382         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
10383         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
10384         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
10385         default:
10386           return false;
10387         }
10388     }
10389
10390   emit_insn (gen (out, in0, in1));
10391   return true;
10392 }
10393
10394 /* Recognize patterns suitable for the ZIP instructions.  */
10395 static bool
10396 aarch64_evpc_zip (struct expand_vec_perm_d *d)
10397 {
10398   unsigned int i, high, mask, nelt = d->nelt;
10399   rtx out, in0, in1, x;
10400   rtx (*gen) (rtx, rtx, rtx);
10401   machine_mode vmode = d->vmode;
10402
10403   if (GET_MODE_UNIT_SIZE (vmode) > 8)
10404     return false;
10405
10406   /* Note that these are little-endian tests.
10407      We correct for big-endian later.  */
10408   high = nelt / 2;
10409   if (d->perm[0] == high)
10410     /* Do Nothing.  */
10411     ;
10412   else if (d->perm[0] == 0)
10413     high = 0;
10414   else
10415     return false;
10416   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
10417
10418   for (i = 0; i < nelt / 2; i++)
10419     {
10420       unsigned elt = (i + high) & mask;
10421       if (d->perm[i * 2] != elt)
10422         return false;
10423       elt = (elt + nelt) & mask;
10424       if (d->perm[i * 2 + 1] != elt)
10425         return false;
10426     }
10427
10428   /* Success!  */
10429   if (d->testing_p)
10430     return true;
10431
10432   in0 = d->op0;
10433   in1 = d->op1;
10434   if (BYTES_BIG_ENDIAN)
10435     {
10436       x = in0, in0 = in1, in1 = x;
10437       high = !high;
10438     }
10439   out = d->target;
10440
10441   if (high)
10442     {
10443       switch (vmode)
10444         {
10445         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
10446         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
10447         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
10448         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
10449         case V4SImode: gen = gen_aarch64_zip2v4si; break;
10450         case V2SImode: gen = gen_aarch64_zip2v2si; break;
10451         case V2DImode: gen = gen_aarch64_zip2v2di; break;
10452         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
10453         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
10454         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
10455         default:
10456           return false;
10457         }
10458     }
10459   else
10460     {
10461       switch (vmode)
10462         {
10463         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
10464         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
10465         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
10466         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
10467         case V4SImode: gen = gen_aarch64_zip1v4si; break;
10468         case V2SImode: gen = gen_aarch64_zip1v2si; break;
10469         case V2DImode: gen = gen_aarch64_zip1v2di; break;
10470         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
10471         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
10472         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
10473         default:
10474           return false;
10475         }
10476     }
10477
10478   emit_insn (gen (out, in0, in1));
10479   return true;
10480 }
10481
10482 /* Recognize patterns for the EXT insn.  */
10483
10484 static bool
10485 aarch64_evpc_ext (struct expand_vec_perm_d *d)
10486 {
10487   unsigned int i, nelt = d->nelt;
10488   rtx (*gen) (rtx, rtx, rtx, rtx);
10489   rtx offset;
10490
10491   unsigned int location = d->perm[0]; /* Always < nelt.  */
10492
10493   /* Check if the extracted indices are increasing by one.  */
10494   for (i = 1; i < nelt; i++)
10495     {
10496       unsigned int required = location + i;
10497       if (d->one_vector_p)
10498         {
10499           /* We'll pass the same vector in twice, so allow indices to wrap.  */
10500           required &= (nelt - 1);
10501         }
10502       if (d->perm[i] != required)
10503         return false;
10504     }
10505
10506   switch (d->vmode)
10507     {
10508     case V16QImode: gen = gen_aarch64_extv16qi; break;
10509     case V8QImode: gen = gen_aarch64_extv8qi; break;
10510     case V4HImode: gen = gen_aarch64_extv4hi; break;
10511     case V8HImode: gen = gen_aarch64_extv8hi; break;
10512     case V2SImode: gen = gen_aarch64_extv2si; break;
10513     case V4SImode: gen = gen_aarch64_extv4si; break;
10514     case V2SFmode: gen = gen_aarch64_extv2sf; break;
10515     case V4SFmode: gen = gen_aarch64_extv4sf; break;
10516     case V2DImode: gen = gen_aarch64_extv2di; break;
10517     case V2DFmode: gen = gen_aarch64_extv2df; break;
10518     default:
10519       return false;
10520     }
10521
10522   /* Success! */
10523   if (d->testing_p)
10524     return true;
10525
10526   /* The case where (location == 0) is a no-op for both big- and little-endian,
10527      and is removed by the mid-end at optimization levels -O1 and higher.  */
10528
10529   if (BYTES_BIG_ENDIAN && (location != 0))
10530     {
10531       /* After setup, we want the high elements of the first vector (stored
10532          at the LSB end of the register), and the low elements of the second
10533          vector (stored at the MSB end of the register). So swap.  */
10534       std::swap (d->op0, d->op1);
10535       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
10536       location = nelt - location;
10537     }
10538
10539   offset = GEN_INT (location);
10540   emit_insn (gen (d->target, d->op0, d->op1, offset));
10541   return true;
10542 }
10543
10544 /* Recognize patterns for the REV insns.  */
10545
10546 static bool
10547 aarch64_evpc_rev (struct expand_vec_perm_d *d)
10548 {
10549   unsigned int i, j, diff, nelt = d->nelt;
10550   rtx (*gen) (rtx, rtx);
10551
10552   if (!d->one_vector_p)
10553     return false;
10554
10555   diff = d->perm[0];
10556   switch (diff)
10557     {
10558     case 7:
10559       switch (d->vmode)
10560         {
10561         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
10562         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
10563         default:
10564           return false;
10565         }
10566       break;
10567     case 3:
10568       switch (d->vmode)
10569         {
10570         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
10571         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
10572         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
10573         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
10574         default:
10575           return false;
10576         }
10577       break;
10578     case 1:
10579       switch (d->vmode)
10580         {
10581         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
10582         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
10583         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
10584         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
10585         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
10586         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
10587         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
10588         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
10589         default:
10590           return false;
10591         }
10592       break;
10593     default:
10594       return false;
10595     }
10596
10597   for (i = 0; i < nelt ; i += diff + 1)
10598     for (j = 0; j <= diff; j += 1)
10599       {
10600         /* This is guaranteed to be true as the value of diff
10601            is 7, 3, 1 and we should have enough elements in the
10602            queue to generate this.  Getting a vector mask with a
10603            value of diff other than these values implies that
10604            something is wrong by the time we get here.  */
10605         gcc_assert (i + j < nelt);
10606         if (d->perm[i + j] != i + diff - j)
10607           return false;
10608       }
10609
10610   /* Success! */
10611   if (d->testing_p)
10612     return true;
10613
10614   emit_insn (gen (d->target, d->op0));
10615   return true;
10616 }
10617
10618 static bool
10619 aarch64_evpc_dup (struct expand_vec_perm_d *d)
10620 {
10621   rtx (*gen) (rtx, rtx, rtx);
10622   rtx out = d->target;
10623   rtx in0;
10624   machine_mode vmode = d->vmode;
10625   unsigned int i, elt, nelt = d->nelt;
10626   rtx lane;
10627
10628   elt = d->perm[0];
10629   for (i = 1; i < nelt; i++)
10630     {
10631       if (elt != d->perm[i])
10632         return false;
10633     }
10634
10635   /* The generic preparation in aarch64_expand_vec_perm_const_1
10636      swaps the operand order and the permute indices if it finds
10637      d->perm[0] to be in the second operand.  Thus, we can always
10638      use d->op0 and need not do any extra arithmetic to get the
10639      correct lane number.  */
10640   in0 = d->op0;
10641   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
10642
10643   switch (vmode)
10644     {
10645     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
10646     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
10647     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
10648     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
10649     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
10650     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
10651     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
10652     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
10653     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
10654     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
10655     default:
10656       return false;
10657     }
10658
10659   emit_insn (gen (out, in0, lane));
10660   return true;
10661 }
10662
10663 static bool
10664 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
10665 {
10666   rtx rperm[MAX_VECT_LEN], sel;
10667   machine_mode vmode = d->vmode;
10668   unsigned int i, nelt = d->nelt;
10669
10670   if (d->testing_p)
10671     return true;
10672
10673   /* Generic code will try constant permutation twice.  Once with the
10674      original mode and again with the elements lowered to QImode.
10675      So wait and don't do the selector expansion ourselves.  */
10676   if (vmode != V8QImode && vmode != V16QImode)
10677     return false;
10678
10679   for (i = 0; i < nelt; ++i)
10680     {
10681       int nunits = GET_MODE_NUNITS (vmode);
10682
10683       /* If big-endian and two vectors we end up with a weird mixed-endian
10684          mode on NEON.  Reverse the index within each word but not the word
10685          itself.  */
10686       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
10687                                            : d->perm[i]);
10688     }
10689   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
10690   sel = force_reg (vmode, sel);
10691
10692   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
10693   return true;
10694 }
10695
10696 static bool
10697 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
10698 {
10699   /* The pattern matching functions above are written to look for a small
10700      number to begin the sequence (0, 1, N/2).  If we begin with an index
10701      from the second operand, we can swap the operands.  */
10702   if (d->perm[0] >= d->nelt)
10703     {
10704       unsigned i, nelt = d->nelt;
10705
10706       gcc_assert (nelt == (nelt & -nelt));
10707       for (i = 0; i < nelt; ++i)
10708         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
10709
10710       std::swap (d->op0, d->op1);
10711     }
10712
10713   if (TARGET_SIMD)
10714     {
10715       if (aarch64_evpc_rev (d))
10716         return true;
10717       else if (aarch64_evpc_ext (d))
10718         return true;
10719       else if (aarch64_evpc_dup (d))
10720         return true;
10721       else if (aarch64_evpc_zip (d))
10722         return true;
10723       else if (aarch64_evpc_uzp (d))
10724         return true;
10725       else if (aarch64_evpc_trn (d))
10726         return true;
10727       return aarch64_evpc_tbl (d);
10728     }
10729   return false;
10730 }
10731
10732 /* Expand a vec_perm_const pattern.  */
10733
10734 bool
10735 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
10736 {
10737   struct expand_vec_perm_d d;
10738   int i, nelt, which;
10739
10740   d.target = target;
10741   d.op0 = op0;
10742   d.op1 = op1;
10743
10744   d.vmode = GET_MODE (target);
10745   gcc_assert (VECTOR_MODE_P (d.vmode));
10746   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10747   d.testing_p = false;
10748
10749   for (i = which = 0; i < nelt; ++i)
10750     {
10751       rtx e = XVECEXP (sel, 0, i);
10752       int ei = INTVAL (e) & (2 * nelt - 1);
10753       which |= (ei < nelt ? 1 : 2);
10754       d.perm[i] = ei;
10755     }
10756
10757   switch (which)
10758     {
10759     default:
10760       gcc_unreachable ();
10761
10762     case 3:
10763       d.one_vector_p = false;
10764       if (!rtx_equal_p (op0, op1))
10765         break;
10766
10767       /* The elements of PERM do not suggest that only the first operand
10768          is used, but both operands are identical.  Allow easier matching
10769          of the permutation by folding the permutation into the single
10770          input vector.  */
10771       /* Fall Through.  */
10772     case 2:
10773       for (i = 0; i < nelt; ++i)
10774         d.perm[i] &= nelt - 1;
10775       d.op0 = op1;
10776       d.one_vector_p = true;
10777       break;
10778
10779     case 1:
10780       d.op1 = op0;
10781       d.one_vector_p = true;
10782       break;
10783     }
10784
10785   return aarch64_expand_vec_perm_const_1 (&d);
10786 }
10787
10788 static bool
10789 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
10790                                      const unsigned char *sel)
10791 {
10792   struct expand_vec_perm_d d;
10793   unsigned int i, nelt, which;
10794   bool ret;
10795
10796   d.vmode = vmode;
10797   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
10798   d.testing_p = true;
10799   memcpy (d.perm, sel, nelt);
10800
10801   /* Calculate whether all elements are in one vector.  */
10802   for (i = which = 0; i < nelt; ++i)
10803     {
10804       unsigned char e = d.perm[i];
10805       gcc_assert (e < 2 * nelt);
10806       which |= (e < nelt ? 1 : 2);
10807     }
10808
10809   /* If all elements are from the second vector, reindex as if from the
10810      first vector.  */
10811   if (which == 2)
10812     for (i = 0; i < nelt; ++i)
10813       d.perm[i] -= nelt;
10814
10815   /* Check whether the mask can be applied to a single vector.  */
10816   d.one_vector_p = (which != 3);
10817
10818   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
10819   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
10820   if (!d.one_vector_p)
10821     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
10822
10823   start_sequence ();
10824   ret = aarch64_expand_vec_perm_const_1 (&d);
10825   end_sequence ();
10826
10827   return ret;
10828 }
10829
10830 rtx
10831 aarch64_reverse_mask (enum machine_mode mode)
10832 {
10833   /* We have to reverse each vector because we dont have
10834      a permuted load that can reverse-load according to ABI rules.  */
10835   rtx mask;
10836   rtvec v = rtvec_alloc (16);
10837   int i, j;
10838   int nunits = GET_MODE_NUNITS (mode);
10839   int usize = GET_MODE_UNIT_SIZE (mode);
10840
10841   gcc_assert (BYTES_BIG_ENDIAN);
10842   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
10843
10844   for (i = 0; i < nunits; i++)
10845     for (j = 0; j < usize; j++)
10846       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
10847   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
10848   return force_reg (V16QImode, mask);
10849 }
10850
10851 /* Implement MODES_TIEABLE_P.  */
10852
10853 bool
10854 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
10855 {
10856   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
10857     return true;
10858
10859   /* We specifically want to allow elements of "structure" modes to
10860      be tieable to the structure.  This more general condition allows
10861      other rarer situations too.  */
10862   if (TARGET_SIMD
10863       && aarch64_vector_mode_p (mode1)
10864       && aarch64_vector_mode_p (mode2))
10865     return true;
10866
10867   return false;
10868 }
10869
10870 /* Return a new RTX holding the result of moving POINTER forward by
10871    AMOUNT bytes.  */
10872
10873 static rtx
10874 aarch64_move_pointer (rtx pointer, int amount)
10875 {
10876   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
10877
10878   return adjust_automodify_address (pointer, GET_MODE (pointer),
10879                                     next, amount);
10880 }
10881
10882 /* Return a new RTX holding the result of moving POINTER forward by the
10883    size of the mode it points to.  */
10884
10885 static rtx
10886 aarch64_progress_pointer (rtx pointer)
10887 {
10888   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
10889
10890   return aarch64_move_pointer (pointer, amount);
10891 }
10892
10893 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
10894    MODE bytes.  */
10895
10896 static void
10897 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
10898                                               machine_mode mode)
10899 {
10900   rtx reg = gen_reg_rtx (mode);
10901
10902   /* "Cast" the pointers to the correct mode.  */
10903   *src = adjust_address (*src, mode, 0);
10904   *dst = adjust_address (*dst, mode, 0);
10905   /* Emit the memcpy.  */
10906   emit_move_insn (reg, *src);
10907   emit_move_insn (*dst, reg);
10908   /* Move the pointers forward.  */
10909   *src = aarch64_progress_pointer (*src);
10910   *dst = aarch64_progress_pointer (*dst);
10911 }
10912
10913 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
10914    we succeed, otherwise return false.  */
10915
10916 bool
10917 aarch64_expand_movmem (rtx *operands)
10918 {
10919   unsigned int n;
10920   rtx dst = operands[0];
10921   rtx src = operands[1];
10922   rtx base;
10923   bool speed_p = !optimize_function_for_size_p (cfun);
10924
10925   /* When optimizing for size, give a better estimate of the length of a
10926      memcpy call, but use the default otherwise.  */
10927   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
10928
10929   /* We can't do anything smart if the amount to copy is not constant.  */
10930   if (!CONST_INT_P (operands[2]))
10931     return false;
10932
10933   n = UINTVAL (operands[2]);
10934
10935   /* Try to keep the number of instructions low.  For cases below 16 bytes we
10936      need to make at most two moves.  For cases above 16 bytes it will be one
10937      move for each 16 byte chunk, then at most two additional moves.  */
10938   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
10939     return false;
10940
10941   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
10942   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
10943
10944   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
10945   src = adjust_automodify_address (src, VOIDmode, base, 0);
10946
10947   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
10948      1-byte chunk.  */
10949   if (n < 4)
10950     {
10951       if (n >= 2)
10952         {
10953           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
10954           n -= 2;
10955         }
10956
10957       if (n == 1)
10958         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
10959
10960       return true;
10961     }
10962
10963   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
10964      4-byte chunk, partially overlapping with the previously copied chunk.  */
10965   if (n < 8)
10966     {
10967       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10968       n -= 4;
10969       if (n > 0)
10970         {
10971           int move = n - 4;
10972
10973           src = aarch64_move_pointer (src, move);
10974           dst = aarch64_move_pointer (dst, move);
10975           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
10976         }
10977       return true;
10978     }
10979
10980   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
10981      them, then (if applicable) an 8-byte chunk.  */
10982   while (n >= 8)
10983     {
10984       if (n / 16)
10985         {
10986           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
10987           n -= 16;
10988         }
10989       else
10990         {
10991           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
10992           n -= 8;
10993         }
10994     }
10995
10996   /* Finish the final bytes of the copy.  We can always do this in one
10997      instruction.  We either copy the exact amount we need, or partially
10998      overlap with the previous chunk we copied and copy 8-bytes.  */
10999   if (n == 0)
11000     return true;
11001   else if (n == 1)
11002     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
11003   else if (n == 2)
11004     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
11005   else if (n == 4)
11006     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
11007   else
11008     {
11009       if (n == 3)
11010         {
11011           src = aarch64_move_pointer (src, -1);
11012           dst = aarch64_move_pointer (dst, -1);
11013           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
11014         }
11015       else
11016         {
11017           int move = n - 8;
11018
11019           src = aarch64_move_pointer (src, move);
11020           dst = aarch64_move_pointer (dst, move);
11021           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
11022         }
11023     }
11024
11025   return true;
11026 }
11027
11028 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
11029
11030 static unsigned HOST_WIDE_INT
11031 aarch64_asan_shadow_offset (void)
11032 {
11033   return (HOST_WIDE_INT_1 << 36);
11034 }
11035
11036 static bool
11037 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
11038                                         unsigned int align,
11039                                         enum by_pieces_operation op,
11040                                         bool speed_p)
11041 {
11042   /* STORE_BY_PIECES can be used when copying a constant string, but
11043      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
11044      For now we always fail this and let the move_by_pieces code copy
11045      the string from read-only memory.  */
11046   if (op == STORE_BY_PIECES)
11047     return false;
11048
11049   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
11050 }
11051
11052 static enum machine_mode
11053 aarch64_code_to_ccmode (enum rtx_code code)
11054 {
11055   switch (code)
11056     {
11057     case NE:
11058       return CC_DNEmode;
11059
11060     case EQ:
11061       return CC_DEQmode;
11062
11063     case LE:
11064       return CC_DLEmode;
11065
11066     case LT:
11067       return CC_DLTmode;
11068
11069     case GE:
11070       return CC_DGEmode;
11071
11072     case GT:
11073       return CC_DGTmode;
11074
11075     case LEU:
11076       return CC_DLEUmode;
11077
11078     case LTU:
11079       return CC_DLTUmode;
11080
11081     case GEU:
11082       return CC_DGEUmode;
11083
11084     case GTU:
11085       return CC_DGTUmode;
11086
11087     default:
11088       return CCmode;
11089     }
11090 }
11091
11092 static rtx
11093 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
11094                         int code, tree treeop0, tree treeop1)
11095 {
11096   enum machine_mode op_mode, cmp_mode, cc_mode;
11097   rtx op0, op1, cmp, target;
11098   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
11099   enum insn_code icode;
11100   struct expand_operand ops[4];
11101
11102   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
11103   if (cc_mode == CCmode)
11104     return NULL_RTX;
11105
11106   start_sequence ();
11107   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
11108
11109   op_mode = GET_MODE (op0);
11110   if (op_mode == VOIDmode)
11111     op_mode = GET_MODE (op1);
11112
11113   switch (op_mode)
11114     {
11115     case QImode:
11116     case HImode:
11117     case SImode:
11118       cmp_mode = SImode;
11119       icode = CODE_FOR_cmpsi;
11120       break;
11121
11122     case DImode:
11123       cmp_mode = DImode;
11124       icode = CODE_FOR_cmpdi;
11125       break;
11126
11127     default:
11128       end_sequence ();
11129       return NULL_RTX;
11130     }
11131
11132   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
11133   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
11134   if (!op0 || !op1)
11135     {
11136       end_sequence ();
11137       return NULL_RTX;
11138     }
11139   *prep_seq = get_insns ();
11140   end_sequence ();
11141
11142   cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
11143   target = gen_rtx_REG (CCmode, CC_REGNUM);
11144
11145   create_output_operand (&ops[0], target, CCmode);
11146   create_fixed_operand (&ops[1], cmp);
11147   create_fixed_operand (&ops[2], op0);
11148   create_fixed_operand (&ops[3], op1);
11149
11150   start_sequence ();
11151   if (!maybe_expand_insn (icode, 4, ops))
11152     {
11153       end_sequence ();
11154       return NULL_RTX;
11155     }
11156   *gen_seq = get_insns ();
11157   end_sequence ();
11158
11159   return gen_rtx_REG (cc_mode, CC_REGNUM);
11160 }
11161
11162 static rtx
11163 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
11164                        tree treeop0, tree treeop1, int bit_code)
11165 {
11166   rtx op0, op1, cmp0, cmp1, target;
11167   enum machine_mode op_mode, cmp_mode, cc_mode;
11168   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
11169   enum insn_code icode = CODE_FOR_ccmp_andsi;
11170   struct expand_operand ops[6];
11171
11172   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
11173   if (cc_mode == CCmode)
11174     return NULL_RTX;
11175
11176   push_to_sequence ((rtx_insn*) *prep_seq);
11177   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
11178
11179   op_mode = GET_MODE (op0);
11180   if (op_mode == VOIDmode)
11181     op_mode = GET_MODE (op1);
11182
11183   switch (op_mode)
11184     {
11185     case QImode:
11186     case HImode:
11187     case SImode:
11188       cmp_mode = SImode;
11189       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
11190                                                 : CODE_FOR_ccmp_iorsi;
11191       break;
11192
11193     case DImode:
11194       cmp_mode = DImode;
11195       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
11196                                                 : CODE_FOR_ccmp_iordi;
11197       break;
11198
11199     default:
11200       end_sequence ();
11201       return NULL_RTX;
11202     }
11203
11204   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
11205   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
11206   if (!op0 || !op1)
11207     {
11208       end_sequence ();
11209       return NULL_RTX;
11210     }
11211   *prep_seq = get_insns ();
11212   end_sequence ();
11213
11214   target = gen_rtx_REG (cc_mode, CC_REGNUM);
11215   cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
11216   cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
11217
11218   create_fixed_operand (&ops[0], prev);
11219   create_fixed_operand (&ops[1], target);
11220   create_fixed_operand (&ops[2], op0);
11221   create_fixed_operand (&ops[3], op1);
11222   create_fixed_operand (&ops[4], cmp0);
11223   create_fixed_operand (&ops[5], cmp1);
11224
11225   push_to_sequence ((rtx_insn*) *gen_seq);
11226   if (!maybe_expand_insn (icode, 6, ops))
11227     {
11228       end_sequence ();
11229       return NULL_RTX;
11230     }
11231
11232   *gen_seq = get_insns ();
11233   end_sequence ();
11234
11235   return target;
11236 }
11237
11238 #undef TARGET_GEN_CCMP_FIRST
11239 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
11240
11241 #undef TARGET_GEN_CCMP_NEXT
11242 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
11243
11244 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
11245    instruction fusion of some sort.  */
11246
11247 static bool
11248 aarch64_macro_fusion_p (void)
11249 {
11250   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
11251 }
11252
11253
11254 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
11255    should be kept together during scheduling.  */
11256
11257 static bool
11258 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
11259 {
11260   rtx set_dest;
11261   rtx prev_set = single_set (prev);
11262   rtx curr_set = single_set (curr);
11263   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
11264   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
11265
11266   if (!aarch64_macro_fusion_p ())
11267     return false;
11268
11269   if (simple_sets_p
11270       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOV_MOVK))
11271     {
11272       /* We are trying to match:
11273          prev (mov)  == (set (reg r0) (const_int imm16))
11274          curr (movk) == (set (zero_extract (reg r0)
11275                                            (const_int 16)
11276                                            (const_int 16))
11277                              (const_int imm16_1))  */
11278
11279       set_dest = SET_DEST (curr_set);
11280
11281       if (GET_CODE (set_dest) == ZERO_EXTRACT
11282           && CONST_INT_P (SET_SRC (curr_set))
11283           && CONST_INT_P (SET_SRC (prev_set))
11284           && CONST_INT_P (XEXP (set_dest, 2))
11285           && INTVAL (XEXP (set_dest, 2)) == 16
11286           && REG_P (XEXP (set_dest, 0))
11287           && REG_P (SET_DEST (prev_set))
11288           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
11289         {
11290           return true;
11291         }
11292     }
11293
11294   if (simple_sets_p
11295       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_ADD))
11296     {
11297
11298       /*  We're trying to match:
11299           prev (adrp) == (set (reg r1)
11300                               (high (symbol_ref ("SYM"))))
11301           curr (add) == (set (reg r0)
11302                              (lo_sum (reg r1)
11303                                      (symbol_ref ("SYM"))))
11304           Note that r0 need not necessarily be the same as r1, especially
11305           during pre-regalloc scheduling.  */
11306
11307       if (satisfies_constraint_Ush (SET_SRC (prev_set))
11308           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
11309         {
11310           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
11311               && REG_P (XEXP (SET_SRC (curr_set), 0))
11312               && REGNO (XEXP (SET_SRC (curr_set), 0))
11313                  == REGNO (SET_DEST (prev_set))
11314               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
11315                               XEXP (SET_SRC (curr_set), 1)))
11316             return true;
11317         }
11318     }
11319
11320   if (simple_sets_p
11321       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOVK_MOVK))
11322     {
11323
11324       /* We're trying to match:
11325          prev (movk) == (set (zero_extract (reg r0)
11326                                            (const_int 16)
11327                                            (const_int 32))
11328                              (const_int imm16_1))
11329          curr (movk) == (set (zero_extract (reg r0)
11330                                            (const_int 16)
11331                                            (const_int 48))
11332                              (const_int imm16_2))  */
11333
11334       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
11335           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
11336           && REG_P (XEXP (SET_DEST (prev_set), 0))
11337           && REG_P (XEXP (SET_DEST (curr_set), 0))
11338           && REGNO (XEXP (SET_DEST (prev_set), 0))
11339              == REGNO (XEXP (SET_DEST (curr_set), 0))
11340           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
11341           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
11342           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
11343           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
11344           && CONST_INT_P (SET_SRC (prev_set))
11345           && CONST_INT_P (SET_SRC (curr_set)))
11346         return true;
11347
11348     }
11349   if (simple_sets_p
11350       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_LDR))
11351     {
11352       /* We're trying to match:
11353           prev (adrp) == (set (reg r0)
11354                               (high (symbol_ref ("SYM"))))
11355           curr (ldr) == (set (reg r1)
11356                              (mem (lo_sum (reg r0)
11357                                              (symbol_ref ("SYM")))))
11358                  or
11359           curr (ldr) == (set (reg r1)
11360                              (zero_extend (mem
11361                                            (lo_sum (reg r0)
11362                                                    (symbol_ref ("SYM"))))))  */
11363       if (satisfies_constraint_Ush (SET_SRC (prev_set))
11364           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
11365         {
11366           rtx curr_src = SET_SRC (curr_set);
11367
11368           if (GET_CODE (curr_src) == ZERO_EXTEND)
11369             curr_src = XEXP (curr_src, 0);
11370
11371           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
11372               && REG_P (XEXP (XEXP (curr_src, 0), 0))
11373               && REGNO (XEXP (XEXP (curr_src, 0), 0))
11374                  == REGNO (SET_DEST (prev_set))
11375               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
11376                               XEXP (SET_SRC (prev_set), 0)))
11377               return true;
11378         }
11379     }
11380
11381   if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_CMP_BRANCH)
11382       && any_condjump_p (curr))
11383     {
11384       enum attr_type prev_type = get_attr_type (prev);
11385
11386       /* FIXME: this misses some which is considered simple arthematic
11387          instructions for ThunderX.  Simple shifts are missed here.  */
11388       if (prev_type == TYPE_ALUS_SREG
11389           || prev_type == TYPE_ALUS_IMM
11390           || prev_type == TYPE_LOGICS_REG
11391           || prev_type == TYPE_LOGICS_IMM)
11392         return true;
11393     }
11394
11395   return false;
11396 }
11397
11398 /* If MEM is in the form of [base+offset], extract the two parts
11399    of address and set to BASE and OFFSET, otherwise return false
11400    after clearing BASE and OFFSET.  */
11401
11402 bool
11403 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
11404 {
11405   rtx addr;
11406
11407   gcc_assert (MEM_P (mem));
11408
11409   addr = XEXP (mem, 0);
11410
11411   if (REG_P (addr))
11412     {
11413       *base = addr;
11414       *offset = const0_rtx;
11415       return true;
11416     }
11417
11418   if (GET_CODE (addr) == PLUS
11419       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
11420     {
11421       *base = XEXP (addr, 0);
11422       *offset = XEXP (addr, 1);
11423       return true;
11424     }
11425
11426   *base = NULL_RTX;
11427   *offset = NULL_RTX;
11428
11429   return false;
11430 }
11431
11432 /* Types for scheduling fusion.  */
11433 enum sched_fusion_type
11434 {
11435   SCHED_FUSION_NONE = 0,
11436   SCHED_FUSION_LD_SIGN_EXTEND,
11437   SCHED_FUSION_LD_ZERO_EXTEND,
11438   SCHED_FUSION_LD,
11439   SCHED_FUSION_ST,
11440   SCHED_FUSION_NUM
11441 };
11442
11443 /* If INSN is a load or store of address in the form of [base+offset],
11444    extract the two parts and set to BASE and OFFSET.  Return scheduling
11445    fusion type this INSN is.  */
11446
11447 static enum sched_fusion_type
11448 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
11449 {
11450   rtx x, dest, src;
11451   enum sched_fusion_type fusion = SCHED_FUSION_LD;
11452
11453   gcc_assert (INSN_P (insn));
11454   x = PATTERN (insn);
11455   if (GET_CODE (x) != SET)
11456     return SCHED_FUSION_NONE;
11457
11458   src = SET_SRC (x);
11459   dest = SET_DEST (x);
11460
11461   if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
11462       && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
11463     return SCHED_FUSION_NONE;
11464
11465   if (GET_CODE (src) == SIGN_EXTEND)
11466     {
11467       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
11468       src = XEXP (src, 0);
11469       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
11470         return SCHED_FUSION_NONE;
11471     }
11472   else if (GET_CODE (src) == ZERO_EXTEND)
11473     {
11474       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
11475       src = XEXP (src, 0);
11476       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
11477         return SCHED_FUSION_NONE;
11478     }
11479
11480   if (GET_CODE (src) == MEM && REG_P (dest))
11481     extract_base_offset_in_addr (src, base, offset);
11482   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
11483     {
11484       fusion = SCHED_FUSION_ST;
11485       extract_base_offset_in_addr (dest, base, offset);
11486     }
11487   else
11488     return SCHED_FUSION_NONE;
11489
11490   if (*base == NULL_RTX || *offset == NULL_RTX)
11491     fusion = SCHED_FUSION_NONE;
11492
11493   return fusion;
11494 }
11495
11496 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
11497
11498    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
11499    and PRI are only calculated for these instructions.  For other instruction,
11500    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
11501    type instruction fusion can be added by returning different priorities.
11502
11503    It's important that irrelevant instructions get the largest FUSION_PRI.  */
11504
11505 static void
11506 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
11507                                int *fusion_pri, int *pri)
11508 {
11509   int tmp, off_val;
11510   rtx base, offset;
11511   enum sched_fusion_type fusion;
11512
11513   gcc_assert (INSN_P (insn));
11514
11515   tmp = max_pri - 1;
11516   fusion = fusion_load_store (insn, &base, &offset);
11517   if (fusion == SCHED_FUSION_NONE)
11518     {
11519       *pri = tmp;
11520       *fusion_pri = tmp;
11521       return;
11522     }
11523
11524   /* Set FUSION_PRI according to fusion type and base register.  */
11525   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
11526
11527   /* Calculate PRI.  */
11528   tmp /= 2;
11529
11530   /* INSN with smaller offset goes first.  */
11531   off_val = (int)(INTVAL (offset));
11532   if (off_val >= 0)
11533     tmp -= (off_val & 0xfffff);
11534   else
11535     tmp += ((- off_val) & 0xfffff);
11536
11537   *pri = tmp;
11538   return;
11539 }
11540
11541 /* Given OPERANDS of consecutive load/store, check if we can merge
11542    them into ldp/stp.  LOAD is true if they are load instructions.
11543    MODE is the mode of memory operands.  */
11544
11545 bool
11546 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
11547                                 enum machine_mode mode)
11548 {
11549   HOST_WIDE_INT offval_1, offval_2, msize;
11550   enum reg_class rclass_1, rclass_2;
11551   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
11552
11553   if (load)
11554     {
11555       mem_1 = operands[1];
11556       mem_2 = operands[3];
11557       reg_1 = operands[0];
11558       reg_2 = operands[2];
11559       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
11560       if (REGNO (reg_1) == REGNO (reg_2))
11561         return false;
11562     }
11563   else
11564     {
11565       mem_1 = operands[0];
11566       mem_2 = operands[2];
11567       reg_1 = operands[1];
11568       reg_2 = operands[3];
11569     }
11570
11571   /* The mems cannot be volatile.  */
11572   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
11573     return false;
11574
11575   /* Check if the addresses are in the form of [base+offset].  */
11576   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11577   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11578     return false;
11579   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11580   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11581     return false;
11582
11583   /* Check if the bases are same.  */
11584   if (!rtx_equal_p (base_1, base_2))
11585     return false;
11586
11587   offval_1 = INTVAL (offset_1);
11588   offval_2 = INTVAL (offset_2);
11589   msize = GET_MODE_SIZE (mode);
11590   /* Check if the offsets are consecutive.  */
11591   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
11592     return false;
11593
11594   /* Check if the addresses are clobbered by load.  */
11595   if (load)
11596     {
11597       if (reg_mentioned_p (reg_1, mem_1))
11598         return false;
11599
11600       /* In increasing order, the last load can clobber the address.  */
11601       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
11602       return false;
11603     }
11604
11605   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11606     rclass_1 = FP_REGS;
11607   else
11608     rclass_1 = GENERAL_REGS;
11609
11610   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11611     rclass_2 = FP_REGS;
11612   else
11613     rclass_2 = GENERAL_REGS;
11614
11615   /* Check if the registers are of same class.  */
11616   if (rclass_1 != rclass_2)
11617     return false;
11618
11619   return true;
11620 }
11621
11622 /* Given OPERANDS of consecutive load/store, check if we can merge
11623    them into ldp/stp by adjusting the offset.  LOAD is true if they
11624    are load instructions.  MODE is the mode of memory operands.
11625
11626    Given below consecutive stores:
11627
11628      str  w1, [xb, 0x100]
11629      str  w1, [xb, 0x104]
11630      str  w1, [xb, 0x108]
11631      str  w1, [xb, 0x10c]
11632
11633    Though the offsets are out of the range supported by stp, we can
11634    still pair them after adjusting the offset, like:
11635
11636      add  scratch, xb, 0x100
11637      stp  w1, w1, [scratch]
11638      stp  w1, w1, [scratch, 0x8]
11639
11640    The peephole patterns detecting this opportunity should guarantee
11641    the scratch register is avaliable.  */
11642
11643 bool
11644 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
11645                                        enum machine_mode mode)
11646 {
11647   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
11648   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
11649   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
11650   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
11651
11652   if (load)
11653     {
11654       reg_1 = operands[0];
11655       mem_1 = operands[1];
11656       reg_2 = operands[2];
11657       mem_2 = operands[3];
11658       reg_3 = operands[4];
11659       mem_3 = operands[5];
11660       reg_4 = operands[6];
11661       mem_4 = operands[7];
11662       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
11663                   && REG_P (reg_3) && REG_P (reg_4));
11664       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
11665         return false;
11666     }
11667   else
11668     {
11669       mem_1 = operands[0];
11670       reg_1 = operands[1];
11671       mem_2 = operands[2];
11672       reg_2 = operands[3];
11673       mem_3 = operands[4];
11674       reg_3 = operands[5];
11675       mem_4 = operands[6];
11676       reg_4 = operands[7];
11677     }
11678   /* Skip if memory operand is by itslef valid for ldp/stp.  */
11679   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
11680     return false;
11681
11682   /* The mems cannot be volatile.  */
11683   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
11684       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
11685     return false;
11686
11687   /* Check if the addresses are in the form of [base+offset].  */
11688   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
11689   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
11690     return false;
11691   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
11692   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
11693     return false;
11694   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
11695   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
11696     return false;
11697   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
11698   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
11699     return false;
11700
11701   /* Check if the bases are same.  */
11702   if (!rtx_equal_p (base_1, base_2)
11703       || !rtx_equal_p (base_2, base_3)
11704       || !rtx_equal_p (base_3, base_4))
11705     return false;
11706
11707   offval_1 = INTVAL (offset_1);
11708   offval_2 = INTVAL (offset_2);
11709   offval_3 = INTVAL (offset_3);
11710   offval_4 = INTVAL (offset_4);
11711   msize = GET_MODE_SIZE (mode);
11712   /* Check if the offsets are consecutive.  */
11713   if ((offval_1 != (offval_2 + msize)
11714        || offval_1 != (offval_3 + msize * 2)
11715        || offval_1 != (offval_4 + msize * 3))
11716       && (offval_4 != (offval_3 + msize)
11717           || offval_4 != (offval_2 + msize * 2)
11718           || offval_4 != (offval_1 + msize * 3)))
11719     return false;
11720
11721   /* Check if the addresses are clobbered by load.  */
11722   if (load)
11723     {
11724       if (reg_mentioned_p (reg_1, mem_1)
11725           || reg_mentioned_p (reg_2, mem_2)
11726           || reg_mentioned_p (reg_3, mem_3))
11727         return false;
11728
11729       /* In increasing order, the last load can clobber the address.  */
11730       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
11731         return false;
11732     }
11733
11734   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
11735     rclass_1 = FP_REGS;
11736   else
11737     rclass_1 = GENERAL_REGS;
11738
11739   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
11740     rclass_2 = FP_REGS;
11741   else
11742     rclass_2 = GENERAL_REGS;
11743
11744   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
11745     rclass_3 = FP_REGS;
11746   else
11747     rclass_3 = GENERAL_REGS;
11748
11749   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
11750     rclass_4 = FP_REGS;
11751   else
11752     rclass_4 = GENERAL_REGS;
11753
11754   /* Check if the registers are of same class.  */
11755   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
11756     return false;
11757
11758   return true;
11759 }
11760
11761 /* Given OPERANDS of consecutive load/store, this function pairs them
11762    into ldp/stp after adjusting the offset.  It depends on the fact
11763    that addresses of load/store instructions are in increasing order.
11764    MODE is the mode of memory operands.  CODE is the rtl operator
11765    which should be applied to all memory operands, it's SIGN_EXTEND,
11766    ZERO_EXTEND or UNKNOWN.  */
11767
11768 bool
11769 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
11770                              enum machine_mode mode, RTX_CODE code)
11771 {
11772   rtx base, offset, t1, t2;
11773   rtx mem_1, mem_2, mem_3, mem_4;
11774   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
11775
11776   if (load)
11777     {
11778       mem_1 = operands[1];
11779       mem_2 = operands[3];
11780       mem_3 = operands[5];
11781       mem_4 = operands[7];
11782     }
11783   else
11784     {
11785       mem_1 = operands[0];
11786       mem_2 = operands[2];
11787       mem_3 = operands[4];
11788       mem_4 = operands[6];
11789       gcc_assert (code == UNKNOWN);
11790     }
11791
11792   extract_base_offset_in_addr (mem_1, &base, &offset);
11793   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
11794
11795   /* Adjust offset thus it can fit in ldp/stp instruction.  */
11796   msize = GET_MODE_SIZE (mode);
11797   stp_off_limit = msize * 0x40;
11798   off_val = INTVAL (offset);
11799   abs_off = (off_val < 0) ? -off_val : off_val;
11800   new_off = abs_off % stp_off_limit;
11801   adj_off = abs_off - new_off;
11802
11803   /* Further adjust to make sure all offsets are OK.  */
11804   if ((new_off + msize * 2) >= stp_off_limit)
11805     {
11806       adj_off += stp_off_limit;
11807       new_off -= stp_off_limit;
11808     }
11809
11810   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
11811   if (adj_off >= 0x1000)
11812     return false;
11813
11814   if (off_val < 0)
11815     {
11816       adj_off = -adj_off;
11817       new_off = -new_off;
11818     }
11819
11820   /* Create new memory references.  */
11821   mem_1 = change_address (mem_1, VOIDmode,
11822                           plus_constant (DImode, operands[8], new_off));
11823
11824   /* Check if the adjusted address is OK for ldp/stp.  */
11825   if (!aarch64_mem_pair_operand (mem_1, mode))
11826     return false;
11827
11828   msize = GET_MODE_SIZE (mode);
11829   mem_2 = change_address (mem_2, VOIDmode,
11830                           plus_constant (DImode,
11831                                          operands[8],
11832                                          new_off + msize));
11833   mem_3 = change_address (mem_3, VOIDmode,
11834                           plus_constant (DImode,
11835                                          operands[8],
11836                                          new_off + msize * 2));
11837   mem_4 = change_address (mem_4, VOIDmode,
11838                           plus_constant (DImode,
11839                                          operands[8],
11840                                          new_off + msize * 3));
11841
11842   if (code == ZERO_EXTEND)
11843     {
11844       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
11845       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
11846       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
11847       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
11848     }
11849   else if (code == SIGN_EXTEND)
11850     {
11851       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
11852       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
11853       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
11854       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
11855     }
11856
11857   if (load)
11858     {
11859       operands[1] = mem_1;
11860       operands[3] = mem_2;
11861       operands[5] = mem_3;
11862       operands[7] = mem_4;
11863     }
11864   else
11865     {
11866       operands[0] = mem_1;
11867       operands[2] = mem_2;
11868       operands[4] = mem_3;
11869       operands[6] = mem_4;
11870     }
11871
11872   /* Emit adjusting instruction.  */
11873   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
11874   /* Emit ldp/stp instructions.  */
11875   t1 = gen_rtx_SET (operands[0], operands[1]);
11876   t2 = gen_rtx_SET (operands[2], operands[3]);
11877   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11878   t1 = gen_rtx_SET (operands[4], operands[5]);
11879   t2 = gen_rtx_SET (operands[6], operands[7]);
11880   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
11881   return true;
11882 }
11883
11884 /* Return 1 if pseudo register should be created and used to hold
11885    GOT address for PIC code.  */
11886
11887 bool
11888 aarch64_use_pseudo_pic_reg (void)
11889 {
11890   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
11891 }
11892
11893 #undef TARGET_ADDRESS_COST
11894 #define TARGET_ADDRESS_COST aarch64_address_cost
11895
11896 /* This hook will determines whether unnamed bitfields affect the alignment
11897    of the containing structure.  The hook returns true if the structure
11898    should inherit the alignment requirements of an unnamed bitfield's
11899    type.  */
11900 #undef TARGET_ALIGN_ANON_BITFIELD
11901 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
11902
11903 #undef TARGET_ASM_ALIGNED_DI_OP
11904 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
11905
11906 #undef TARGET_ASM_ALIGNED_HI_OP
11907 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
11908
11909 #undef TARGET_ASM_ALIGNED_SI_OP
11910 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
11911
11912 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
11913 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
11914   hook_bool_const_tree_hwi_hwi_const_tree_true
11915
11916 #undef TARGET_ASM_FILE_START
11917 #define TARGET_ASM_FILE_START aarch64_start_file
11918
11919 #undef TARGET_ASM_OUTPUT_MI_THUNK
11920 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
11921
11922 #undef TARGET_ASM_SELECT_RTX_SECTION
11923 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
11924
11925 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
11926 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
11927
11928 #undef TARGET_BUILD_BUILTIN_VA_LIST
11929 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
11930
11931 #undef TARGET_CALLEE_COPIES
11932 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
11933
11934 #undef TARGET_CAN_ELIMINATE
11935 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
11936
11937 #undef TARGET_CANNOT_FORCE_CONST_MEM
11938 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
11939
11940 #undef TARGET_CONDITIONAL_REGISTER_USAGE
11941 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
11942
11943 /* Only the least significant bit is used for initialization guard
11944    variables.  */
11945 #undef TARGET_CXX_GUARD_MASK_BIT
11946 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
11947
11948 #undef TARGET_C_MODE_FOR_SUFFIX
11949 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
11950
11951 #ifdef TARGET_BIG_ENDIAN_DEFAULT
11952 #undef  TARGET_DEFAULT_TARGET_FLAGS
11953 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
11954 #endif
11955
11956 #undef TARGET_CLASS_MAX_NREGS
11957 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
11958
11959 #undef TARGET_BUILTIN_DECL
11960 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
11961
11962 #undef  TARGET_EXPAND_BUILTIN
11963 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
11964
11965 #undef TARGET_EXPAND_BUILTIN_VA_START
11966 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
11967
11968 #undef TARGET_FOLD_BUILTIN
11969 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
11970
11971 #undef TARGET_FUNCTION_ARG
11972 #define TARGET_FUNCTION_ARG aarch64_function_arg
11973
11974 #undef TARGET_FUNCTION_ARG_ADVANCE
11975 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
11976
11977 #undef TARGET_FUNCTION_ARG_BOUNDARY
11978 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
11979
11980 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
11981 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
11982
11983 #undef TARGET_FUNCTION_VALUE
11984 #define TARGET_FUNCTION_VALUE aarch64_function_value
11985
11986 #undef TARGET_FUNCTION_VALUE_REGNO_P
11987 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
11988
11989 #undef TARGET_FRAME_POINTER_REQUIRED
11990 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
11991
11992 #undef TARGET_GIMPLE_FOLD_BUILTIN
11993 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
11994
11995 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
11996 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
11997
11998 #undef  TARGET_INIT_BUILTINS
11999 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
12000
12001 #undef TARGET_LEGITIMATE_ADDRESS_P
12002 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
12003
12004 #undef TARGET_LEGITIMATE_CONSTANT_P
12005 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
12006
12007 #undef TARGET_LIBGCC_CMP_RETURN_MODE
12008 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
12009
12010 #undef TARGET_LRA_P
12011 #define TARGET_LRA_P hook_bool_void_true
12012
12013 #undef TARGET_MANGLE_TYPE
12014 #define TARGET_MANGLE_TYPE aarch64_mangle_type
12015
12016 #undef TARGET_MEMORY_MOVE_COST
12017 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
12018
12019 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
12020 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
12021
12022 #undef TARGET_MUST_PASS_IN_STACK
12023 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
12024
12025 /* This target hook should return true if accesses to volatile bitfields
12026    should use the narrowest mode possible.  It should return false if these
12027    accesses should use the bitfield container type.  */
12028 #undef TARGET_NARROW_VOLATILE_BITFIELD
12029 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
12030
12031 #undef  TARGET_OPTION_OVERRIDE
12032 #define TARGET_OPTION_OVERRIDE aarch64_override_options
12033
12034 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
12035 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
12036   aarch64_override_options_after_change
12037
12038 #undef TARGET_PASS_BY_REFERENCE
12039 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
12040
12041 #undef TARGET_PREFERRED_RELOAD_CLASS
12042 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
12043
12044 #undef TARGET_SCHED_REASSOCIATION_WIDTH
12045 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
12046
12047 #undef TARGET_SECONDARY_RELOAD
12048 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
12049
12050 #undef TARGET_SHIFT_TRUNCATION_MASK
12051 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
12052
12053 #undef TARGET_SETUP_INCOMING_VARARGS
12054 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
12055
12056 #undef TARGET_STRUCT_VALUE_RTX
12057 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
12058
12059 #undef TARGET_REGISTER_MOVE_COST
12060 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
12061
12062 #undef TARGET_RETURN_IN_MEMORY
12063 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
12064
12065 #undef TARGET_RETURN_IN_MSB
12066 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
12067
12068 #undef TARGET_RTX_COSTS
12069 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
12070
12071 #undef TARGET_SCHED_ISSUE_RATE
12072 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
12073
12074 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
12075 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
12076   aarch64_sched_first_cycle_multipass_dfa_lookahead
12077
12078 #undef TARGET_TRAMPOLINE_INIT
12079 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
12080
12081 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
12082 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
12083
12084 #undef TARGET_VECTOR_MODE_SUPPORTED_P
12085 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
12086
12087 #undef TARGET_ARRAY_MODE_SUPPORTED_P
12088 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
12089
12090 #undef TARGET_VECTORIZE_ADD_STMT_COST
12091 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
12092
12093 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
12094 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
12095   aarch64_builtin_vectorization_cost
12096
12097 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
12098 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
12099
12100 #undef TARGET_VECTORIZE_BUILTINS
12101 #define TARGET_VECTORIZE_BUILTINS
12102
12103 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
12104 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
12105   aarch64_builtin_vectorized_function
12106
12107 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
12108 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
12109   aarch64_autovectorize_vector_sizes
12110
12111 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
12112 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
12113   aarch64_atomic_assign_expand_fenv
12114
12115 /* Section anchor support.  */
12116
12117 #undef TARGET_MIN_ANCHOR_OFFSET
12118 #define TARGET_MIN_ANCHOR_OFFSET -256
12119
12120 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
12121    byte offset; we can do much more for larger data types, but have no way
12122    to determine the size of the access.  We assume accesses are aligned.  */
12123 #undef TARGET_MAX_ANCHOR_OFFSET
12124 #define TARGET_MAX_ANCHOR_OFFSET 4095
12125
12126 #undef TARGET_VECTOR_ALIGNMENT
12127 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
12128
12129 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
12130 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
12131   aarch64_simd_vector_alignment_reachable
12132
12133 /* vec_perm support.  */
12134
12135 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
12136 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
12137   aarch64_vectorize_vec_perm_const_ok
12138
12139
12140 #undef TARGET_FIXED_CONDITION_CODE_REGS
12141 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
12142
12143 #undef TARGET_FLAGS_REGNUM
12144 #define TARGET_FLAGS_REGNUM CC_REGNUM
12145
12146 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
12147 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
12148
12149 #undef TARGET_ASAN_SHADOW_OFFSET
12150 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
12151
12152 #undef TARGET_LEGITIMIZE_ADDRESS
12153 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
12154
12155 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
12156 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
12157   aarch64_use_by_pieces_infrastructure_p
12158
12159 #undef TARGET_CAN_USE_DOLOOP_P
12160 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
12161
12162 #undef TARGET_SCHED_MACRO_FUSION_P
12163 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
12164
12165 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
12166 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
12167
12168 #undef TARGET_SCHED_FUSION_PRIORITY
12169 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
12170
12171 #undef TARGET_USE_PSEUDO_PIC_REG
12172 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
12173
12174 struct gcc_target targetm = TARGET_INITIALIZER;
12175
12176 #include "gt-aarch64.h"