gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2015 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "backend.h"
  25 #include "cfghooks.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "rtl.h"
  29 #include "df.h"
  30 #include "insn-codes.h"
  31 #include "insn-attr.h"
  32 #include "alias.h"
  33 #include "fold-const.h"
  34 #include "stringpool.h"
  35 #include "stor-layout.h"
  36 #include "calls.h"
  37 #include "varasm.h"
  38 #include "regs.h"
  39 #include "cfgrtl.h"
  40 #include "cfganal.h"
  41 #include "lcm.h"
  42 #include "cfgbuild.h"
  43 #include "cfgcleanup.h"
  44 #include "output.h"
  45 #include "flags.h"
  46 #include "insn-config.h"
  47 #include "expmed.h"
  48 #include "dojump.h"
  49 #include "explow.h"
  50 #include "emit-rtl.h"
  51 #include "stmt.h"
  52 #include "expr.h"
  53 #include "reload.h"
  54 #include "toplev.h"
  55 #include "target.h"
  56 #include "targhooks.h"
  57 #include "tm_p.h"
  58 #include "recog.h"
  59 #include "langhooks.h"
  60 #include "opts.h"
  61 #include "diagnostic.h"
  62 #include "diagnostic-core.h"
  63 #include "internal-fn.h"
  64 #include "gimple-fold.h"
  65 #include "tree-eh.h"
  66 #include "gimplify.h"
  67 #include "optabs.h"
  68 #include "dwarf2.h"
  69 #include "cfgloop.h"
  70 #include "tree-vectorizer.h"
  71 #include "aarch64-cost-tables.h"
  72 #include "dumpfile.h"
  73 #include "builtins.h"
  74 #include "rtl-iter.h"
  75 #include "tm-constrs.h"
  76 #include "sched-int.h"
  77 #include "cortex-a57-fma-steering.h"
  78 #include "target-globals.h"
  79
  80 /* This file should be included last.  */
  81 #include "target-def.h"
  82
  83 /* Defined for convenience.  */
  84 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  85
  86 /* Classifies an address.
  87
  88    ADDRESS_REG_IMM
  89        A simple base register plus immediate offset.
  90
  91    ADDRESS_REG_WB
  92        A base register indexed by immediate offset with writeback.
  93
  94    ADDRESS_REG_REG
  95        A base register indexed by (optionally scaled) register.
  96
  97    ADDRESS_REG_UXTW
  98        A base register indexed by (optionally scaled) zero-extended register.
  99
 100    ADDRESS_REG_SXTW
 101        A base register indexed by (optionally scaled) sign-extended register.
 102
 103    ADDRESS_LO_SUM
 104        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 105
 106    ADDRESS_SYMBOLIC:
 107        A constant symbolic address, in pc-relative literal pool.  */
 108
 109 enum aarch64_address_type {
 110   ADDRESS_REG_IMM,
 111   ADDRESS_REG_WB,
 112   ADDRESS_REG_REG,
 113   ADDRESS_REG_UXTW,
 114   ADDRESS_REG_SXTW,
 115   ADDRESS_LO_SUM,
 116   ADDRESS_SYMBOLIC
 117 };
 118
 119 struct aarch64_address_info {
 120   enum aarch64_address_type type;
 121   rtx base;
 122   rtx offset;
 123   int shift;
 124   enum aarch64_symbol_type symbol_type;
 125 };
 126
 127 struct simd_immediate_info
 128 {
 129   rtx value;
 130   int shift;
 131   int element_width;
 132   bool mvn;
 133   bool msl;
 134 };
 135
 136 /* The current code model.  */
 137 enum aarch64_code_model aarch64_cmodel;
 138
 139 #ifdef HAVE_AS_TLS
 140 #undef TARGET_HAVE_TLS
 141 #define TARGET_HAVE_TLS 1
 142 #endif
 143
 144 static bool aarch64_composite_type_p (const_tree, machine_mode);
 145 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 146                                                      const_tree,
 147                                                      machine_mode *, int *,
 148                                                      bool *);
 149 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 150 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 151 static void aarch64_override_options_after_change (void);
 152 static bool aarch64_vector_mode_supported_p (machine_mode);
 153 static unsigned bit_count (unsigned HOST_WIDE_INT);
 154 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 155                                                  const unsigned char *sel);
 156 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 157
 158 /* Major revision number of the ARM Architecture implemented by the target.  */
 159 unsigned aarch64_architecture_version;
 160
 161 /* The processor for which instructions should be scheduled.  */
 162 enum aarch64_processor aarch64_tune = cortexa53;
 163
 164 /* Mask to specify which instruction scheduling options should be used.  */
 165 unsigned long aarch64_tune_flags = 0;
 166
 167 /* Support for command line parsing of boolean flags in the tuning
 168    structures.  */
 169 struct aarch64_flag_desc
 170 {
 171   const char* name;
 172   unsigned int flag;
 173 };
 174
 175 #define AARCH64_FUSION_PAIR(name, internal_name, y) \
 176   { name, AARCH64_FUSE_##internal_name },
 177 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 178 {
 179   { "none", AARCH64_FUSE_NOTHING },
 180 #include "aarch64-fusion-pairs.def"
 181   { "all", AARCH64_FUSE_ALL },
 182   { NULL, AARCH64_FUSE_NOTHING }
 183 };
 184 #undef AARCH64_FUION_PAIR
 185
 186 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name, y) \
 187   { name, AARCH64_EXTRA_TUNE_##internal_name },
 188 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 189 {
 190   { "none", AARCH64_EXTRA_TUNE_NONE },
 191 #include "aarch64-tuning-flags.def"
 192   { "all", AARCH64_EXTRA_TUNE_ALL },
 193   { NULL, AARCH64_EXTRA_TUNE_NONE }
 194 };
 195 #undef AARCH64_EXTRA_TUNING_OPTION
 196
 197 /* Tuning parameters.  */
 198
 199 static const struct cpu_addrcost_table generic_addrcost_table =
 200 {
 201     {
 202       0, /* hi  */
 203       0, /* si  */
 204       0, /* di  */
 205       0, /* ti  */
 206     },
 207   0, /* pre_modify  */
 208   0, /* post_modify  */
 209   0, /* register_offset  */
 210   0, /* register_extend  */
 211   0 /* imm_offset  */
 212 };
 213
 214 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 215 {
 216     {
 217       1, /* hi  */
 218       0, /* si  */
 219       0, /* di  */
 220       1, /* ti  */
 221     },
 222   0, /* pre_modify  */
 223   0, /* post_modify  */
 224   0, /* register_offset  */
 225   0, /* register_extend  */
 226   0, /* imm_offset  */
 227 };
 228
 229 static const struct cpu_addrcost_table xgene1_addrcost_table =
 230 {
 231     {
 232       1, /* hi  */
 233       0, /* si  */
 234       0, /* di  */
 235       1, /* ti  */
 236     },
 237   1, /* pre_modify  */
 238   0, /* post_modify  */
 239   0, /* register_offset  */
 240   1, /* register_extend  */
 241   0, /* imm_offset  */
 242 };
 243
 244 static const struct cpu_regmove_cost generic_regmove_cost =
 245 {
 246   1, /* GP2GP  */
 247   /* Avoid the use of slow int<->fp moves for spilling by setting
 248      their cost higher than memmov_cost.  */
 249   5, /* GP2FP  */
 250   5, /* FP2GP  */
 251   2 /* FP2FP  */
 252 };
 253
 254 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 255 {
 256   1, /* GP2GP  */
 257   /* Avoid the use of slow int<->fp moves for spilling by setting
 258      their cost higher than memmov_cost.  */
 259   5, /* GP2FP  */
 260   5, /* FP2GP  */
 261   2 /* FP2FP  */
 262 };
 263
 264 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 265 {
 266   1, /* GP2GP  */
 267   /* Avoid the use of slow int<->fp moves for spilling by setting
 268      their cost higher than memmov_cost.  */
 269   5, /* GP2FP  */
 270   5, /* FP2GP  */
 271   2 /* FP2FP  */
 272 };
 273
 274 static const struct cpu_regmove_cost thunderx_regmove_cost =
 275 {
 276   2, /* GP2GP  */
 277   2, /* GP2FP  */
 278   6, /* FP2GP  */
 279   4 /* FP2FP  */
 280 };
 281
 282 static const struct cpu_regmove_cost xgene1_regmove_cost =
 283 {
 284   1, /* GP2GP  */
 285   /* Avoid the use of slow int<->fp moves for spilling by setting
 286      their cost higher than memmov_cost.  */
 287   8, /* GP2FP  */
 288   8, /* FP2GP  */
 289   2 /* FP2FP  */
 290 };
 291
 292 /* Generic costs for vector insn classes.  */
 293 static const struct cpu_vector_cost generic_vector_cost =
 294 {
 295   1, /* scalar_stmt_cost  */
 296   1, /* scalar_load_cost  */
 297   1, /* scalar_store_cost  */
 298   1, /* vec_stmt_cost  */
 299   1, /* vec_to_scalar_cost  */
 300   1, /* scalar_to_vec_cost  */
 301   1, /* vec_align_load_cost  */
 302   1, /* vec_unalign_load_cost  */
 303   1, /* vec_unalign_store_cost  */
 304   1, /* vec_store_cost  */
 305   3, /* cond_taken_branch_cost  */
 306   1 /* cond_not_taken_branch_cost  */
 307 };
 308
 309 /* Generic costs for vector insn classes.  */
 310 static const struct cpu_vector_cost cortexa57_vector_cost =
 311 {
 312   1, /* scalar_stmt_cost  */
 313   4, /* scalar_load_cost  */
 314   1, /* scalar_store_cost  */
 315   3, /* vec_stmt_cost  */
 316   8, /* vec_to_scalar_cost  */
 317   8, /* scalar_to_vec_cost  */
 318   5, /* vec_align_load_cost  */
 319   5, /* vec_unalign_load_cost  */
 320   1, /* vec_unalign_store_cost  */
 321   1, /* vec_store_cost  */
 322   1, /* cond_taken_branch_cost  */
 323   1 /* cond_not_taken_branch_cost  */
 324 };
 325
 326 /* Generic costs for vector insn classes.  */
 327 static const struct cpu_vector_cost xgene1_vector_cost =
 328 {
 329   1, /* scalar_stmt_cost  */
 330   5, /* scalar_load_cost  */
 331   1, /* scalar_store_cost  */
 332   2, /* vec_stmt_cost  */
 333   4, /* vec_to_scalar_cost  */
 334   4, /* scalar_to_vec_cost  */
 335   10, /* vec_align_load_cost  */
 336   10, /* vec_unalign_load_cost  */
 337   2, /* vec_unalign_store_cost  */
 338   2, /* vec_store_cost  */
 339   2, /* cond_taken_branch_cost  */
 340   1 /* cond_not_taken_branch_cost  */
 341 };
 342
 343 /* Generic costs for branch instructions.  */
 344 static const struct cpu_branch_cost generic_branch_cost =
 345 {
 346   2,  /* Predictable.  */
 347   2   /* Unpredictable.  */
 348 };
 349
 350 static const struct tune_params generic_tunings =
 351 {
 352   &cortexa57_extra_costs,
 353   &generic_addrcost_table,
 354   &generic_regmove_cost,
 355   &generic_vector_cost,
 356   &generic_branch_cost,
 357   4, /* memmov_cost  */
 358   2, /* issue_rate  */
 359   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 360   8,    /* function_align.  */
 361   8,    /* jump_align.  */
 362   4,    /* loop_align.  */
 363   2,    /* int_reassoc_width.  */
 364   4,    /* fp_reassoc_width.  */
 365   1,    /* vec_reassoc_width.  */
 366   2,    /* min_div_recip_mul_sf.  */
 367   2,    /* min_div_recip_mul_df.  */
 368   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 369 };
 370
 371 static const struct tune_params cortexa53_tunings =
 372 {
 373   &cortexa53_extra_costs,
 374   &generic_addrcost_table,
 375   &cortexa53_regmove_cost,
 376   &generic_vector_cost,
 377   &generic_branch_cost,
 378   4, /* memmov_cost  */
 379   2, /* issue_rate  */
 380   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 381    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 382   8,    /* function_align.  */
 383   8,    /* jump_align.  */
 384   4,    /* loop_align.  */
 385   2,    /* int_reassoc_width.  */
 386   4,    /* fp_reassoc_width.  */
 387   1,    /* vec_reassoc_width.  */
 388   2,    /* min_div_recip_mul_sf.  */
 389   2,    /* min_div_recip_mul_df.  */
 390   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 391 };
 392
 393 static const struct tune_params cortexa57_tunings =
 394 {
 395   &cortexa57_extra_costs,
 396   &cortexa57_addrcost_table,
 397   &cortexa57_regmove_cost,
 398   &cortexa57_vector_cost,
 399   &generic_branch_cost,
 400   4, /* memmov_cost  */
 401   3, /* issue_rate  */
 402   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 403    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 404   16,   /* function_align.  */
 405   8,    /* jump_align.  */
 406   4,    /* loop_align.  */
 407   2,    /* int_reassoc_width.  */
 408   4,    /* fp_reassoc_width.  */
 409   1,    /* vec_reassoc_width.  */
 410   2,    /* min_div_recip_mul_sf.  */
 411   2,    /* min_div_recip_mul_df.  */
 412   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)  /* tune_flags.  */
 413 };
 414
 415 static const struct tune_params cortexa72_tunings =
 416 {
 417   &cortexa57_extra_costs,
 418   &cortexa57_addrcost_table,
 419   &cortexa57_regmove_cost,
 420   &cortexa57_vector_cost,
 421   &generic_branch_cost,
 422   4, /* memmov_cost  */
 423   3, /* issue_rate  */
 424   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 425    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 426   16,   /* function_align.  */
 427   8,    /* jump_align.  */
 428   4,    /* loop_align.  */
 429   2,    /* int_reassoc_width.  */
 430   4,    /* fp_reassoc_width.  */
 431   1,    /* vec_reassoc_width.  */
 432   2,    /* min_div_recip_mul_sf.  */
 433   2,    /* min_div_recip_mul_df.  */
 434   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 435 };
 436
 437 static const struct tune_params thunderx_tunings =
 438 {
 439   &thunderx_extra_costs,
 440   &generic_addrcost_table,
 441   &thunderx_regmove_cost,
 442   &generic_vector_cost,
 443   &generic_branch_cost,
 444   6, /* memmov_cost  */
 445   2, /* issue_rate  */
 446   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 447   8,    /* function_align.  */
 448   8,    /* jump_align.  */
 449   8,    /* loop_align.  */
 450   2,    /* int_reassoc_width.  */
 451   4,    /* fp_reassoc_width.  */
 452   1,    /* vec_reassoc_width.  */
 453   2,    /* min_div_recip_mul_sf.  */
 454   2,    /* min_div_recip_mul_df.  */
 455   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 456 };
 457
 458 static const struct tune_params xgene1_tunings =
 459 {
 460   &xgene1_extra_costs,
 461   &xgene1_addrcost_table,
 462   &xgene1_regmove_cost,
 463   &xgene1_vector_cost,
 464   &generic_branch_cost,
 465   6, /* memmov_cost  */
 466   4, /* issue_rate  */
 467   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 468   16,   /* function_align.  */
 469   8,    /* jump_align.  */
 470   16,   /* loop_align.  */
 471   2,    /* int_reassoc_width.  */
 472   4,    /* fp_reassoc_width.  */
 473   1,    /* vec_reassoc_width.  */
 474   2,    /* min_div_recip_mul_sf.  */
 475   2,    /* min_div_recip_mul_df.  */
 476   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 477 };
 478
 479 /* Support for fine-grained override of the tuning structures.  */
 480 struct aarch64_tuning_override_function
 481 {
 482   const char* name;
 483   void (*parse_override)(const char*, struct tune_params*);
 484 };
 485
 486 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 487 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 488
 489 static const struct aarch64_tuning_override_function
 490 aarch64_tuning_override_functions[] =
 491 {
 492   { "fuse", aarch64_parse_fuse_string },
 493   { "tune", aarch64_parse_tune_string },
 494   { NULL, NULL }
 495 };
 496
 497 /* A processor implementing AArch64.  */
 498 struct processor
 499 {
 500   const char *const name;
 501   enum aarch64_processor ident;
 502   enum aarch64_processor sched_core;
 503   enum aarch64_arch arch;
 504   unsigned architecture_version;
 505   const unsigned long flags;
 506   const struct tune_params *const tune;
 507 };
 508
 509 /* Architectures implementing AArch64.  */
 510 static const struct processor all_architectures[] =
 511 {
 512 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 513   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 514 #include "aarch64-arches.def"
 515 #undef AARCH64_ARCH
 516   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 517 };
 518
 519 /* Processor cores implementing AArch64.  */
 520 static const struct processor all_cores[] =
 521 {
 522 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
 523   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 524   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 525   FLAGS, &COSTS##_tunings},
 526 #include "aarch64-cores.def"
 527 #undef AARCH64_CORE
 528   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 529     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 530   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 531 };
 532
 533
 534 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 535    handling code or by target attributes.  */
 536 static const struct processor *selected_arch;
 537 static const struct processor *selected_cpu;
 538 static const struct processor *selected_tune;
 539
 540 /* The current tuning set.  */
 541 struct tune_params aarch64_tune_params = generic_tunings;
 542
 543 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 544
 545 /* An ISA extension in the co-processor and main instruction set space.  */
 546 struct aarch64_option_extension
 547 {
 548   const char *const name;
 549   const unsigned long flags_on;
 550   const unsigned long flags_off;
 551 };
 552
 553 /* ISA extensions in AArch64.  */
 554 static const struct aarch64_option_extension all_extensions[] =
 555 {
 556 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
 557   {NAME, FLAGS_ON, FLAGS_OFF},
 558 #include "aarch64-option-extensions.def"
 559 #undef AARCH64_OPT_EXTENSION
 560   {NULL, 0, 0}
 561 };
 562
 563 /* Used to track the size of an address when generating a pre/post
 564    increment address.  */
 565 static machine_mode aarch64_memory_reference_mode;
 566
 567 /* A table of valid AArch64 "bitmask immediate" values for
 568    logical instructions.  */
 569
 570 #define AARCH64_NUM_BITMASKS  5334
 571 static unsigned HOST_WIDE_INT aarch64_bitmasks[AARCH64_NUM_BITMASKS];
 572
 573 typedef enum aarch64_cond_code
 574 {
 575   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 576   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 577   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 578 }
 579 aarch64_cc;
 580
 581 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 582
 583 /* The condition codes of the processor, and the inverse function.  */
 584 static const char * const aarch64_condition_codes[] =
 585 {
 586   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 587   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 588 };
 589
 590 void
 591 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 592 {
 593   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 594   if (TARGET_GENERAL_REGS_ONLY)
 595     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 596   else
 597     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 598 }
 599
 600 static unsigned int
 601 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 602 {
 603   if (GET_MODE_UNIT_SIZE (mode) == 4)
 604     return aarch64_tune_params.min_div_recip_mul_sf;
 605   return aarch64_tune_params.min_div_recip_mul_df;
 606 }
 607
 608 static int
 609 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 610                              enum machine_mode mode)
 611 {
 612   if (VECTOR_MODE_P (mode))
 613     return aarch64_tune_params.vec_reassoc_width;
 614   if (INTEGRAL_MODE_P (mode))
 615     return aarch64_tune_params.int_reassoc_width;
 616   if (FLOAT_MODE_P (mode))
 617     return aarch64_tune_params.fp_reassoc_width;
 618   return 1;
 619 }
 620
 621 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 622 unsigned
 623 aarch64_dbx_register_number (unsigned regno)
 624 {
 625    if (GP_REGNUM_P (regno))
 626      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 627    else if (regno == SP_REGNUM)
 628      return AARCH64_DWARF_SP;
 629    else if (FP_REGNUM_P (regno))
 630      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 631
 632    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 633       equivalent DWARF register.  */
 634    return DWARF_FRAME_REGISTERS;
 635 }
 636
 637 /* Return TRUE if MODE is any of the large INT modes.  */
 638 static bool
 639 aarch64_vect_struct_mode_p (machine_mode mode)
 640 {
 641   return mode == OImode || mode == CImode || mode == XImode;
 642 }
 643
 644 /* Return TRUE if MODE is any of the vector modes.  */
 645 static bool
 646 aarch64_vector_mode_p (machine_mode mode)
 647 {
 648   return aarch64_vector_mode_supported_p (mode)
 649          || aarch64_vect_struct_mode_p (mode);
 650 }
 651
 652 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 653 static bool
 654 aarch64_array_mode_supported_p (machine_mode mode,
 655                                 unsigned HOST_WIDE_INT nelems)
 656 {
 657   if (TARGET_SIMD
 658       && AARCH64_VALID_SIMD_QREG_MODE (mode)
 659       && (nelems >= 2 && nelems <= 4))
 660     return true;
 661
 662   return false;
 663 }
 664
 665 /* Implement HARD_REGNO_NREGS.  */
 666
 667 int
 668 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
 669 {
 670   switch (aarch64_regno_regclass (regno))
 671     {
 672     case FP_REGS:
 673     case FP_LO_REGS:
 674       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 675     default:
 676       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 677     }
 678   gcc_unreachable ();
 679 }
 680
 681 /* Implement HARD_REGNO_MODE_OK.  */
 682
 683 int
 684 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
 685 {
 686   if (GET_MODE_CLASS (mode) == MODE_CC)
 687     return regno == CC_REGNUM;
 688
 689   if (regno == SP_REGNUM)
 690     /* The purpose of comparing with ptr_mode is to support the
 691        global register variable associated with the stack pointer
 692        register via the syntax of asm ("wsp") in ILP32.  */
 693     return mode == Pmode || mode == ptr_mode;
 694
 695   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 696     return mode == Pmode;
 697
 698   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 699     return 1;
 700
 701   if (FP_REGNUM_P (regno))
 702     {
 703       if (aarch64_vect_struct_mode_p (mode))
 704         return
 705           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 706       else
 707         return 1;
 708     }
 709
 710   return 0;
 711 }
 712
 713 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 714 machine_mode
 715 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 716                                      machine_mode mode)
 717 {
 718   /* Handle modes that fit within single registers.  */
 719   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 720     {
 721       if (GET_MODE_SIZE (mode) >= 4)
 722         return mode;
 723       else
 724         return SImode;
 725     }
 726   /* Fall back to generic for multi-reg and very large modes.  */
 727   else
 728     return choose_hard_reg_mode (regno, nregs, false);
 729 }
 730
 731 /* Return true if calls to DECL should be treated as
 732    long-calls (ie called via a register).  */
 733 static bool
 734 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 735 {
 736   return false;
 737 }
 738
 739 /* Return true if calls to symbol-ref SYM should be treated as
 740    long-calls (ie called via a register).  */
 741 bool
 742 aarch64_is_long_call_p (rtx sym)
 743 {
 744   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 745 }
 746
 747 /* Return true if the offsets to a zero/sign-extract operation
 748    represent an expression that matches an extend operation.  The
 749    operands represent the paramters from
 750
 751    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 752 bool
 753 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
 754                                 rtx extract_imm)
 755 {
 756   HOST_WIDE_INT mult_val, extract_val;
 757
 758   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 759     return false;
 760
 761   mult_val = INTVAL (mult_imm);
 762   extract_val = INTVAL (extract_imm);
 763
 764   if (extract_val > 8
 765       && extract_val < GET_MODE_BITSIZE (mode)
 766       && exact_log2 (extract_val & ~7) > 0
 767       && (extract_val & 7) <= 4
 768       && mult_val == (1 << (extract_val & 7)))
 769     return true;
 770
 771   return false;
 772 }
 773
 774 /* Emit an insn that's a simple single-set.  Both the operands must be
 775    known to be valid.  */
 776 inline static rtx
 777 emit_set_insn (rtx x, rtx y)
 778 {
 779   return emit_insn (gen_rtx_SET (x, y));
 780 }
 781
 782 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 783    return the rtx for register 0 in the proper mode.  */
 784 rtx
 785 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 786 {
 787   machine_mode mode = SELECT_CC_MODE (code, x, y);
 788   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 789
 790   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 791   return cc_reg;
 792 }
 793
 794 /* Build the SYMBOL_REF for __tls_get_addr.  */
 795
 796 static GTY(()) rtx tls_get_addr_libfunc;
 797
 798 rtx
 799 aarch64_tls_get_addr (void)
 800 {
 801   if (!tls_get_addr_libfunc)
 802     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 803   return tls_get_addr_libfunc;
 804 }
 805
 806 /* Return the TLS model to use for ADDR.  */
 807
 808 static enum tls_model
 809 tls_symbolic_operand_type (rtx addr)
 810 {
 811   enum tls_model tls_kind = TLS_MODEL_NONE;
 812   rtx sym, addend;
 813
 814   if (GET_CODE (addr) == CONST)
 815     {
 816       split_const (addr, &sym, &addend);
 817       if (GET_CODE (sym) == SYMBOL_REF)
 818         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 819     }
 820   else if (GET_CODE (addr) == SYMBOL_REF)
 821     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 822
 823   return tls_kind;
 824 }
 825
 826 /* We'll allow lo_sum's in addresses in our legitimate addresses
 827    so that combine would take care of combining addresses where
 828    necessary, but for generation purposes, we'll generate the address
 829    as :
 830    RTL                               Absolute
 831    tmp = hi (symbol_ref);            adrp  x1, foo
 832    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 833                                      nop
 834
 835    PIC                               TLS
 836    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 837    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 838                                      bl   __tls_get_addr
 839                                      nop
 840
 841    Load TLS symbol, depending on TLS mechanism and TLS access model.
 842
 843    Global Dynamic - Traditional TLS:
 844    adrp tmp, :tlsgd:imm
 845    add  dest, tmp, #:tlsgd_lo12:imm
 846    bl   __tls_get_addr
 847
 848    Global Dynamic - TLS Descriptors:
 849    adrp dest, :tlsdesc:imm
 850    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 851    add  dest, dest, #:tlsdesc_lo12:imm
 852    blr  tmp
 853    mrs  tp, tpidr_el0
 854    add  dest, dest, tp
 855
 856    Initial Exec:
 857    mrs  tp, tpidr_el0
 858    adrp tmp, :gottprel:imm
 859    ldr  dest, [tmp, #:gottprel_lo12:imm]
 860    add  dest, dest, tp
 861
 862    Local Exec:
 863    mrs  tp, tpidr_el0
 864    add  t0, tp, #:tprel_hi12:imm, lsl #12
 865    add  t0, t0, #:tprel_lo12_nc:imm
 866 */
 867
 868 static void
 869 aarch64_load_symref_appropriately (rtx dest, rtx imm,
 870                                    enum aarch64_symbol_type type)
 871 {
 872   switch (type)
 873     {
 874     case SYMBOL_SMALL_ABSOLUTE:
 875       {
 876         /* In ILP32, the mode of dest can be either SImode or DImode.  */
 877         rtx tmp_reg = dest;
 878         machine_mode mode = GET_MODE (dest);
 879
 880         gcc_assert (mode == Pmode || mode == ptr_mode);
 881
 882         if (can_create_pseudo_p ())
 883           tmp_reg = gen_reg_rtx (mode);
 884
 885         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 886         emit_insn (gen_add_losym (dest, tmp_reg, imm));
 887         return;
 888       }
 889
 890     case SYMBOL_TINY_ABSOLUTE:
 891       emit_insn (gen_rtx_SET (dest, imm));
 892       return;
 893
 894     case SYMBOL_SMALL_GOT_28K:
 895       {
 896         machine_mode mode = GET_MODE (dest);
 897         rtx gp_rtx = pic_offset_table_rtx;
 898         rtx insn;
 899         rtx mem;
 900
 901         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
 902            here before rtl expand.  Tree IVOPT will generate rtl pattern to
 903            decide rtx costs, in which case pic_offset_table_rtx is not
 904            initialized.  For that case no need to generate the first adrp
 905            instruction as the final cost for global variable access is
 906            one instruction.  */
 907         if (gp_rtx != NULL)
 908           {
 909             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
 910                using the page base as GOT base, the first page may be wasted,
 911                in the worst scenario, there is only 28K space for GOT).
 912
 913                The generate instruction sequence for accessing global variable
 914                is:
 915
 916                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
 917
 918                Only one instruction needed. But we must initialize
 919                pic_offset_table_rtx properly.  We generate initialize insn for
 920                every global access, and allow CSE to remove all redundant.
 921
 922                The final instruction sequences will look like the following
 923                for multiply global variables access.
 924
 925                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
 926
 927                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
 928                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
 929                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
 930                  ...  */
 931
 932             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
 933             crtl->uses_pic_offset_table = 1;
 934             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
 935
 936             if (mode != GET_MODE (gp_rtx))
 937               gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
 938           }
 939
 940         if (mode == ptr_mode)
 941           {
 942             if (mode == DImode)
 943               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
 944             else
 945               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
 946
 947             mem = XVECEXP (SET_SRC (insn), 0, 0);
 948           }
 949         else
 950           {
 951             gcc_assert (mode == Pmode);
 952
 953             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
 954             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
 955           }
 956
 957         /* The operand is expected to be MEM.  Whenever the related insn
 958            pattern changed, above code which calculate mem should be
 959            updated.  */
 960         gcc_assert (GET_CODE (mem) == MEM);
 961         MEM_READONLY_P (mem) = 1;
 962         MEM_NOTRAP_P (mem) = 1;
 963         emit_insn (insn);
 964         return;
 965       }
 966
 967     case SYMBOL_SMALL_GOT_4G:
 968       {
 969         /* In ILP32, the mode of dest can be either SImode or DImode,
 970            while the got entry is always of SImode size.  The mode of
 971            dest depends on how dest is used: if dest is assigned to a
 972            pointer (e.g. in the memory), it has SImode; it may have
 973            DImode if dest is dereferenced to access the memeory.
 974            This is why we have to handle three different ldr_got_small
 975            patterns here (two patterns for ILP32).  */
 976
 977         rtx insn;
 978         rtx mem;
 979         rtx tmp_reg = dest;
 980         machine_mode mode = GET_MODE (dest);
 981
 982         if (can_create_pseudo_p ())
 983           tmp_reg = gen_reg_rtx (mode);
 984
 985         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 986         if (mode == ptr_mode)
 987           {
 988             if (mode == DImode)
 989               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
 990             else
 991               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
 992
 993             mem = XVECEXP (SET_SRC (insn), 0, 0);
 994           }
 995         else
 996           {
 997             gcc_assert (mode == Pmode);
 998
 999             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1000             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1001           }
1002
1003         gcc_assert (GET_CODE (mem) == MEM);
1004         MEM_READONLY_P (mem) = 1;
1005         MEM_NOTRAP_P (mem) = 1;
1006         emit_insn (insn);
1007         return;
1008       }
1009
1010     case SYMBOL_SMALL_TLSGD:
1011       {
1012         rtx_insn *insns;
1013         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
1014
1015         start_sequence ();
1016         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
1017         insns = get_insns ();
1018         end_sequence ();
1019
1020         RTL_CONST_CALL_P (insns) = 1;
1021         emit_libcall_block (insns, dest, result, imm);
1022         return;
1023       }
1024
1025     case SYMBOL_SMALL_TLSDESC:
1026       {
1027         machine_mode mode = GET_MODE (dest);
1028         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1029         rtx tp;
1030
1031         gcc_assert (mode == Pmode || mode == ptr_mode);
1032
1033         /* In ILP32, the got entry is always of SImode size.  Unlike
1034            small GOT, the dest is fixed at reg 0.  */
1035         if (TARGET_ILP32)
1036           emit_insn (gen_tlsdesc_small_si (imm));
1037         else
1038           emit_insn (gen_tlsdesc_small_di (imm));
1039         tp = aarch64_load_tp (NULL);
1040
1041         if (mode != Pmode)
1042           tp = gen_lowpart (mode, tp);
1043
1044         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1045         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1046         return;
1047       }
1048
1049     case SYMBOL_SMALL_GOTTPREL:
1050       {
1051         /* In ILP32, the mode of dest can be either SImode or DImode,
1052            while the got entry is always of SImode size.  The mode of
1053            dest depends on how dest is used: if dest is assigned to a
1054            pointer (e.g. in the memory), it has SImode; it may have
1055            DImode if dest is dereferenced to access the memeory.
1056            This is why we have to handle three different tlsie_small
1057            patterns here (two patterns for ILP32).  */
1058         machine_mode mode = GET_MODE (dest);
1059         rtx tmp_reg = gen_reg_rtx (mode);
1060         rtx tp = aarch64_load_tp (NULL);
1061
1062         if (mode == ptr_mode)
1063           {
1064             if (mode == DImode)
1065               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1066             else
1067               {
1068                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1069                 tp = gen_lowpart (mode, tp);
1070               }
1071           }
1072         else
1073           {
1074             gcc_assert (mode == Pmode);
1075             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1076           }
1077
1078         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1079         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1080         return;
1081       }
1082
1083     case SYMBOL_TLSLE:
1084       {
1085         rtx tp = aarch64_load_tp (NULL);
1086
1087         if (GET_MODE (dest) != Pmode)
1088           tp = gen_lowpart (GET_MODE (dest), tp);
1089
1090         emit_insn (gen_tlsle (dest, tp, imm));
1091         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1092         return;
1093       }
1094
1095     case SYMBOL_TINY_GOT:
1096       emit_insn (gen_ldr_got_tiny (dest, imm));
1097       return;
1098
1099     default:
1100       gcc_unreachable ();
1101     }
1102 }
1103
1104 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1105    handle all moves if !can_create_pseudo_p ().  The distinction is
1106    important because, unlike emit_move_insn, the move expanders know
1107    how to force Pmode objects into the constant pool even when the
1108    constant pool address is not itself legitimate.  */
1109 static rtx
1110 aarch64_emit_move (rtx dest, rtx src)
1111 {
1112   return (can_create_pseudo_p ()
1113           ? emit_move_insn (dest, src)
1114           : emit_move_insn_1 (dest, src));
1115 }
1116
1117 /* Split a 128-bit move operation into two 64-bit move operations,
1118    taking care to handle partial overlap of register to register
1119    copies.  Special cases are needed when moving between GP regs and
1120    FP regs.  SRC can be a register, constant or memory; DST a register
1121    or memory.  If either operand is memory it must not have any side
1122    effects.  */
1123 void
1124 aarch64_split_128bit_move (rtx dst, rtx src)
1125 {
1126   rtx dst_lo, dst_hi;
1127   rtx src_lo, src_hi;
1128
1129   machine_mode mode = GET_MODE (dst);
1130
1131   gcc_assert (mode == TImode || mode == TFmode);
1132   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1133   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1134
1135   if (REG_P (dst) && REG_P (src))
1136     {
1137       int src_regno = REGNO (src);
1138       int dst_regno = REGNO (dst);
1139
1140       /* Handle FP <-> GP regs.  */
1141       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1142         {
1143           src_lo = gen_lowpart (word_mode, src);
1144           src_hi = gen_highpart (word_mode, src);
1145
1146           if (mode == TImode)
1147             {
1148               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1149               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1150             }
1151           else
1152             {
1153               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1154               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1155             }
1156           return;
1157         }
1158       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1159         {
1160           dst_lo = gen_lowpart (word_mode, dst);
1161           dst_hi = gen_highpart (word_mode, dst);
1162
1163           if (mode == TImode)
1164             {
1165               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1166               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1167             }
1168           else
1169             {
1170               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1171               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1172             }
1173           return;
1174         }
1175     }
1176
1177   dst_lo = gen_lowpart (word_mode, dst);
1178   dst_hi = gen_highpart (word_mode, dst);
1179   src_lo = gen_lowpart (word_mode, src);
1180   src_hi = gen_highpart_mode (word_mode, mode, src);
1181
1182   /* At most one pairing may overlap.  */
1183   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1184     {
1185       aarch64_emit_move (dst_hi, src_hi);
1186       aarch64_emit_move (dst_lo, src_lo);
1187     }
1188   else
1189     {
1190       aarch64_emit_move (dst_lo, src_lo);
1191       aarch64_emit_move (dst_hi, src_hi);
1192     }
1193 }
1194
1195 bool
1196 aarch64_split_128bit_move_p (rtx dst, rtx src)
1197 {
1198   return (! REG_P (src)
1199           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1200 }
1201
1202 /* Split a complex SIMD combine.  */
1203
1204 void
1205 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1206 {
1207   machine_mode src_mode = GET_MODE (src1);
1208   machine_mode dst_mode = GET_MODE (dst);
1209
1210   gcc_assert (VECTOR_MODE_P (dst_mode));
1211
1212   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1213     {
1214       rtx (*gen) (rtx, rtx, rtx);
1215
1216       switch (src_mode)
1217         {
1218         case V8QImode:
1219           gen = gen_aarch64_simd_combinev8qi;
1220           break;
1221         case V4HImode:
1222           gen = gen_aarch64_simd_combinev4hi;
1223           break;
1224         case V2SImode:
1225           gen = gen_aarch64_simd_combinev2si;
1226           break;
1227         case V2SFmode:
1228           gen = gen_aarch64_simd_combinev2sf;
1229           break;
1230         case DImode:
1231           gen = gen_aarch64_simd_combinedi;
1232           break;
1233         case DFmode:
1234           gen = gen_aarch64_simd_combinedf;
1235           break;
1236         default:
1237           gcc_unreachable ();
1238         }
1239
1240       emit_insn (gen (dst, src1, src2));
1241       return;
1242     }
1243 }
1244
1245 /* Split a complex SIMD move.  */
1246
1247 void
1248 aarch64_split_simd_move (rtx dst, rtx src)
1249 {
1250   machine_mode src_mode = GET_MODE (src);
1251   machine_mode dst_mode = GET_MODE (dst);
1252
1253   gcc_assert (VECTOR_MODE_P (dst_mode));
1254
1255   if (REG_P (dst) && REG_P (src))
1256     {
1257       rtx (*gen) (rtx, rtx);
1258
1259       gcc_assert (VECTOR_MODE_P (src_mode));
1260
1261       switch (src_mode)
1262         {
1263         case V16QImode:
1264           gen = gen_aarch64_split_simd_movv16qi;
1265           break;
1266         case V8HImode:
1267           gen = gen_aarch64_split_simd_movv8hi;
1268           break;
1269         case V4SImode:
1270           gen = gen_aarch64_split_simd_movv4si;
1271           break;
1272         case V2DImode:
1273           gen = gen_aarch64_split_simd_movv2di;
1274           break;
1275         case V4SFmode:
1276           gen = gen_aarch64_split_simd_movv4sf;
1277           break;
1278         case V2DFmode:
1279           gen = gen_aarch64_split_simd_movv2df;
1280           break;
1281         default:
1282           gcc_unreachable ();
1283         }
1284
1285       emit_insn (gen (dst, src));
1286       return;
1287     }
1288 }
1289
1290 static rtx
1291 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1292 {
1293   if (can_create_pseudo_p ())
1294     return force_reg (mode, value);
1295   else
1296     {
1297       x = aarch64_emit_move (x, value);
1298       return x;
1299     }
1300 }
1301
1302
1303 static rtx
1304 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1305 {
1306   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1307     {
1308       rtx high;
1309       /* Load the full offset into a register.  This
1310          might be improvable in the future.  */
1311       high = GEN_INT (offset);
1312       offset = 0;
1313       high = aarch64_force_temporary (mode, temp, high);
1314       reg = aarch64_force_temporary (mode, temp,
1315                                      gen_rtx_PLUS (mode, high, reg));
1316     }
1317   return plus_constant (mode, reg, offset);
1318 }
1319
1320 static int
1321 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1322                                 machine_mode mode)
1323 {
1324   unsigned HOST_WIDE_INT mask;
1325   int i;
1326   bool first;
1327   unsigned HOST_WIDE_INT val;
1328   bool subtargets;
1329   rtx subtarget;
1330   int one_match, zero_match, first_not_ffff_match;
1331   int num_insns = 0;
1332
1333   if (CONST_INT_P (imm) && aarch64_move_imm (INTVAL (imm), mode))
1334     {
1335       if (generate)
1336         emit_insn (gen_rtx_SET (dest, imm));
1337       num_insns++;
1338       return num_insns;
1339     }
1340
1341   if (mode == SImode)
1342     {
1343       /* We know we can't do this in 1 insn, and we must be able to do it
1344          in two; so don't mess around looking for sequences that don't buy
1345          us anything.  */
1346       if (generate)
1347         {
1348           emit_insn (gen_rtx_SET (dest, GEN_INT (INTVAL (imm) & 0xffff)));
1349           emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1350                                      GEN_INT ((INTVAL (imm) >> 16) & 0xffff)));
1351         }
1352       num_insns += 2;
1353       return num_insns;
1354     }
1355
1356   /* Remaining cases are all for DImode.  */
1357
1358   val = INTVAL (imm);
1359   subtargets = optimize && can_create_pseudo_p ();
1360
1361   one_match = 0;
1362   zero_match = 0;
1363   mask = 0xffff;
1364   first_not_ffff_match = -1;
1365
1366   for (i = 0; i < 64; i += 16, mask <<= 16)
1367     {
1368       if ((val & mask) == mask)
1369         one_match++;
1370       else
1371         {
1372           if (first_not_ffff_match < 0)
1373             first_not_ffff_match = i;
1374           if ((val & mask) == 0)
1375             zero_match++;
1376         }
1377     }
1378
1379   if (one_match == 2)
1380     {
1381       /* Set one of the quarters and then insert back into result.  */
1382       mask = 0xffffll << first_not_ffff_match;
1383       if (generate)
1384         {
1385           emit_insn (gen_rtx_SET (dest, GEN_INT (val | mask)));
1386           emit_insn (gen_insv_immdi (dest, GEN_INT (first_not_ffff_match),
1387                                      GEN_INT ((val >> first_not_ffff_match)
1388                                               & 0xffff)));
1389         }
1390       num_insns += 2;
1391       return num_insns;
1392     }
1393
1394   if (zero_match == 2)
1395     goto simple_sequence;
1396
1397   mask = 0x0ffff0000UL;
1398   for (i = 16; i < 64; i += 16, mask <<= 16)
1399     {
1400       HOST_WIDE_INT comp = mask & ~(mask - 1);
1401
1402       if (aarch64_uimm12_shift (val - (val & mask)))
1403         {
1404           if (generate)
1405             {
1406               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1407               emit_insn (gen_rtx_SET (subtarget, GEN_INT (val & mask)));
1408               emit_insn (gen_adddi3 (dest, subtarget,
1409                                      GEN_INT (val - (val & mask))));
1410             }
1411           num_insns += 2;
1412           return num_insns;
1413         }
1414       else if (aarch64_uimm12_shift (-(val - ((val + comp) & mask))))
1415         {
1416           if (generate)
1417             {
1418               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1419               emit_insn (gen_rtx_SET (subtarget,
1420                                       GEN_INT ((val + comp) & mask)));
1421               emit_insn (gen_adddi3 (dest, subtarget,
1422                                      GEN_INT (val - ((val + comp) & mask))));
1423             }
1424           num_insns += 2;
1425           return num_insns;
1426         }
1427       else if (aarch64_uimm12_shift (val - ((val - comp) | ~mask)))
1428         {
1429           if (generate)
1430             {
1431               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1432               emit_insn (gen_rtx_SET (subtarget,
1433                                       GEN_INT ((val - comp) | ~mask)));
1434               emit_insn (gen_adddi3 (dest, subtarget,
1435                                      GEN_INT (val - ((val - comp) | ~mask))));
1436             }
1437           num_insns += 2;
1438           return num_insns;
1439         }
1440       else if (aarch64_uimm12_shift (-(val - (val | ~mask))))
1441         {
1442           if (generate)
1443             {
1444               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1445               emit_insn (gen_rtx_SET (subtarget, GEN_INT (val | ~mask)));
1446               emit_insn (gen_adddi3 (dest, subtarget,
1447                                      GEN_INT (val - (val | ~mask))));
1448             }
1449           num_insns += 2;
1450           return num_insns;
1451         }
1452     }
1453
1454   /* See if we can do it by arithmetically combining two
1455      immediates.  */
1456   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1457     {
1458       int j;
1459       mask = 0xffff;
1460
1461       if (aarch64_uimm12_shift (val - aarch64_bitmasks[i])
1462           || aarch64_uimm12_shift (-val + aarch64_bitmasks[i]))
1463         {
1464           if (generate)
1465             {
1466               subtarget = subtargets ? gen_reg_rtx (DImode) : dest;
1467               emit_insn (gen_rtx_SET (subtarget,
1468                                       GEN_INT (aarch64_bitmasks[i])));
1469               emit_insn (gen_adddi3 (dest, subtarget,
1470                                      GEN_INT (val - aarch64_bitmasks[i])));
1471             }
1472           num_insns += 2;
1473           return num_insns;
1474         }
1475
1476       for (j = 0; j < 64; j += 16, mask <<= 16)
1477         {
1478           if ((aarch64_bitmasks[i] & ~mask) == (val & ~mask))
1479             {
1480               if (generate)
1481                 {
1482                   emit_insn (gen_rtx_SET (dest,
1483                                           GEN_INT (aarch64_bitmasks[i])));
1484                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
1485                                              GEN_INT ((val >> j) & 0xffff)));
1486                 }
1487               num_insns += 2;
1488               return num_insns;
1489             }
1490         }
1491     }
1492
1493   /* See if we can do it by logically combining two immediates.  */
1494   for (i = 0; i < AARCH64_NUM_BITMASKS; i++)
1495     {
1496       if ((aarch64_bitmasks[i] & val) == aarch64_bitmasks[i])
1497         {
1498           int j;
1499
1500           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1501             if (val == (aarch64_bitmasks[i] | aarch64_bitmasks[j]))
1502               {
1503                 if (generate)
1504                   {
1505                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1506                     emit_insn (gen_rtx_SET (subtarget,
1507                                             GEN_INT (aarch64_bitmasks[i])));
1508                     emit_insn (gen_iordi3 (dest, subtarget,
1509                                            GEN_INT (aarch64_bitmasks[j])));
1510                   }
1511                 num_insns += 2;
1512                 return num_insns;
1513               }
1514         }
1515       else if ((val & aarch64_bitmasks[i]) == val)
1516         {
1517           int j;
1518
1519           for (j = i + 1; j < AARCH64_NUM_BITMASKS; j++)
1520             if (val == (aarch64_bitmasks[j] & aarch64_bitmasks[i]))
1521               {
1522                 if (generate)
1523                   {
1524                     subtarget = subtargets ? gen_reg_rtx (mode) : dest;
1525                     emit_insn (gen_rtx_SET (subtarget,
1526                                             GEN_INT (aarch64_bitmasks[j])));
1527                     emit_insn (gen_anddi3 (dest, subtarget,
1528                                            GEN_INT (aarch64_bitmasks[i])));
1529                   }
1530                 num_insns += 2;
1531                 return num_insns;
1532               }
1533         }
1534     }
1535
1536   if (one_match > zero_match)
1537     {
1538       /* Set either first three quarters or all but the third.   */
1539       mask = 0xffffll << (16 - first_not_ffff_match);
1540       if (generate)
1541         emit_insn (gen_rtx_SET (dest,
1542                                 GEN_INT (val | mask | 0xffffffff00000000ull)));
1543       num_insns ++;
1544
1545       /* Now insert other two quarters.  */
1546       for (i = first_not_ffff_match + 16, mask <<= (first_not_ffff_match << 1);
1547            i < 64; i += 16, mask <<= 16)
1548         {
1549           if ((val & mask) != mask)
1550             {
1551               if (generate)
1552                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1553                                            GEN_INT ((val >> i) & 0xffff)));
1554               num_insns ++;
1555             }
1556         }
1557       return num_insns;
1558     }
1559
1560  simple_sequence:
1561   first = true;
1562   mask = 0xffff;
1563   for (i = 0; i < 64; i += 16, mask <<= 16)
1564     {
1565       if ((val & mask) != 0)
1566         {
1567           if (first)
1568             {
1569               if (generate)
1570                 emit_insn (gen_rtx_SET (dest, GEN_INT (val & mask)));
1571               num_insns ++;
1572               first = false;
1573             }
1574           else
1575             {
1576               if (generate)
1577                 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1578                                            GEN_INT ((val >> i) & 0xffff)));
1579               num_insns ++;
1580             }
1581         }
1582     }
1583
1584   return num_insns;
1585 }
1586
1587
1588 void
1589 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1590 {
1591   machine_mode mode = GET_MODE (dest);
1592
1593   gcc_assert (mode == SImode || mode == DImode);
1594
1595   /* Check on what type of symbol it is.  */
1596   if (GET_CODE (imm) == SYMBOL_REF
1597       || GET_CODE (imm) == LABEL_REF
1598       || GET_CODE (imm) == CONST)
1599     {
1600       rtx mem, base, offset;
1601       enum aarch64_symbol_type sty;
1602
1603       /* If we have (const (plus symbol offset)), separate out the offset
1604          before we start classifying the symbol.  */
1605       split_const (imm, &base, &offset);
1606
1607       sty = aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR);
1608       switch (sty)
1609         {
1610         case SYMBOL_FORCE_TO_MEM:
1611           if (offset != const0_rtx
1612               && targetm.cannot_force_const_mem (mode, imm))
1613             {
1614               gcc_assert (can_create_pseudo_p ());
1615               base = aarch64_force_temporary (mode, dest, base);
1616               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1617               aarch64_emit_move (dest, base);
1618               return;
1619             }
1620           mem = force_const_mem (ptr_mode, imm);
1621           gcc_assert (mem);
1622           if (mode != ptr_mode)
1623             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1624           emit_insn (gen_rtx_SET (dest, mem));
1625           return;
1626
1627         case SYMBOL_SMALL_TLSGD:
1628         case SYMBOL_SMALL_TLSDESC:
1629         case SYMBOL_SMALL_GOTTPREL:
1630         case SYMBOL_SMALL_GOT_28K:
1631         case SYMBOL_SMALL_GOT_4G:
1632         case SYMBOL_TINY_GOT:
1633           if (offset != const0_rtx)
1634             {
1635               gcc_assert(can_create_pseudo_p ());
1636               base = aarch64_force_temporary (mode, dest, base);
1637               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1638               aarch64_emit_move (dest, base);
1639               return;
1640             }
1641           /* FALLTHRU */
1642
1643         case SYMBOL_SMALL_ABSOLUTE:
1644         case SYMBOL_TINY_ABSOLUTE:
1645         case SYMBOL_TLSLE:
1646           aarch64_load_symref_appropriately (dest, imm, sty);
1647           return;
1648
1649         default:
1650           gcc_unreachable ();
1651         }
1652     }
1653
1654   if (!CONST_INT_P (imm))
1655     {
1656       if (GET_CODE (imm) == HIGH)
1657         emit_insn (gen_rtx_SET (dest, imm));
1658       else
1659         {
1660           rtx mem = force_const_mem (mode, imm);
1661           gcc_assert (mem);
1662           emit_insn (gen_rtx_SET (dest, mem));
1663         }
1664
1665       return;
1666     }
1667
1668   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1669 }
1670
1671 static bool
1672 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1673                                  tree exp ATTRIBUTE_UNUSED)
1674 {
1675   /* Currently, always true.  */
1676   return true;
1677 }
1678
1679 /* Implement TARGET_PASS_BY_REFERENCE.  */
1680
1681 static bool
1682 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1683                            machine_mode mode,
1684                            const_tree type,
1685                            bool named ATTRIBUTE_UNUSED)
1686 {
1687   HOST_WIDE_INT size;
1688   machine_mode dummymode;
1689   int nregs;
1690
1691   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1692   size = (mode == BLKmode && type)
1693     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1694
1695   /* Aggregates are passed by reference based on their size.  */
1696   if (type && AGGREGATE_TYPE_P (type))
1697     {
1698       size = int_size_in_bytes (type);
1699     }
1700
1701   /* Variable sized arguments are always returned by reference.  */
1702   if (size < 0)
1703     return true;
1704
1705   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1706   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1707                                                &dummymode, &nregs,
1708                                                NULL))
1709     return false;
1710
1711   /* Arguments which are variable sized or larger than 2 registers are
1712      passed by reference unless they are a homogenous floating point
1713      aggregate.  */
1714   return size > 2 * UNITS_PER_WORD;
1715 }
1716
1717 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1718 static bool
1719 aarch64_return_in_msb (const_tree valtype)
1720 {
1721   machine_mode dummy_mode;
1722   int dummy_int;
1723
1724   /* Never happens in little-endian mode.  */
1725   if (!BYTES_BIG_ENDIAN)
1726     return false;
1727
1728   /* Only composite types smaller than or equal to 16 bytes can
1729      be potentially returned in registers.  */
1730   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1731       || int_size_in_bytes (valtype) <= 0
1732       || int_size_in_bytes (valtype) > 16)
1733     return false;
1734
1735   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1736      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1737      is always passed/returned in the least significant bits of fp/simd
1738      register(s).  */
1739   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1740                                                &dummy_mode, &dummy_int, NULL))
1741     return false;
1742
1743   return true;
1744 }
1745
1746 /* Implement TARGET_FUNCTION_VALUE.
1747    Define how to find the value returned by a function.  */
1748
1749 static rtx
1750 aarch64_function_value (const_tree type, const_tree func,
1751                         bool outgoing ATTRIBUTE_UNUSED)
1752 {
1753   machine_mode mode;
1754   int unsignedp;
1755   int count;
1756   machine_mode ag_mode;
1757
1758   mode = TYPE_MODE (type);
1759   if (INTEGRAL_TYPE_P (type))
1760     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1761
1762   if (aarch64_return_in_msb (type))
1763     {
1764       HOST_WIDE_INT size = int_size_in_bytes (type);
1765
1766       if (size % UNITS_PER_WORD != 0)
1767         {
1768           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1769           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1770         }
1771     }
1772
1773   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1774                                                &ag_mode, &count, NULL))
1775     {
1776       if (!aarch64_composite_type_p (type, mode))
1777         {
1778           gcc_assert (count == 1 && mode == ag_mode);
1779           return gen_rtx_REG (mode, V0_REGNUM);
1780         }
1781       else
1782         {
1783           int i;
1784           rtx par;
1785
1786           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1787           for (i = 0; i < count; i++)
1788             {
1789               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1790               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1791                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1792               XVECEXP (par, 0, i) = tmp;
1793             }
1794           return par;
1795         }
1796     }
1797   else
1798     return gen_rtx_REG (mode, R0_REGNUM);
1799 }
1800
1801 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1802    Return true if REGNO is the number of a hard register in which the values
1803    of called function may come back.  */
1804
1805 static bool
1806 aarch64_function_value_regno_p (const unsigned int regno)
1807 {
1808   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1809      of 16-byte return values are: 128-bit integers and 16-byte small
1810      structures (excluding homogeneous floating-point aggregates).  */
1811   if (regno == R0_REGNUM || regno == R1_REGNUM)
1812     return true;
1813
1814   /* Up to four fp/simd registers can return a function value, e.g. a
1815      homogeneous floating-point aggregate having four members.  */
1816   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1817     return TARGET_FLOAT;
1818
1819   return false;
1820 }
1821
1822 /* Implement TARGET_RETURN_IN_MEMORY.
1823
1824    If the type T of the result of a function is such that
1825      void func (T arg)
1826    would require that arg be passed as a value in a register (or set of
1827    registers) according to the parameter passing rules, then the result
1828    is returned in the same registers as would be used for such an
1829    argument.  */
1830
1831 static bool
1832 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1833 {
1834   HOST_WIDE_INT size;
1835   machine_mode ag_mode;
1836   int count;
1837
1838   if (!AGGREGATE_TYPE_P (type)
1839       && TREE_CODE (type) != COMPLEX_TYPE
1840       && TREE_CODE (type) != VECTOR_TYPE)
1841     /* Simple scalar types always returned in registers.  */
1842     return false;
1843
1844   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1845                                                type,
1846                                                &ag_mode,
1847                                                &count,
1848                                                NULL))
1849     return false;
1850
1851   /* Types larger than 2 registers returned in memory.  */
1852   size = int_size_in_bytes (type);
1853   return (size < 0 || size > 2 * UNITS_PER_WORD);
1854 }
1855
1856 static bool
1857 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1858                                const_tree type, int *nregs)
1859 {
1860   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1861   return aarch64_vfp_is_call_or_return_candidate (mode,
1862                                                   type,
1863                                                   &pcum->aapcs_vfp_rmode,
1864                                                   nregs,
1865                                                   NULL);
1866 }
1867
1868 /* Given MODE and TYPE of a function argument, return the alignment in
1869    bits.  The idea is to suppress any stronger alignment requested by
1870    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1871    This is a helper function for local use only.  */
1872
1873 static unsigned int
1874 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1875 {
1876   unsigned int alignment;
1877
1878   if (type)
1879     {
1880       if (!integer_zerop (TYPE_SIZE (type)))
1881         {
1882           if (TYPE_MODE (type) == mode)
1883             alignment = TYPE_ALIGN (type);
1884           else
1885             alignment = GET_MODE_ALIGNMENT (mode);
1886         }
1887       else
1888         alignment = 0;
1889     }
1890   else
1891     alignment = GET_MODE_ALIGNMENT (mode);
1892
1893   return alignment;
1894 }
1895
1896 /* Layout a function argument according to the AAPCS64 rules.  The rule
1897    numbers refer to the rule numbers in the AAPCS64.  */
1898
1899 static void
1900 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1901                     const_tree type,
1902                     bool named ATTRIBUTE_UNUSED)
1903 {
1904   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1905   int ncrn, nvrn, nregs;
1906   bool allocate_ncrn, allocate_nvrn;
1907   HOST_WIDE_INT size;
1908
1909   /* We need to do this once per argument.  */
1910   if (pcum->aapcs_arg_processed)
1911     return;
1912
1913   pcum->aapcs_arg_processed = true;
1914
1915   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1916   size
1917     = AARCH64_ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1918                         UNITS_PER_WORD);
1919
1920   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1921   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1922                                                  mode,
1923                                                  type,
1924                                                  &nregs);
1925
1926   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1927      The following code thus handles passing by SIMD/FP registers first.  */
1928
1929   nvrn = pcum->aapcs_nvrn;
1930
1931   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1932      and homogenous short-vector aggregates (HVA).  */
1933   if (allocate_nvrn)
1934     {
1935       if (!TARGET_FLOAT)
1936         aarch64_err_no_fpadvsimd (mode, "argument");
1937
1938       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1939         {
1940           pcum->aapcs_nextnvrn = nvrn + nregs;
1941           if (!aarch64_composite_type_p (type, mode))
1942             {
1943               gcc_assert (nregs == 1);
1944               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1945             }
1946           else
1947             {
1948               rtx par;
1949               int i;
1950               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1951               for (i = 0; i < nregs; i++)
1952                 {
1953                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1954                                          V0_REGNUM + nvrn + i);
1955                   tmp = gen_rtx_EXPR_LIST
1956                     (VOIDmode, tmp,
1957                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1958                   XVECEXP (par, 0, i) = tmp;
1959                 }
1960               pcum->aapcs_reg = par;
1961             }
1962           return;
1963         }
1964       else
1965         {
1966           /* C.3 NSRN is set to 8.  */
1967           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1968           goto on_stack;
1969         }
1970     }
1971
1972   ncrn = pcum->aapcs_ncrn;
1973   nregs = size / UNITS_PER_WORD;
1974
1975   /* C6 - C9.  though the sign and zero extension semantics are
1976      handled elsewhere.  This is the case where the argument fits
1977      entirely general registers.  */
1978   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1979     {
1980       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1981
1982       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1983
1984       /* C.8 if the argument has an alignment of 16 then the NGRN is
1985          rounded up to the next even number.  */
1986       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1987         {
1988           ++ncrn;
1989           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1990         }
1991       /* NREGS can be 0 when e.g. an empty structure is to be passed.
1992          A reg is still generated for it, but the caller should be smart
1993          enough not to use it.  */
1994       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1995         {
1996           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1997         }
1998       else
1999         {
2000           rtx par;
2001           int i;
2002
2003           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2004           for (i = 0; i < nregs; i++)
2005             {
2006               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2007               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2008                                        GEN_INT (i * UNITS_PER_WORD));
2009               XVECEXP (par, 0, i) = tmp;
2010             }
2011           pcum->aapcs_reg = par;
2012         }
2013
2014       pcum->aapcs_nextncrn = ncrn + nregs;
2015       return;
2016     }
2017
2018   /* C.11  */
2019   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2020
2021   /* The argument is passed on stack; record the needed number of words for
2022      this argument and align the total size if necessary.  */
2023 on_stack:
2024   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2025   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2026     pcum->aapcs_stack_size = AARCH64_ROUND_UP (pcum->aapcs_stack_size,
2027                                                16 / UNITS_PER_WORD);
2028   return;
2029 }
2030
2031 /* Implement TARGET_FUNCTION_ARG.  */
2032
2033 static rtx
2034 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2035                       const_tree type, bool named)
2036 {
2037   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2038   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2039
2040   if (mode == VOIDmode)
2041     return NULL_RTX;
2042
2043   aarch64_layout_arg (pcum_v, mode, type, named);
2044   return pcum->aapcs_reg;
2045 }
2046
2047 void
2048 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2049                            const_tree fntype ATTRIBUTE_UNUSED,
2050                            rtx libname ATTRIBUTE_UNUSED,
2051                            const_tree fndecl ATTRIBUTE_UNUSED,
2052                            unsigned n_named ATTRIBUTE_UNUSED)
2053 {
2054   pcum->aapcs_ncrn = 0;
2055   pcum->aapcs_nvrn = 0;
2056   pcum->aapcs_nextncrn = 0;
2057   pcum->aapcs_nextnvrn = 0;
2058   pcum->pcs_variant = ARM_PCS_AAPCS64;
2059   pcum->aapcs_reg = NULL_RTX;
2060   pcum->aapcs_arg_processed = false;
2061   pcum->aapcs_stack_words = 0;
2062   pcum->aapcs_stack_size = 0;
2063
2064   if (!TARGET_FLOAT
2065       && fndecl && TREE_PUBLIC (fndecl)
2066       && fntype && fntype != error_mark_node)
2067     {
2068       const_tree type = TREE_TYPE (fntype);
2069       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2070       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2071       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2072                                                    &mode, &nregs, NULL))
2073         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2074     }
2075   return;
2076 }
2077
2078 static void
2079 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2080                               machine_mode mode,
2081                               const_tree type,
2082                               bool named)
2083 {
2084   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2085   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2086     {
2087       aarch64_layout_arg (pcum_v, mode, type, named);
2088       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2089                   != (pcum->aapcs_stack_words != 0));
2090       pcum->aapcs_arg_processed = false;
2091       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2092       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2093       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2094       pcum->aapcs_stack_words = 0;
2095       pcum->aapcs_reg = NULL_RTX;
2096     }
2097 }
2098
2099 bool
2100 aarch64_function_arg_regno_p (unsigned regno)
2101 {
2102   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2103           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2104 }
2105
2106 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2107    PARM_BOUNDARY bits of alignment, but will be given anything up
2108    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2109    that both before and after the layout of each argument, the Next
2110    Stacked Argument Address (NSAA) will have a minimum alignment of
2111    8 bytes.  */
2112
2113 static unsigned int
2114 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2115 {
2116   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2117
2118   if (alignment < PARM_BOUNDARY)
2119     alignment = PARM_BOUNDARY;
2120   if (alignment > STACK_BOUNDARY)
2121     alignment = STACK_BOUNDARY;
2122   return alignment;
2123 }
2124
2125 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2126
2127    Return true if an argument passed on the stack should be padded upwards,
2128    i.e. if the least-significant byte of the stack slot has useful data.
2129
2130    Small aggregate types are placed in the lowest memory address.
2131
2132    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2133
2134 bool
2135 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2136 {
2137   /* On little-endian targets, the least significant byte of every stack
2138      argument is passed at the lowest byte address of the stack slot.  */
2139   if (!BYTES_BIG_ENDIAN)
2140     return true;
2141
2142   /* Otherwise, integral, floating-point and pointer types are padded downward:
2143      the least significant byte of a stack argument is passed at the highest
2144      byte address of the stack slot.  */
2145   if (type
2146       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2147          || POINTER_TYPE_P (type))
2148       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2149     return false;
2150
2151   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2152   return true;
2153 }
2154
2155 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2156
2157    It specifies padding for the last (may also be the only)
2158    element of a block move between registers and memory.  If
2159    assuming the block is in the memory, padding upward means that
2160    the last element is padded after its highest significant byte,
2161    while in downward padding, the last element is padded at the
2162    its least significant byte side.
2163
2164    Small aggregates and small complex types are always padded
2165    upwards.
2166
2167    We don't need to worry about homogeneous floating-point or
2168    short-vector aggregates; their move is not affected by the
2169    padding direction determined here.  Regardless of endianness,
2170    each element of such an aggregate is put in the least
2171    significant bits of a fp/simd register.
2172
2173    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2174    register has useful data, and return the opposite if the most
2175    significant byte does.  */
2176
2177 bool
2178 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2179                      bool first ATTRIBUTE_UNUSED)
2180 {
2181
2182   /* Small composite types are always padded upward.  */
2183   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2184     {
2185       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2186                             : GET_MODE_SIZE (mode));
2187       if (size < 2 * UNITS_PER_WORD)
2188         return true;
2189     }
2190
2191   /* Otherwise, use the default padding.  */
2192   return !BYTES_BIG_ENDIAN;
2193 }
2194
2195 static machine_mode
2196 aarch64_libgcc_cmp_return_mode (void)
2197 {
2198   return SImode;
2199 }
2200
2201 static bool
2202 aarch64_frame_pointer_required (void)
2203 {
2204   /* In aarch64_override_options_after_change
2205      flag_omit_leaf_frame_pointer turns off the frame pointer by
2206      default.  Turn it back on now if we've not got a leaf
2207      function.  */
2208   if (flag_omit_leaf_frame_pointer
2209       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2210     return true;
2211
2212   return false;
2213 }
2214
2215 /* Mark the registers that need to be saved by the callee and calculate
2216    the size of the callee-saved registers area and frame record (both FP
2217    and LR may be omitted).  */
2218 static void
2219 aarch64_layout_frame (void)
2220 {
2221   HOST_WIDE_INT offset = 0;
2222   int regno;
2223
2224   if (reload_completed && cfun->machine->frame.laid_out)
2225     return;
2226
2227 #define SLOT_NOT_REQUIRED (-2)
2228 #define SLOT_REQUIRED     (-1)
2229
2230   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2231   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2232
2233   /* First mark all the registers that really need to be saved...  */
2234   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2235     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2236
2237   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2238     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2239
2240   /* ... that includes the eh data registers (if needed)...  */
2241   if (crtl->calls_eh_return)
2242     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2243       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2244         = SLOT_REQUIRED;
2245
2246   /* ... and any callee saved register that dataflow says is live.  */
2247   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2248     if (df_regs_ever_live_p (regno)
2249         && (regno == R30_REGNUM
2250             || !call_used_regs[regno]))
2251       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2252
2253   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2254     if (df_regs_ever_live_p (regno)
2255         && !call_used_regs[regno])
2256       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2257
2258   if (frame_pointer_needed)
2259     {
2260       /* FP and LR are placed in the linkage record.  */
2261       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2262       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2263       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2264       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2265       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2266       offset += 2 * UNITS_PER_WORD;
2267     }
2268
2269   /* Now assign stack slots for them.  */
2270   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2271     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2272       {
2273         cfun->machine->frame.reg_offset[regno] = offset;
2274         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2275           cfun->machine->frame.wb_candidate1 = regno;
2276         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2277           cfun->machine->frame.wb_candidate2 = regno;
2278         offset += UNITS_PER_WORD;
2279       }
2280
2281   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2282     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2283       {
2284         cfun->machine->frame.reg_offset[regno] = offset;
2285         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2286           cfun->machine->frame.wb_candidate1 = regno;
2287         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2288                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2289           cfun->machine->frame.wb_candidate2 = regno;
2290         offset += UNITS_PER_WORD;
2291       }
2292
2293   cfun->machine->frame.padding0 =
2294     (AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2295   offset = AARCH64_ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2296
2297   cfun->machine->frame.saved_regs_size = offset;
2298
2299   cfun->machine->frame.hard_fp_offset
2300     = AARCH64_ROUND_UP (cfun->machine->frame.saved_varargs_size
2301                         + get_frame_size ()
2302                         + cfun->machine->frame.saved_regs_size,
2303                         STACK_BOUNDARY / BITS_PER_UNIT);
2304
2305   cfun->machine->frame.frame_size
2306     = AARCH64_ROUND_UP (cfun->machine->frame.hard_fp_offset
2307                         + crtl->outgoing_args_size,
2308                         STACK_BOUNDARY / BITS_PER_UNIT);
2309
2310   cfun->machine->frame.laid_out = true;
2311 }
2312
2313 static bool
2314 aarch64_register_saved_on_entry (int regno)
2315 {
2316   return cfun->machine->frame.reg_offset[regno] >= 0;
2317 }
2318
2319 static unsigned
2320 aarch64_next_callee_save (unsigned regno, unsigned limit)
2321 {
2322   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2323     regno ++;
2324   return regno;
2325 }
2326
2327 static void
2328 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2329                            HOST_WIDE_INT adjustment)
2330  {
2331   rtx base_rtx = stack_pointer_rtx;
2332   rtx insn, reg, mem;
2333
2334   reg = gen_rtx_REG (mode, regno);
2335   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2336                             plus_constant (Pmode, base_rtx, -adjustment));
2337   mem = gen_rtx_MEM (mode, mem);
2338
2339   insn = emit_move_insn (mem, reg);
2340   RTX_FRAME_RELATED_P (insn) = 1;
2341 }
2342
2343 static rtx
2344 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2345                           HOST_WIDE_INT adjustment)
2346 {
2347   switch (mode)
2348     {
2349     case DImode:
2350       return gen_storewb_pairdi_di (base, base, reg, reg2,
2351                                     GEN_INT (-adjustment),
2352                                     GEN_INT (UNITS_PER_WORD - adjustment));
2353     case DFmode:
2354       return gen_storewb_pairdf_di (base, base, reg, reg2,
2355                                     GEN_INT (-adjustment),
2356                                     GEN_INT (UNITS_PER_WORD - adjustment));
2357     default:
2358       gcc_unreachable ();
2359     }
2360 }
2361
2362 static void
2363 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2364                          unsigned regno2, HOST_WIDE_INT adjustment)
2365 {
2366   rtx_insn *insn;
2367   rtx reg1 = gen_rtx_REG (mode, regno1);
2368   rtx reg2 = gen_rtx_REG (mode, regno2);
2369
2370   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2371                                               reg2, adjustment));
2372   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2373   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2374   RTX_FRAME_RELATED_P (insn) = 1;
2375 }
2376
2377 static rtx
2378 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2379                          HOST_WIDE_INT adjustment)
2380 {
2381   switch (mode)
2382     {
2383     case DImode:
2384       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2385                                    GEN_INT (UNITS_PER_WORD));
2386     case DFmode:
2387       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2388                                    GEN_INT (UNITS_PER_WORD));
2389     default:
2390       gcc_unreachable ();
2391     }
2392 }
2393
2394 static rtx
2395 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2396                         rtx reg2)
2397 {
2398   switch (mode)
2399     {
2400     case DImode:
2401       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2402
2403     case DFmode:
2404       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2405
2406     default:
2407       gcc_unreachable ();
2408     }
2409 }
2410
2411 static rtx
2412 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2413                        rtx mem2)
2414 {
2415   switch (mode)
2416     {
2417     case DImode:
2418       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2419
2420     case DFmode:
2421       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2422
2423     default:
2424       gcc_unreachable ();
2425     }
2426 }
2427
2428
2429 static void
2430 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2431                            unsigned start, unsigned limit, bool skip_wb)
2432 {
2433   rtx_insn *insn;
2434   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2435                                                  ? gen_frame_mem : gen_rtx_MEM);
2436   unsigned regno;
2437   unsigned regno2;
2438
2439   for (regno = aarch64_next_callee_save (start, limit);
2440        regno <= limit;
2441        regno = aarch64_next_callee_save (regno + 1, limit))
2442     {
2443       rtx reg, mem;
2444       HOST_WIDE_INT offset;
2445
2446       if (skip_wb
2447           && (regno == cfun->machine->frame.wb_candidate1
2448               || regno == cfun->machine->frame.wb_candidate2))
2449         continue;
2450
2451       reg = gen_rtx_REG (mode, regno);
2452       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2453       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2454                                               offset));
2455
2456       regno2 = aarch64_next_callee_save (regno + 1, limit);
2457
2458       if (regno2 <= limit
2459           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2460               == cfun->machine->frame.reg_offset[regno2]))
2461
2462         {
2463           rtx reg2 = gen_rtx_REG (mode, regno2);
2464           rtx mem2;
2465
2466           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2467           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2468                                                    offset));
2469           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2470                                                     reg2));
2471
2472           /* The first part of a frame-related parallel insn is
2473              always assumed to be relevant to the frame
2474              calculations; subsequent parts, are only
2475              frame-related if explicitly marked.  */
2476           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2477           regno = regno2;
2478         }
2479       else
2480         insn = emit_move_insn (mem, reg);
2481
2482       RTX_FRAME_RELATED_P (insn) = 1;
2483     }
2484 }
2485
2486 static void
2487 aarch64_restore_callee_saves (machine_mode mode,
2488                               HOST_WIDE_INT start_offset, unsigned start,
2489                               unsigned limit, bool skip_wb, rtx *cfi_ops)
2490 {
2491   rtx base_rtx = stack_pointer_rtx;
2492   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2493                                                  ? gen_frame_mem : gen_rtx_MEM);
2494   unsigned regno;
2495   unsigned regno2;
2496   HOST_WIDE_INT offset;
2497
2498   for (regno = aarch64_next_callee_save (start, limit);
2499        regno <= limit;
2500        regno = aarch64_next_callee_save (regno + 1, limit))
2501     {
2502       rtx reg, mem;
2503
2504       if (skip_wb
2505           && (regno == cfun->machine->frame.wb_candidate1
2506               || regno == cfun->machine->frame.wb_candidate2))
2507         continue;
2508
2509       reg = gen_rtx_REG (mode, regno);
2510       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2511       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2512
2513       regno2 = aarch64_next_callee_save (regno + 1, limit);
2514
2515       if (regno2 <= limit
2516           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2517               == cfun->machine->frame.reg_offset[regno2]))
2518         {
2519           rtx reg2 = gen_rtx_REG (mode, regno2);
2520           rtx mem2;
2521
2522           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2523           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2524           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2525
2526           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2527           regno = regno2;
2528         }
2529       else
2530         emit_move_insn (reg, mem);
2531       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2532     }
2533 }
2534
2535 /* AArch64 stack frames generated by this compiler look like:
2536
2537         +-------------------------------+
2538         |                               |
2539         |  incoming stack arguments     |
2540         |                               |
2541         +-------------------------------+
2542         |                               | <-- incoming stack pointer (aligned)
2543         |  callee-allocated save area   |
2544         |  for register varargs         |
2545         |                               |
2546         +-------------------------------+
2547         |  local variables              | <-- frame_pointer_rtx
2548         |                               |
2549         +-------------------------------+
2550         |  padding0                     | \
2551         +-------------------------------+  |
2552         |  callee-saved registers       |  | frame.saved_regs_size
2553         +-------------------------------+  |
2554         |  LR'                          |  |
2555         +-------------------------------+  |
2556         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2557         +-------------------------------+
2558         |  dynamic allocation           |
2559         +-------------------------------+
2560         |  padding                      |
2561         +-------------------------------+
2562         |  outgoing stack arguments     | <-- arg_pointer
2563         |                               |
2564         +-------------------------------+
2565         |                               | <-- stack_pointer_rtx (aligned)
2566
2567    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2568    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2569    unchanged.  */
2570
2571 /* Generate the prologue instructions for entry into a function.
2572    Establish the stack frame by decreasing the stack pointer with a
2573    properly calculated size and, if necessary, create a frame record
2574    filled with the values of LR and previous frame pointer.  The
2575    current FP is also set up if it is in use.  */
2576
2577 void
2578 aarch64_expand_prologue (void)
2579 {
2580   /* sub sp, sp, #<frame_size>
2581      stp {fp, lr}, [sp, #<frame_size> - 16]
2582      add fp, sp, #<frame_size> - hardfp_offset
2583      stp {cs_reg}, [fp, #-16] etc.
2584
2585      sub sp, sp, <final_adjustment_if_any>
2586   */
2587   HOST_WIDE_INT frame_size, offset;
2588   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2589   HOST_WIDE_INT hard_fp_offset;
2590   rtx_insn *insn;
2591
2592   aarch64_layout_frame ();
2593
2594   offset = frame_size = cfun->machine->frame.frame_size;
2595   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2596   fp_offset = frame_size - hard_fp_offset;
2597
2598   if (flag_stack_usage_info)
2599     current_function_static_stack_size = frame_size;
2600
2601   /* Store pairs and load pairs have a range only -512 to 504.  */
2602   if (offset >= 512)
2603     {
2604       /* When the frame has a large size, an initial decrease is done on
2605          the stack pointer to jump over the callee-allocated save area for
2606          register varargs, the local variable area and/or the callee-saved
2607          register area.  This will allow the pre-index write-back
2608          store pair instructions to be used for setting up the stack frame
2609          efficiently.  */
2610       offset = hard_fp_offset;
2611       if (offset >= 512)
2612         offset = cfun->machine->frame.saved_regs_size;
2613
2614       frame_size -= (offset + crtl->outgoing_args_size);
2615       fp_offset = 0;
2616
2617       if (frame_size >= 0x1000000)
2618         {
2619           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2620           emit_move_insn (op0, GEN_INT (-frame_size));
2621           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2622
2623           add_reg_note (insn, REG_CFA_ADJUST_CFA,
2624                         gen_rtx_SET (stack_pointer_rtx,
2625                                      plus_constant (Pmode, stack_pointer_rtx,
2626                                                     -frame_size)));
2627           RTX_FRAME_RELATED_P (insn) = 1;
2628         }
2629       else if (frame_size > 0)
2630         {
2631           int hi_ofs = frame_size & 0xfff000;
2632           int lo_ofs = frame_size & 0x000fff;
2633
2634           if (hi_ofs)
2635             {
2636               insn = emit_insn (gen_add2_insn
2637                                 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2638               RTX_FRAME_RELATED_P (insn) = 1;
2639             }
2640           if (lo_ofs)
2641             {
2642               insn = emit_insn (gen_add2_insn
2643                                 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2644               RTX_FRAME_RELATED_P (insn) = 1;
2645             }
2646         }
2647     }
2648   else
2649     frame_size = -1;
2650
2651   if (offset > 0)
2652     {
2653       bool skip_wb = false;
2654
2655       if (frame_pointer_needed)
2656         {
2657           skip_wb = true;
2658
2659           if (fp_offset)
2660             {
2661               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2662                                                GEN_INT (-offset)));
2663               RTX_FRAME_RELATED_P (insn) = 1;
2664
2665               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2666                                          R30_REGNUM, false);
2667             }
2668           else
2669             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2670
2671           /* Set up frame pointer to point to the location of the
2672              previous frame pointer on the stack.  */
2673           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2674                                            stack_pointer_rtx,
2675                                            GEN_INT (fp_offset)));
2676           RTX_FRAME_RELATED_P (insn) = 1;
2677           emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2678         }
2679       else
2680         {
2681           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2682           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2683
2684           if (fp_offset
2685               || reg1 == FIRST_PSEUDO_REGISTER
2686               || (reg2 == FIRST_PSEUDO_REGISTER
2687                   && offset >= 256))
2688             {
2689               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2690                                                GEN_INT (-offset)));
2691               RTX_FRAME_RELATED_P (insn) = 1;
2692             }
2693           else
2694             {
2695               machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2696
2697               skip_wb = true;
2698
2699               if (reg2 == FIRST_PSEUDO_REGISTER)
2700                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2701               else
2702                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2703             }
2704         }
2705
2706       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2707                                  skip_wb);
2708       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2709                                  skip_wb);
2710     }
2711
2712   /* when offset >= 512,
2713      sub sp, sp, #<outgoing_args_size> */
2714   if (frame_size > -1)
2715     {
2716       if (crtl->outgoing_args_size > 0)
2717         {
2718           insn = emit_insn (gen_add2_insn
2719                             (stack_pointer_rtx,
2720                              GEN_INT (- crtl->outgoing_args_size)));
2721           RTX_FRAME_RELATED_P (insn) = 1;
2722         }
2723     }
2724 }
2725
2726 /* Return TRUE if we can use a simple_return insn.
2727
2728    This function checks whether the callee saved stack is empty, which
2729    means no restore actions are need. The pro_and_epilogue will use
2730    this to check whether shrink-wrapping opt is feasible.  */
2731
2732 bool
2733 aarch64_use_return_insn_p (void)
2734 {
2735   if (!reload_completed)
2736     return false;
2737
2738   if (crtl->profile)
2739     return false;
2740
2741   aarch64_layout_frame ();
2742
2743   return cfun->machine->frame.frame_size == 0;
2744 }
2745
2746 /* Generate the epilogue instructions for returning from a function.  */
2747 void
2748 aarch64_expand_epilogue (bool for_sibcall)
2749 {
2750   HOST_WIDE_INT frame_size, offset;
2751   HOST_WIDE_INT fp_offset;
2752   HOST_WIDE_INT hard_fp_offset;
2753   rtx_insn *insn;
2754   /* We need to add memory barrier to prevent read from deallocated stack.  */
2755   bool need_barrier_p = (get_frame_size () != 0
2756                          || cfun->machine->frame.saved_varargs_size);
2757
2758   aarch64_layout_frame ();
2759
2760   offset = frame_size = cfun->machine->frame.frame_size;
2761   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2762   fp_offset = frame_size - hard_fp_offset;
2763
2764   /* Store pairs and load pairs have a range only -512 to 504.  */
2765   if (offset >= 512)
2766     {
2767       offset = hard_fp_offset;
2768       if (offset >= 512)
2769         offset = cfun->machine->frame.saved_regs_size;
2770
2771       frame_size -= (offset + crtl->outgoing_args_size);
2772       fp_offset = 0;
2773       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2774         {
2775           insn = emit_insn (gen_add2_insn
2776                             (stack_pointer_rtx,
2777                              GEN_INT (crtl->outgoing_args_size)));
2778           RTX_FRAME_RELATED_P (insn) = 1;
2779         }
2780     }
2781   else
2782     frame_size = -1;
2783
2784   /* If there were outgoing arguments or we've done dynamic stack
2785      allocation, then restore the stack pointer from the frame
2786      pointer.  This is at most one insn and more efficient than using
2787      GCC's internal mechanism.  */
2788   if (frame_pointer_needed
2789       && (crtl->outgoing_args_size || cfun->calls_alloca))
2790     {
2791       if (cfun->calls_alloca)
2792         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2793
2794       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2795                                        hard_frame_pointer_rtx,
2796                                        GEN_INT (0)));
2797       offset = offset - fp_offset;
2798     }
2799
2800   if (offset > 0)
2801     {
2802       unsigned reg1 = cfun->machine->frame.wb_candidate1;
2803       unsigned reg2 = cfun->machine->frame.wb_candidate2;
2804       bool skip_wb = true;
2805       rtx cfi_ops = NULL;
2806
2807       if (frame_pointer_needed)
2808         fp_offset = 0;
2809       else if (fp_offset
2810                || reg1 == FIRST_PSEUDO_REGISTER
2811                || (reg2 == FIRST_PSEUDO_REGISTER
2812                    && offset >= 256))
2813         skip_wb = false;
2814
2815       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2816                                     skip_wb, &cfi_ops);
2817       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2818                                     skip_wb, &cfi_ops);
2819
2820       if (need_barrier_p)
2821         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2822
2823       if (skip_wb)
2824         {
2825           machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2826           rtx rreg1 = gen_rtx_REG (mode1, reg1);
2827
2828           cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2829           if (reg2 == FIRST_PSEUDO_REGISTER)
2830             {
2831               rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2832               mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2833               mem = gen_rtx_MEM (mode1, mem);
2834               insn = emit_move_insn (rreg1, mem);
2835             }
2836           else
2837             {
2838               rtx rreg2 = gen_rtx_REG (mode1, reg2);
2839
2840               cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2841               insn = emit_insn (aarch64_gen_loadwb_pair
2842                                 (mode1, stack_pointer_rtx, rreg1,
2843                                  rreg2, offset));
2844             }
2845         }
2846       else
2847         {
2848           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2849                                            GEN_INT (offset)));
2850         }
2851
2852       /* Reset the CFA to be SP + FRAME_SIZE.  */
2853       rtx new_cfa = stack_pointer_rtx;
2854       if (frame_size > 0)
2855         new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2856       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2857       REG_NOTES (insn) = cfi_ops;
2858       RTX_FRAME_RELATED_P (insn) = 1;
2859     }
2860
2861   if (frame_size > 0)
2862     {
2863       if (need_barrier_p)
2864         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2865
2866       if (frame_size >= 0x1000000)
2867         {
2868           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2869           emit_move_insn (op0, GEN_INT (frame_size));
2870           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2871         }
2872       else
2873         {
2874           int hi_ofs = frame_size & 0xfff000;
2875           int lo_ofs = frame_size & 0x000fff;
2876
2877           if (hi_ofs && lo_ofs)
2878             {
2879               insn = emit_insn (gen_add2_insn
2880                                 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2881               RTX_FRAME_RELATED_P (insn) = 1;
2882               frame_size = lo_ofs;
2883             }
2884           insn = emit_insn (gen_add2_insn
2885                             (stack_pointer_rtx, GEN_INT (frame_size)));
2886         }
2887
2888       /* Reset the CFA to be SP + 0.  */
2889       add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2890       RTX_FRAME_RELATED_P (insn) = 1;
2891     }
2892
2893   /* Stack adjustment for exception handler.  */
2894   if (crtl->calls_eh_return)
2895     {
2896       /* We need to unwind the stack by the offset computed by
2897          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
2898          to be SP; letting the CFA move during this adjustment
2899          is just as correct as retaining the CFA from the body
2900          of the function.  Therefore, do nothing special.  */
2901       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2902     }
2903
2904   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2905   if (!for_sibcall)
2906     emit_jump_insn (ret_rtx);
2907 }
2908
2909 /* Return the place to copy the exception unwinding return address to.
2910    This will probably be a stack slot, but could (in theory be the
2911    return register).  */
2912 rtx
2913 aarch64_final_eh_return_addr (void)
2914 {
2915   HOST_WIDE_INT fp_offset;
2916
2917   aarch64_layout_frame ();
2918
2919   fp_offset = cfun->machine->frame.frame_size
2920               - cfun->machine->frame.hard_fp_offset;
2921
2922   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2923     return gen_rtx_REG (DImode, LR_REGNUM);
2924
2925   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2926      result in a store to save LR introduced by builtin_eh_return () being
2927      incorrectly deleted because the alias is not detected.
2928      So in the calculation of the address to copy the exception unwinding
2929      return address to, we note 2 cases.
2930      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2931      we return a SP-relative location since all the addresses are SP-relative
2932      in this case.  This prevents the store from being optimized away.
2933      If the fp_offset is not 0, then the addresses will be FP-relative and
2934      therefore we return a FP-relative location.  */
2935
2936   if (frame_pointer_needed)
2937     {
2938       if (fp_offset)
2939         return gen_frame_mem (DImode,
2940                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2941       else
2942         return gen_frame_mem (DImode,
2943                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2944     }
2945
2946   /* If FP is not needed, we calculate the location of LR, which would be
2947      at the top of the saved registers block.  */
2948
2949   return gen_frame_mem (DImode,
2950                         plus_constant (Pmode,
2951                                        stack_pointer_rtx,
2952                                        fp_offset
2953                                        + cfun->machine->frame.saved_regs_size
2954                                        - 2 * UNITS_PER_WORD));
2955 }
2956
2957 /* Possibly output code to build up a constant in a register.  For
2958    the benefit of the costs infrastructure, returns the number of
2959    instructions which would be emitted.  GENERATE inhibits or
2960    enables code generation.  */
2961
2962 static int
2963 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2964 {
2965   int insns = 0;
2966
2967   if (aarch64_bitmask_imm (val, DImode))
2968     {
2969       if (generate)
2970         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2971       insns = 1;
2972     }
2973   else
2974     {
2975       int i;
2976       int ncount = 0;
2977       int zcount = 0;
2978       HOST_WIDE_INT valp = val >> 16;
2979       HOST_WIDE_INT valm;
2980       HOST_WIDE_INT tval;
2981
2982       for (i = 16; i < 64; i += 16)
2983         {
2984           valm = (valp & 0xffff);
2985
2986           if (valm != 0)
2987             ++ zcount;
2988
2989           if (valm != 0xffff)
2990             ++ ncount;
2991
2992           valp >>= 16;
2993         }
2994
2995       /* zcount contains the number of additional MOVK instructions
2996          required if the constant is built up with an initial MOVZ instruction,
2997          while ncount is the number of MOVK instructions required if starting
2998          with a MOVN instruction.  Choose the sequence that yields the fewest
2999          number of instructions, preferring MOVZ instructions when they are both
3000          the same.  */
3001       if (ncount < zcount)
3002         {
3003           if (generate)
3004             emit_move_insn (gen_rtx_REG (Pmode, regnum),
3005                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
3006           tval = 0xffff;
3007           insns++;
3008         }
3009       else
3010         {
3011           if (generate)
3012             emit_move_insn (gen_rtx_REG (Pmode, regnum),
3013                             GEN_INT (val & 0xffff));
3014           tval = 0;
3015           insns++;
3016         }
3017
3018       val >>= 16;
3019
3020       for (i = 16; i < 64; i += 16)
3021         {
3022           if ((val & 0xffff) != tval)
3023             {
3024               if (generate)
3025                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
3026                                            GEN_INT (i),
3027                                            GEN_INT (val & 0xffff)));
3028               insns++;
3029             }
3030           val >>= 16;
3031         }
3032     }
3033   return insns;
3034 }
3035
3036 static void
3037 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
3038 {
3039   HOST_WIDE_INT mdelta = delta;
3040   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
3041   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
3042
3043   if (mdelta < 0)
3044     mdelta = -mdelta;
3045
3046   if (mdelta >= 4096 * 4096)
3047     {
3048       (void) aarch64_build_constant (scratchreg, delta, true);
3049       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
3050     }
3051   else if (mdelta > 0)
3052     {
3053       if (mdelta >= 4096)
3054         {
3055           emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
3056           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
3057           if (delta < 0)
3058             emit_insn (gen_rtx_SET (this_rtx,
3059                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
3060           else
3061             emit_insn (gen_rtx_SET (this_rtx,
3062                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
3063         }
3064       if (mdelta % 4096 != 0)
3065         {
3066           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
3067           emit_insn (gen_rtx_SET (this_rtx,
3068                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
3069         }
3070     }
3071 }
3072
3073 /* Output code to add DELTA to the first argument, and then jump
3074    to FUNCTION.  Used for C++ multiple inheritance.  */
3075 static void
3076 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3077                          HOST_WIDE_INT delta,
3078                          HOST_WIDE_INT vcall_offset,
3079                          tree function)
3080 {
3081   /* The this pointer is always in x0.  Note that this differs from
3082      Arm where the this pointer maybe bumped to r1 if r0 is required
3083      to return a pointer to an aggregate.  On AArch64 a result value
3084      pointer will be in x8.  */
3085   int this_regno = R0_REGNUM;
3086   rtx this_rtx, temp0, temp1, addr, funexp;
3087   rtx_insn *insn;
3088
3089   reload_completed = 1;
3090   emit_note (NOTE_INSN_PROLOGUE_END);
3091
3092   if (vcall_offset == 0)
3093     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3094   else
3095     {
3096       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3097
3098       this_rtx = gen_rtx_REG (Pmode, this_regno);
3099       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3100       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3101
3102       addr = this_rtx;
3103       if (delta != 0)
3104         {
3105           if (delta >= -256 && delta < 256)
3106             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3107                                        plus_constant (Pmode, this_rtx, delta));
3108           else
3109             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3110         }
3111
3112       if (Pmode == ptr_mode)
3113         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3114       else
3115         aarch64_emit_move (temp0,
3116                            gen_rtx_ZERO_EXTEND (Pmode,
3117                                                 gen_rtx_MEM (ptr_mode, addr)));
3118
3119       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3120           addr = plus_constant (Pmode, temp0, vcall_offset);
3121       else
3122         {
3123           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
3124           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3125         }
3126
3127       if (Pmode == ptr_mode)
3128         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3129       else
3130         aarch64_emit_move (temp1,
3131                            gen_rtx_SIGN_EXTEND (Pmode,
3132                                                 gen_rtx_MEM (ptr_mode, addr)));
3133
3134       emit_insn (gen_add2_insn (this_rtx, temp1));
3135     }
3136
3137   /* Generate a tail call to the target function.  */
3138   if (!TREE_USED (function))
3139     {
3140       assemble_external (function);
3141       TREE_USED (function) = 1;
3142     }
3143   funexp = XEXP (DECL_RTL (function), 0);
3144   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3145   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3146   SIBLING_CALL_P (insn) = 1;
3147
3148   insn = get_insns ();
3149   shorten_branches (insn);
3150   final_start_function (insn, file, 1);
3151   final (insn, file, 1);
3152   final_end_function ();
3153
3154   /* Stop pretending to be a post-reload pass.  */
3155   reload_completed = 0;
3156 }
3157
3158 static bool
3159 aarch64_tls_referenced_p (rtx x)
3160 {
3161   if (!TARGET_HAVE_TLS)
3162     return false;
3163   subrtx_iterator::array_type array;
3164   FOR_EACH_SUBRTX (iter, array, x, ALL)
3165     {
3166       const_rtx x = *iter;
3167       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3168         return true;
3169       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3170          TLS offsets, not real symbol references.  */
3171       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3172         iter.skip_subrtxes ();
3173     }
3174   return false;
3175 }
3176
3177
3178 static int
3179 aarch64_bitmasks_cmp (const void *i1, const void *i2)
3180 {
3181   const unsigned HOST_WIDE_INT *imm1 = (const unsigned HOST_WIDE_INT *) i1;
3182   const unsigned HOST_WIDE_INT *imm2 = (const unsigned HOST_WIDE_INT *) i2;
3183
3184   if (*imm1 < *imm2)
3185     return -1;
3186   if (*imm1 > *imm2)
3187     return +1;
3188   return 0;
3189 }
3190
3191
3192 static void
3193 aarch64_build_bitmask_table (void)
3194 {
3195   unsigned HOST_WIDE_INT mask, imm;
3196   unsigned int log_e, e, s, r;
3197   unsigned int nimms = 0;
3198
3199   for (log_e = 1; log_e <= 6; log_e++)
3200     {
3201       e = 1 << log_e;
3202       if (e == 64)
3203         mask = ~(HOST_WIDE_INT) 0;
3204       else
3205         mask = ((HOST_WIDE_INT) 1 << e) - 1;
3206       for (s = 1; s < e; s++)
3207         {
3208           for (r = 0; r < e; r++)
3209             {
3210               /* set s consecutive bits to 1 (s < 64) */
3211               imm = ((unsigned HOST_WIDE_INT)1 << s) - 1;
3212               /* rotate right by r */
3213               if (r != 0)
3214                 imm = ((imm >> r) | (imm << (e - r))) & mask;
3215               /* replicate the constant depending on SIMD size */
3216               switch (log_e) {
3217               case 1: imm |= (imm <<  2);
3218               case 2: imm |= (imm <<  4);
3219               case 3: imm |= (imm <<  8);
3220               case 4: imm |= (imm << 16);
3221               case 5: imm |= (imm << 32);
3222               case 6:
3223                 break;
3224               default:
3225                 gcc_unreachable ();
3226               }
3227               gcc_assert (nimms < AARCH64_NUM_BITMASKS);
3228               aarch64_bitmasks[nimms++] = imm;
3229             }
3230         }
3231     }
3232
3233   gcc_assert (nimms == AARCH64_NUM_BITMASKS);
3234   qsort (aarch64_bitmasks, nimms, sizeof (aarch64_bitmasks[0]),
3235          aarch64_bitmasks_cmp);
3236 }
3237
3238
3239 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3240    a left shift of 0 or 12 bits.  */
3241 bool
3242 aarch64_uimm12_shift (HOST_WIDE_INT val)
3243 {
3244   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3245           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3246           );
3247 }
3248
3249
3250 /* Return true if val is an immediate that can be loaded into a
3251    register by a MOVZ instruction.  */
3252 static bool
3253 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3254 {
3255   if (GET_MODE_SIZE (mode) > 4)
3256     {
3257       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3258           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3259         return 1;
3260     }
3261   else
3262     {
3263       /* Ignore sign extension.  */
3264       val &= (HOST_WIDE_INT) 0xffffffff;
3265     }
3266   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3267           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3268 }
3269
3270
3271 /* Return true if val is a valid bitmask immediate.  */
3272 bool
3273 aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode mode)
3274 {
3275   if (GET_MODE_SIZE (mode) < 8)
3276     {
3277       /* Replicate bit pattern.  */
3278       val &= (HOST_WIDE_INT) 0xffffffff;
3279       val |= val << 32;
3280     }
3281   return bsearch (&val, aarch64_bitmasks, AARCH64_NUM_BITMASKS,
3282                   sizeof (aarch64_bitmasks[0]), aarch64_bitmasks_cmp) != NULL;
3283 }
3284
3285
3286 /* Return true if val is an immediate that can be loaded into a
3287    register in a single instruction.  */
3288 bool
3289 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3290 {
3291   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3292     return 1;
3293   return aarch64_bitmask_imm (val, mode);
3294 }
3295
3296 static bool
3297 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3298 {
3299   rtx base, offset;
3300
3301   if (GET_CODE (x) == HIGH)
3302     return true;
3303
3304   split_const (x, &base, &offset);
3305   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3306     {
3307       if (aarch64_classify_symbol (base, offset, SYMBOL_CONTEXT_ADR)
3308           != SYMBOL_FORCE_TO_MEM)
3309         return true;
3310       else
3311         /* Avoid generating a 64-bit relocation in ILP32; leave
3312            to aarch64_expand_mov_immediate to handle it properly.  */
3313         return mode != ptr_mode;
3314     }
3315
3316   return aarch64_tls_referenced_p (x);
3317 }
3318
3319 /* Return true if register REGNO is a valid index register.
3320    STRICT_P is true if REG_OK_STRICT is in effect.  */
3321
3322 bool
3323 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3324 {
3325   if (!HARD_REGISTER_NUM_P (regno))
3326     {
3327       if (!strict_p)
3328         return true;
3329
3330       if (!reg_renumber)
3331         return false;
3332
3333       regno = reg_renumber[regno];
3334     }
3335   return GP_REGNUM_P (regno);
3336 }
3337
3338 /* Return true if register REGNO is a valid base register for mode MODE.
3339    STRICT_P is true if REG_OK_STRICT is in effect.  */
3340
3341 bool
3342 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3343 {
3344   if (!HARD_REGISTER_NUM_P (regno))
3345     {
3346       if (!strict_p)
3347         return true;
3348
3349       if (!reg_renumber)
3350         return false;
3351
3352       regno = reg_renumber[regno];
3353     }
3354
3355   /* The fake registers will be eliminated to either the stack or
3356      hard frame pointer, both of which are usually valid base registers.
3357      Reload deals with the cases where the eliminated form isn't valid.  */
3358   return (GP_REGNUM_P (regno)
3359           || regno == SP_REGNUM
3360           || regno == FRAME_POINTER_REGNUM
3361           || regno == ARG_POINTER_REGNUM);
3362 }
3363
3364 /* Return true if X is a valid base register for mode MODE.
3365    STRICT_P is true if REG_OK_STRICT is in effect.  */
3366
3367 static bool
3368 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3369 {
3370   if (!strict_p && GET_CODE (x) == SUBREG)
3371     x = SUBREG_REG (x);
3372
3373   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3374 }
3375
3376 /* Return true if address offset is a valid index.  If it is, fill in INFO
3377    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3378
3379 static bool
3380 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3381                         machine_mode mode, bool strict_p)
3382 {
3383   enum aarch64_address_type type;
3384   rtx index;
3385   int shift;
3386
3387   /* (reg:P) */
3388   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3389       && GET_MODE (x) == Pmode)
3390     {
3391       type = ADDRESS_REG_REG;
3392       index = x;
3393       shift = 0;
3394     }
3395   /* (sign_extend:DI (reg:SI)) */
3396   else if ((GET_CODE (x) == SIGN_EXTEND
3397             || GET_CODE (x) == ZERO_EXTEND)
3398            && GET_MODE (x) == DImode
3399            && GET_MODE (XEXP (x, 0)) == SImode)
3400     {
3401       type = (GET_CODE (x) == SIGN_EXTEND)
3402         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3403       index = XEXP (x, 0);
3404       shift = 0;
3405     }
3406   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3407   else if (GET_CODE (x) == MULT
3408            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3409                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3410            && GET_MODE (XEXP (x, 0)) == DImode
3411            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3412            && CONST_INT_P (XEXP (x, 1)))
3413     {
3414       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3415         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3416       index = XEXP (XEXP (x, 0), 0);
3417       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3418     }
3419   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3420   else if (GET_CODE (x) == ASHIFT
3421            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3422                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3423            && GET_MODE (XEXP (x, 0)) == DImode
3424            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3425            && CONST_INT_P (XEXP (x, 1)))
3426     {
3427       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3428         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3429       index = XEXP (XEXP (x, 0), 0);
3430       shift = INTVAL (XEXP (x, 1));
3431     }
3432   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3433   else if ((GET_CODE (x) == SIGN_EXTRACT
3434             || GET_CODE (x) == ZERO_EXTRACT)
3435            && GET_MODE (x) == DImode
3436            && GET_CODE (XEXP (x, 0)) == MULT
3437            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3438            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3439     {
3440       type = (GET_CODE (x) == SIGN_EXTRACT)
3441         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3442       index = XEXP (XEXP (x, 0), 0);
3443       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3444       if (INTVAL (XEXP (x, 1)) != 32 + shift
3445           || INTVAL (XEXP (x, 2)) != 0)
3446         shift = -1;
3447     }
3448   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3449      (const_int 0xffffffff<<shift)) */
3450   else if (GET_CODE (x) == AND
3451            && GET_MODE (x) == DImode
3452            && GET_CODE (XEXP (x, 0)) == MULT
3453            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3454            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3455            && CONST_INT_P (XEXP (x, 1)))
3456     {
3457       type = ADDRESS_REG_UXTW;
3458       index = XEXP (XEXP (x, 0), 0);
3459       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3460       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3461         shift = -1;
3462     }
3463   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3464   else if ((GET_CODE (x) == SIGN_EXTRACT
3465             || GET_CODE (x) == ZERO_EXTRACT)
3466            && GET_MODE (x) == DImode
3467            && GET_CODE (XEXP (x, 0)) == ASHIFT
3468            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3469            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3470     {
3471       type = (GET_CODE (x) == SIGN_EXTRACT)
3472         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3473       index = XEXP (XEXP (x, 0), 0);
3474       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3475       if (INTVAL (XEXP (x, 1)) != 32 + shift
3476           || INTVAL (XEXP (x, 2)) != 0)
3477         shift = -1;
3478     }
3479   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3480      (const_int 0xffffffff<<shift)) */
3481   else if (GET_CODE (x) == AND
3482            && GET_MODE (x) == DImode
3483            && GET_CODE (XEXP (x, 0)) == ASHIFT
3484            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3485            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3486            && CONST_INT_P (XEXP (x, 1)))
3487     {
3488       type = ADDRESS_REG_UXTW;
3489       index = XEXP (XEXP (x, 0), 0);
3490       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3491       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3492         shift = -1;
3493     }
3494   /* (mult:P (reg:P) (const_int scale)) */
3495   else if (GET_CODE (x) == MULT
3496            && GET_MODE (x) == Pmode
3497            && GET_MODE (XEXP (x, 0)) == Pmode
3498            && CONST_INT_P (XEXP (x, 1)))
3499     {
3500       type = ADDRESS_REG_REG;
3501       index = XEXP (x, 0);
3502       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3503     }
3504   /* (ashift:P (reg:P) (const_int shift)) */
3505   else if (GET_CODE (x) == ASHIFT
3506            && GET_MODE (x) == Pmode
3507            && GET_MODE (XEXP (x, 0)) == Pmode
3508            && CONST_INT_P (XEXP (x, 1)))
3509     {
3510       type = ADDRESS_REG_REG;
3511       index = XEXP (x, 0);
3512       shift = INTVAL (XEXP (x, 1));
3513     }
3514   else
3515     return false;
3516
3517   if (GET_CODE (index) == SUBREG)
3518     index = SUBREG_REG (index);
3519
3520   if ((shift == 0 ||
3521        (shift > 0 && shift <= 3
3522         && (1 << shift) == GET_MODE_SIZE (mode)))
3523       && REG_P (index)
3524       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3525     {
3526       info->type = type;
3527       info->offset = index;
3528       info->shift = shift;
3529       return true;
3530     }
3531
3532   return false;
3533 }
3534
3535 bool
3536 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3537 {
3538   return (offset >= -64 * GET_MODE_SIZE (mode)
3539           && offset < 64 * GET_MODE_SIZE (mode)
3540           && offset % GET_MODE_SIZE (mode) == 0);
3541 }
3542
3543 static inline bool
3544 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3545                                HOST_WIDE_INT offset)
3546 {
3547   return offset >= -256 && offset < 256;
3548 }
3549
3550 static inline bool
3551 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3552 {
3553   return (offset >= 0
3554           && offset < 4096 * GET_MODE_SIZE (mode)
3555           && offset % GET_MODE_SIZE (mode) == 0);
3556 }
3557
3558 /* Return true if X is a valid address for machine mode MODE.  If it is,
3559    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3560    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3561
3562 static bool
3563 aarch64_classify_address (struct aarch64_address_info *info,
3564                           rtx x, machine_mode mode,
3565                           RTX_CODE outer_code, bool strict_p)
3566 {
3567   enum rtx_code code = GET_CODE (x);
3568   rtx op0, op1;
3569
3570   /* On BE, we use load/store pair for all large int mode load/stores.  */
3571   bool load_store_pair_p = (outer_code == PARALLEL
3572                             || (BYTES_BIG_ENDIAN
3573                                 && aarch64_vect_struct_mode_p (mode)));
3574
3575   bool allow_reg_index_p =
3576     !load_store_pair_p
3577     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3578     && !aarch64_vect_struct_mode_p (mode);
3579
3580   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3581      REG addressing.  */
3582   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3583       && (code != POST_INC && code != REG))
3584     return false;
3585
3586   switch (code)
3587     {
3588     case REG:
3589     case SUBREG:
3590       info->type = ADDRESS_REG_IMM;
3591       info->base = x;
3592       info->offset = const0_rtx;
3593       return aarch64_base_register_rtx_p (x, strict_p);
3594
3595     case PLUS:
3596       op0 = XEXP (x, 0);
3597       op1 = XEXP (x, 1);
3598
3599       if (! strict_p
3600           && REG_P (op0)
3601           && (op0 == virtual_stack_vars_rtx
3602               || op0 == frame_pointer_rtx
3603               || op0 == arg_pointer_rtx)
3604           && CONST_INT_P (op1))
3605         {
3606           info->type = ADDRESS_REG_IMM;
3607           info->base = op0;
3608           info->offset = op1;
3609
3610           return true;
3611         }
3612
3613       if (GET_MODE_SIZE (mode) != 0
3614           && CONST_INT_P (op1)
3615           && aarch64_base_register_rtx_p (op0, strict_p))
3616         {
3617           HOST_WIDE_INT offset = INTVAL (op1);
3618
3619           info->type = ADDRESS_REG_IMM;
3620           info->base = op0;
3621           info->offset = op1;
3622
3623           /* TImode and TFmode values are allowed in both pairs of X
3624              registers and individual Q registers.  The available
3625              address modes are:
3626              X,X: 7-bit signed scaled offset
3627              Q:   9-bit signed offset
3628              We conservatively require an offset representable in either mode.
3629            */
3630           if (mode == TImode || mode == TFmode)
3631             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3632                     && offset_9bit_signed_unscaled_p (mode, offset));
3633
3634           /* A 7bit offset check because OImode will emit a ldp/stp
3635              instruction (only big endian will get here).
3636              For ldp/stp instructions, the offset is scaled for the size of a
3637              single element of the pair.  */
3638           if (mode == OImode)
3639             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3640
3641           /* Three 9/12 bit offsets checks because CImode will emit three
3642              ldr/str instructions (only big endian will get here).  */
3643           if (mode == CImode)
3644             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3645                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3646                         || offset_12bit_unsigned_scaled_p (V16QImode,
3647                                                            offset + 32)));
3648
3649           /* Two 7bit offsets checks because XImode will emit two ldp/stp
3650              instructions (only big endian will get here).  */
3651           if (mode == XImode)
3652             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3653                     && aarch64_offset_7bit_signed_scaled_p (TImode,
3654                                                             offset + 32));
3655
3656           if (load_store_pair_p)
3657             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3658                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3659           else
3660             return (offset_9bit_signed_unscaled_p (mode, offset)
3661                     || offset_12bit_unsigned_scaled_p (mode, offset));
3662         }
3663
3664       if (allow_reg_index_p)
3665         {
3666           /* Look for base + (scaled/extended) index register.  */
3667           if (aarch64_base_register_rtx_p (op0, strict_p)
3668               && aarch64_classify_index (info, op1, mode, strict_p))
3669             {
3670               info->base = op0;
3671               return true;
3672             }
3673           if (aarch64_base_register_rtx_p (op1, strict_p)
3674               && aarch64_classify_index (info, op0, mode, strict_p))
3675             {
3676               info->base = op1;
3677               return true;
3678             }
3679         }
3680
3681       return false;
3682
3683     case POST_INC:
3684     case POST_DEC:
3685     case PRE_INC:
3686     case PRE_DEC:
3687       info->type = ADDRESS_REG_WB;
3688       info->base = XEXP (x, 0);
3689       info->offset = NULL_RTX;
3690       return aarch64_base_register_rtx_p (info->base, strict_p);
3691
3692     case POST_MODIFY:
3693     case PRE_MODIFY:
3694       info->type = ADDRESS_REG_WB;
3695       info->base = XEXP (x, 0);
3696       if (GET_CODE (XEXP (x, 1)) == PLUS
3697           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3698           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3699           && aarch64_base_register_rtx_p (info->base, strict_p))
3700         {
3701           HOST_WIDE_INT offset;
3702           info->offset = XEXP (XEXP (x, 1), 1);
3703           offset = INTVAL (info->offset);
3704
3705           /* TImode and TFmode values are allowed in both pairs of X
3706              registers and individual Q registers.  The available
3707              address modes are:
3708              X,X: 7-bit signed scaled offset
3709              Q:   9-bit signed offset
3710              We conservatively require an offset representable in either mode.
3711            */
3712           if (mode == TImode || mode == TFmode)
3713             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3714                     && offset_9bit_signed_unscaled_p (mode, offset));
3715
3716           if (load_store_pair_p)
3717             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3718                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3719           else
3720             return offset_9bit_signed_unscaled_p (mode, offset);
3721         }
3722       return false;
3723
3724     case CONST:
3725     case SYMBOL_REF:
3726     case LABEL_REF:
3727       /* load literal: pc-relative constant pool entry.  Only supported
3728          for SI mode or larger.  */
3729       info->type = ADDRESS_SYMBOLIC;
3730
3731       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3732         {
3733           rtx sym, addend;
3734
3735           split_const (x, &sym, &addend);
3736           return (GET_CODE (sym) == LABEL_REF
3737                   || (GET_CODE (sym) == SYMBOL_REF
3738                       && CONSTANT_POOL_ADDRESS_P (sym)));
3739         }
3740       return false;
3741
3742     case LO_SUM:
3743       info->type = ADDRESS_LO_SUM;
3744       info->base = XEXP (x, 0);
3745       info->offset = XEXP (x, 1);
3746       if (allow_reg_index_p
3747           && aarch64_base_register_rtx_p (info->base, strict_p))
3748         {
3749           rtx sym, offs;
3750           split_const (info->offset, &sym, &offs);
3751           if (GET_CODE (sym) == SYMBOL_REF
3752               && (aarch64_classify_symbol (sym, offs, SYMBOL_CONTEXT_MEM)
3753                   == SYMBOL_SMALL_ABSOLUTE))
3754             {
3755               /* The symbol and offset must be aligned to the access size.  */
3756               unsigned int align;
3757               unsigned int ref_size;
3758
3759               if (CONSTANT_POOL_ADDRESS_P (sym))
3760                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3761               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3762                 {
3763                   tree exp = SYMBOL_REF_DECL (sym);
3764                   align = TYPE_ALIGN (TREE_TYPE (exp));
3765                   align = CONSTANT_ALIGNMENT (exp, align);
3766                 }
3767               else if (SYMBOL_REF_DECL (sym))
3768                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3769               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3770                        && SYMBOL_REF_BLOCK (sym) != NULL)
3771                 align = SYMBOL_REF_BLOCK (sym)->alignment;
3772               else
3773                 align = BITS_PER_UNIT;
3774
3775               ref_size = GET_MODE_SIZE (mode);
3776               if (ref_size == 0)
3777                 ref_size = GET_MODE_SIZE (DImode);
3778
3779               return ((INTVAL (offs) & (ref_size - 1)) == 0
3780                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3781             }
3782         }
3783       return false;
3784
3785     default:
3786       return false;
3787     }
3788 }
3789
3790 bool
3791 aarch64_symbolic_address_p (rtx x)
3792 {
3793   rtx offset;
3794
3795   split_const (x, &x, &offset);
3796   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3797 }
3798
3799 /* Classify the base of symbolic expression X, given that X appears in
3800    context CONTEXT.  */
3801
3802 enum aarch64_symbol_type
3803 aarch64_classify_symbolic_expression (rtx x,
3804                                       enum aarch64_symbol_context context)
3805 {
3806   rtx offset;
3807
3808   split_const (x, &x, &offset);
3809   return aarch64_classify_symbol (x, offset, context);
3810 }
3811
3812
3813 /* Return TRUE if X is a legitimate address for accessing memory in
3814    mode MODE.  */
3815 static bool
3816 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3817 {
3818   struct aarch64_address_info addr;
3819
3820   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3821 }
3822
3823 /* Return TRUE if X is a legitimate address for accessing memory in
3824    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3825    pair operation.  */
3826 bool
3827 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3828                               RTX_CODE outer_code, bool strict_p)
3829 {
3830   struct aarch64_address_info addr;
3831
3832   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3833 }
3834
3835 /* Return TRUE if rtx X is immediate constant 0.0 */
3836 bool
3837 aarch64_float_const_zero_rtx_p (rtx x)
3838 {
3839   REAL_VALUE_TYPE r;
3840
3841   if (GET_MODE (x) == VOIDmode)
3842     return false;
3843
3844   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
3845   if (REAL_VALUE_MINUS_ZERO (r))
3846     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3847   return REAL_VALUES_EQUAL (r, dconst0);
3848 }
3849
3850 /* Return the fixed registers used for condition codes.  */
3851
3852 static bool
3853 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3854 {
3855   *p1 = CC_REGNUM;
3856   *p2 = INVALID_REGNUM;
3857   return true;
3858 }
3859
3860 /* Emit call insn with PAT and do aarch64-specific handling.  */
3861
3862 void
3863 aarch64_emit_call_insn (rtx pat)
3864 {
3865   rtx insn = emit_call_insn (pat);
3866
3867   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3868   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3869   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3870 }
3871
3872 machine_mode
3873 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3874 {
3875   /* All floating point compares return CCFP if it is an equality
3876      comparison, and CCFPE otherwise.  */
3877   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3878     {
3879       switch (code)
3880         {
3881         case EQ:
3882         case NE:
3883         case UNORDERED:
3884         case ORDERED:
3885         case UNLT:
3886         case UNLE:
3887         case UNGT:
3888         case UNGE:
3889         case UNEQ:
3890         case LTGT:
3891           return CCFPmode;
3892
3893         case LT:
3894         case LE:
3895         case GT:
3896         case GE:
3897           return CCFPEmode;
3898
3899         default:
3900           gcc_unreachable ();
3901         }
3902     }
3903
3904   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3905       && y == const0_rtx
3906       && (code == EQ || code == NE || code == LT || code == GE)
3907       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3908           || GET_CODE (x) == NEG))
3909     return CC_NZmode;
3910
3911   /* A compare with a shifted operand.  Because of canonicalization,
3912      the comparison will have to be swapped when we emit the assembly
3913      code.  */
3914   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3915       && (REG_P (y) || GET_CODE (y) == SUBREG)
3916       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3917           || GET_CODE (x) == LSHIFTRT
3918           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3919     return CC_SWPmode;
3920
3921   /* Similarly for a negated operand, but we can only do this for
3922      equalities.  */
3923   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3924       && (REG_P (y) || GET_CODE (y) == SUBREG)
3925       && (code == EQ || code == NE)
3926       && GET_CODE (x) == NEG)
3927     return CC_Zmode;
3928
3929   /* A compare of a mode narrower than SI mode against zero can be done
3930      by extending the value in the comparison.  */
3931   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3932       && y == const0_rtx)
3933     /* Only use sign-extension if we really need it.  */
3934     return ((code == GT || code == GE || code == LE || code == LT)
3935             ? CC_SESWPmode : CC_ZESWPmode);
3936
3937   /* For everything else, return CCmode.  */
3938   return CCmode;
3939 }
3940
3941 static int
3942 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3943
3944 int
3945 aarch64_get_condition_code (rtx x)
3946 {
3947   machine_mode mode = GET_MODE (XEXP (x, 0));
3948   enum rtx_code comp_code = GET_CODE (x);
3949
3950   if (GET_MODE_CLASS (mode) != MODE_CC)
3951     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3952   return aarch64_get_condition_code_1 (mode, comp_code);
3953 }
3954
3955 static int
3956 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3957 {
3958   int ne = -1, eq = -1;
3959   switch (mode)
3960     {
3961     case CCFPmode:
3962     case CCFPEmode:
3963       switch (comp_code)
3964         {
3965         case GE: return AARCH64_GE;
3966         case GT: return AARCH64_GT;
3967         case LE: return AARCH64_LS;
3968         case LT: return AARCH64_MI;
3969         case NE: return AARCH64_NE;
3970         case EQ: return AARCH64_EQ;
3971         case ORDERED: return AARCH64_VC;
3972         case UNORDERED: return AARCH64_VS;
3973         case UNLT: return AARCH64_LT;
3974         case UNLE: return AARCH64_LE;
3975         case UNGT: return AARCH64_HI;
3976         case UNGE: return AARCH64_PL;
3977         default: return -1;
3978         }
3979       break;
3980
3981     case CC_DNEmode:
3982       ne = AARCH64_NE;
3983       eq = AARCH64_EQ;
3984       break;
3985
3986     case CC_DEQmode:
3987       ne = AARCH64_EQ;
3988       eq = AARCH64_NE;
3989       break;
3990
3991     case CC_DGEmode:
3992       ne = AARCH64_GE;
3993       eq = AARCH64_LT;
3994       break;
3995
3996     case CC_DLTmode:
3997       ne = AARCH64_LT;
3998       eq = AARCH64_GE;
3999       break;
4000
4001     case CC_DGTmode:
4002       ne = AARCH64_GT;
4003       eq = AARCH64_LE;
4004       break;
4005
4006     case CC_DLEmode:
4007       ne = AARCH64_LE;
4008       eq = AARCH64_GT;
4009       break;
4010
4011     case CC_DGEUmode:
4012       ne = AARCH64_CS;
4013       eq = AARCH64_CC;
4014       break;
4015
4016     case CC_DLTUmode:
4017       ne = AARCH64_CC;
4018       eq = AARCH64_CS;
4019       break;
4020
4021     case CC_DGTUmode:
4022       ne = AARCH64_HI;
4023       eq = AARCH64_LS;
4024       break;
4025
4026     case CC_DLEUmode:
4027       ne = AARCH64_LS;
4028       eq = AARCH64_HI;
4029       break;
4030
4031     case CCmode:
4032       switch (comp_code)
4033         {
4034         case NE: return AARCH64_NE;
4035         case EQ: return AARCH64_EQ;
4036         case GE: return AARCH64_GE;
4037         case GT: return AARCH64_GT;
4038         case LE: return AARCH64_LE;
4039         case LT: return AARCH64_LT;
4040         case GEU: return AARCH64_CS;
4041         case GTU: return AARCH64_HI;
4042         case LEU: return AARCH64_LS;
4043         case LTU: return AARCH64_CC;
4044         default: return -1;
4045         }
4046       break;
4047
4048     case CC_SWPmode:
4049     case CC_ZESWPmode:
4050     case CC_SESWPmode:
4051       switch (comp_code)
4052         {
4053         case NE: return AARCH64_NE;
4054         case EQ: return AARCH64_EQ;
4055         case GE: return AARCH64_LE;
4056         case GT: return AARCH64_LT;
4057         case LE: return AARCH64_GE;
4058         case LT: return AARCH64_GT;
4059         case GEU: return AARCH64_LS;
4060         case GTU: return AARCH64_CC;
4061         case LEU: return AARCH64_CS;
4062         case LTU: return AARCH64_HI;
4063         default: return -1;
4064         }
4065       break;
4066
4067     case CC_NZmode:
4068       switch (comp_code)
4069         {
4070         case NE: return AARCH64_NE;
4071         case EQ: return AARCH64_EQ;
4072         case GE: return AARCH64_PL;
4073         case LT: return AARCH64_MI;
4074         default: return -1;
4075         }
4076       break;
4077
4078     case CC_Zmode:
4079       switch (comp_code)
4080         {
4081         case NE: return AARCH64_NE;
4082         case EQ: return AARCH64_EQ;
4083         default: return -1;
4084         }
4085       break;
4086
4087     default:
4088       return -1;
4089       break;
4090     }
4091
4092   if (comp_code == NE)
4093     return ne;
4094
4095   if (comp_code == EQ)
4096     return eq;
4097
4098   return -1;
4099 }
4100
4101 bool
4102 aarch64_const_vec_all_same_in_range_p (rtx x,
4103                                   HOST_WIDE_INT minval,
4104                                   HOST_WIDE_INT maxval)
4105 {
4106   HOST_WIDE_INT firstval;
4107   int count, i;
4108
4109   if (GET_CODE (x) != CONST_VECTOR
4110       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4111     return false;
4112
4113   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4114   if (firstval < minval || firstval > maxval)
4115     return false;
4116
4117   count = CONST_VECTOR_NUNITS (x);
4118   for (i = 1; i < count; i++)
4119     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4120       return false;
4121
4122   return true;
4123 }
4124
4125 bool
4126 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4127 {
4128   return aarch64_const_vec_all_same_in_range_p (x, val, val);
4129 }
4130
4131 static unsigned
4132 bit_count (unsigned HOST_WIDE_INT value)
4133 {
4134   unsigned count = 0;
4135
4136   while (value)
4137     {
4138       count++;
4139       value &= value - 1;
4140     }
4141
4142   return count;
4143 }
4144
4145 /* N Z C V.  */
4146 #define AARCH64_CC_V 1
4147 #define AARCH64_CC_C (1 << 1)
4148 #define AARCH64_CC_Z (1 << 2)
4149 #define AARCH64_CC_N (1 << 3)
4150
4151 /* N Z C V flags for ccmp.  The first code is for AND op and the other
4152    is for IOR op.  Indexed by AARCH64_COND_CODE.  */
4153 static const int aarch64_nzcv_codes[][2] =
4154 {
4155   {AARCH64_CC_Z, 0}, /* EQ, Z == 1.  */
4156   {0, AARCH64_CC_Z}, /* NE, Z == 0.  */
4157   {AARCH64_CC_C, 0}, /* CS, C == 1.  */
4158   {0, AARCH64_CC_C}, /* CC, C == 0.  */
4159   {AARCH64_CC_N, 0}, /* MI, N == 1.  */
4160   {0, AARCH64_CC_N}, /* PL, N == 0.  */
4161   {AARCH64_CC_V, 0}, /* VS, V == 1.  */
4162   {0, AARCH64_CC_V}, /* VC, V == 0.  */
4163   {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0.  */
4164   {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0).  */
4165   {0, AARCH64_CC_V}, /* GE, N == V.  */
4166   {AARCH64_CC_V, 0}, /* LT, N != V.  */
4167   {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V.  */
4168   {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V).  */
4169   {0, 0}, /* AL, Any.  */
4170   {0, 0}, /* NV, Any.  */
4171 };
4172
4173 int
4174 aarch64_ccmp_mode_to_code (enum machine_mode mode)
4175 {
4176   switch (mode)
4177     {
4178     case CC_DNEmode:
4179       return NE;
4180
4181     case CC_DEQmode:
4182       return EQ;
4183
4184     case CC_DLEmode:
4185       return LE;
4186
4187     case CC_DGTmode:
4188       return GT;
4189
4190     case CC_DLTmode:
4191       return LT;
4192
4193     case CC_DGEmode:
4194       return GE;
4195
4196     case CC_DLEUmode:
4197       return LEU;
4198
4199     case CC_DGTUmode:
4200       return GTU;
4201
4202     case CC_DLTUmode:
4203       return LTU;
4204
4205     case CC_DGEUmode:
4206       return GEU;
4207
4208     default:
4209       gcc_unreachable ();
4210     }
4211 }
4212
4213
4214 void
4215 aarch64_print_operand (FILE *f, rtx x, char code)
4216 {
4217   switch (code)
4218     {
4219     /* An integer or symbol address without a preceding # sign.  */
4220     case 'c':
4221       switch (GET_CODE (x))
4222         {
4223         case CONST_INT:
4224           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4225           break;
4226
4227         case SYMBOL_REF:
4228           output_addr_const (f, x);
4229           break;
4230
4231         case CONST:
4232           if (GET_CODE (XEXP (x, 0)) == PLUS
4233               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4234             {
4235               output_addr_const (f, x);
4236               break;
4237             }
4238           /* Fall through.  */
4239
4240         default:
4241           output_operand_lossage ("Unsupported operand for code '%c'", code);
4242         }
4243       break;
4244
4245     case 'e':
4246       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4247       {
4248         int n;
4249
4250         if (!CONST_INT_P (x)
4251             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4252           {
4253             output_operand_lossage ("invalid operand for '%%%c'", code);
4254             return;
4255           }
4256
4257         switch (n)
4258           {
4259           case 3:
4260             fputc ('b', f);
4261             break;
4262           case 4:
4263             fputc ('h', f);
4264             break;
4265           case 5:
4266             fputc ('w', f);
4267             break;
4268           default:
4269             output_operand_lossage ("invalid operand for '%%%c'", code);
4270             return;
4271           }
4272       }
4273       break;
4274
4275     case 'p':
4276       {
4277         int n;
4278
4279         /* Print N such that 2^N == X.  */
4280         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4281           {
4282             output_operand_lossage ("invalid operand for '%%%c'", code);
4283             return;
4284           }
4285
4286         asm_fprintf (f, "%d", n);
4287       }
4288       break;
4289
4290     case 'P':
4291       /* Print the number of non-zero bits in X (a const_int).  */
4292       if (!CONST_INT_P (x))
4293         {
4294           output_operand_lossage ("invalid operand for '%%%c'", code);
4295           return;
4296         }
4297
4298       asm_fprintf (f, "%u", bit_count (INTVAL (x)));
4299       break;
4300
4301     case 'H':
4302       /* Print the higher numbered register of a pair (TImode) of regs.  */
4303       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4304         {
4305           output_operand_lossage ("invalid operand for '%%%c'", code);
4306           return;
4307         }
4308
4309       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4310       break;
4311
4312     case 'm':
4313       {
4314         int cond_code;
4315         /* Print a condition (eq, ne, etc).  */
4316
4317         /* CONST_TRUE_RTX means always -- that's the default.  */
4318         if (x == const_true_rtx)
4319           return;
4320
4321         if (!COMPARISON_P (x))
4322           {
4323             output_operand_lossage ("invalid operand for '%%%c'", code);
4324             return;
4325           }
4326
4327         cond_code = aarch64_get_condition_code (x);
4328         gcc_assert (cond_code >= 0);
4329         fputs (aarch64_condition_codes[cond_code], f);
4330       }
4331       break;
4332
4333     case 'M':
4334       {
4335         int cond_code;
4336         /* Print the inverse of a condition (eq <-> ne, etc).  */
4337
4338         /* CONST_TRUE_RTX means never -- that's the default.  */
4339         if (x == const_true_rtx)
4340           {
4341             fputs ("nv", f);
4342             return;
4343           }
4344
4345         if (!COMPARISON_P (x))
4346           {
4347             output_operand_lossage ("invalid operand for '%%%c'", code);
4348             return;
4349           }
4350         cond_code = aarch64_get_condition_code (x);
4351         gcc_assert (cond_code >= 0);
4352         fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4353                                        (cond_code)], f);
4354       }
4355       break;
4356
4357     case 'b':
4358     case 'h':
4359     case 's':
4360     case 'd':
4361     case 'q':
4362       /* Print a scalar FP/SIMD register name.  */
4363       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4364         {
4365           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4366           return;
4367         }
4368       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4369       break;
4370
4371     case 'S':
4372     case 'T':
4373     case 'U':
4374     case 'V':
4375       /* Print the first FP/SIMD register name in a list.  */
4376       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4377         {
4378           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4379           return;
4380         }
4381       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4382       break;
4383
4384     case 'R':
4385       /* Print a scalar FP/SIMD register name + 1.  */
4386       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4387         {
4388           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4389           return;
4390         }
4391       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4392       break;
4393
4394     case 'X':
4395       /* Print bottom 16 bits of integer constant in hex.  */
4396       if (!CONST_INT_P (x))
4397         {
4398           output_operand_lossage ("invalid operand for '%%%c'", code);
4399           return;
4400         }
4401       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4402       break;
4403
4404     case 'w':
4405     case 'x':
4406       /* Print a general register name or the zero register (32-bit or
4407          64-bit).  */
4408       if (x == const0_rtx
4409           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4410         {
4411           asm_fprintf (f, "%czr", code);
4412           break;
4413         }
4414
4415       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4416         {
4417           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4418           break;
4419         }
4420
4421       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4422         {
4423           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4424           break;
4425         }
4426
4427       /* Fall through */
4428
4429     case 0:
4430       /* Print a normal operand, if it's a general register, then we
4431          assume DImode.  */
4432       if (x == NULL)
4433         {
4434           output_operand_lossage ("missing operand");
4435           return;
4436         }
4437
4438       switch (GET_CODE (x))
4439         {
4440         case REG:
4441           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4442           break;
4443
4444         case MEM:
4445           aarch64_memory_reference_mode = GET_MODE (x);
4446           output_address (XEXP (x, 0));
4447           break;
4448
4449         case LABEL_REF:
4450         case SYMBOL_REF:
4451           output_addr_const (asm_out_file, x);
4452           break;
4453
4454         case CONST_INT:
4455           asm_fprintf (f, "%wd", INTVAL (x));
4456           break;
4457
4458         case CONST_VECTOR:
4459           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4460             {
4461               gcc_assert (
4462                   aarch64_const_vec_all_same_in_range_p (x,
4463                                                          HOST_WIDE_INT_MIN,
4464                                                          HOST_WIDE_INT_MAX));
4465               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4466             }
4467           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4468             {
4469               fputc ('0', f);
4470             }
4471           else
4472             gcc_unreachable ();
4473           break;
4474
4475         case CONST_DOUBLE:
4476           /* CONST_DOUBLE can represent a double-width integer.
4477              In this case, the mode of x is VOIDmode.  */
4478           if (GET_MODE (x) == VOIDmode)
4479             ; /* Do Nothing.  */
4480           else if (aarch64_float_const_zero_rtx_p (x))
4481             {
4482               fputc ('0', f);
4483               break;
4484             }
4485           else if (aarch64_float_const_representable_p (x))
4486             {
4487 #define buf_size 20
4488               char float_buf[buf_size] = {'\0'};
4489               REAL_VALUE_TYPE r;
4490               REAL_VALUE_FROM_CONST_DOUBLE (r, x);
4491               real_to_decimal_for_mode (float_buf, &r,
4492                                         buf_size, buf_size,
4493                                         1, GET_MODE (x));
4494               asm_fprintf (asm_out_file, "%s", float_buf);
4495               break;
4496 #undef buf_size
4497             }
4498           output_operand_lossage ("invalid constant");
4499           return;
4500         default:
4501           output_operand_lossage ("invalid operand");
4502           return;
4503         }
4504       break;
4505
4506     case 'A':
4507       if (GET_CODE (x) == HIGH)
4508         x = XEXP (x, 0);
4509
4510       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4511         {
4512         case SYMBOL_SMALL_GOT_4G:
4513           asm_fprintf (asm_out_file, ":got:");
4514           break;
4515
4516         case SYMBOL_SMALL_TLSGD:
4517           asm_fprintf (asm_out_file, ":tlsgd:");
4518           break;
4519
4520         case SYMBOL_SMALL_TLSDESC:
4521           asm_fprintf (asm_out_file, ":tlsdesc:");
4522           break;
4523
4524         case SYMBOL_SMALL_GOTTPREL:
4525           asm_fprintf (asm_out_file, ":gottprel:");
4526           break;
4527
4528         case SYMBOL_TLSLE:
4529           asm_fprintf (asm_out_file, ":tprel:");
4530           break;
4531
4532         case SYMBOL_TINY_GOT:
4533           gcc_unreachable ();
4534           break;
4535
4536         default:
4537           break;
4538         }
4539       output_addr_const (asm_out_file, x);
4540       break;
4541
4542     case 'L':
4543       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4544         {
4545         case SYMBOL_SMALL_GOT_4G:
4546           asm_fprintf (asm_out_file, ":lo12:");
4547           break;
4548
4549         case SYMBOL_SMALL_TLSGD:
4550           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4551           break;
4552
4553         case SYMBOL_SMALL_TLSDESC:
4554           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4555           break;
4556
4557         case SYMBOL_SMALL_GOTTPREL:
4558           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4559           break;
4560
4561         case SYMBOL_TLSLE:
4562           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4563           break;
4564
4565         case SYMBOL_TINY_GOT:
4566           asm_fprintf (asm_out_file, ":got:");
4567           break;
4568
4569         default:
4570           break;
4571         }
4572       output_addr_const (asm_out_file, x);
4573       break;
4574
4575     case 'G':
4576
4577       switch (aarch64_classify_symbolic_expression (x, SYMBOL_CONTEXT_ADR))
4578         {
4579         case SYMBOL_TLSLE:
4580           asm_fprintf (asm_out_file, ":tprel_hi12:");
4581           break;
4582         default:
4583           break;
4584         }
4585       output_addr_const (asm_out_file, x);
4586       break;
4587
4588     case 'K':
4589       {
4590         int cond_code;
4591         /* Print nzcv.  */
4592
4593         if (!COMPARISON_P (x))
4594           {
4595             output_operand_lossage ("invalid operand for '%%%c'", code);
4596             return;
4597           }
4598
4599         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4600         gcc_assert (cond_code >= 0);
4601         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4602       }
4603       break;
4604
4605     case 'k':
4606       {
4607         int cond_code;
4608         /* Print nzcv.  */
4609
4610         if (!COMPARISON_P (x))
4611           {
4612             output_operand_lossage ("invalid operand for '%%%c'", code);
4613             return;
4614           }
4615
4616         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4617         gcc_assert (cond_code >= 0);
4618         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4619       }
4620       break;
4621
4622     default:
4623       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4624       return;
4625     }
4626 }
4627
4628 void
4629 aarch64_print_operand_address (FILE *f, rtx x)
4630 {
4631   struct aarch64_address_info addr;
4632
4633   if (aarch64_classify_address (&addr, x, aarch64_memory_reference_mode,
4634                              MEM, true))
4635     switch (addr.type)
4636       {
4637       case ADDRESS_REG_IMM:
4638         if (addr.offset == const0_rtx)
4639           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4640         else
4641           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4642                        INTVAL (addr.offset));
4643         return;
4644
4645       case ADDRESS_REG_REG:
4646         if (addr.shift == 0)
4647           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4648                        reg_names [REGNO (addr.offset)]);
4649         else
4650           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4651                        reg_names [REGNO (addr.offset)], addr.shift);
4652         return;
4653
4654       case ADDRESS_REG_UXTW:
4655         if (addr.shift == 0)
4656           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4657                        REGNO (addr.offset) - R0_REGNUM);
4658         else
4659           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4660                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4661         return;
4662
4663       case ADDRESS_REG_SXTW:
4664         if (addr.shift == 0)
4665           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4666                        REGNO (addr.offset) - R0_REGNUM);
4667         else
4668           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4669                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4670         return;
4671
4672       case ADDRESS_REG_WB:
4673         switch (GET_CODE (x))
4674           {
4675           case PRE_INC:
4676             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4677                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4678             return;
4679           case POST_INC:
4680             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4681                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4682             return;
4683           case PRE_DEC:
4684             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4685                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4686             return;
4687           case POST_DEC:
4688             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4689                          GET_MODE_SIZE (aarch64_memory_reference_mode));
4690             return;
4691           case PRE_MODIFY:
4692             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4693                          INTVAL (addr.offset));
4694             return;
4695           case POST_MODIFY:
4696             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4697                          INTVAL (addr.offset));
4698             return;
4699           default:
4700             break;
4701           }
4702         break;
4703
4704       case ADDRESS_LO_SUM:
4705         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4706         output_addr_const (f, addr.offset);
4707         asm_fprintf (f, "]");
4708         return;
4709
4710       case ADDRESS_SYMBOLIC:
4711         break;
4712       }
4713
4714   output_addr_const (f, x);
4715 }
4716
4717 bool
4718 aarch64_label_mentioned_p (rtx x)
4719 {
4720   const char *fmt;
4721   int i;
4722
4723   if (GET_CODE (x) == LABEL_REF)
4724     return true;
4725
4726   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4727      referencing instruction, but they are constant offsets, not
4728      symbols.  */
4729   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4730     return false;
4731
4732   fmt = GET_RTX_FORMAT (GET_CODE (x));
4733   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4734     {
4735       if (fmt[i] == 'E')
4736         {
4737           int j;
4738
4739           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4740             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4741               return 1;
4742         }
4743       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4744         return 1;
4745     }
4746
4747   return 0;
4748 }
4749
4750 /* Implement REGNO_REG_CLASS.  */
4751
4752 enum reg_class
4753 aarch64_regno_regclass (unsigned regno)
4754 {
4755   if (GP_REGNUM_P (regno))
4756     return GENERAL_REGS;
4757
4758   if (regno == SP_REGNUM)
4759     return STACK_REG;
4760
4761   if (regno == FRAME_POINTER_REGNUM
4762       || regno == ARG_POINTER_REGNUM)
4763     return POINTER_REGS;
4764
4765   if (FP_REGNUM_P (regno))
4766     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4767
4768   return NO_REGS;
4769 }
4770
4771 static rtx
4772 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
4773 {
4774   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4775      where mask is selected by alignment and size of the offset.
4776      We try to pick as large a range for the offset as possible to
4777      maximize the chance of a CSE.  However, for aligned addresses
4778      we limit the range to 4k so that structures with different sized
4779      elements are likely to use the same base.  */
4780
4781   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4782     {
4783       HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4784       HOST_WIDE_INT base_offset;
4785
4786       /* Does it look like we'll need a load/store-pair operation?  */
4787       if (GET_MODE_SIZE (mode) > 16
4788           || mode == TImode)
4789         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4790                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
4791       /* For offsets aren't a multiple of the access size, the limit is
4792          -256...255.  */
4793       else if (offset & (GET_MODE_SIZE (mode) - 1))
4794         base_offset = (offset + 0x100) & ~0x1ff;
4795       else
4796         base_offset = offset & ~0xfff;
4797
4798       if (base_offset == 0)
4799         return x;
4800
4801       offset -= base_offset;
4802       rtx base_reg = gen_reg_rtx (Pmode);
4803       rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4804                            NULL_RTX);
4805       emit_move_insn (base_reg, val);
4806       x = plus_constant (Pmode, base_reg, offset);
4807     }
4808
4809   return x;
4810 }
4811
4812 /* Try a machine-dependent way of reloading an illegitimate address
4813    operand.  If we find one, push the reload and return the new rtx.  */
4814
4815 rtx
4816 aarch64_legitimize_reload_address (rtx *x_p,
4817                                    machine_mode mode,
4818                                    int opnum, int type,
4819                                    int ind_levels ATTRIBUTE_UNUSED)
4820 {
4821   rtx x = *x_p;
4822
4823   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4824   if (aarch64_vect_struct_mode_p (mode)
4825       && GET_CODE (x) == PLUS
4826       && REG_P (XEXP (x, 0))
4827       && CONST_INT_P (XEXP (x, 1)))
4828     {
4829       rtx orig_rtx = x;
4830       x = copy_rtx (x);
4831       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4832                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4833                    opnum, (enum reload_type) type);
4834       return x;
4835     }
4836
4837   /* We must recognize output that we have already generated ourselves.  */
4838   if (GET_CODE (x) == PLUS
4839       && GET_CODE (XEXP (x, 0)) == PLUS
4840       && REG_P (XEXP (XEXP (x, 0), 0))
4841       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4842       && CONST_INT_P (XEXP (x, 1)))
4843     {
4844       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4845                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4846                    opnum, (enum reload_type) type);
4847       return x;
4848     }
4849
4850   /* We wish to handle large displacements off a base register by splitting
4851      the addend across an add and the mem insn.  This can cut the number of
4852      extra insns needed from 3 to 1.  It is only useful for load/store of a
4853      single register with 12 bit offset field.  */
4854   if (GET_CODE (x) == PLUS
4855       && REG_P (XEXP (x, 0))
4856       && CONST_INT_P (XEXP (x, 1))
4857       && HARD_REGISTER_P (XEXP (x, 0))
4858       && mode != TImode
4859       && mode != TFmode
4860       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4861     {
4862       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4863       HOST_WIDE_INT low = val & 0xfff;
4864       HOST_WIDE_INT high = val - low;
4865       HOST_WIDE_INT offs;
4866       rtx cst;
4867       machine_mode xmode = GET_MODE (x);
4868
4869       /* In ILP32, xmode can be either DImode or SImode.  */
4870       gcc_assert (xmode == DImode || xmode == SImode);
4871
4872       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4873          BLKmode alignment.  */
4874       if (GET_MODE_SIZE (mode) == 0)
4875         return NULL_RTX;
4876
4877       offs = low % GET_MODE_SIZE (mode);
4878
4879       /* Align misaligned offset by adjusting high part to compensate.  */
4880       if (offs != 0)
4881         {
4882           if (aarch64_uimm12_shift (high + offs))
4883             {
4884               /* Align down.  */
4885               low = low - offs;
4886               high = high + offs;
4887             }
4888           else
4889             {
4890               /* Align up.  */
4891               offs = GET_MODE_SIZE (mode) - offs;
4892               low = low + offs;
4893               high = high + (low & 0x1000) - offs;
4894               low &= 0xfff;
4895             }
4896         }
4897
4898       /* Check for overflow.  */
4899       if (high + low != val)
4900         return NULL_RTX;
4901
4902       cst = GEN_INT (high);
4903       if (!aarch64_uimm12_shift (high))
4904         cst = force_const_mem (xmode, cst);
4905
4906       /* Reload high part into base reg, leaving the low part
4907          in the mem instruction.
4908          Note that replacing this gen_rtx_PLUS with plus_constant is
4909          wrong in this case because we rely on the
4910          (plus (plus reg c1) c2) structure being preserved so that
4911          XEXP (*p, 0) in push_reload below uses the correct term.  */
4912       x = gen_rtx_PLUS (xmode,
4913                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4914                         GEN_INT (low));
4915
4916       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4917                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4918                    opnum, (enum reload_type) type);
4919       return x;
4920     }
4921
4922   return NULL_RTX;
4923 }
4924
4925
4926 static reg_class_t
4927 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4928                           reg_class_t rclass,
4929                           machine_mode mode,
4930                           secondary_reload_info *sri)
4931 {
4932   /* Without the TARGET_SIMD instructions we cannot move a Q register
4933      to a Q register directly.  We need a scratch.  */
4934   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4935       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4936       && reg_class_subset_p (rclass, FP_REGS))
4937     {
4938       if (mode == TFmode)
4939         sri->icode = CODE_FOR_aarch64_reload_movtf;
4940       else if (mode == TImode)
4941         sri->icode = CODE_FOR_aarch64_reload_movti;
4942       return NO_REGS;
4943     }
4944
4945   /* A TFmode or TImode memory access should be handled via an FP_REGS
4946      because AArch64 has richer addressing modes for LDR/STR instructions
4947      than LDP/STP instructions.  */
4948   if (TARGET_FLOAT && rclass == GENERAL_REGS
4949       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4950     return FP_REGS;
4951
4952   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4953       return GENERAL_REGS;
4954
4955   return NO_REGS;
4956 }
4957
4958 static bool
4959 aarch64_can_eliminate (const int from, const int to)
4960 {
4961   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4962      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4963
4964   if (frame_pointer_needed)
4965     {
4966       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4967         return true;
4968       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4969         return false;
4970       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4971           && !cfun->calls_alloca)
4972         return true;
4973       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4974         return true;
4975
4976       return false;
4977     }
4978   else
4979     {
4980       /* If we decided that we didn't need a leaf frame pointer but then used
4981          LR in the function, then we'll want a frame pointer after all, so
4982          prevent this elimination to ensure a frame pointer is used.  */
4983       if (to == STACK_POINTER_REGNUM
4984           && flag_omit_leaf_frame_pointer
4985           && df_regs_ever_live_p (LR_REGNUM))
4986         return false;
4987     }
4988
4989   return true;
4990 }
4991
4992 HOST_WIDE_INT
4993 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4994 {
4995   aarch64_layout_frame ();
4996
4997   if (to == HARD_FRAME_POINTER_REGNUM)
4998     {
4999       if (from == ARG_POINTER_REGNUM)
5000         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
5001
5002       if (from == FRAME_POINTER_REGNUM)
5003         return (cfun->machine->frame.hard_fp_offset
5004                 - cfun->machine->frame.saved_varargs_size);
5005     }
5006
5007   if (to == STACK_POINTER_REGNUM)
5008     {
5009       if (from == FRAME_POINTER_REGNUM)
5010           return (cfun->machine->frame.frame_size
5011                   - cfun->machine->frame.saved_varargs_size);
5012     }
5013
5014   return cfun->machine->frame.frame_size;
5015 }
5016
5017 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5018    previous frame.  */
5019
5020 rtx
5021 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5022 {
5023   if (count != 0)
5024     return const0_rtx;
5025   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5026 }
5027
5028
5029 static void
5030 aarch64_asm_trampoline_template (FILE *f)
5031 {
5032   if (TARGET_ILP32)
5033     {
5034       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5035       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5036     }
5037   else
5038     {
5039       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5040       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5041     }
5042   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5043   assemble_aligned_integer (4, const0_rtx);
5044   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5045   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5046 }
5047
5048 static void
5049 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5050 {
5051   rtx fnaddr, mem, a_tramp;
5052   const int tramp_code_sz = 16;
5053
5054   /* Don't need to copy the trailing D-words, we fill those in below.  */
5055   emit_block_move (m_tramp, assemble_trampoline_template (),
5056                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5057   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5058   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5059   if (GET_MODE (fnaddr) != ptr_mode)
5060     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5061   emit_move_insn (mem, fnaddr);
5062
5063   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5064   emit_move_insn (mem, chain_value);
5065
5066   /* XXX We should really define a "clear_cache" pattern and use
5067      gen_clear_cache().  */
5068   a_tramp = XEXP (m_tramp, 0);
5069   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5070                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5071                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5072                      ptr_mode);
5073 }
5074
5075 static unsigned char
5076 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5077 {
5078   switch (regclass)
5079     {
5080     case CALLER_SAVE_REGS:
5081     case POINTER_REGS:
5082     case GENERAL_REGS:
5083     case ALL_REGS:
5084     case FP_REGS:
5085     case FP_LO_REGS:
5086       return
5087         aarch64_vector_mode_p (mode)
5088           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5089           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5090     case STACK_REG:
5091       return 1;
5092
5093     case NO_REGS:
5094       return 0;
5095
5096     default:
5097       break;
5098     }
5099   gcc_unreachable ();
5100 }
5101
5102 static reg_class_t
5103 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5104 {
5105   if (regclass == POINTER_REGS)
5106     return GENERAL_REGS;
5107
5108   if (regclass == STACK_REG)
5109     {
5110       if (REG_P(x)
5111           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5112           return regclass;
5113
5114       return NO_REGS;
5115     }
5116
5117   /* If it's an integer immediate that MOVI can't handle, then
5118      FP_REGS is not an option, so we return NO_REGS instead.  */
5119   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5120       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5121     return NO_REGS;
5122
5123   /* Register eliminiation can result in a request for
5124      SP+constant->FP_REGS.  We cannot support such operations which
5125      use SP as source and an FP_REG as destination, so reject out
5126      right now.  */
5127   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5128     {
5129       rtx lhs = XEXP (x, 0);
5130
5131       /* Look through a possible SUBREG introduced by ILP32.  */
5132       if (GET_CODE (lhs) == SUBREG)
5133         lhs = SUBREG_REG (lhs);
5134
5135       gcc_assert (REG_P (lhs));
5136       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5137                                       POINTER_REGS));
5138       return NO_REGS;
5139     }
5140
5141   return regclass;
5142 }
5143
5144 void
5145 aarch64_asm_output_labelref (FILE* f, const char *name)
5146 {
5147   asm_fprintf (f, "%U%s", name);
5148 }
5149
5150 static void
5151 aarch64_elf_asm_constructor (rtx symbol, int priority)
5152 {
5153   if (priority == DEFAULT_INIT_PRIORITY)
5154     default_ctor_section_asm_out_constructor (symbol, priority);
5155   else
5156     {
5157       section *s;
5158       char buf[18];
5159       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5160       s = get_section (buf, SECTION_WRITE, NULL);
5161       switch_to_section (s);
5162       assemble_align (POINTER_SIZE);
5163       assemble_aligned_integer (POINTER_BYTES, symbol);
5164     }
5165 }
5166
5167 static void
5168 aarch64_elf_asm_destructor (rtx symbol, int priority)
5169 {
5170   if (priority == DEFAULT_INIT_PRIORITY)
5171     default_dtor_section_asm_out_destructor (symbol, priority);
5172   else
5173     {
5174       section *s;
5175       char buf[18];
5176       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5177       s = get_section (buf, SECTION_WRITE, NULL);
5178       switch_to_section (s);
5179       assemble_align (POINTER_SIZE);
5180       assemble_aligned_integer (POINTER_BYTES, symbol);
5181     }
5182 }
5183
5184 const char*
5185 aarch64_output_casesi (rtx *operands)
5186 {
5187   char buf[100];
5188   char label[100];
5189   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5190   int index;
5191   static const char *const patterns[4][2] =
5192   {
5193     {
5194       "ldrb\t%w3, [%0,%w1,uxtw]",
5195       "add\t%3, %4, %w3, sxtb #2"
5196     },
5197     {
5198       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5199       "add\t%3, %4, %w3, sxth #2"
5200     },
5201     {
5202       "ldr\t%w3, [%0,%w1,uxtw #2]",
5203       "add\t%3, %4, %w3, sxtw #2"
5204     },
5205     /* We assume that DImode is only generated when not optimizing and
5206        that we don't really need 64-bit address offsets.  That would
5207        imply an object file with 8GB of code in a single function!  */
5208     {
5209       "ldr\t%w3, [%0,%w1,uxtw #2]",
5210       "add\t%3, %4, %w3, sxtw #2"
5211     }
5212   };
5213
5214   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5215
5216   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5217
5218   gcc_assert (index >= 0 && index <= 3);
5219
5220   /* Need to implement table size reduction, by chaning the code below.  */
5221   output_asm_insn (patterns[index][0], operands);
5222   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5223   snprintf (buf, sizeof (buf),
5224             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5225   output_asm_insn (buf, operands);
5226   output_asm_insn (patterns[index][1], operands);
5227   output_asm_insn ("br\t%3", operands);
5228   assemble_label (asm_out_file, label);
5229   return "";
5230 }
5231
5232
5233 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5234    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5235    operator.  */
5236
5237 int
5238 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5239 {
5240   if (shift >= 0 && shift <= 3)
5241     {
5242       int size;
5243       for (size = 8; size <= 32; size *= 2)
5244         {
5245           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5246           if (mask == bits << shift)
5247             return size;
5248         }
5249     }
5250   return 0;
5251 }
5252
5253 static bool
5254 aarch64_use_blocks_for_constant_p (machine_mode mode ATTRIBUTE_UNUSED,
5255                                    const_rtx x ATTRIBUTE_UNUSED)
5256 {
5257   /* We can't use blocks for constants when we're using a per-function
5258      constant pool.  */
5259   return false;
5260 }
5261
5262 static section *
5263 aarch64_select_rtx_section (machine_mode mode ATTRIBUTE_UNUSED,
5264                             rtx x ATTRIBUTE_UNUSED,
5265                             unsigned HOST_WIDE_INT align ATTRIBUTE_UNUSED)
5266 {
5267   /* Force all constant pool entries into the current function section.  */
5268   return function_section (current_function_decl);
5269 }
5270
5271
5272 /* Costs.  */
5273
5274 /* Helper function for rtx cost calculation.  Strip a shift expression
5275    from X.  Returns the inner operand if successful, or the original
5276    expression on failure.  */
5277 static rtx
5278 aarch64_strip_shift (rtx x)
5279 {
5280   rtx op = x;
5281
5282   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5283      we can convert both to ROR during final output.  */
5284   if ((GET_CODE (op) == ASHIFT
5285        || GET_CODE (op) == ASHIFTRT
5286        || GET_CODE (op) == LSHIFTRT
5287        || GET_CODE (op) == ROTATERT
5288        || GET_CODE (op) == ROTATE)
5289       && CONST_INT_P (XEXP (op, 1)))
5290     return XEXP (op, 0);
5291
5292   if (GET_CODE (op) == MULT
5293       && CONST_INT_P (XEXP (op, 1))
5294       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5295     return XEXP (op, 0);
5296
5297   return x;
5298 }
5299
5300 /* Helper function for rtx cost calculation.  Strip an extend
5301    expression from X.  Returns the inner operand if successful, or the
5302    original expression on failure.  We deal with a number of possible
5303    canonicalization variations here.  */
5304 static rtx
5305 aarch64_strip_extend (rtx x)
5306 {
5307   rtx op = x;
5308
5309   /* Zero and sign extraction of a widened value.  */
5310   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5311       && XEXP (op, 2) == const0_rtx
5312       && GET_CODE (XEXP (op, 0)) == MULT
5313       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5314                                          XEXP (op, 1)))
5315     return XEXP (XEXP (op, 0), 0);
5316
5317   /* It can also be represented (for zero-extend) as an AND with an
5318      immediate.  */
5319   if (GET_CODE (op) == AND
5320       && GET_CODE (XEXP (op, 0)) == MULT
5321       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5322       && CONST_INT_P (XEXP (op, 1))
5323       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5324                            INTVAL (XEXP (op, 1))) != 0)
5325     return XEXP (XEXP (op, 0), 0);
5326
5327   /* Now handle extended register, as this may also have an optional
5328      left shift by 1..4.  */
5329   if (GET_CODE (op) == ASHIFT
5330       && CONST_INT_P (XEXP (op, 1))
5331       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5332     op = XEXP (op, 0);
5333
5334   if (GET_CODE (op) == ZERO_EXTEND
5335       || GET_CODE (op) == SIGN_EXTEND)
5336     op = XEXP (op, 0);
5337
5338   if (op != x)
5339     return op;
5340
5341   return x;
5342 }
5343
5344 /* Return true iff CODE is a shift supported in combination
5345    with arithmetic instructions.  */
5346
5347 static bool
5348 aarch64_shift_p (enum rtx_code code)
5349 {
5350   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5351 }
5352
5353 /* Helper function for rtx cost calculation.  Calculate the cost of
5354    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5355    Return the calculated cost of the expression, recursing manually in to
5356    operands where needed.  */
5357
5358 static int
5359 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
5360 {
5361   rtx op0, op1;
5362   const struct cpu_cost_table *extra_cost
5363     = aarch64_tune_params.insn_extra_cost;
5364   int cost = 0;
5365   bool compound_p = (outer == PLUS || outer == MINUS);
5366   machine_mode mode = GET_MODE (x);
5367
5368   gcc_checking_assert (code == MULT);
5369
5370   op0 = XEXP (x, 0);
5371   op1 = XEXP (x, 1);
5372
5373   if (VECTOR_MODE_P (mode))
5374     mode = GET_MODE_INNER (mode);
5375
5376   /* Integer multiply/fma.  */
5377   if (GET_MODE_CLASS (mode) == MODE_INT)
5378     {
5379       /* The multiply will be canonicalized as a shift, cost it as such.  */
5380       if (aarch64_shift_p (GET_CODE (x))
5381           || (CONST_INT_P (op1)
5382               && exact_log2 (INTVAL (op1)) > 0))
5383         {
5384           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5385                            || GET_CODE (op0) == SIGN_EXTEND;
5386           if (speed)
5387             {
5388               if (compound_p)
5389                 {
5390                   if (REG_P (op1))
5391                     /* ARITH + shift-by-register.  */
5392                     cost += extra_cost->alu.arith_shift_reg;
5393                   else if (is_extend)
5394                     /* ARITH + extended register.  We don't have a cost field
5395                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
5396                     cost += extra_cost->alu.extend_arith;
5397                   else
5398                     /* ARITH + shift-by-immediate.  */
5399                     cost += extra_cost->alu.arith_shift;
5400                 }
5401               else
5402                 /* LSL (immediate).  */
5403                 cost += extra_cost->alu.shift;
5404
5405             }
5406           /* Strip extends as we will have costed them in the case above.  */
5407           if (is_extend)
5408             op0 = aarch64_strip_extend (op0);
5409
5410           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
5411
5412           return cost;
5413         }
5414
5415       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
5416          compound and let the below cases handle it.  After all, MNEG is a
5417          special-case alias of MSUB.  */
5418       if (GET_CODE (op0) == NEG)
5419         {
5420           op0 = XEXP (op0, 0);
5421           compound_p = true;
5422         }
5423
5424       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5425       if ((GET_CODE (op0) == ZERO_EXTEND
5426            && GET_CODE (op1) == ZERO_EXTEND)
5427           || (GET_CODE (op0) == SIGN_EXTEND
5428               && GET_CODE (op1) == SIGN_EXTEND))
5429         {
5430           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
5431           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
5432
5433           if (speed)
5434             {
5435               if (compound_p)
5436                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
5437                 cost += extra_cost->mult[0].extend_add;
5438               else
5439                 /* MUL/SMULL/UMULL.  */
5440                 cost += extra_cost->mult[0].extend;
5441             }
5442
5443           return cost;
5444         }
5445
5446       /* This is either an integer multiply or a MADD.  In both cases
5447          we want to recurse and cost the operands.  */
5448       cost += rtx_cost (op0, mode, MULT, 0, speed);
5449       cost += rtx_cost (op1, mode, MULT, 1, speed);
5450
5451       if (speed)
5452         {
5453           if (compound_p)
5454             /* MADD/MSUB.  */
5455             cost += extra_cost->mult[mode == DImode].add;
5456           else
5457             /* MUL.  */
5458             cost += extra_cost->mult[mode == DImode].simple;
5459         }
5460
5461       return cost;
5462     }
5463   else
5464     {
5465       if (speed)
5466         {
5467           /* Floating-point FMA/FMUL can also support negations of the
5468              operands.  */
5469           if (GET_CODE (op0) == NEG)
5470             op0 = XEXP (op0, 0);
5471           if (GET_CODE (op1) == NEG)
5472             op1 = XEXP (op1, 0);
5473
5474           if (compound_p)
5475             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5476             cost += extra_cost->fp[mode == DFmode].fma;
5477           else
5478             /* FMUL/FNMUL.  */
5479             cost += extra_cost->fp[mode == DFmode].mult;
5480         }
5481
5482       cost += rtx_cost (op0, mode, MULT, 0, speed);
5483       cost += rtx_cost (op1, mode, MULT, 1, speed);
5484       return cost;
5485     }
5486 }
5487
5488 static int
5489 aarch64_address_cost (rtx x,
5490                       machine_mode mode,
5491                       addr_space_t as ATTRIBUTE_UNUSED,
5492                       bool speed)
5493 {
5494   enum rtx_code c = GET_CODE (x);
5495   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
5496   struct aarch64_address_info info;
5497   int cost = 0;
5498   info.shift = 0;
5499
5500   if (!aarch64_classify_address (&info, x, mode, c, false))
5501     {
5502       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5503         {
5504           /* This is a CONST or SYMBOL ref which will be split
5505              in a different way depending on the code model in use.
5506              Cost it through the generic infrastructure.  */
5507           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
5508           /* Divide through by the cost of one instruction to
5509              bring it to the same units as the address costs.  */
5510           cost_symbol_ref /= COSTS_N_INSNS (1);
5511           /* The cost is then the cost of preparing the address,
5512              followed by an immediate (possibly 0) offset.  */
5513           return cost_symbol_ref + addr_cost->imm_offset;
5514         }
5515       else
5516         {
5517           /* This is most likely a jump table from a case
5518              statement.  */
5519           return addr_cost->register_offset;
5520         }
5521     }
5522
5523   switch (info.type)
5524     {
5525       case ADDRESS_LO_SUM:
5526       case ADDRESS_SYMBOLIC:
5527       case ADDRESS_REG_IMM:
5528         cost += addr_cost->imm_offset;
5529         break;
5530
5531       case ADDRESS_REG_WB:
5532         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5533           cost += addr_cost->pre_modify;
5534         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5535           cost += addr_cost->post_modify;
5536         else
5537           gcc_unreachable ();
5538
5539         break;
5540
5541       case ADDRESS_REG_REG:
5542         cost += addr_cost->register_offset;
5543         break;
5544
5545       case ADDRESS_REG_UXTW:
5546       case ADDRESS_REG_SXTW:
5547         cost += addr_cost->register_extend;
5548         break;
5549
5550       default:
5551         gcc_unreachable ();
5552     }
5553
5554
5555   if (info.shift > 0)
5556     {
5557       /* For the sake of calculating the cost of the shifted register
5558          component, we can treat same sized modes in the same way.  */
5559       switch (GET_MODE_BITSIZE (mode))
5560         {
5561           case 16:
5562             cost += addr_cost->addr_scale_costs.hi;
5563             break;
5564
5565           case 32:
5566             cost += addr_cost->addr_scale_costs.si;
5567             break;
5568
5569           case 64:
5570             cost += addr_cost->addr_scale_costs.di;
5571             break;
5572
5573           /* We can't tell, or this is a 128-bit vector.  */
5574           default:
5575             cost += addr_cost->addr_scale_costs.ti;
5576             break;
5577         }
5578     }
5579
5580   return cost;
5581 }
5582
5583 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
5584    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
5585    to be taken.  */
5586
5587 int
5588 aarch64_branch_cost (bool speed_p, bool predictable_p)
5589 {
5590   /* When optimizing for speed, use the cost of unpredictable branches.  */
5591   const struct cpu_branch_cost *branch_costs =
5592     aarch64_tune_params.branch_costs;
5593
5594   if (!speed_p || predictable_p)
5595     return branch_costs->predictable;
5596   else
5597     return branch_costs->unpredictable;
5598 }
5599
5600 /* Return true if the RTX X in mode MODE is a zero or sign extract
5601    usable in an ADD or SUB (extended register) instruction.  */
5602 static bool
5603 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5604 {
5605   /* Catch add with a sign extract.
5606      This is add_<optab><mode>_multp2.  */
5607   if (GET_CODE (x) == SIGN_EXTRACT
5608       || GET_CODE (x) == ZERO_EXTRACT)
5609     {
5610       rtx op0 = XEXP (x, 0);
5611       rtx op1 = XEXP (x, 1);
5612       rtx op2 = XEXP (x, 2);
5613
5614       if (GET_CODE (op0) == MULT
5615           && CONST_INT_P (op1)
5616           && op2 == const0_rtx
5617           && CONST_INT_P (XEXP (op0, 1))
5618           && aarch64_is_extend_from_extract (mode,
5619                                              XEXP (op0, 1),
5620                                              op1))
5621         {
5622           return true;
5623         }
5624     }
5625   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
5626      No shift.  */
5627   else if (GET_CODE (x) == SIGN_EXTEND
5628            || GET_CODE (x) == ZERO_EXTEND)
5629     return REG_P (XEXP (x, 0));
5630
5631   return false;
5632 }
5633
5634 static bool
5635 aarch64_frint_unspec_p (unsigned int u)
5636 {
5637   switch (u)
5638     {
5639       case UNSPEC_FRINTZ:
5640       case UNSPEC_FRINTP:
5641       case UNSPEC_FRINTM:
5642       case UNSPEC_FRINTA:
5643       case UNSPEC_FRINTN:
5644       case UNSPEC_FRINTX:
5645       case UNSPEC_FRINTI:
5646         return true;
5647
5648       default:
5649         return false;
5650     }
5651 }
5652
5653 /* Return true iff X is an rtx that will match an extr instruction
5654    i.e. as described in the *extr<mode>5_insn family of patterns.
5655    OP0 and OP1 will be set to the operands of the shifts involved
5656    on success and will be NULL_RTX otherwise.  */
5657
5658 static bool
5659 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5660 {
5661   rtx op0, op1;
5662   machine_mode mode = GET_MODE (x);
5663
5664   *res_op0 = NULL_RTX;
5665   *res_op1 = NULL_RTX;
5666
5667   if (GET_CODE (x) != IOR)
5668     return false;
5669
5670   op0 = XEXP (x, 0);
5671   op1 = XEXP (x, 1);
5672
5673   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5674       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5675     {
5676      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
5677       if (GET_CODE (op1) == ASHIFT)
5678         std::swap (op0, op1);
5679
5680       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5681         return false;
5682
5683       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5684       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5685
5686       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5687           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5688         {
5689           *res_op0 = XEXP (op0, 0);
5690           *res_op1 = XEXP (op1, 0);
5691           return true;
5692         }
5693     }
5694
5695   return false;
5696 }
5697
5698 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5699    storing it in *COST.  Result is true if the total cost of the operation
5700    has now been calculated.  */
5701 static bool
5702 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5703 {
5704   rtx inner;
5705   rtx comparator;
5706   enum rtx_code cmpcode;
5707
5708   if (COMPARISON_P (op0))
5709     {
5710       inner = XEXP (op0, 0);
5711       comparator = XEXP (op0, 1);
5712       cmpcode = GET_CODE (op0);
5713     }
5714   else
5715     {
5716       inner = op0;
5717       comparator = const0_rtx;
5718       cmpcode = NE;
5719     }
5720
5721   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5722     {
5723       /* Conditional branch.  */
5724       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5725         return true;
5726       else
5727         {
5728           if (cmpcode == NE || cmpcode == EQ)
5729             {
5730               if (comparator == const0_rtx)
5731                 {
5732                   /* TBZ/TBNZ/CBZ/CBNZ.  */
5733                   if (GET_CODE (inner) == ZERO_EXTRACT)
5734                     /* TBZ/TBNZ.  */
5735                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
5736                                        ZERO_EXTRACT, 0, speed);
5737                   else
5738                     /* CBZ/CBNZ.  */
5739                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
5740
5741                 return true;
5742               }
5743             }
5744           else if (cmpcode == LT || cmpcode == GE)
5745             {
5746               /* TBZ/TBNZ.  */
5747               if (comparator == const0_rtx)
5748                 return true;
5749             }
5750         }
5751     }
5752   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5753     {
5754       /* It's a conditional operation based on the status flags,
5755          so it must be some flavor of CSEL.  */
5756
5757       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
5758       if (GET_CODE (op1) == NEG
5759           || GET_CODE (op1) == NOT
5760           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5761         op1 = XEXP (op1, 0);
5762
5763       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
5764       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
5765       return true;
5766     }
5767
5768   /* We don't know what this is, cost all operands.  */
5769   return false;
5770 }
5771
5772 /* Calculate the cost of calculating X, storing it in *COST.  Result
5773    is true if the total cost of the operation has now been calculated.  */
5774 static bool
5775 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
5776                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5777 {
5778   rtx op0, op1, op2;
5779   const struct cpu_cost_table *extra_cost
5780     = aarch64_tune_params.insn_extra_cost;
5781   int code = GET_CODE (x);
5782
5783   /* By default, assume that everything has equivalent cost to the
5784      cheapest instruction.  Any additional costs are applied as a delta
5785      above this default.  */
5786   *cost = COSTS_N_INSNS (1);
5787
5788   switch (code)
5789     {
5790     case SET:
5791       /* The cost depends entirely on the operands to SET.  */
5792       *cost = 0;
5793       op0 = SET_DEST (x);
5794       op1 = SET_SRC (x);
5795
5796       switch (GET_CODE (op0))
5797         {
5798         case MEM:
5799           if (speed)
5800             {
5801               rtx address = XEXP (op0, 0);
5802               if (VECTOR_MODE_P (mode))
5803                 *cost += extra_cost->ldst.storev;
5804               else if (GET_MODE_CLASS (mode) == MODE_INT)
5805                 *cost += extra_cost->ldst.store;
5806               else if (mode == SFmode)
5807                 *cost += extra_cost->ldst.storef;
5808               else if (mode == DFmode)
5809                 *cost += extra_cost->ldst.stored;
5810
5811               *cost +=
5812                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5813                                                      0, speed));
5814             }
5815
5816           *cost += rtx_cost (op1, mode, SET, 1, speed);
5817           return true;
5818
5819         case SUBREG:
5820           if (! REG_P (SUBREG_REG (op0)))
5821             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
5822
5823           /* Fall through.  */
5824         case REG:
5825           /* The cost is one per vector-register copied.  */
5826           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
5827             {
5828               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5829                               / GET_MODE_SIZE (V4SImode);
5830               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5831             }
5832           /* const0_rtx is in general free, but we will use an
5833              instruction to set a register to 0.  */
5834           else if (REG_P (op1) || op1 == const0_rtx)
5835             {
5836               /* The cost is 1 per register copied.  */
5837               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5838                               / UNITS_PER_WORD;
5839               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5840             }
5841           else
5842             /* Cost is just the cost of the RHS of the set.  */
5843             *cost += rtx_cost (op1, mode, SET, 1, speed);
5844           return true;
5845
5846         case ZERO_EXTRACT:
5847         case SIGN_EXTRACT:
5848           /* Bit-field insertion.  Strip any redundant widening of
5849              the RHS to meet the width of the target.  */
5850           if (GET_CODE (op1) == SUBREG)
5851             op1 = SUBREG_REG (op1);
5852           if ((GET_CODE (op1) == ZERO_EXTEND
5853                || GET_CODE (op1) == SIGN_EXTEND)
5854               && CONST_INT_P (XEXP (op0, 1))
5855               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5856                   >= INTVAL (XEXP (op0, 1))))
5857             op1 = XEXP (op1, 0);
5858
5859           if (CONST_INT_P (op1))
5860             {
5861               /* MOV immediate is assumed to always be cheap.  */
5862               *cost = COSTS_N_INSNS (1);
5863             }
5864           else
5865             {
5866               /* BFM.  */
5867               if (speed)
5868                 *cost += extra_cost->alu.bfi;
5869               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
5870             }
5871
5872           return true;
5873
5874         default:
5875           /* We can't make sense of this, assume default cost.  */
5876           *cost = COSTS_N_INSNS (1);
5877           return false;
5878         }
5879       return false;
5880
5881     case CONST_INT:
5882       /* If an instruction can incorporate a constant within the
5883          instruction, the instruction's expression avoids calling
5884          rtx_cost() on the constant.  If rtx_cost() is called on a
5885          constant, then it is usually because the constant must be
5886          moved into a register by one or more instructions.
5887
5888          The exception is constant 0, which can be expressed
5889          as XZR/WZR and is therefore free.  The exception to this is
5890          if we have (set (reg) (const0_rtx)) in which case we must cost
5891          the move.  However, we can catch that when we cost the SET, so
5892          we don't need to consider that here.  */
5893       if (x == const0_rtx)
5894         *cost = 0;
5895       else
5896         {
5897           /* To an approximation, building any other constant is
5898              proportionally expensive to the number of instructions
5899              required to build that constant.  This is true whether we
5900              are compiling for SPEED or otherwise.  */
5901           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5902                                  (NULL_RTX, x, false, mode));
5903         }
5904       return true;
5905
5906     case CONST_DOUBLE:
5907       if (speed)
5908         {
5909           /* mov[df,sf]_aarch64.  */
5910           if (aarch64_float_const_representable_p (x))
5911             /* FMOV (scalar immediate).  */
5912             *cost += extra_cost->fp[mode == DFmode].fpconst;
5913           else if (!aarch64_float_const_zero_rtx_p (x))
5914             {
5915               /* This will be a load from memory.  */
5916               if (mode == DFmode)
5917                 *cost += extra_cost->ldst.loadd;
5918               else
5919                 *cost += extra_cost->ldst.loadf;
5920             }
5921           else
5922             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5923                or MOV v0.s[0], wzr - neither of which are modeled by the
5924                cost tables.  Just use the default cost.  */
5925             {
5926             }
5927         }
5928
5929       return true;
5930
5931     case MEM:
5932       if (speed)
5933         {
5934           /* For loads we want the base cost of a load, plus an
5935              approximation for the additional cost of the addressing
5936              mode.  */
5937           rtx address = XEXP (x, 0);
5938           if (VECTOR_MODE_P (mode))
5939             *cost += extra_cost->ldst.loadv;
5940           else if (GET_MODE_CLASS (mode) == MODE_INT)
5941             *cost += extra_cost->ldst.load;
5942           else if (mode == SFmode)
5943             *cost += extra_cost->ldst.loadf;
5944           else if (mode == DFmode)
5945             *cost += extra_cost->ldst.loadd;
5946
5947           *cost +=
5948                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5949                                                      0, speed));
5950         }
5951
5952       return true;
5953
5954     case NEG:
5955       op0 = XEXP (x, 0);
5956
5957       if (VECTOR_MODE_P (mode))
5958         {
5959           if (speed)
5960             {
5961               /* FNEG.  */
5962               *cost += extra_cost->vect.alu;
5963             }
5964           return false;
5965         }
5966
5967       if (GET_MODE_CLASS (mode) == MODE_INT)
5968         {
5969           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5970               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5971             {
5972               /* CSETM.  */
5973               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
5974               return true;
5975             }
5976
5977           /* Cost this as SUB wzr, X.  */
5978           op0 = CONST0_RTX (mode);
5979           op1 = XEXP (x, 0);
5980           goto cost_minus;
5981         }
5982
5983       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5984         {
5985           /* Support (neg(fma...)) as a single instruction only if
5986              sign of zeros is unimportant.  This matches the decision
5987              making in aarch64.md.  */
5988           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5989             {
5990               /* FNMADD.  */
5991               *cost = rtx_cost (op0, mode, NEG, 0, speed);
5992               return true;
5993             }
5994           if (speed)
5995             /* FNEG.  */
5996             *cost += extra_cost->fp[mode == DFmode].neg;
5997           return false;
5998         }
5999
6000       return false;
6001
6002     case CLRSB:
6003     case CLZ:
6004       if (speed)
6005         {
6006           if (VECTOR_MODE_P (mode))
6007             *cost += extra_cost->vect.alu;
6008           else
6009             *cost += extra_cost->alu.clz;
6010         }
6011
6012       return false;
6013
6014     case COMPARE:
6015       op0 = XEXP (x, 0);
6016       op1 = XEXP (x, 1);
6017
6018       if (op1 == const0_rtx
6019           && GET_CODE (op0) == AND)
6020         {
6021           x = op0;
6022           mode = GET_MODE (op0);
6023           goto cost_logic;
6024         }
6025
6026       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6027         {
6028           /* TODO: A write to the CC flags possibly costs extra, this
6029              needs encoding in the cost tables.  */
6030
6031           /* CC_ZESWPmode supports zero extend for free.  */
6032           if (mode == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
6033             op0 = XEXP (op0, 0);
6034
6035           mode = GET_MODE (op0);
6036           /* ANDS.  */
6037           if (GET_CODE (op0) == AND)
6038             {
6039               x = op0;
6040               goto cost_logic;
6041             }
6042
6043           if (GET_CODE (op0) == PLUS)
6044             {
6045               /* ADDS (and CMN alias).  */
6046               x = op0;
6047               goto cost_plus;
6048             }
6049
6050           if (GET_CODE (op0) == MINUS)
6051             {
6052               /* SUBS.  */
6053               x = op0;
6054               goto cost_minus;
6055             }
6056
6057           if (GET_CODE (op1) == NEG)
6058             {
6059               /* CMN.  */
6060               if (speed)
6061                 *cost += extra_cost->alu.arith;
6062
6063               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6064               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6065               return true;
6066             }
6067
6068           /* CMP.
6069
6070              Compare can freely swap the order of operands, and
6071              canonicalization puts the more complex operation first.
6072              But the integer MINUS logic expects the shift/extend
6073              operation in op1.  */
6074           if (! (REG_P (op0)
6075                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6076           {
6077             op0 = XEXP (x, 1);
6078             op1 = XEXP (x, 0);
6079           }
6080           goto cost_minus;
6081         }
6082
6083       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6084         {
6085           /* FCMP.  */
6086           if (speed)
6087             *cost += extra_cost->fp[mode == DFmode].compare;
6088
6089           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6090             {
6091               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6092               /* FCMP supports constant 0.0 for no extra cost. */
6093               return true;
6094             }
6095           return false;
6096         }
6097
6098       if (VECTOR_MODE_P (mode))
6099         {
6100           /* Vector compare.  */
6101           if (speed)
6102             *cost += extra_cost->vect.alu;
6103
6104           if (aarch64_float_const_zero_rtx_p (op1))
6105             {
6106               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6107                  cost.  */
6108               return true;
6109             }
6110           return false;
6111         }
6112       return false;
6113
6114     case MINUS:
6115       {
6116         op0 = XEXP (x, 0);
6117         op1 = XEXP (x, 1);
6118
6119 cost_minus:
6120         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6121
6122         /* Detect valid immediates.  */
6123         if ((GET_MODE_CLASS (mode) == MODE_INT
6124              || (GET_MODE_CLASS (mode) == MODE_CC
6125                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6126             && CONST_INT_P (op1)
6127             && aarch64_uimm12_shift (INTVAL (op1)))
6128           {
6129             if (speed)
6130               /* SUB(S) (immediate).  */
6131               *cost += extra_cost->alu.arith;
6132             return true;
6133           }
6134
6135         /* Look for SUB (extended register).  */
6136         if (aarch64_rtx_arith_op_extract_p (op1, mode))
6137           {
6138             if (speed)
6139               *cost += extra_cost->alu.extend_arith;
6140
6141             op1 = aarch64_strip_extend (op1);
6142             *cost += rtx_cost (op1, VOIDmode,
6143                                (enum rtx_code) GET_CODE (op1), 0, speed);
6144             return true;
6145           }
6146
6147         rtx new_op1 = aarch64_strip_extend (op1);
6148
6149         /* Cost this as an FMA-alike operation.  */
6150         if ((GET_CODE (new_op1) == MULT
6151              || aarch64_shift_p (GET_CODE (new_op1)))
6152             && code != COMPARE)
6153           {
6154             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6155                                             (enum rtx_code) code,
6156                                             speed);
6157             return true;
6158           }
6159
6160         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6161
6162         if (speed)
6163           {
6164             if (VECTOR_MODE_P (mode))
6165               {
6166                 /* Vector SUB.  */
6167                 *cost += extra_cost->vect.alu;
6168               }
6169             else if (GET_MODE_CLASS (mode) == MODE_INT)
6170               {
6171                 /* SUB(S).  */
6172                 *cost += extra_cost->alu.arith;
6173               }
6174             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6175               {
6176                 /* FSUB.  */
6177                 *cost += extra_cost->fp[mode == DFmode].addsub;
6178               }
6179           }
6180         return true;
6181       }
6182
6183     case PLUS:
6184       {
6185         rtx new_op0;
6186
6187         op0 = XEXP (x, 0);
6188         op1 = XEXP (x, 1);
6189
6190 cost_plus:
6191         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6192             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6193           {
6194             /* CSINC.  */
6195             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6196             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6197             return true;
6198           }
6199
6200         if (GET_MODE_CLASS (mode) == MODE_INT
6201             && CONST_INT_P (op1)
6202             && aarch64_uimm12_shift (INTVAL (op1)))
6203           {
6204             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6205
6206             if (speed)
6207               /* ADD (immediate).  */
6208               *cost += extra_cost->alu.arith;
6209             return true;
6210           }
6211
6212         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6213
6214         /* Look for ADD (extended register).  */
6215         if (aarch64_rtx_arith_op_extract_p (op0, mode))
6216           {
6217             if (speed)
6218               *cost += extra_cost->alu.extend_arith;
6219
6220             op0 = aarch64_strip_extend (op0);
6221             *cost += rtx_cost (op0, VOIDmode,
6222                                (enum rtx_code) GET_CODE (op0), 0, speed);
6223             return true;
6224           }
6225
6226         /* Strip any extend, leave shifts behind as we will
6227            cost them through mult_cost.  */
6228         new_op0 = aarch64_strip_extend (op0);
6229
6230         if (GET_CODE (new_op0) == MULT
6231             || aarch64_shift_p (GET_CODE (new_op0)))
6232           {
6233             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6234                                             speed);
6235             return true;
6236           }
6237
6238         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
6239
6240         if (speed)
6241           {
6242             if (VECTOR_MODE_P (mode))
6243               {
6244                 /* Vector ADD.  */
6245                 *cost += extra_cost->vect.alu;
6246               }
6247             else if (GET_MODE_CLASS (mode) == MODE_INT)
6248               {
6249                 /* ADD.  */
6250                 *cost += extra_cost->alu.arith;
6251               }
6252             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6253               {
6254                 /* FADD.  */
6255                 *cost += extra_cost->fp[mode == DFmode].addsub;
6256               }
6257           }
6258         return true;
6259       }
6260
6261     case BSWAP:
6262       *cost = COSTS_N_INSNS (1);
6263
6264       if (speed)
6265         {
6266           if (VECTOR_MODE_P (mode))
6267             *cost += extra_cost->vect.alu;
6268           else
6269             *cost += extra_cost->alu.rev;
6270         }
6271       return false;
6272
6273     case IOR:
6274       if (aarch_rev16_p (x))
6275         {
6276           *cost = COSTS_N_INSNS (1);
6277
6278           if (speed)
6279             {
6280               if (VECTOR_MODE_P (mode))
6281                 *cost += extra_cost->vect.alu;
6282               else
6283                 *cost += extra_cost->alu.rev;
6284             }
6285           return true;
6286         }
6287
6288       if (aarch64_extr_rtx_p (x, &op0, &op1))
6289         {
6290           *cost += rtx_cost (op0, mode, IOR, 0, speed);
6291           *cost += rtx_cost (op1, mode, IOR, 1, speed);
6292           if (speed)
6293             *cost += extra_cost->alu.shift;
6294
6295           return true;
6296         }
6297     /* Fall through.  */
6298     case XOR:
6299     case AND:
6300     cost_logic:
6301       op0 = XEXP (x, 0);
6302       op1 = XEXP (x, 1);
6303
6304       if (VECTOR_MODE_P (mode))
6305         {
6306           if (speed)
6307             *cost += extra_cost->vect.alu;
6308           return true;
6309         }
6310
6311       if (code == AND
6312           && GET_CODE (op0) == MULT
6313           && CONST_INT_P (XEXP (op0, 1))
6314           && CONST_INT_P (op1)
6315           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6316                                INTVAL (op1)) != 0)
6317         {
6318           /* This is a UBFM/SBFM.  */
6319           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
6320           if (speed)
6321             *cost += extra_cost->alu.bfx;
6322           return true;
6323         }
6324
6325       if (GET_MODE_CLASS (mode) == MODE_INT)
6326         {
6327           /* We possibly get the immediate for free, this is not
6328              modelled.  */
6329           if (CONST_INT_P (op1)
6330               && aarch64_bitmask_imm (INTVAL (op1), mode))
6331             {
6332               *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6333
6334               if (speed)
6335                 *cost += extra_cost->alu.logical;
6336
6337               return true;
6338             }
6339           else
6340             {
6341               rtx new_op0 = op0;
6342
6343               /* Handle ORN, EON, or BIC.  */
6344               if (GET_CODE (op0) == NOT)
6345                 op0 = XEXP (op0, 0);
6346
6347               new_op0 = aarch64_strip_shift (op0);
6348
6349               /* If we had a shift on op0 then this is a logical-shift-
6350                  by-register/immediate operation.  Otherwise, this is just
6351                  a logical operation.  */
6352               if (speed)
6353                 {
6354                   if (new_op0 != op0)
6355                     {
6356                       /* Shift by immediate.  */
6357                       if (CONST_INT_P (XEXP (op0, 1)))
6358                         *cost += extra_cost->alu.log_shift;
6359                       else
6360                         *cost += extra_cost->alu.log_shift_reg;
6361                     }
6362                   else
6363                     *cost += extra_cost->alu.logical;
6364                 }
6365
6366               /* In both cases we want to cost both operands.  */
6367               *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
6368               *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
6369
6370               return true;
6371             }
6372         }
6373       return false;
6374
6375     case NOT:
6376       x = XEXP (x, 0);
6377       op0 = aarch64_strip_shift (x);
6378
6379       if (VECTOR_MODE_P (mode))
6380         {
6381           /* Vector NOT.  */
6382           *cost += extra_cost->vect.alu;
6383           return false;
6384         }
6385
6386       /* MVN-shifted-reg.  */
6387       if (op0 != x)
6388         {
6389           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6390
6391           if (speed)
6392             *cost += extra_cost->alu.log_shift;
6393
6394           return true;
6395         }
6396       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6397          Handle the second form here taking care that 'a' in the above can
6398          be a shift.  */
6399       else if (GET_CODE (op0) == XOR)
6400         {
6401           rtx newop0 = XEXP (op0, 0);
6402           rtx newop1 = XEXP (op0, 1);
6403           rtx op0_stripped = aarch64_strip_shift (newop0);
6404
6405           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
6406           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
6407
6408           if (speed)
6409             {
6410               if (op0_stripped != newop0)
6411                 *cost += extra_cost->alu.log_shift;
6412               else
6413                 *cost += extra_cost->alu.logical;
6414             }
6415
6416           return true;
6417         }
6418       /* MVN.  */
6419       if (speed)
6420         *cost += extra_cost->alu.logical;
6421
6422       return false;
6423
6424     case ZERO_EXTEND:
6425
6426       op0 = XEXP (x, 0);
6427       /* If a value is written in SI mode, then zero extended to DI
6428          mode, the operation will in general be free as a write to
6429          a 'w' register implicitly zeroes the upper bits of an 'x'
6430          register.  However, if this is
6431
6432            (set (reg) (zero_extend (reg)))
6433
6434          we must cost the explicit register move.  */
6435       if (mode == DImode
6436           && GET_MODE (op0) == SImode
6437           && outer == SET)
6438         {
6439           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
6440
6441           if (!op_cost && speed)
6442             /* MOV.  */
6443             *cost += extra_cost->alu.extend;
6444           else
6445             /* Free, the cost is that of the SI mode operation.  */
6446             *cost = op_cost;
6447
6448           return true;
6449         }
6450       else if (MEM_P (op0))
6451         {
6452           /* All loads can zero extend to any size for free.  */
6453           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
6454           return true;
6455         }
6456
6457       if (speed)
6458         {
6459           if (VECTOR_MODE_P (mode))
6460             {
6461               /* UMOV.  */
6462               *cost += extra_cost->vect.alu;
6463             }
6464           else
6465             {
6466               /* UXTB/UXTH.  */
6467               *cost += extra_cost->alu.extend;
6468             }
6469         }
6470       return false;
6471
6472     case SIGN_EXTEND:
6473       if (MEM_P (XEXP (x, 0)))
6474         {
6475           /* LDRSH.  */
6476           if (speed)
6477             {
6478               rtx address = XEXP (XEXP (x, 0), 0);
6479               *cost += extra_cost->ldst.load_sign_extend;
6480
6481               *cost +=
6482                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6483                                                      0, speed));
6484             }
6485           return true;
6486         }
6487
6488       if (speed)
6489         {
6490           if (VECTOR_MODE_P (mode))
6491             *cost += extra_cost->vect.alu;
6492           else
6493             *cost += extra_cost->alu.extend;
6494         }
6495       return false;
6496
6497     case ASHIFT:
6498       op0 = XEXP (x, 0);
6499       op1 = XEXP (x, 1);
6500
6501       if (CONST_INT_P (op1))
6502         {
6503           if (speed)
6504             {
6505               if (VECTOR_MODE_P (mode))
6506                 {
6507                   /* Vector shift (immediate).  */
6508                   *cost += extra_cost->vect.alu;
6509                 }
6510               else
6511                 {
6512                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6513                      aliases.  */
6514                   *cost += extra_cost->alu.shift;
6515                 }
6516             }
6517
6518           /* We can incorporate zero/sign extend for free.  */
6519           if (GET_CODE (op0) == ZERO_EXTEND
6520               || GET_CODE (op0) == SIGN_EXTEND)
6521             op0 = XEXP (op0, 0);
6522
6523           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
6524           return true;
6525         }
6526       else
6527         {
6528           if (speed)
6529             {
6530               if (VECTOR_MODE_P (mode))
6531                 {
6532                   /* Vector shift (register).  */
6533                   *cost += extra_cost->vect.alu;
6534                 }
6535               else
6536                 {
6537                   /* LSLV.  */
6538                   *cost += extra_cost->alu.shift_reg;
6539                 }
6540             }
6541           return false;  /* All arguments need to be in registers.  */
6542         }
6543
6544     case ROTATE:
6545     case ROTATERT:
6546     case LSHIFTRT:
6547     case ASHIFTRT:
6548       op0 = XEXP (x, 0);
6549       op1 = XEXP (x, 1);
6550
6551       if (CONST_INT_P (op1))
6552         {
6553           /* ASR (immediate) and friends.  */
6554           if (speed)
6555             {
6556               if (VECTOR_MODE_P (mode))
6557                 *cost += extra_cost->vect.alu;
6558               else
6559                 *cost += extra_cost->alu.shift;
6560             }
6561
6562           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6563           return true;
6564         }
6565       else
6566         {
6567
6568           /* ASR (register) and friends.  */
6569           if (speed)
6570             {
6571               if (VECTOR_MODE_P (mode))
6572                 *cost += extra_cost->vect.alu;
6573               else
6574                 *cost += extra_cost->alu.shift_reg;
6575             }
6576           return false;  /* All arguments need to be in registers.  */
6577         }
6578
6579     case SYMBOL_REF:
6580
6581       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
6582           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
6583         {
6584           /* LDR.  */
6585           if (speed)
6586             *cost += extra_cost->ldst.load;
6587         }
6588       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6589                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6590         {
6591           /* ADRP, followed by ADD.  */
6592           *cost += COSTS_N_INSNS (1);
6593           if (speed)
6594             *cost += 2 * extra_cost->alu.arith;
6595         }
6596       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6597                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6598         {
6599           /* ADR.  */
6600           if (speed)
6601             *cost += extra_cost->alu.arith;
6602         }
6603
6604       if (flag_pic)
6605         {
6606           /* One extra load instruction, after accessing the GOT.  */
6607           *cost += COSTS_N_INSNS (1);
6608           if (speed)
6609             *cost += extra_cost->ldst.load;
6610         }
6611       return true;
6612
6613     case HIGH:
6614     case LO_SUM:
6615       /* ADRP/ADD (immediate).  */
6616       if (speed)
6617         *cost += extra_cost->alu.arith;
6618       return true;
6619
6620     case ZERO_EXTRACT:
6621     case SIGN_EXTRACT:
6622       /* UBFX/SBFX.  */
6623       if (speed)
6624         {
6625           if (VECTOR_MODE_P (mode))
6626             *cost += extra_cost->vect.alu;
6627           else
6628             *cost += extra_cost->alu.bfx;
6629         }
6630
6631       /* We can trust that the immediates used will be correct (there
6632          are no by-register forms), so we need only cost op0.  */
6633       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
6634       return true;
6635
6636     case MULT:
6637       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6638       /* aarch64_rtx_mult_cost always handles recursion to its
6639          operands.  */
6640       return true;
6641
6642     case MOD:
6643     case UMOD:
6644       if (speed)
6645         {
6646           if (VECTOR_MODE_P (mode))
6647             *cost += extra_cost->vect.alu;
6648           else if (GET_MODE_CLASS (mode) == MODE_INT)
6649             *cost += (extra_cost->mult[mode == DImode].add
6650                       + extra_cost->mult[mode == DImode].idiv);
6651           else if (mode == DFmode)
6652             *cost += (extra_cost->fp[1].mult
6653                       + extra_cost->fp[1].div);
6654           else if (mode == SFmode)
6655             *cost += (extra_cost->fp[0].mult
6656                       + extra_cost->fp[0].div);
6657         }
6658       return false;  /* All arguments need to be in registers.  */
6659
6660     case DIV:
6661     case UDIV:
6662     case SQRT:
6663       if (speed)
6664         {
6665           if (VECTOR_MODE_P (mode))
6666             *cost += extra_cost->vect.alu;
6667           else if (GET_MODE_CLASS (mode) == MODE_INT)
6668             /* There is no integer SQRT, so only DIV and UDIV can get
6669                here.  */
6670             *cost += extra_cost->mult[mode == DImode].idiv;
6671           else
6672             *cost += extra_cost->fp[mode == DFmode].div;
6673         }
6674       return false;  /* All arguments need to be in registers.  */
6675
6676     case IF_THEN_ELSE:
6677       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6678                                          XEXP (x, 2), cost, speed);
6679
6680     case EQ:
6681     case NE:
6682     case GT:
6683     case GTU:
6684     case LT:
6685     case LTU:
6686     case GE:
6687     case GEU:
6688     case LE:
6689     case LEU:
6690
6691       return false; /* All arguments must be in registers.  */
6692
6693     case FMA:
6694       op0 = XEXP (x, 0);
6695       op1 = XEXP (x, 1);
6696       op2 = XEXP (x, 2);
6697
6698       if (speed)
6699         {
6700           if (VECTOR_MODE_P (mode))
6701             *cost += extra_cost->vect.alu;
6702           else
6703             *cost += extra_cost->fp[mode == DFmode].fma;
6704         }
6705
6706       /* FMSUB, FNMADD, and FNMSUB are free.  */
6707       if (GET_CODE (op0) == NEG)
6708         op0 = XEXP (op0, 0);
6709
6710       if (GET_CODE (op2) == NEG)
6711         op2 = XEXP (op2, 0);
6712
6713       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6714          and the by-element operand as operand 0.  */
6715       if (GET_CODE (op1) == NEG)
6716         op1 = XEXP (op1, 0);
6717
6718       /* Catch vector-by-element operations.  The by-element operand can
6719          either be (vec_duplicate (vec_select (x))) or just
6720          (vec_select (x)), depending on whether we are multiplying by
6721          a vector or a scalar.
6722
6723          Canonicalization is not very good in these cases, FMA4 will put the
6724          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
6725       if (GET_CODE (op0) == VEC_DUPLICATE)
6726         op0 = XEXP (op0, 0);
6727       else if (GET_CODE (op1) == VEC_DUPLICATE)
6728         op1 = XEXP (op1, 0);
6729
6730       if (GET_CODE (op0) == VEC_SELECT)
6731         op0 = XEXP (op0, 0);
6732       else if (GET_CODE (op1) == VEC_SELECT)
6733         op1 = XEXP (op1, 0);
6734
6735       /* If the remaining parameters are not registers,
6736          get the cost to put them into registers.  */
6737       *cost += rtx_cost (op0, mode, FMA, 0, speed);
6738       *cost += rtx_cost (op1, mode, FMA, 1, speed);
6739       *cost += rtx_cost (op2, mode, FMA, 2, speed);
6740       return true;
6741
6742     case FLOAT:
6743     case UNSIGNED_FLOAT:
6744       if (speed)
6745         *cost += extra_cost->fp[mode == DFmode].fromint;
6746       return false;
6747
6748     case FLOAT_EXTEND:
6749       if (speed)
6750         {
6751           if (VECTOR_MODE_P (mode))
6752             {
6753               /*Vector truncate.  */
6754               *cost += extra_cost->vect.alu;
6755             }
6756           else
6757             *cost += extra_cost->fp[mode == DFmode].widen;
6758         }
6759       return false;
6760
6761     case FLOAT_TRUNCATE:
6762       if (speed)
6763         {
6764           if (VECTOR_MODE_P (mode))
6765             {
6766               /*Vector conversion.  */
6767               *cost += extra_cost->vect.alu;
6768             }
6769           else
6770             *cost += extra_cost->fp[mode == DFmode].narrow;
6771         }
6772       return false;
6773
6774     case FIX:
6775     case UNSIGNED_FIX:
6776       x = XEXP (x, 0);
6777       /* Strip the rounding part.  They will all be implemented
6778          by the fcvt* family of instructions anyway.  */
6779       if (GET_CODE (x) == UNSPEC)
6780         {
6781           unsigned int uns_code = XINT (x, 1);
6782
6783           if (uns_code == UNSPEC_FRINTA
6784               || uns_code == UNSPEC_FRINTM
6785               || uns_code == UNSPEC_FRINTN
6786               || uns_code == UNSPEC_FRINTP
6787               || uns_code == UNSPEC_FRINTZ)
6788             x = XVECEXP (x, 0, 0);
6789         }
6790
6791       if (speed)
6792         {
6793           if (VECTOR_MODE_P (mode))
6794             *cost += extra_cost->vect.alu;
6795           else
6796             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6797         }
6798       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
6799       return true;
6800
6801     case ABS:
6802       if (VECTOR_MODE_P (mode))
6803         {
6804           /* ABS (vector).  */
6805           if (speed)
6806             *cost += extra_cost->vect.alu;
6807         }
6808       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6809         {
6810           op0 = XEXP (x, 0);
6811
6812           /* FABD, which is analogous to FADD.  */
6813           if (GET_CODE (op0) == MINUS)
6814             {
6815               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
6816               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
6817               if (speed)
6818                 *cost += extra_cost->fp[mode == DFmode].addsub;
6819
6820               return true;
6821             }
6822           /* Simple FABS is analogous to FNEG.  */
6823           if (speed)
6824             *cost += extra_cost->fp[mode == DFmode].neg;
6825         }
6826       else
6827         {
6828           /* Integer ABS will either be split to
6829              two arithmetic instructions, or will be an ABS
6830              (scalar), which we don't model.  */
6831           *cost = COSTS_N_INSNS (2);
6832           if (speed)
6833             *cost += 2 * extra_cost->alu.arith;
6834         }
6835       return false;
6836
6837     case SMAX:
6838     case SMIN:
6839       if (speed)
6840         {
6841           if (VECTOR_MODE_P (mode))
6842             *cost += extra_cost->vect.alu;
6843           else
6844             {
6845               /* FMAXNM/FMINNM/FMAX/FMIN.
6846                  TODO: This may not be accurate for all implementations, but
6847                  we do not model this in the cost tables.  */
6848               *cost += extra_cost->fp[mode == DFmode].addsub;
6849             }
6850         }
6851       return false;
6852
6853     case UNSPEC:
6854       /* The floating point round to integer frint* instructions.  */
6855       if (aarch64_frint_unspec_p (XINT (x, 1)))
6856         {
6857           if (speed)
6858             *cost += extra_cost->fp[mode == DFmode].roundint;
6859
6860           return false;
6861         }
6862
6863       if (XINT (x, 1) == UNSPEC_RBIT)
6864         {
6865           if (speed)
6866             *cost += extra_cost->alu.rev;
6867
6868           return false;
6869         }
6870       break;
6871
6872     case TRUNCATE:
6873
6874       /* Decompose <su>muldi3_highpart.  */
6875       if (/* (truncate:DI  */
6876           mode == DImode
6877           /*   (lshiftrt:TI  */
6878           && GET_MODE (XEXP (x, 0)) == TImode
6879           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6880           /*      (mult:TI  */
6881           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6882           /*        (ANY_EXTEND:TI (reg:DI))
6883                     (ANY_EXTEND:TI (reg:DI)))  */
6884           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6885                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6886               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6887                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6888           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6889           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6890           /*     (const_int 64)  */
6891           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6892           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6893         {
6894           /* UMULH/SMULH.  */
6895           if (speed)
6896             *cost += extra_cost->mult[mode == DImode].extend;
6897           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6898                              mode, MULT, 0, speed);
6899           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6900                              mode, MULT, 1, speed);
6901           return true;
6902         }
6903
6904       /* Fall through.  */
6905     default:
6906       break;
6907     }
6908
6909   if (dump_file && (dump_flags & TDF_DETAILS))
6910     fprintf (dump_file,
6911       "\nFailed to cost RTX.  Assuming default cost.\n");
6912
6913   return true;
6914 }
6915
6916 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6917    calculated for X.  This cost is stored in *COST.  Returns true
6918    if the total cost of X was calculated.  */
6919 static bool
6920 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
6921                    int param, int *cost, bool speed)
6922 {
6923   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
6924
6925   if (dump_file && (dump_flags & TDF_DETAILS))
6926     {
6927       print_rtl_single (dump_file, x);
6928       fprintf (dump_file, "\n%s cost: %d (%s)\n",
6929                speed ? "Hot" : "Cold",
6930                *cost, result ? "final" : "partial");
6931     }
6932
6933   return result;
6934 }
6935
6936 static int
6937 aarch64_register_move_cost (machine_mode mode,
6938                             reg_class_t from_i, reg_class_t to_i)
6939 {
6940   enum reg_class from = (enum reg_class) from_i;
6941   enum reg_class to = (enum reg_class) to_i;
6942   const struct cpu_regmove_cost *regmove_cost
6943     = aarch64_tune_params.regmove_cost;
6944
6945   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
6946   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6947     to = GENERAL_REGS;
6948
6949   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6950     from = GENERAL_REGS;
6951
6952   /* Moving between GPR and stack cost is the same as GP2GP.  */
6953   if ((from == GENERAL_REGS && to == STACK_REG)
6954       || (to == GENERAL_REGS && from == STACK_REG))
6955     return regmove_cost->GP2GP;
6956
6957   /* To/From the stack register, we move via the gprs.  */
6958   if (to == STACK_REG || from == STACK_REG)
6959     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
6960             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
6961
6962   if (GET_MODE_SIZE (mode) == 16)
6963     {
6964       /* 128-bit operations on general registers require 2 instructions.  */
6965       if (from == GENERAL_REGS && to == GENERAL_REGS)
6966         return regmove_cost->GP2GP * 2;
6967       else if (from == GENERAL_REGS)
6968         return regmove_cost->GP2FP * 2;
6969       else if (to == GENERAL_REGS)
6970         return regmove_cost->FP2GP * 2;
6971
6972       /* When AdvSIMD instructions are disabled it is not possible to move
6973          a 128-bit value directly between Q registers.  This is handled in
6974          secondary reload.  A general register is used as a scratch to move
6975          the upper DI value and the lower DI value is moved directly,
6976          hence the cost is the sum of three moves. */
6977       if (! TARGET_SIMD)
6978         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
6979
6980       return regmove_cost->FP2FP;
6981     }
6982
6983   if (from == GENERAL_REGS && to == GENERAL_REGS)
6984     return regmove_cost->GP2GP;
6985   else if (from == GENERAL_REGS)
6986     return regmove_cost->GP2FP;
6987   else if (to == GENERAL_REGS)
6988     return regmove_cost->FP2GP;
6989
6990   return regmove_cost->FP2FP;
6991 }
6992
6993 static int
6994 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
6995                           reg_class_t rclass ATTRIBUTE_UNUSED,
6996                           bool in ATTRIBUTE_UNUSED)
6997 {
6998   return aarch64_tune_params.memmov_cost;
6999 }
7000
7001 /* Return the number of instructions that can be issued per cycle.  */
7002 static int
7003 aarch64_sched_issue_rate (void)
7004 {
7005   return aarch64_tune_params.issue_rate;
7006 }
7007
7008 static int
7009 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
7010 {
7011   int issue_rate = aarch64_sched_issue_rate ();
7012
7013   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
7014 }
7015
7016 /* Vectorizer cost model target hooks.  */
7017
7018 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
7019 static int
7020 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
7021                                     tree vectype,
7022                                     int misalign ATTRIBUTE_UNUSED)
7023 {
7024   unsigned elements;
7025
7026   switch (type_of_cost)
7027     {
7028       case scalar_stmt:
7029         return aarch64_tune_params.vec_costs->scalar_stmt_cost;
7030
7031       case scalar_load:
7032         return aarch64_tune_params.vec_costs->scalar_load_cost;
7033
7034       case scalar_store:
7035         return aarch64_tune_params.vec_costs->scalar_store_cost;
7036
7037       case vector_stmt:
7038         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7039
7040       case vector_load:
7041         return aarch64_tune_params.vec_costs->vec_align_load_cost;
7042
7043       case vector_store:
7044         return aarch64_tune_params.vec_costs->vec_store_cost;
7045
7046       case vec_to_scalar:
7047         return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
7048
7049       case scalar_to_vec:
7050         return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
7051
7052       case unaligned_load:
7053         return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
7054
7055       case unaligned_store:
7056         return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
7057
7058       case cond_branch_taken:
7059         return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
7060
7061       case cond_branch_not_taken:
7062         return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
7063
7064       case vec_perm:
7065       case vec_promote_demote:
7066         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7067
7068       case vec_construct:
7069         elements = TYPE_VECTOR_SUBPARTS (vectype);
7070         return elements / 2 + 1;
7071
7072       default:
7073         gcc_unreachable ();
7074     }
7075 }
7076
7077 /* Implement targetm.vectorize.add_stmt_cost.  */
7078 static unsigned
7079 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
7080                        struct _stmt_vec_info *stmt_info, int misalign,
7081                        enum vect_cost_model_location where)
7082 {
7083   unsigned *cost = (unsigned *) data;
7084   unsigned retval = 0;
7085
7086   if (flag_vect_cost_model)
7087     {
7088       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
7089       int stmt_cost =
7090             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
7091
7092       /* Statements in an inner loop relative to the loop being
7093          vectorized are weighted more heavily.  The value here is
7094          a function (linear for now) of the loop nest level.  */
7095       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
7096         {
7097           loop_vec_info loop_info = STMT_VINFO_LOOP_VINFO (stmt_info);
7098           struct loop *loop =  LOOP_VINFO_LOOP (loop_info);
7099           unsigned nest_level = loop_depth (loop);
7100
7101           count *= nest_level;
7102         }
7103
7104       retval = (unsigned) (count * stmt_cost);
7105       cost[where] += retval;
7106     }
7107
7108   return retval;
7109 }
7110
7111 static void initialize_aarch64_code_model (struct gcc_options *);
7112
7113 /* Enum describing the various ways that the
7114    aarch64_parse_{arch,tune,cpu,extension} functions can fail.
7115    This way their callers can choose what kind of error to give.  */
7116
7117 enum aarch64_parse_opt_result
7118 {
7119   AARCH64_PARSE_OK,                     /* Parsing was successful.  */
7120   AARCH64_PARSE_MISSING_ARG,            /* Missing argument.  */
7121   AARCH64_PARSE_INVALID_FEATURE,        /* Invalid feature modifier.  */
7122   AARCH64_PARSE_INVALID_ARG             /* Invalid arch, tune, cpu arg.  */
7123 };
7124
7125 /* Parse the architecture extension string STR and update ISA_FLAGS
7126    with the architecture features turned on or off.  Return a
7127    aarch64_parse_opt_result describing the result.  */
7128
7129 static enum aarch64_parse_opt_result
7130 aarch64_parse_extension (char *str, unsigned long *isa_flags)
7131 {
7132   /* The extension string is parsed left to right.  */
7133   const struct aarch64_option_extension *opt = NULL;
7134
7135   /* Flag to say whether we are adding or removing an extension.  */
7136   int adding_ext = -1;
7137
7138   while (str != NULL && *str != 0)
7139     {
7140       char *ext;
7141       size_t len;
7142
7143       str++;
7144       ext = strchr (str, '+');
7145
7146       if (ext != NULL)
7147         len = ext - str;
7148       else
7149         len = strlen (str);
7150
7151       if (len >= 2 && strncmp (str, "no", 2) == 0)
7152         {
7153           adding_ext = 0;
7154           len -= 2;
7155           str += 2;
7156         }
7157       else if (len > 0)
7158         adding_ext = 1;
7159
7160       if (len == 0)
7161         return AARCH64_PARSE_MISSING_ARG;
7162
7163
7164       /* Scan over the extensions table trying to find an exact match.  */
7165       for (opt = all_extensions; opt->name != NULL; opt++)
7166         {
7167           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
7168             {
7169               /* Add or remove the extension.  */
7170               if (adding_ext)
7171                 *isa_flags |= opt->flags_on;
7172               else
7173                 *isa_flags &= ~(opt->flags_off);
7174               break;
7175             }
7176         }
7177
7178       if (opt->name == NULL)
7179         {
7180           /* Extension not found in list.  */
7181           return AARCH64_PARSE_INVALID_FEATURE;
7182         }
7183
7184       str = ext;
7185     };
7186
7187   return AARCH64_PARSE_OK;
7188 }
7189
7190 /* Parse the TO_PARSE string and put the architecture struct that it
7191    selects into RES and the architectural features into ISA_FLAGS.
7192    Return an aarch64_parse_opt_result describing the parse result.
7193    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
7194
7195 static enum aarch64_parse_opt_result
7196 aarch64_parse_arch (const char *to_parse, const struct processor **res,
7197                     unsigned long *isa_flags)
7198 {
7199   char *ext;
7200   const struct processor *arch;
7201   char *str = (char *) alloca (strlen (to_parse) + 1);
7202   size_t len;
7203
7204   strcpy (str, to_parse);
7205
7206   ext = strchr (str, '+');
7207
7208   if (ext != NULL)
7209     len = ext - str;
7210   else
7211     len = strlen (str);
7212
7213   if (len == 0)
7214     return AARCH64_PARSE_MISSING_ARG;
7215
7216
7217   /* Loop through the list of supported ARCHes to find a match.  */
7218   for (arch = all_architectures; arch->name != NULL; arch++)
7219     {
7220       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7221         {
7222           unsigned long isa_temp = arch->flags;
7223
7224           if (ext != NULL)
7225             {
7226               /* TO_PARSE string contains at least one extension.  */
7227               enum aarch64_parse_opt_result ext_res
7228                 = aarch64_parse_extension (ext, &isa_temp);
7229
7230               if (ext_res != AARCH64_PARSE_OK)
7231                 return ext_res;
7232             }
7233           /* Extension parsing was successful.  Confirm the result
7234              arch and ISA flags.  */
7235           *res = arch;
7236           *isa_flags = isa_temp;
7237           return AARCH64_PARSE_OK;
7238         }
7239     }
7240
7241   /* ARCH name not found in list.  */
7242   return AARCH64_PARSE_INVALID_ARG;
7243 }
7244
7245 /* Parse the TO_PARSE string and put the result tuning in RES and the
7246    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
7247    describing the parse result.  If there is an error parsing, RES and
7248    ISA_FLAGS are left unchanged.  */
7249
7250 static enum aarch64_parse_opt_result
7251 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
7252                    unsigned long *isa_flags)
7253 {
7254   char *ext;
7255   const struct processor *cpu;
7256   char *str = (char *) alloca (strlen (to_parse) + 1);
7257   size_t len;
7258
7259   strcpy (str, to_parse);
7260
7261   ext = strchr (str, '+');
7262
7263   if (ext != NULL)
7264     len = ext - str;
7265   else
7266     len = strlen (str);
7267
7268   if (len == 0)
7269     return AARCH64_PARSE_MISSING_ARG;
7270
7271
7272   /* Loop through the list of supported CPUs to find a match.  */
7273   for (cpu = all_cores; cpu->name != NULL; cpu++)
7274     {
7275       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7276         {
7277           unsigned long isa_temp = cpu->flags;
7278
7279
7280           if (ext != NULL)
7281             {
7282               /* TO_PARSE string contains at least one extension.  */
7283               enum aarch64_parse_opt_result ext_res
7284                 = aarch64_parse_extension (ext, &isa_temp);
7285
7286               if (ext_res != AARCH64_PARSE_OK)
7287                 return ext_res;
7288             }
7289           /* Extension parsing was successfull.  Confirm the result
7290              cpu and ISA flags.  */
7291           *res = cpu;
7292           *isa_flags = isa_temp;
7293           return AARCH64_PARSE_OK;
7294         }
7295     }
7296
7297   /* CPU name not found in list.  */
7298   return AARCH64_PARSE_INVALID_ARG;
7299 }
7300
7301 /* Parse the TO_PARSE string and put the cpu it selects into RES.
7302    Return an aarch64_parse_opt_result describing the parse result.
7303    If the parsing fails the RES does not change.  */
7304
7305 static enum aarch64_parse_opt_result
7306 aarch64_parse_tune (const char *to_parse, const struct processor **res)
7307 {
7308   const struct processor *cpu;
7309   char *str = (char *) alloca (strlen (to_parse) + 1);
7310
7311   strcpy (str, to_parse);
7312
7313   /* Loop through the list of supported CPUs to find a match.  */
7314   for (cpu = all_cores; cpu->name != NULL; cpu++)
7315     {
7316       if (strcmp (cpu->name, str) == 0)
7317         {
7318           *res = cpu;
7319           return AARCH64_PARSE_OK;
7320         }
7321     }
7322
7323   /* CPU name not found in list.  */
7324   return AARCH64_PARSE_INVALID_ARG;
7325 }
7326
7327 /* Parse TOKEN, which has length LENGTH to see if it is an option
7328    described in FLAG.  If it is, return the index bit for that fusion type.
7329    If not, error (printing OPTION_NAME) and return zero.  */
7330
7331 static unsigned int
7332 aarch64_parse_one_option_token (const char *token,
7333                                 size_t length,
7334                                 const struct aarch64_flag_desc *flag,
7335                                 const char *option_name)
7336 {
7337   for (; flag->name != NULL; flag++)
7338     {
7339       if (length == strlen (flag->name)
7340           && !strncmp (flag->name, token, length))
7341         return flag->flag;
7342     }
7343
7344   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
7345   return 0;
7346 }
7347
7348 /* Parse OPTION which is a comma-separated list of flags to enable.
7349    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
7350    default state we inherit from the CPU tuning structures.  OPTION_NAME
7351    gives the top-level option we are parsing in the -moverride string,
7352    for use in error messages.  */
7353
7354 static unsigned int
7355 aarch64_parse_boolean_options (const char *option,
7356                                const struct aarch64_flag_desc *flags,
7357                                unsigned int initial_state,
7358                                const char *option_name)
7359 {
7360   const char separator = '.';
7361   const char* specs = option;
7362   const char* ntoken = option;
7363   unsigned int found_flags = initial_state;
7364
7365   while ((ntoken = strchr (specs, separator)))
7366     {
7367       size_t token_length = ntoken - specs;
7368       unsigned token_ops = aarch64_parse_one_option_token (specs,
7369                                                            token_length,
7370                                                            flags,
7371                                                            option_name);
7372       /* If we find "none" (or, for simplicity's sake, an error) anywhere
7373          in the token stream, reset the supported operations.  So:
7374
7375            adrp+add.cmp+branch.none.adrp+add
7376
7377            would have the result of turning on only adrp+add fusion.  */
7378       if (!token_ops)
7379         found_flags = 0;
7380
7381       found_flags |= token_ops;
7382       specs = ++ntoken;
7383     }
7384
7385   /* We ended with a comma, print something.  */
7386   if (!(*specs))
7387     {
7388       error ("%s string ill-formed\n", option_name);
7389       return 0;
7390     }
7391
7392   /* We still have one more token to parse.  */
7393   size_t token_length = strlen (specs);
7394   unsigned token_ops = aarch64_parse_one_option_token (specs,
7395                                                        token_length,
7396                                                        flags,
7397                                                        option_name);
7398    if (!token_ops)
7399      found_flags = 0;
7400
7401   found_flags |= token_ops;
7402   return found_flags;
7403 }
7404
7405 /* Support for overriding instruction fusion.  */
7406
7407 static void
7408 aarch64_parse_fuse_string (const char *fuse_string,
7409                             struct tune_params *tune)
7410 {
7411   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
7412                                                      aarch64_fusible_pairs,
7413                                                      tune->fusible_ops,
7414                                                      "fuse=");
7415 }
7416
7417 /* Support for overriding other tuning flags.  */
7418
7419 static void
7420 aarch64_parse_tune_string (const char *tune_string,
7421                             struct tune_params *tune)
7422 {
7423   tune->extra_tuning_flags
7424     = aarch64_parse_boolean_options (tune_string,
7425                                      aarch64_tuning_flags,
7426                                      tune->extra_tuning_flags,
7427                                      "tune=");
7428 }
7429
7430 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
7431    we understand.  If it is, extract the option string and handoff to
7432    the appropriate function.  */
7433
7434 void
7435 aarch64_parse_one_override_token (const char* token,
7436                                   size_t length,
7437                                   struct tune_params *tune)
7438 {
7439   const struct aarch64_tuning_override_function *fn
7440     = aarch64_tuning_override_functions;
7441
7442   const char *option_part = strchr (token, '=');
7443   if (!option_part)
7444     {
7445       error ("tuning string missing in option (%s)", token);
7446       return;
7447     }
7448
7449   /* Get the length of the option name.  */
7450   length = option_part - token;
7451   /* Skip the '=' to get to the option string.  */
7452   option_part++;
7453
7454   for (; fn->name != NULL; fn++)
7455     {
7456       if (!strncmp (fn->name, token, length))
7457         {
7458           fn->parse_override (option_part, tune);
7459           return;
7460         }
7461     }
7462
7463   error ("unknown tuning option (%s)",token);
7464   return;
7465 }
7466
7467 /* Parse STRING looking for options in the format:
7468      string     :: option:string
7469      option     :: name=substring
7470      name       :: {a-z}
7471      substring  :: defined by option.  */
7472
7473 static void
7474 aarch64_parse_override_string (const char* input_string,
7475                                struct tune_params* tune)
7476 {
7477   const char separator = ':';
7478   size_t string_length = strlen (input_string) + 1;
7479   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
7480   char *string = string_root;
7481   strncpy (string, input_string, string_length);
7482   string[string_length - 1] = '\0';
7483
7484   char* ntoken = string;
7485
7486   while ((ntoken = strchr (string, separator)))
7487     {
7488       size_t token_length = ntoken - string;
7489       /* Make this substring look like a string.  */
7490       *ntoken = '\0';
7491       aarch64_parse_one_override_token (string, token_length, tune);
7492       string = ++ntoken;
7493     }
7494
7495   /* One last option to parse.  */
7496   aarch64_parse_one_override_token (string, strlen (string), tune);
7497   free (string_root);
7498 }
7499
7500
7501 static void
7502 aarch64_override_options_after_change_1 (struct gcc_options *opts)
7503 {
7504   if (opts->x_flag_omit_frame_pointer)
7505     opts->x_flag_omit_leaf_frame_pointer = false;
7506   else if (opts->x_flag_omit_leaf_frame_pointer)
7507     opts->x_flag_omit_frame_pointer = true;
7508
7509   /* If not opzimizing for size, set the default
7510      alignment to what the target wants.  */
7511   if (!opts->x_optimize_size)
7512     {
7513       if (opts->x_align_loops <= 0)
7514         opts->x_align_loops = aarch64_tune_params.loop_align;
7515       if (opts->x_align_jumps <= 0)
7516         opts->x_align_jumps = aarch64_tune_params.jump_align;
7517       if (opts->x_align_functions <= 0)
7518         opts->x_align_functions = aarch64_tune_params.function_align;
7519     }
7520 }
7521
7522 /* 'Unpack' up the internal tuning structs and update the options
7523     in OPTS.  The caller must have set up selected_tune and selected_arch
7524     as all the other target-specific codegen decisions are
7525     derived from them.  */
7526
7527 void
7528 aarch64_override_options_internal (struct gcc_options *opts)
7529 {
7530   aarch64_tune_flags = selected_tune->flags;
7531   aarch64_tune = selected_tune->sched_core;
7532   /* Make a copy of the tuning parameters attached to the core, which
7533      we may later overwrite.  */
7534   aarch64_tune_params = *(selected_tune->tune);
7535   aarch64_architecture_version = selected_arch->architecture_version;
7536
7537   if (opts->x_aarch64_override_tune_string)
7538     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
7539                                   &aarch64_tune_params);
7540
7541   /* This target defaults to strict volatile bitfields.  */
7542   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
7543     opts->x_flag_strict_volatile_bitfields = 1;
7544
7545   /* -mgeneral-regs-only sets a mask in target_flags, make sure that
7546      aarch64_isa_flags does not contain the FP/SIMD/Crypto feature flags
7547      in case some code tries reading aarch64_isa_flags directly to check if
7548      FP is available.  Reuse the aarch64_parse_extension machinery since it
7549      knows how to disable any other flags that fp implies.  */
7550   if (TARGET_GENERAL_REGS_ONLY_P (opts->x_target_flags))
7551     {
7552       /* aarch64_parse_extension takes char* rather than const char* because
7553          it is usually called from within other parsing functions.  */
7554       char tmp_str[] = "+nofp";
7555       aarch64_parse_extension (tmp_str, &opts->x_aarch64_isa_flags);
7556     }
7557
7558   initialize_aarch64_code_model (opts);
7559
7560   aarch64_override_options_after_change_1 (opts);
7561 }
7562
7563 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
7564    specified in STR and throw errors if appropriate.  Put the results if
7565    they are valid in RES and ISA_FLAGS.  Return whether the option is
7566    valid.  */
7567
7568 static bool
7569 aarch64_validate_mcpu (const char *str, const struct processor **res,
7570                        unsigned long *isa_flags)
7571 {
7572   enum aarch64_parse_opt_result parse_res
7573     = aarch64_parse_cpu (str, res, isa_flags);
7574
7575   if (parse_res == AARCH64_PARSE_OK)
7576     return true;
7577
7578   switch (parse_res)
7579     {
7580       case AARCH64_PARSE_MISSING_ARG:
7581         error ("missing cpu name in -mcpu=%qs", str);
7582         break;
7583       case AARCH64_PARSE_INVALID_ARG:
7584         error ("unknown value %qs for -mcpu", str);
7585         break;
7586       case AARCH64_PARSE_INVALID_FEATURE:
7587         error ("invalid feature modifier in -mcpu=%qs", str);
7588         break;
7589       default:
7590         gcc_unreachable ();
7591     }
7592
7593   return false;
7594 }
7595
7596 /* Validate a command-line -march option.  Parse the arch and extensions
7597    (if any) specified in STR and throw errors if appropriate.  Put the
7598    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
7599    option is valid.  */
7600
7601 static bool
7602 aarch64_validate_march (const char *str, const struct processor **res,
7603                        unsigned long *isa_flags)
7604 {
7605   enum aarch64_parse_opt_result parse_res
7606     = aarch64_parse_arch (str, res, isa_flags);
7607
7608   if (parse_res == AARCH64_PARSE_OK)
7609     return true;
7610
7611   switch (parse_res)
7612     {
7613       case AARCH64_PARSE_MISSING_ARG:
7614         error ("missing arch name in -march=%qs", str);
7615         break;
7616       case AARCH64_PARSE_INVALID_ARG:
7617         error ("unknown value %qs for -march", str);
7618         break;
7619       case AARCH64_PARSE_INVALID_FEATURE:
7620         error ("invalid feature modifier in -march=%qs", str);
7621         break;
7622       default:
7623         gcc_unreachable ();
7624     }
7625
7626   return false;
7627 }
7628
7629 /* Validate a command-line -mtune option.  Parse the cpu
7630    specified in STR and throw errors if appropriate.  Put the
7631    result, if it is valid, in RES.  Return whether the option is
7632    valid.  */
7633
7634 static bool
7635 aarch64_validate_mtune (const char *str, const struct processor **res)
7636 {
7637   enum aarch64_parse_opt_result parse_res
7638     = aarch64_parse_tune (str, res);
7639
7640   if (parse_res == AARCH64_PARSE_OK)
7641     return true;
7642
7643   switch (parse_res)
7644     {
7645       case AARCH64_PARSE_MISSING_ARG:
7646         error ("missing cpu name in -mtune=%qs", str);
7647         break;
7648       case AARCH64_PARSE_INVALID_ARG:
7649         error ("unknown value %qs for -mtune", str);
7650         break;
7651       default:
7652         gcc_unreachable ();
7653     }
7654   return false;
7655 }
7656
7657 /* Return the CPU corresponding to the enum CPU.
7658    If it doesn't specify a cpu, return the default.  */
7659
7660 static const struct processor *
7661 aarch64_get_tune_cpu (enum aarch64_processor cpu)
7662 {
7663   if (cpu != aarch64_none)
7664     return &all_cores[cpu];
7665
7666   /* The & 0x3f is to extract the bottom 6 bits that encode the
7667      default cpu as selected by the --with-cpu GCC configure option
7668      in config.gcc.
7669      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
7670      flags mechanism should be reworked to make it more sane.  */
7671   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
7672 }
7673
7674 /* Return the architecture corresponding to the enum ARCH.
7675    If it doesn't specify a valid architecture, return the default.  */
7676
7677 static const struct processor *
7678 aarch64_get_arch (enum aarch64_arch arch)
7679 {
7680   if (arch != aarch64_no_arch)
7681     return &all_architectures[arch];
7682
7683   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
7684
7685   return &all_architectures[cpu->arch];
7686 }
7687
7688 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
7689    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
7690    tuning structs.  In particular it must set selected_tune and
7691    aarch64_isa_flags that define the available ISA features and tuning
7692    decisions.  It must also set selected_arch as this will be used to
7693    output the .arch asm tags for each function.  */
7694
7695 static void
7696 aarch64_override_options (void)
7697 {
7698   unsigned long cpu_isa = 0;
7699   unsigned long arch_isa = 0;
7700   aarch64_isa_flags = 0;
7701
7702   bool valid_cpu = true;
7703   bool valid_tune = true;
7704   bool valid_arch = true;
7705
7706   selected_cpu = NULL;
7707   selected_arch = NULL;
7708   selected_tune = NULL;
7709
7710   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
7711      If either of -march or -mtune is given, they override their
7712      respective component of -mcpu.  */
7713   if (aarch64_cpu_string)
7714     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
7715                                         &cpu_isa);
7716
7717   if (aarch64_arch_string)
7718     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
7719                                           &arch_isa);
7720
7721   if (aarch64_tune_string)
7722     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
7723
7724   /* If the user did not specify a processor, choose the default
7725      one for them.  This will be the CPU set during configuration using
7726      --with-cpu, otherwise it is "generic".  */
7727   if (!selected_cpu)
7728     {
7729       if (selected_arch)
7730         {
7731           selected_cpu = &all_cores[selected_arch->ident];
7732           aarch64_isa_flags = arch_isa;
7733           explicit_arch = selected_arch->arch;
7734         }
7735       else
7736         {
7737           /* Get default configure-time CPU.  */
7738           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
7739           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
7740         }
7741
7742       if (selected_tune)
7743         explicit_tune_core = selected_tune->ident;
7744     }
7745   /* If both -mcpu and -march are specified check that they are architecturally
7746      compatible, warn if they're not and prefer the -march ISA flags.  */
7747   else if (selected_arch)
7748     {
7749       if (selected_arch->arch != selected_cpu->arch)
7750         {
7751           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
7752                        all_architectures[selected_cpu->arch].name,
7753                        selected_arch->name);
7754         }
7755       aarch64_isa_flags = arch_isa;
7756       explicit_arch = selected_arch->arch;
7757       explicit_tune_core = selected_tune ? selected_tune->ident
7758                                           : selected_cpu->ident;
7759     }
7760   else
7761     {
7762       /* -mcpu but no -march.  */
7763       aarch64_isa_flags = cpu_isa;
7764       explicit_tune_core = selected_tune ? selected_tune->ident
7765                                           : selected_cpu->ident;
7766       gcc_assert (selected_cpu);
7767       selected_arch = &all_architectures[selected_cpu->arch];
7768       explicit_arch = selected_arch->arch;
7769     }
7770
7771   /* Set the arch as well as we will need it when outputing
7772      the .arch directive in assembly.  */
7773   if (!selected_arch)
7774     {
7775       gcc_assert (selected_cpu);
7776       selected_arch = &all_architectures[selected_cpu->arch];
7777     }
7778
7779   if (!selected_tune)
7780     selected_tune = selected_cpu;
7781
7782 #ifndef HAVE_AS_MABI_OPTION
7783   /* The compiler may have been configured with 2.23.* binutils, which does
7784      not have support for ILP32.  */
7785   if (TARGET_ILP32)
7786     error ("Assembler does not support -mabi=ilp32");
7787 #endif
7788
7789   /* Make sure we properly set up the explicit options.  */
7790   if ((aarch64_cpu_string && valid_cpu)
7791        || (aarch64_tune_string && valid_tune))
7792     gcc_assert (explicit_tune_core != aarch64_none);
7793
7794   if ((aarch64_cpu_string && valid_cpu)
7795        || (aarch64_arch_string && valid_arch))
7796     gcc_assert (explicit_arch != aarch64_no_arch);
7797
7798   aarch64_build_bitmask_table ();
7799
7800   aarch64_override_options_internal (&global_options);
7801
7802   /* Save these options as the default ones in case we push and pop them later
7803      while processing functions with potential target attributes.  */
7804   target_option_default_node = target_option_current_node
7805       = build_target_option_node (&global_options);
7806
7807   aarch64_register_fma_steering ();
7808
7809 }
7810
7811 /* Implement targetm.override_options_after_change.  */
7812
7813 static void
7814 aarch64_override_options_after_change (void)
7815 {
7816   aarch64_override_options_after_change_1 (&global_options);
7817 }
7818
7819 static struct machine_function *
7820 aarch64_init_machine_status (void)
7821 {
7822   struct machine_function *machine;
7823   machine = ggc_cleared_alloc<machine_function> ();
7824   return machine;
7825 }
7826
7827 void
7828 aarch64_init_expanders (void)
7829 {
7830   init_machine_status = aarch64_init_machine_status;
7831 }
7832
7833 /* A checking mechanism for the implementation of the various code models.  */
7834 static void
7835 initialize_aarch64_code_model (struct gcc_options *opts)
7836 {
7837    if (opts->x_flag_pic)
7838      {
7839        switch (opts->x_aarch64_cmodel_var)
7840          {
7841          case AARCH64_CMODEL_TINY:
7842            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
7843            break;
7844          case AARCH64_CMODEL_SMALL:
7845 #ifdef HAVE_AS_SMALL_PIC_RELOCS
7846            aarch64_cmodel = (flag_pic == 2
7847                              ? AARCH64_CMODEL_SMALL_PIC
7848                              : AARCH64_CMODEL_SMALL_SPIC);
7849 #else
7850            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
7851 #endif
7852            break;
7853          case AARCH64_CMODEL_LARGE:
7854            sorry ("code model %qs with -f%s", "large",
7855                   opts->x_flag_pic > 1 ? "PIC" : "pic");
7856          default:
7857            gcc_unreachable ();
7858          }
7859      }
7860    else
7861      aarch64_cmodel = opts->x_aarch64_cmodel_var;
7862 }
7863
7864 /* Print to F the architecture features specified by ISA_FLAGS.  */
7865
7866 static void
7867 aarch64_print_extension (FILE *f, unsigned long isa_flags)
7868 {
7869   const struct aarch64_option_extension *opt = NULL;
7870
7871   for (opt = all_extensions; opt->name != NULL; opt++)
7872     if ((isa_flags & opt->flags_on) == opt->flags_on)
7873       asm_fprintf (f, "+%s", opt->name);
7874
7875   asm_fprintf (f, "\n");
7876 }
7877
7878 /* Implement TARGET_OPTION_SAVE.  */
7879
7880 static void
7881 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
7882 {
7883   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
7884 }
7885
7886 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
7887    using the information saved in PTR.  */
7888
7889 static void
7890 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
7891 {
7892   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
7893   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
7894   opts->x_explicit_arch = ptr->x_explicit_arch;
7895   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
7896   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
7897
7898   aarch64_override_options_internal (opts);
7899 }
7900
7901 /* Implement TARGET_OPTION_PRINT.  */
7902
7903 static void
7904 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
7905 {
7906   const struct processor *cpu
7907     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
7908   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
7909   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
7910
7911   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
7912   fprintf (file, "%*sselected arch = %s", indent, "", arch->name);
7913   aarch64_print_extension (file, isa_flags);
7914 }
7915
7916 static GTY(()) tree aarch64_previous_fndecl;
7917
7918 void
7919 aarch64_reset_previous_fndecl (void)
7920 {
7921   aarch64_previous_fndecl = NULL;
7922 }
7923
7924 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
7925    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
7926    of the function, if such exists.  This function may be called multiple
7927    times on a single function so use aarch64_previous_fndecl to avoid
7928    setting up identical state.  */
7929
7930 static void
7931 aarch64_set_current_function (tree fndecl)
7932 {
7933   tree old_tree = (aarch64_previous_fndecl
7934                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
7935                    : NULL_TREE);
7936
7937   tree new_tree = (fndecl
7938                    ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
7939                    : NULL_TREE);
7940
7941
7942   if (fndecl && fndecl != aarch64_previous_fndecl)
7943     {
7944       aarch64_previous_fndecl = fndecl;
7945       if (old_tree == new_tree)
7946         ;
7947
7948       else if (new_tree && new_tree != target_option_default_node)
7949         {
7950           cl_target_option_restore (&global_options,
7951                                     TREE_TARGET_OPTION (new_tree));
7952           if (TREE_TARGET_GLOBALS (new_tree))
7953             restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7954           else
7955             TREE_TARGET_GLOBALS (new_tree)
7956               = save_target_globals_default_opts ();
7957         }
7958
7959       else if (old_tree && old_tree != target_option_default_node)
7960         {
7961           new_tree = target_option_current_node;
7962           cl_target_option_restore (&global_options,
7963                                     TREE_TARGET_OPTION (new_tree));
7964           if (TREE_TARGET_GLOBALS (new_tree))
7965             restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
7966           else if (new_tree == target_option_default_node)
7967             restore_target_globals (&default_target_globals);
7968           else
7969             TREE_TARGET_GLOBALS (new_tree)
7970               = save_target_globals_default_opts ();
7971         }
7972     }
7973 }
7974
7975 /* Enum describing the various ways we can handle attributes.
7976    In many cases we can reuse the generic option handling machinery.  */
7977
7978 enum aarch64_attr_opt_type
7979 {
7980   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
7981   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
7982   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
7983   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
7984 };
7985
7986 /* All the information needed to handle a target attribute.
7987    NAME is the name of the attribute.
7988    ATTR_TYPE specifies the type of behaviour of the attribute as described
7989    in the definition of enum aarch64_attr_opt_type.
7990    ALLOW_NEG is true if the attribute supports a "no-" form.
7991    HANDLER is the function that takes the attribute string and whether
7992    it is a pragma or attribute and handles the option.  It is needed only
7993    when the ATTR_TYPE is aarch64_attr_custom.
7994    OPT_NUM is the enum specifying the option that the attribute modifies.
7995    This is needed for attributes that mirror the behaviour of a command-line
7996    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
7997    aarch64_attr_enum.  */
7998
7999 struct aarch64_attribute_info
8000 {
8001   const char *name;
8002   enum aarch64_attr_opt_type attr_type;
8003   bool allow_neg;
8004   bool (*handler) (const char *, const char *);
8005   enum opt_code opt_num;
8006 };
8007
8008 /* Handle the ARCH_STR argument to the arch= target attribute.
8009    PRAGMA_OR_ATTR is used in potential error messages.  */
8010
8011 static bool
8012 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
8013 {
8014   const struct processor *tmp_arch = NULL;
8015   enum aarch64_parse_opt_result parse_res
8016     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
8017
8018   if (parse_res == AARCH64_PARSE_OK)
8019     {
8020       gcc_assert (tmp_arch);
8021       selected_arch = tmp_arch;
8022       explicit_arch = selected_arch->arch;
8023       return true;
8024     }
8025
8026   switch (parse_res)
8027     {
8028       case AARCH64_PARSE_MISSING_ARG:
8029         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
8030         break;
8031       case AARCH64_PARSE_INVALID_ARG:
8032         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
8033         break;
8034       case AARCH64_PARSE_INVALID_FEATURE:
8035         error ("invalid feature modifier %qs for 'arch' target %s",
8036                str, pragma_or_attr);
8037         break;
8038       default:
8039         gcc_unreachable ();
8040     }
8041
8042   return false;
8043 }
8044
8045 /* Handle the argument CPU_STR to the cpu= target attribute.
8046    PRAGMA_OR_ATTR is used in potential error messages.  */
8047
8048 static bool
8049 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
8050 {
8051   const struct processor *tmp_cpu = NULL;
8052   enum aarch64_parse_opt_result parse_res
8053     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
8054
8055   if (parse_res == AARCH64_PARSE_OK)
8056     {
8057       gcc_assert (tmp_cpu);
8058       selected_tune = tmp_cpu;
8059       explicit_tune_core = selected_tune->ident;
8060
8061       selected_arch = &all_architectures[tmp_cpu->arch];
8062       explicit_arch = selected_arch->arch;
8063       return true;
8064     }
8065
8066   switch (parse_res)
8067     {
8068       case AARCH64_PARSE_MISSING_ARG:
8069         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
8070         break;
8071       case AARCH64_PARSE_INVALID_ARG:
8072         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
8073         break;
8074       case AARCH64_PARSE_INVALID_FEATURE:
8075         error ("invalid feature modifier %qs for 'cpu' target %s",
8076                str, pragma_or_attr);
8077         break;
8078       default:
8079         gcc_unreachable ();
8080     }
8081
8082   return false;
8083 }
8084
8085 /* Handle the argument STR to the tune= target attribute.
8086    PRAGMA_OR_ATTR is used in potential error messages.  */
8087
8088 static bool
8089 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
8090 {
8091   const struct processor *tmp_tune = NULL;
8092   enum aarch64_parse_opt_result parse_res
8093     = aarch64_parse_tune (str, &tmp_tune);
8094
8095   if (parse_res == AARCH64_PARSE_OK)
8096     {
8097       gcc_assert (tmp_tune);
8098       selected_tune = tmp_tune;
8099       explicit_tune_core = selected_tune->ident;
8100       return true;
8101     }
8102
8103   switch (parse_res)
8104     {
8105       case AARCH64_PARSE_INVALID_ARG:
8106         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
8107         break;
8108       default:
8109         gcc_unreachable ();
8110     }
8111
8112   return false;
8113 }
8114
8115 /* Parse an architecture extensions target attribute string specified in STR.
8116    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
8117    if successful.  Update aarch64_isa_flags to reflect the ISA features
8118    modified.
8119    PRAGMA_OR_ATTR is used in potential error messages.  */
8120
8121 static bool
8122 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
8123 {
8124   enum aarch64_parse_opt_result parse_res;
8125   unsigned long isa_flags = aarch64_isa_flags;
8126
8127   /* We allow "+nothing" in the beginning to clear out all architectural
8128      features if the user wants to handpick specific features.  */
8129   if (strncmp ("+nothing", str, 8) == 0)
8130     {
8131       isa_flags = 0;
8132       str += 8;
8133     }
8134
8135   parse_res = aarch64_parse_extension (str, &isa_flags);
8136
8137   if (parse_res == AARCH64_PARSE_OK)
8138     {
8139       aarch64_isa_flags = isa_flags;
8140       return true;
8141     }
8142
8143   switch (parse_res)
8144     {
8145       case AARCH64_PARSE_MISSING_ARG:
8146         error ("missing feature modifier in target %s %qs",
8147                pragma_or_attr, str);
8148         break;
8149
8150       case AARCH64_PARSE_INVALID_FEATURE:
8151         error ("invalid feature modifier in target %s %qs",
8152                pragma_or_attr, str);
8153         break;
8154
8155       default:
8156         gcc_unreachable ();
8157     }
8158
8159  return false;
8160 }
8161
8162 /* The target attributes that we support.  On top of these we also support just
8163    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
8164    handled explicitly in aarch64_process_one_target_attr.  */
8165
8166 static const struct aarch64_attribute_info aarch64_attributes[] =
8167 {
8168   { "general-regs-only", aarch64_attr_mask, false, NULL,
8169      OPT_mgeneral_regs_only },
8170   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
8171      OPT_mfix_cortex_a53_835769 },
8172   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
8173   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
8174   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
8175      OPT_momit_leaf_frame_pointer },
8176   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
8177   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
8178      OPT_march_ },
8179   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
8180   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
8181      OPT_mtune_ },
8182   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
8183 };
8184
8185 /* Parse ARG_STR which contains the definition of one target attribute.
8186    Show appropriate errors if any or return true if the attribute is valid.
8187    PRAGMA_OR_ATTR holds the string to use in error messages about whether
8188    we're processing a target attribute or pragma.  */
8189
8190 static bool
8191 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
8192 {
8193   bool invert = false;
8194
8195   size_t len = strlen (arg_str);
8196
8197   if (len == 0)
8198     {
8199       error ("malformed target %s", pragma_or_attr);
8200       return false;
8201     }
8202
8203   char *str_to_check = (char *) alloca (len + 1);
8204   strcpy (str_to_check, arg_str);
8205
8206   /* Skip leading whitespace.  */
8207   while (*str_to_check == ' ' || *str_to_check == '\t')
8208     str_to_check++;
8209
8210   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
8211      It is easier to detect and handle it explicitly here rather than going
8212      through the machinery for the rest of the target attributes in this
8213      function.  */
8214   if (*str_to_check == '+')
8215     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
8216
8217   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
8218     {
8219       invert = true;
8220       str_to_check += 3;
8221     }
8222   char *arg = strchr (str_to_check, '=');
8223
8224   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
8225      and point ARG to "foo".  */
8226   if (arg)
8227     {
8228       *arg = '\0';
8229       arg++;
8230     }
8231   const struct aarch64_attribute_info *p_attr;
8232   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
8233     {
8234       /* If the names don't match up, or the user has given an argument
8235          to an attribute that doesn't accept one, or didn't give an argument
8236          to an attribute that expects one, fail to match.  */
8237       if (strcmp (str_to_check, p_attr->name) != 0)
8238         continue;
8239
8240       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
8241                               || p_attr->attr_type == aarch64_attr_enum;
8242
8243       if (attr_need_arg_p ^ (arg != NULL))
8244         {
8245           error ("target %s %qs does not accept an argument",
8246                   pragma_or_attr, str_to_check);
8247           return false;
8248         }
8249
8250       /* If the name matches but the attribute does not allow "no-" versions
8251          then we can't match.  */
8252       if (invert && !p_attr->allow_neg)
8253         {
8254           error ("target %s %qs does not allow a negated form",
8255                   pragma_or_attr, str_to_check);
8256           return false;
8257         }
8258
8259       switch (p_attr->attr_type)
8260         {
8261         /* Has a custom handler registered.
8262            For example, cpu=, arch=, tune=.  */
8263           case aarch64_attr_custom:
8264             gcc_assert (p_attr->handler);
8265             if (!p_attr->handler (arg, pragma_or_attr))
8266               return false;
8267             break;
8268
8269           /* Either set or unset a boolean option.  */
8270           case aarch64_attr_bool:
8271             {
8272               struct cl_decoded_option decoded;
8273
8274               generate_option (p_attr->opt_num, NULL, !invert,
8275                                CL_TARGET, &decoded);
8276               aarch64_handle_option (&global_options, &global_options_set,
8277                                       &decoded, input_location);
8278               break;
8279             }
8280           /* Set or unset a bit in the target_flags.  aarch64_handle_option
8281              should know what mask to apply given the option number.  */
8282           case aarch64_attr_mask:
8283             {
8284               struct cl_decoded_option decoded;
8285               /* We only need to specify the option number.
8286                  aarch64_handle_option will know which mask to apply.  */
8287               decoded.opt_index = p_attr->opt_num;
8288               decoded.value = !invert;
8289               aarch64_handle_option (&global_options, &global_options_set,
8290                                       &decoded, input_location);
8291               break;
8292             }
8293           /* Use the option setting machinery to set an option to an enum.  */
8294           case aarch64_attr_enum:
8295             {
8296               gcc_assert (arg);
8297               bool valid;
8298               int value;
8299               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
8300                                               &value, CL_TARGET);
8301               if (valid)
8302                 {
8303                   set_option (&global_options, NULL, p_attr->opt_num, value,
8304                               NULL, DK_UNSPECIFIED, input_location,
8305                               global_dc);
8306                 }
8307               else
8308                 {
8309                   error ("target %s %s=%s is not valid",
8310                          pragma_or_attr, str_to_check, arg);
8311                 }
8312               break;
8313             }
8314           default:
8315             gcc_unreachable ();
8316         }
8317     }
8318
8319   return true;
8320 }
8321
8322 /* Count how many times the character C appears in
8323    NULL-terminated string STR.  */
8324
8325 static unsigned int
8326 num_occurences_in_str (char c, char *str)
8327 {
8328   unsigned int res = 0;
8329   while (*str != '\0')
8330     {
8331       if (*str == c)
8332         res++;
8333
8334       str++;
8335     }
8336
8337   return res;
8338 }
8339
8340 /* Parse the tree in ARGS that contains the target attribute information
8341    and update the global target options space.  PRAGMA_OR_ATTR is a string
8342    to be used in error messages, specifying whether this is processing
8343    a target attribute or a target pragma.  */
8344
8345 bool
8346 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
8347 {
8348   if (TREE_CODE (args) == TREE_LIST)
8349     {
8350       do
8351         {
8352           tree head = TREE_VALUE (args);
8353           if (head)
8354             {
8355               if (!aarch64_process_target_attr (head, pragma_or_attr))
8356                 return false;
8357             }
8358           args = TREE_CHAIN (args);
8359         } while (args);
8360
8361       return true;
8362     }
8363   /* We expect to find a string to parse.  */
8364   gcc_assert (TREE_CODE (args) == STRING_CST);
8365
8366   size_t len = strlen (TREE_STRING_POINTER (args));
8367   char *str_to_check = (char *) alloca (len + 1);
8368   strcpy (str_to_check, TREE_STRING_POINTER (args));
8369
8370   if (len == 0)
8371     {
8372       error ("malformed target %s value", pragma_or_attr);
8373       return false;
8374     }
8375
8376   /* Used to catch empty spaces between commas i.e.
8377      attribute ((target ("attr1,,attr2"))).  */
8378   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
8379
8380   /* Handle multiple target attributes separated by ','.  */
8381   char *token = strtok (str_to_check, ",");
8382
8383   unsigned int num_attrs = 0;
8384   while (token)
8385     {
8386       num_attrs++;
8387       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
8388         {
8389           error ("target %s %qs is invalid", pragma_or_attr, token);
8390           return false;
8391         }
8392
8393       token = strtok (NULL, ",");
8394     }
8395
8396   if (num_attrs != num_commas + 1)
8397     {
8398       error ("malformed target %s list %qs",
8399               pragma_or_attr, TREE_STRING_POINTER (args));
8400       return false;
8401     }
8402
8403   return true;
8404 }
8405
8406 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
8407    process attribute ((target ("..."))).  */
8408
8409 static bool
8410 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
8411 {
8412   struct cl_target_option cur_target;
8413   bool ret;
8414   tree old_optimize;
8415   tree new_target, new_optimize;
8416   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
8417
8418   /* If what we're processing is the current pragma string then the
8419      target option node is already stored in target_option_current_node
8420      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
8421      having to re-parse the string.  This is especially useful to keep
8422      arm_neon.h compile times down since that header contains a lot
8423      of intrinsics enclosed in pragmas.  */
8424   if (!existing_target && args == current_target_pragma)
8425     {
8426       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
8427       return true;
8428     }
8429   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
8430
8431   old_optimize = build_optimization_node (&global_options);
8432   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
8433
8434   /* If the function changed the optimization levels as well as setting
8435      target options, start with the optimizations specified.  */
8436   if (func_optimize && func_optimize != old_optimize)
8437     cl_optimization_restore (&global_options,
8438                              TREE_OPTIMIZATION (func_optimize));
8439
8440   /* Save the current target options to restore at the end.  */
8441   cl_target_option_save (&cur_target, &global_options);
8442
8443   /* If fndecl already has some target attributes applied to it, unpack
8444      them so that we add this attribute on top of them, rather than
8445      overwriting them.  */
8446   if (existing_target)
8447     {
8448       struct cl_target_option *existing_options
8449         = TREE_TARGET_OPTION (existing_target);
8450
8451       if (existing_options)
8452         cl_target_option_restore (&global_options, existing_options);
8453     }
8454   else
8455     cl_target_option_restore (&global_options,
8456                         TREE_TARGET_OPTION (target_option_current_node));
8457
8458
8459   ret = aarch64_process_target_attr (args, "attribute");
8460
8461   /* Set up any additional state.  */
8462   if (ret)
8463     {
8464       aarch64_override_options_internal (&global_options);
8465       /* Initialize SIMD builtins if we haven't already.
8466          Set current_target_pragma to NULL for the duration so that
8467          the builtin initialization code doesn't try to tag the functions
8468          being built with the attributes specified by any current pragma, thus
8469          going into an infinite recursion.  */
8470       if (TARGET_SIMD)
8471         {
8472           tree saved_current_target_pragma = current_target_pragma;
8473           current_target_pragma = NULL;
8474           aarch64_init_simd_builtins ();
8475           current_target_pragma = saved_current_target_pragma;
8476         }
8477       new_target = build_target_option_node (&global_options);
8478     }
8479   else
8480     new_target = NULL;
8481
8482   new_optimize = build_optimization_node (&global_options);
8483
8484   if (fndecl && ret)
8485     {
8486       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
8487
8488       if (old_optimize != new_optimize)
8489         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
8490     }
8491
8492   cl_target_option_restore (&global_options, &cur_target);
8493
8494   if (old_optimize != new_optimize)
8495     cl_optimization_restore (&global_options,
8496                              TREE_OPTIMIZATION (old_optimize));
8497   return ret;
8498 }
8499
8500 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
8501    tri-bool options (yes, no, don't care) and the default value is
8502    DEF, determine whether to reject inlining.  */
8503
8504 static bool
8505 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
8506                                      int dont_care, int def)
8507 {
8508   /* If the callee doesn't care, always allow inlining.  */
8509   if (callee == dont_care)
8510     return true;
8511
8512   /* If the caller doesn't care, always allow inlining.  */
8513   if (caller == dont_care)
8514     return true;
8515
8516   /* Otherwise, allow inlining if either the callee and caller values
8517      agree, or if the callee is using the default value.  */
8518   return (callee == caller || callee == def);
8519 }
8520
8521 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
8522    to inline CALLEE into CALLER based on target-specific info.
8523    Make sure that the caller and callee have compatible architectural
8524    features.  Then go through the other possible target attributes
8525    and see if they can block inlining.  Try not to reject always_inline
8526    callees unless they are incompatible architecturally.  */
8527
8528 static bool
8529 aarch64_can_inline_p (tree caller, tree callee)
8530 {
8531   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
8532   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
8533
8534   /* If callee has no option attributes, then it is ok to inline.  */
8535   if (!callee_tree)
8536     return true;
8537
8538   struct cl_target_option *caller_opts
8539         = TREE_TARGET_OPTION (caller_tree ? caller_tree
8540                                            : target_option_default_node);
8541
8542   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
8543
8544
8545   /* Callee's ISA flags should be a subset of the caller's.  */
8546   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
8547        != callee_opts->x_aarch64_isa_flags)
8548     return false;
8549
8550   /* Allow non-strict aligned functions inlining into strict
8551      aligned ones.  */
8552   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
8553        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
8554       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
8555            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
8556     return false;
8557
8558   bool always_inline = lookup_attribute ("always_inline",
8559                                           DECL_ATTRIBUTES (callee));
8560
8561   /* If the architectural features match up and the callee is always_inline
8562      then the other attributes don't matter.  */
8563   if (always_inline)
8564     return true;
8565
8566   if (caller_opts->x_aarch64_cmodel_var
8567       != callee_opts->x_aarch64_cmodel_var)
8568     return false;
8569
8570   if (caller_opts->x_aarch64_tls_dialect
8571       != callee_opts->x_aarch64_tls_dialect)
8572     return false;
8573
8574   /* Honour explicit requests to workaround errata.  */
8575   if (!aarch64_tribools_ok_for_inlining_p (
8576           caller_opts->x_aarch64_fix_a53_err835769,
8577           callee_opts->x_aarch64_fix_a53_err835769,
8578           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
8579     return false;
8580
8581   /* If the user explicitly specified -momit-leaf-frame-pointer for the
8582      caller and calle and they don't match up, reject inlining.  */
8583   if (!aarch64_tribools_ok_for_inlining_p (
8584           caller_opts->x_flag_omit_leaf_frame_pointer,
8585           callee_opts->x_flag_omit_leaf_frame_pointer,
8586           2, 1))
8587     return false;
8588
8589   /* If the callee has specific tuning overrides, respect them.  */
8590   if (callee_opts->x_aarch64_override_tune_string != NULL
8591       && caller_opts->x_aarch64_override_tune_string == NULL)
8592     return false;
8593
8594   /* If the user specified tuning override strings for the
8595      caller and callee and they don't match up, reject inlining.
8596      We just do a string compare here, we don't analyze the meaning
8597      of the string, as it would be too costly for little gain.  */
8598   if (callee_opts->x_aarch64_override_tune_string
8599       && caller_opts->x_aarch64_override_tune_string
8600       && (strcmp (callee_opts->x_aarch64_override_tune_string,
8601                   caller_opts->x_aarch64_override_tune_string) != 0))
8602     return false;
8603
8604   return true;
8605 }
8606
8607 /* Return true if SYMBOL_REF X binds locally.  */
8608
8609 static bool
8610 aarch64_symbol_binds_local_p (const_rtx x)
8611 {
8612   return (SYMBOL_REF_DECL (x)
8613           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
8614           : SYMBOL_REF_LOCAL_P (x));
8615 }
8616
8617 /* Return true if SYMBOL_REF X is thread local */
8618 static bool
8619 aarch64_tls_symbol_p (rtx x)
8620 {
8621   if (! TARGET_HAVE_TLS)
8622     return false;
8623
8624   if (GET_CODE (x) != SYMBOL_REF)
8625     return false;
8626
8627   return SYMBOL_REF_TLS_MODEL (x) != 0;
8628 }
8629
8630 /* Classify a TLS symbol into one of the TLS kinds.  */
8631 enum aarch64_symbol_type
8632 aarch64_classify_tls_symbol (rtx x)
8633 {
8634   enum tls_model tls_kind = tls_symbolic_operand_type (x);
8635
8636   switch (tls_kind)
8637     {
8638     case TLS_MODEL_GLOBAL_DYNAMIC:
8639     case TLS_MODEL_LOCAL_DYNAMIC:
8640       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
8641
8642     case TLS_MODEL_INITIAL_EXEC:
8643       return SYMBOL_SMALL_GOTTPREL;
8644
8645     case TLS_MODEL_LOCAL_EXEC:
8646       return SYMBOL_TLSLE;
8647
8648     case TLS_MODEL_EMULATED:
8649     case TLS_MODEL_NONE:
8650       return SYMBOL_FORCE_TO_MEM;
8651
8652     default:
8653       gcc_unreachable ();
8654     }
8655 }
8656
8657 /* Return the method that should be used to access SYMBOL_REF or
8658    LABEL_REF X in context CONTEXT.  */
8659
8660 enum aarch64_symbol_type
8661 aarch64_classify_symbol (rtx x, rtx offset,
8662                          enum aarch64_symbol_context context ATTRIBUTE_UNUSED)
8663 {
8664   if (GET_CODE (x) == LABEL_REF)
8665     {
8666       switch (aarch64_cmodel)
8667         {
8668         case AARCH64_CMODEL_LARGE:
8669           return SYMBOL_FORCE_TO_MEM;
8670
8671         case AARCH64_CMODEL_TINY_PIC:
8672         case AARCH64_CMODEL_TINY:
8673           return SYMBOL_TINY_ABSOLUTE;
8674
8675         case AARCH64_CMODEL_SMALL_SPIC:
8676         case AARCH64_CMODEL_SMALL_PIC:
8677         case AARCH64_CMODEL_SMALL:
8678           return SYMBOL_SMALL_ABSOLUTE;
8679
8680         default:
8681           gcc_unreachable ();
8682         }
8683     }
8684
8685   if (GET_CODE (x) == SYMBOL_REF)
8686     {
8687       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
8688           return SYMBOL_FORCE_TO_MEM;
8689
8690       if (aarch64_tls_symbol_p (x))
8691         return aarch64_classify_tls_symbol (x);
8692
8693       switch (aarch64_cmodel)
8694         {
8695         case AARCH64_CMODEL_TINY:
8696           /* When we retreive symbol + offset address, we have to make sure
8697              the offset does not cause overflow of the final address.  But
8698              we have no way of knowing the address of symbol at compile time
8699              so we can't accurately say if the distance between the PC and
8700              symbol + offset is outside the addressible range of +/-1M in the
8701              TINY code model.  So we rely on images not being greater than
8702              1M and cap the offset at 1M and anything beyond 1M will have to
8703              be loaded using an alternative mechanism.  */
8704           if (SYMBOL_REF_WEAK (x)
8705               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
8706             return SYMBOL_FORCE_TO_MEM;
8707           return SYMBOL_TINY_ABSOLUTE;
8708
8709         case AARCH64_CMODEL_SMALL:
8710           /* Same reasoning as the tiny code model, but the offset cap here is
8711              4G.  */
8712           if (SYMBOL_REF_WEAK (x)
8713               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
8714                             HOST_WIDE_INT_C (4294967264)))
8715             return SYMBOL_FORCE_TO_MEM;
8716           return SYMBOL_SMALL_ABSOLUTE;
8717
8718         case AARCH64_CMODEL_TINY_PIC:
8719           if (!aarch64_symbol_binds_local_p (x))
8720             return SYMBOL_TINY_GOT;
8721           return SYMBOL_TINY_ABSOLUTE;
8722
8723         case AARCH64_CMODEL_SMALL_SPIC:
8724         case AARCH64_CMODEL_SMALL_PIC:
8725           if (!aarch64_symbol_binds_local_p (x))
8726             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
8727                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
8728           return SYMBOL_SMALL_ABSOLUTE;
8729
8730         default:
8731           gcc_unreachable ();
8732         }
8733     }
8734
8735   /* By default push everything into the constant pool.  */
8736   return SYMBOL_FORCE_TO_MEM;
8737 }
8738
8739 bool
8740 aarch64_constant_address_p (rtx x)
8741 {
8742   return (CONSTANT_P (x) && memory_address_p (DImode, x));
8743 }
8744
8745 bool
8746 aarch64_legitimate_pic_operand_p (rtx x)
8747 {
8748   if (GET_CODE (x) == SYMBOL_REF
8749       || (GET_CODE (x) == CONST
8750           && GET_CODE (XEXP (x, 0)) == PLUS
8751           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
8752      return false;
8753
8754   return true;
8755 }
8756
8757 /* Return true if X holds either a quarter-precision or
8758      floating-point +0.0 constant.  */
8759 static bool
8760 aarch64_valid_floating_const (machine_mode mode, rtx x)
8761 {
8762   if (!CONST_DOUBLE_P (x))
8763     return false;
8764
8765   if (aarch64_float_const_zero_rtx_p (x))
8766     return true;
8767
8768   /* We only handle moving 0.0 to a TFmode register.  */
8769   if (!(mode == SFmode || mode == DFmode))
8770     return false;
8771
8772   return aarch64_float_const_representable_p (x);
8773 }
8774
8775 static bool
8776 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
8777 {
8778   /* Do not allow vector struct mode constants.  We could support
8779      0 and -1 easily, but they need support in aarch64-simd.md.  */
8780   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
8781     return false;
8782
8783   /* This could probably go away because
8784      we now decompose CONST_INTs according to expand_mov_immediate.  */
8785   if ((GET_CODE (x) == CONST_VECTOR
8786        && aarch64_simd_valid_immediate (x, mode, false, NULL))
8787       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
8788         return !targetm.cannot_force_const_mem (mode, x);
8789
8790   if (GET_CODE (x) == HIGH
8791       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
8792     return true;
8793
8794   return aarch64_constant_address_p (x);
8795 }
8796
8797 rtx
8798 aarch64_load_tp (rtx target)
8799 {
8800   if (!target
8801       || GET_MODE (target) != Pmode
8802       || !register_operand (target, Pmode))
8803     target = gen_reg_rtx (Pmode);
8804
8805   /* Can return in any reg.  */
8806   emit_insn (gen_aarch64_load_tp_hard (target));
8807   return target;
8808 }
8809
8810 /* On AAPCS systems, this is the "struct __va_list".  */
8811 static GTY(()) tree va_list_type;
8812
8813 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
8814    Return the type to use as __builtin_va_list.
8815
8816    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
8817
8818    struct __va_list
8819    {
8820      void *__stack;
8821      void *__gr_top;
8822      void *__vr_top;
8823      int   __gr_offs;
8824      int   __vr_offs;
8825    };  */
8826
8827 static tree
8828 aarch64_build_builtin_va_list (void)
8829 {
8830   tree va_list_name;
8831   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
8832
8833   /* Create the type.  */
8834   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
8835   /* Give it the required name.  */
8836   va_list_name = build_decl (BUILTINS_LOCATION,
8837                              TYPE_DECL,
8838                              get_identifier ("__va_list"),
8839                              va_list_type);
8840   DECL_ARTIFICIAL (va_list_name) = 1;
8841   TYPE_NAME (va_list_type) = va_list_name;
8842   TYPE_STUB_DECL (va_list_type) = va_list_name;
8843
8844   /* Create the fields.  */
8845   f_stack = build_decl (BUILTINS_LOCATION,
8846                         FIELD_DECL, get_identifier ("__stack"),
8847                         ptr_type_node);
8848   f_grtop = build_decl (BUILTINS_LOCATION,
8849                         FIELD_DECL, get_identifier ("__gr_top"),
8850                         ptr_type_node);
8851   f_vrtop = build_decl (BUILTINS_LOCATION,
8852                         FIELD_DECL, get_identifier ("__vr_top"),
8853                         ptr_type_node);
8854   f_groff = build_decl (BUILTINS_LOCATION,
8855                         FIELD_DECL, get_identifier ("__gr_offs"),
8856                         integer_type_node);
8857   f_vroff = build_decl (BUILTINS_LOCATION,
8858                         FIELD_DECL, get_identifier ("__vr_offs"),
8859                         integer_type_node);
8860
8861   DECL_ARTIFICIAL (f_stack) = 1;
8862   DECL_ARTIFICIAL (f_grtop) = 1;
8863   DECL_ARTIFICIAL (f_vrtop) = 1;
8864   DECL_ARTIFICIAL (f_groff) = 1;
8865   DECL_ARTIFICIAL (f_vroff) = 1;
8866
8867   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
8868   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
8869   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
8870   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
8871   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
8872
8873   TYPE_FIELDS (va_list_type) = f_stack;
8874   DECL_CHAIN (f_stack) = f_grtop;
8875   DECL_CHAIN (f_grtop) = f_vrtop;
8876   DECL_CHAIN (f_vrtop) = f_groff;
8877   DECL_CHAIN (f_groff) = f_vroff;
8878
8879   /* Compute its layout.  */
8880   layout_type (va_list_type);
8881
8882   return va_list_type;
8883 }
8884
8885 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
8886 static void
8887 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
8888 {
8889   const CUMULATIVE_ARGS *cum;
8890   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
8891   tree stack, grtop, vrtop, groff, vroff;
8892   tree t;
8893   int gr_save_area_size;
8894   int vr_save_area_size;
8895   int vr_offset;
8896
8897   cum = &crtl->args.info;
8898   gr_save_area_size
8899     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
8900   vr_save_area_size
8901     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
8902
8903   if (!TARGET_FLOAT)
8904     {
8905       gcc_assert (cum->aapcs_nvrn == 0);
8906       vr_save_area_size = 0;
8907     }
8908
8909   f_stack = TYPE_FIELDS (va_list_type_node);
8910   f_grtop = DECL_CHAIN (f_stack);
8911   f_vrtop = DECL_CHAIN (f_grtop);
8912   f_groff = DECL_CHAIN (f_vrtop);
8913   f_vroff = DECL_CHAIN (f_groff);
8914
8915   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
8916                   NULL_TREE);
8917   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
8918                   NULL_TREE);
8919   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
8920                   NULL_TREE);
8921   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
8922                   NULL_TREE);
8923   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
8924                   NULL_TREE);
8925
8926   /* Emit code to initialize STACK, which points to the next varargs stack
8927      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
8928      by named arguments.  STACK is 8-byte aligned.  */
8929   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
8930   if (cum->aapcs_stack_size > 0)
8931     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
8932   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
8933   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8934
8935   /* Emit code to initialize GRTOP, the top of the GR save area.
8936      virtual_incoming_args_rtx should have been 16 byte aligned.  */
8937   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
8938   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
8939   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8940
8941   /* Emit code to initialize VRTOP, the top of the VR save area.
8942      This address is gr_save_area_bytes below GRTOP, rounded
8943      down to the next 16-byte boundary.  */
8944   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
8945   vr_offset = AARCH64_ROUND_UP (gr_save_area_size,
8946                              STACK_BOUNDARY / BITS_PER_UNIT);
8947
8948   if (vr_offset)
8949     t = fold_build_pointer_plus_hwi (t, -vr_offset);
8950   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
8951   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8952
8953   /* Emit code to initialize GROFF, the offset from GRTOP of the
8954      next GPR argument.  */
8955   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
8956               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
8957   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8958
8959   /* Likewise emit code to initialize VROFF, the offset from FTOP
8960      of the next VR argument.  */
8961   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
8962               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
8963   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
8964 }
8965
8966 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
8967
8968 static tree
8969 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
8970                               gimple_seq *post_p ATTRIBUTE_UNUSED)
8971 {
8972   tree addr;
8973   bool indirect_p;
8974   bool is_ha;           /* is HFA or HVA.  */
8975   bool dw_align;        /* double-word align.  */
8976   machine_mode ag_mode = VOIDmode;
8977   int nregs;
8978   machine_mode mode;
8979
8980   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
8981   tree stack, f_top, f_off, off, arg, roundup, on_stack;
8982   HOST_WIDE_INT size, rsize, adjust, align;
8983   tree t, u, cond1, cond2;
8984
8985   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
8986   if (indirect_p)
8987     type = build_pointer_type (type);
8988
8989   mode = TYPE_MODE (type);
8990
8991   f_stack = TYPE_FIELDS (va_list_type_node);
8992   f_grtop = DECL_CHAIN (f_stack);
8993   f_vrtop = DECL_CHAIN (f_grtop);
8994   f_groff = DECL_CHAIN (f_vrtop);
8995   f_vroff = DECL_CHAIN (f_groff);
8996
8997   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
8998                   f_stack, NULL_TREE);
8999   size = int_size_in_bytes (type);
9000   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
9001
9002   dw_align = false;
9003   adjust = 0;
9004   if (aarch64_vfp_is_call_or_return_candidate (mode,
9005                                                type,
9006                                                &ag_mode,
9007                                                &nregs,
9008                                                &is_ha))
9009     {
9010       /* TYPE passed in fp/simd registers.  */
9011       if (!TARGET_FLOAT)
9012         aarch64_err_no_fpadvsimd (mode, "varargs");
9013
9014       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
9015                       unshare_expr (valist), f_vrtop, NULL_TREE);
9016       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
9017                       unshare_expr (valist), f_vroff, NULL_TREE);
9018
9019       rsize = nregs * UNITS_PER_VREG;
9020
9021       if (is_ha)
9022         {
9023           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
9024             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
9025         }
9026       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
9027                && size < UNITS_PER_VREG)
9028         {
9029           adjust = UNITS_PER_VREG - size;
9030         }
9031     }
9032   else
9033     {
9034       /* TYPE passed in general registers.  */
9035       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
9036                       unshare_expr (valist), f_grtop, NULL_TREE);
9037       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
9038                       unshare_expr (valist), f_groff, NULL_TREE);
9039       rsize = (size + UNITS_PER_WORD - 1) & -UNITS_PER_WORD;
9040       nregs = rsize / UNITS_PER_WORD;
9041
9042       if (align > 8)
9043         dw_align = true;
9044
9045       if (BLOCK_REG_PADDING (mode, type, 1) == downward
9046           && size < UNITS_PER_WORD)
9047         {
9048           adjust = UNITS_PER_WORD  - size;
9049         }
9050     }
9051
9052   /* Get a local temporary for the field value.  */
9053   off = get_initialized_tmp_var (f_off, pre_p, NULL);
9054
9055   /* Emit code to branch if off >= 0.  */
9056   t = build2 (GE_EXPR, boolean_type_node, off,
9057               build_int_cst (TREE_TYPE (off), 0));
9058   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
9059
9060   if (dw_align)
9061     {
9062       /* Emit: offs = (offs + 15) & -16.  */
9063       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9064                   build_int_cst (TREE_TYPE (off), 15));
9065       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
9066                   build_int_cst (TREE_TYPE (off), -16));
9067       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
9068     }
9069   else
9070     roundup = NULL;
9071
9072   /* Update ap.__[g|v]r_offs  */
9073   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9074               build_int_cst (TREE_TYPE (off), rsize));
9075   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
9076
9077   /* String up.  */
9078   if (roundup)
9079     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9080
9081   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
9082   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
9083               build_int_cst (TREE_TYPE (f_off), 0));
9084   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
9085
9086   /* String up: make sure the assignment happens before the use.  */
9087   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
9088   COND_EXPR_ELSE (cond1) = t;
9089
9090   /* Prepare the trees handling the argument that is passed on the stack;
9091      the top level node will store in ON_STACK.  */
9092   arg = get_initialized_tmp_var (stack, pre_p, NULL);
9093   if (align > 8)
9094     {
9095       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
9096       t = fold_convert (intDI_type_node, arg);
9097       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9098                   build_int_cst (TREE_TYPE (t), 15));
9099       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9100                   build_int_cst (TREE_TYPE (t), -16));
9101       t = fold_convert (TREE_TYPE (arg), t);
9102       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
9103     }
9104   else
9105     roundup = NULL;
9106   /* Advance ap.__stack  */
9107   t = fold_convert (intDI_type_node, arg);
9108   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9109               build_int_cst (TREE_TYPE (t), size + 7));
9110   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9111               build_int_cst (TREE_TYPE (t), -8));
9112   t = fold_convert (TREE_TYPE (arg), t);
9113   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
9114   /* String up roundup and advance.  */
9115   if (roundup)
9116     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9117   /* String up with arg */
9118   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
9119   /* Big-endianness related address adjustment.  */
9120   if (BLOCK_REG_PADDING (mode, type, 1) == downward
9121       && size < UNITS_PER_WORD)
9122   {
9123     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
9124                 size_int (UNITS_PER_WORD - size));
9125     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
9126   }
9127
9128   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
9129   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
9130
9131   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
9132   t = off;
9133   if (adjust)
9134     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
9135                 build_int_cst (TREE_TYPE (off), adjust));
9136
9137   t = fold_convert (sizetype, t);
9138   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
9139
9140   if (is_ha)
9141     {
9142       /* type ha; // treat as "struct {ftype field[n];}"
9143          ... [computing offs]
9144          for (i = 0; i <nregs; ++i, offs += 16)
9145            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
9146          return ha;  */
9147       int i;
9148       tree tmp_ha, field_t, field_ptr_t;
9149
9150       /* Declare a local variable.  */
9151       tmp_ha = create_tmp_var_raw (type, "ha");
9152       gimple_add_tmp_var (tmp_ha);
9153
9154       /* Establish the base type.  */
9155       switch (ag_mode)
9156         {
9157         case SFmode:
9158           field_t = float_type_node;
9159           field_ptr_t = float_ptr_type_node;
9160           break;
9161         case DFmode:
9162           field_t = double_type_node;
9163           field_ptr_t = double_ptr_type_node;
9164           break;
9165         case TFmode:
9166           field_t = long_double_type_node;
9167           field_ptr_t = long_double_ptr_type_node;
9168           break;
9169 /* The half precision and quad precision are not fully supported yet.  Enable
9170    the following code after the support is complete.  Need to find the correct
9171    type node for __fp16 *.  */
9172 #if 0
9173         case HFmode:
9174           field_t = float_type_node;
9175           field_ptr_t = float_ptr_type_node;
9176           break;
9177 #endif
9178         case V2SImode:
9179         case V4SImode:
9180             {
9181               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
9182               field_t = build_vector_type_for_mode (innertype, ag_mode);
9183               field_ptr_t = build_pointer_type (field_t);
9184             }
9185           break;
9186         default:
9187           gcc_assert (0);
9188         }
9189
9190       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
9191       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
9192       addr = t;
9193       t = fold_convert (field_ptr_t, addr);
9194       t = build2 (MODIFY_EXPR, field_t,
9195                   build1 (INDIRECT_REF, field_t, tmp_ha),
9196                   build1 (INDIRECT_REF, field_t, t));
9197
9198       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
9199       for (i = 1; i < nregs; ++i)
9200         {
9201           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
9202           u = fold_convert (field_ptr_t, addr);
9203           u = build2 (MODIFY_EXPR, field_t,
9204                       build2 (MEM_REF, field_t, tmp_ha,
9205                               build_int_cst (field_ptr_t,
9206                                              (i *
9207                                               int_size_in_bytes (field_t)))),
9208                       build1 (INDIRECT_REF, field_t, u));
9209           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
9210         }
9211
9212       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
9213       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
9214     }
9215
9216   COND_EXPR_ELSE (cond2) = t;
9217   addr = fold_convert (build_pointer_type (type), cond1);
9218   addr = build_va_arg_indirect_ref (addr);
9219
9220   if (indirect_p)
9221     addr = build_va_arg_indirect_ref (addr);
9222
9223   return addr;
9224 }
9225
9226 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
9227
9228 static void
9229 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9230                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
9231                                 int no_rtl)
9232 {
9233   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9234   CUMULATIVE_ARGS local_cum;
9235   int gr_saved, vr_saved;
9236
9237   /* The caller has advanced CUM up to, but not beyond, the last named
9238      argument.  Advance a local copy of CUM past the last "real" named
9239      argument, to find out how many registers are left over.  */
9240   local_cum = *cum;
9241   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
9242
9243   /* Found out how many registers we need to save.  */
9244   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
9245   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
9246
9247   if (!TARGET_FLOAT)
9248     {
9249       gcc_assert (local_cum.aapcs_nvrn == 0);
9250       vr_saved = 0;
9251     }
9252
9253   if (!no_rtl)
9254     {
9255       if (gr_saved > 0)
9256         {
9257           rtx ptr, mem;
9258
9259           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
9260           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
9261                                - gr_saved * UNITS_PER_WORD);
9262           mem = gen_frame_mem (BLKmode, ptr);
9263           set_mem_alias_set (mem, get_varargs_alias_set ());
9264
9265           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
9266                                mem, gr_saved);
9267         }
9268       if (vr_saved > 0)
9269         {
9270           /* We can't use move_block_from_reg, because it will use
9271              the wrong mode, storing D regs only.  */
9272           machine_mode mode = TImode;
9273           int off, i;
9274
9275           /* Set OFF to the offset from virtual_incoming_args_rtx of
9276              the first vector register.  The VR save area lies below
9277              the GR one, and is aligned to 16 bytes.  */
9278           off = -AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
9279                                    STACK_BOUNDARY / BITS_PER_UNIT);
9280           off -= vr_saved * UNITS_PER_VREG;
9281
9282           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
9283             {
9284               rtx ptr, mem;
9285
9286               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
9287               mem = gen_frame_mem (mode, ptr);
9288               set_mem_alias_set (mem, get_varargs_alias_set ());
9289               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
9290               off += UNITS_PER_VREG;
9291             }
9292         }
9293     }
9294
9295   /* We don't save the size into *PRETEND_SIZE because we want to avoid
9296      any complication of having crtl->args.pretend_args_size changed.  */
9297   cfun->machine->frame.saved_varargs_size
9298     = (AARCH64_ROUND_UP (gr_saved * UNITS_PER_WORD,
9299                       STACK_BOUNDARY / BITS_PER_UNIT)
9300        + vr_saved * UNITS_PER_VREG);
9301 }
9302
9303 static void
9304 aarch64_conditional_register_usage (void)
9305 {
9306   int i;
9307   if (!TARGET_FLOAT)
9308     {
9309       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
9310         {
9311           fixed_regs[i] = 1;
9312           call_used_regs[i] = 1;
9313         }
9314     }
9315 }
9316
9317 /* Walk down the type tree of TYPE counting consecutive base elements.
9318    If *MODEP is VOIDmode, then set it to the first valid floating point
9319    type.  If a non-floating point type is found, or if a floating point
9320    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
9321    otherwise return the count in the sub-tree.  */
9322 static int
9323 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
9324 {
9325   machine_mode mode;
9326   HOST_WIDE_INT size;
9327
9328   switch (TREE_CODE (type))
9329     {
9330     case REAL_TYPE:
9331       mode = TYPE_MODE (type);
9332       if (mode != DFmode && mode != SFmode && mode != TFmode)
9333         return -1;
9334
9335       if (*modep == VOIDmode)
9336         *modep = mode;
9337
9338       if (*modep == mode)
9339         return 1;
9340
9341       break;
9342
9343     case COMPLEX_TYPE:
9344       mode = TYPE_MODE (TREE_TYPE (type));
9345       if (mode != DFmode && mode != SFmode && mode != TFmode)
9346         return -1;
9347
9348       if (*modep == VOIDmode)
9349         *modep = mode;
9350
9351       if (*modep == mode)
9352         return 2;
9353
9354       break;
9355
9356     case VECTOR_TYPE:
9357       /* Use V2SImode and V4SImode as representatives of all 64-bit
9358          and 128-bit vector types.  */
9359       size = int_size_in_bytes (type);
9360       switch (size)
9361         {
9362         case 8:
9363           mode = V2SImode;
9364           break;
9365         case 16:
9366           mode = V4SImode;
9367           break;
9368         default:
9369           return -1;
9370         }
9371
9372       if (*modep == VOIDmode)
9373         *modep = mode;
9374
9375       /* Vector modes are considered to be opaque: two vectors are
9376          equivalent for the purposes of being homogeneous aggregates
9377          if they are the same size.  */
9378       if (*modep == mode)
9379         return 1;
9380
9381       break;
9382
9383     case ARRAY_TYPE:
9384       {
9385         int count;
9386         tree index = TYPE_DOMAIN (type);
9387
9388         /* Can't handle incomplete types nor sizes that are not
9389            fixed.  */
9390         if (!COMPLETE_TYPE_P (type)
9391             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
9392           return -1;
9393
9394         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
9395         if (count == -1
9396             || !index
9397             || !TYPE_MAX_VALUE (index)
9398             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
9399             || !TYPE_MIN_VALUE (index)
9400             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
9401             || count < 0)
9402           return -1;
9403
9404         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
9405                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
9406
9407         /* There must be no padding.  */
9408         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
9409           return -1;
9410
9411         return count;
9412       }
9413
9414     case RECORD_TYPE:
9415       {
9416         int count = 0;
9417         int sub_count;
9418         tree field;
9419
9420         /* Can't handle incomplete types nor sizes that are not
9421            fixed.  */
9422         if (!COMPLETE_TYPE_P (type)
9423             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
9424           return -1;
9425
9426         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
9427           {
9428             if (TREE_CODE (field) != FIELD_DECL)
9429               continue;
9430
9431             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
9432             if (sub_count < 0)
9433               return -1;
9434             count += sub_count;
9435           }
9436
9437         /* There must be no padding.  */
9438         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
9439           return -1;
9440
9441         return count;
9442       }
9443
9444     case UNION_TYPE:
9445     case QUAL_UNION_TYPE:
9446       {
9447         /* These aren't very interesting except in a degenerate case.  */
9448         int count = 0;
9449         int sub_count;
9450         tree field;
9451
9452         /* Can't handle incomplete types nor sizes that are not
9453            fixed.  */
9454         if (!COMPLETE_TYPE_P (type)
9455             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
9456           return -1;
9457
9458         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
9459           {
9460             if (TREE_CODE (field) != FIELD_DECL)
9461               continue;
9462
9463             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
9464             if (sub_count < 0)
9465               return -1;
9466             count = count > sub_count ? count : sub_count;
9467           }
9468
9469         /* There must be no padding.  */
9470         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
9471           return -1;
9472
9473         return count;
9474       }
9475
9476     default:
9477       break;
9478     }
9479
9480   return -1;
9481 }
9482
9483 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
9484    type as described in AAPCS64 \S 4.1.2.
9485
9486    See the comment above aarch64_composite_type_p for the notes on MODE.  */
9487
9488 static bool
9489 aarch64_short_vector_p (const_tree type,
9490                         machine_mode mode)
9491 {
9492   HOST_WIDE_INT size = -1;
9493
9494   if (type && TREE_CODE (type) == VECTOR_TYPE)
9495     size = int_size_in_bytes (type);
9496   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
9497             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
9498     size = GET_MODE_SIZE (mode);
9499
9500   return (size == 8 || size == 16);
9501 }
9502
9503 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
9504    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
9505    array types.  The C99 floating-point complex types are also considered
9506    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
9507    types, which are GCC extensions and out of the scope of AAPCS64, are
9508    treated as composite types here as well.
9509
9510    Note that MODE itself is not sufficient in determining whether a type
9511    is such a composite type or not.  This is because
9512    stor-layout.c:compute_record_mode may have already changed the MODE
9513    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
9514    structure with only one field may have its MODE set to the mode of the
9515    field.  Also an integer mode whose size matches the size of the
9516    RECORD_TYPE type may be used to substitute the original mode
9517    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
9518    solely relied on.  */
9519
9520 static bool
9521 aarch64_composite_type_p (const_tree type,
9522                           machine_mode mode)
9523 {
9524   if (aarch64_short_vector_p (type, mode))
9525     return false;
9526
9527   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
9528     return true;
9529
9530   if (mode == BLKmode
9531       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
9532       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
9533     return true;
9534
9535   return false;
9536 }
9537
9538 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
9539    shall be passed or returned in simd/fp register(s) (providing these
9540    parameter passing registers are available).
9541
9542    Upon successful return, *COUNT returns the number of needed registers,
9543    *BASE_MODE returns the mode of the individual register and when IS_HAF
9544    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
9545    floating-point aggregate or a homogeneous short-vector aggregate.  */
9546
9547 static bool
9548 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
9549                                          const_tree type,
9550                                          machine_mode *base_mode,
9551                                          int *count,
9552                                          bool *is_ha)
9553 {
9554   machine_mode new_mode = VOIDmode;
9555   bool composite_p = aarch64_composite_type_p (type, mode);
9556
9557   if (is_ha != NULL) *is_ha = false;
9558
9559   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
9560       || aarch64_short_vector_p (type, mode))
9561     {
9562       *count = 1;
9563       new_mode = mode;
9564     }
9565   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
9566     {
9567       if (is_ha != NULL) *is_ha = true;
9568       *count = 2;
9569       new_mode = GET_MODE_INNER (mode);
9570     }
9571   else if (type && composite_p)
9572     {
9573       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
9574
9575       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
9576         {
9577           if (is_ha != NULL) *is_ha = true;
9578           *count = ag_count;
9579         }
9580       else
9581         return false;
9582     }
9583   else
9584     return false;
9585
9586   *base_mode = new_mode;
9587   return true;
9588 }
9589
9590 /* Implement TARGET_STRUCT_VALUE_RTX.  */
9591
9592 static rtx
9593 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
9594                           int incoming ATTRIBUTE_UNUSED)
9595 {
9596   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
9597 }
9598
9599 /* Implements target hook vector_mode_supported_p.  */
9600 static bool
9601 aarch64_vector_mode_supported_p (machine_mode mode)
9602 {
9603   if (TARGET_SIMD
9604       && (mode == V4SImode  || mode == V8HImode
9605           || mode == V16QImode || mode == V2DImode
9606           || mode == V2SImode  || mode == V4HImode
9607           || mode == V8QImode || mode == V2SFmode
9608           || mode == V4SFmode || mode == V2DFmode
9609           || mode == V1DFmode))
9610     return true;
9611
9612   return false;
9613 }
9614
9615 /* Return appropriate SIMD container
9616    for MODE within a vector of WIDTH bits.  */
9617 static machine_mode
9618 aarch64_simd_container_mode (machine_mode mode, unsigned width)
9619 {
9620   gcc_assert (width == 64 || width == 128);
9621   if (TARGET_SIMD)
9622     {
9623       if (width == 128)
9624         switch (mode)
9625           {
9626           case DFmode:
9627             return V2DFmode;
9628           case SFmode:
9629             return V4SFmode;
9630           case SImode:
9631             return V4SImode;
9632           case HImode:
9633             return V8HImode;
9634           case QImode:
9635             return V16QImode;
9636           case DImode:
9637             return V2DImode;
9638           default:
9639             break;
9640           }
9641       else
9642         switch (mode)
9643           {
9644           case SFmode:
9645             return V2SFmode;
9646           case SImode:
9647             return V2SImode;
9648           case HImode:
9649             return V4HImode;
9650           case QImode:
9651             return V8QImode;
9652           default:
9653             break;
9654           }
9655     }
9656   return word_mode;
9657 }
9658
9659 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
9660 static machine_mode
9661 aarch64_preferred_simd_mode (machine_mode mode)
9662 {
9663   return aarch64_simd_container_mode (mode, 128);
9664 }
9665
9666 /* Return the bitmask of possible vector sizes for the vectorizer
9667    to iterate over.  */
9668 static unsigned int
9669 aarch64_autovectorize_vector_sizes (void)
9670 {
9671   return (16 | 8);
9672 }
9673
9674 /* Implement TARGET_MANGLE_TYPE.  */
9675
9676 static const char *
9677 aarch64_mangle_type (const_tree type)
9678 {
9679   /* The AArch64 ABI documents say that "__va_list" has to be
9680      managled as if it is in the "std" namespace.  */
9681   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
9682     return "St9__va_list";
9683
9684   /* Half-precision float.  */
9685   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
9686     return "Dh";
9687
9688   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
9689      builtin types.  */
9690   if (TYPE_NAME (type) != NULL)
9691     return aarch64_mangle_builtin_type (type);
9692
9693   /* Use the default mangling.  */
9694   return NULL;
9695 }
9696
9697
9698 /* Return true if the rtx_insn contains a MEM RTX somewhere
9699    in it.  */
9700
9701 static bool
9702 has_memory_op (rtx_insn *mem_insn)
9703 {
9704   subrtx_iterator::array_type array;
9705   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
9706     if (MEM_P (*iter))
9707       return true;
9708
9709   return false;
9710 }
9711
9712 /* Find the first rtx_insn before insn that will generate an assembly
9713    instruction.  */
9714
9715 static rtx_insn *
9716 aarch64_prev_real_insn (rtx_insn *insn)
9717 {
9718   if (!insn)
9719     return NULL;
9720
9721   do
9722     {
9723       insn = prev_real_insn (insn);
9724     }
9725   while (insn && recog_memoized (insn) < 0);
9726
9727   return insn;
9728 }
9729
9730 static bool
9731 is_madd_op (enum attr_type t1)
9732 {
9733   unsigned int i;
9734   /* A number of these may be AArch32 only.  */
9735   enum attr_type mlatypes[] = {
9736     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
9737     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
9738     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
9739   };
9740
9741   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
9742     {
9743       if (t1 == mlatypes[i])
9744         return true;
9745     }
9746
9747   return false;
9748 }
9749
9750 /* Check if there is a register dependency between a load and the insn
9751    for which we hold recog_data.  */
9752
9753 static bool
9754 dep_between_memop_and_curr (rtx memop)
9755 {
9756   rtx load_reg;
9757   int opno;
9758
9759   gcc_assert (GET_CODE (memop) == SET);
9760
9761   if (!REG_P (SET_DEST (memop)))
9762     return false;
9763
9764   load_reg = SET_DEST (memop);
9765   for (opno = 1; opno < recog_data.n_operands; opno++)
9766     {
9767       rtx operand = recog_data.operand[opno];
9768       if (REG_P (operand)
9769           && reg_overlap_mentioned_p (load_reg, operand))
9770         return true;
9771
9772     }
9773   return false;
9774 }
9775
9776
9777 /* When working around the Cortex-A53 erratum 835769,
9778    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
9779    instruction and has a preceding memory instruction such that a NOP
9780    should be inserted between them.  */
9781
9782 bool
9783 aarch64_madd_needs_nop (rtx_insn* insn)
9784 {
9785   enum attr_type attr_type;
9786   rtx_insn *prev;
9787   rtx body;
9788
9789   if (!TARGET_FIX_ERR_A53_835769)
9790     return false;
9791
9792   if (recog_memoized (insn) < 0)
9793     return false;
9794
9795   attr_type = get_attr_type (insn);
9796   if (!is_madd_op (attr_type))
9797     return false;
9798
9799   prev = aarch64_prev_real_insn (insn);
9800   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
9801      Restore recog state to INSN to avoid state corruption.  */
9802   extract_constrain_insn_cached (insn);
9803
9804   if (!prev || !has_memory_op (prev))
9805     return false;
9806
9807   body = single_set (prev);
9808
9809   /* If the previous insn is a memory op and there is no dependency between
9810      it and the DImode madd, emit a NOP between them.  If body is NULL then we
9811      have a complex memory operation, probably a load/store pair.
9812      Be conservative for now and emit a NOP.  */
9813   if (GET_MODE (recog_data.operand[0]) == DImode
9814       && (!body || !dep_between_memop_and_curr (body)))
9815     return true;
9816
9817   return false;
9818
9819 }
9820
9821
9822 /* Implement FINAL_PRESCAN_INSN.  */
9823
9824 void
9825 aarch64_final_prescan_insn (rtx_insn *insn)
9826 {
9827   if (aarch64_madd_needs_nop (insn))
9828     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
9829 }
9830
9831
9832 /* Return the equivalent letter for size.  */
9833 static char
9834 sizetochar (int size)
9835 {
9836   switch (size)
9837     {
9838     case 64: return 'd';
9839     case 32: return 's';
9840     case 16: return 'h';
9841     case 8 : return 'b';
9842     default: gcc_unreachable ();
9843     }
9844 }
9845
9846 /* Return true iff x is a uniform vector of floating-point
9847    constants, and the constant can be represented in
9848    quarter-precision form.  Note, as aarch64_float_const_representable
9849    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
9850 static bool
9851 aarch64_vect_float_const_representable_p (rtx x)
9852 {
9853   int i = 0;
9854   REAL_VALUE_TYPE r0, ri;
9855   rtx x0, xi;
9856
9857   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
9858     return false;
9859
9860   x0 = CONST_VECTOR_ELT (x, 0);
9861   if (!CONST_DOUBLE_P (x0))
9862     return false;
9863
9864   REAL_VALUE_FROM_CONST_DOUBLE (r0, x0);
9865
9866   for (i = 1; i < CONST_VECTOR_NUNITS (x); i++)
9867     {
9868       xi = CONST_VECTOR_ELT (x, i);
9869       if (!CONST_DOUBLE_P (xi))
9870         return false;
9871
9872       REAL_VALUE_FROM_CONST_DOUBLE (ri, xi);
9873       if (!REAL_VALUES_EQUAL (r0, ri))
9874         return false;
9875     }
9876
9877   return aarch64_float_const_representable_p (x0);
9878 }
9879
9880 /* Return true for valid and false for invalid.  */
9881 bool
9882 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
9883                               struct simd_immediate_info *info)
9884 {
9885 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
9886   matches = 1;                                          \
9887   for (i = 0; i < idx; i += (STRIDE))                   \
9888     if (!(TEST))                                        \
9889       matches = 0;                                      \
9890   if (matches)                                          \
9891     {                                                   \
9892       immtype = (CLASS);                                \
9893       elsize = (ELSIZE);                                \
9894       eshift = (SHIFT);                                 \
9895       emvn = (NEG);                                     \
9896       break;                                            \
9897     }
9898
9899   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
9900   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
9901   unsigned char bytes[16];
9902   int immtype = -1, matches;
9903   unsigned int invmask = inverse ? 0xff : 0;
9904   int eshift, emvn;
9905
9906   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
9907     {
9908       if (! (aarch64_simd_imm_zero_p (op, mode)
9909              || aarch64_vect_float_const_representable_p (op)))
9910         return false;
9911
9912       if (info)
9913         {
9914           info->value = CONST_VECTOR_ELT (op, 0);
9915           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
9916           info->mvn = false;
9917           info->shift = 0;
9918         }
9919
9920       return true;
9921     }
9922
9923   /* Splat vector constant out into a byte vector.  */
9924   for (i = 0; i < n_elts; i++)
9925     {
9926       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
9927          it must be laid out in the vector register in reverse order.  */
9928       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
9929       unsigned HOST_WIDE_INT elpart;
9930       unsigned int part, parts;
9931
9932       if (CONST_INT_P (el))
9933         {
9934           elpart = INTVAL (el);
9935           parts = 1;
9936         }
9937       else if (GET_CODE (el) == CONST_DOUBLE)
9938         {
9939           elpart = CONST_DOUBLE_LOW (el);
9940           parts = 2;
9941         }
9942       else
9943         gcc_unreachable ();
9944
9945       for (part = 0; part < parts; part++)
9946         {
9947           unsigned int byte;
9948           for (byte = 0; byte < innersize; byte++)
9949             {
9950               bytes[idx++] = (elpart & 0xff) ^ invmask;
9951               elpart >>= BITS_PER_UNIT;
9952             }
9953           if (GET_CODE (el) == CONST_DOUBLE)
9954             elpart = CONST_DOUBLE_HIGH (el);
9955         }
9956     }
9957
9958   /* Sanity check.  */
9959   gcc_assert (idx == GET_MODE_SIZE (mode));
9960
9961   do
9962     {
9963       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
9964              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
9965
9966       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
9967              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
9968
9969       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
9970              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
9971
9972       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
9973              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
9974
9975       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
9976
9977       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
9978
9979       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
9980              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
9981
9982       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
9983              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
9984
9985       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
9986              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
9987
9988       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
9989              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
9990
9991       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
9992
9993       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
9994
9995       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
9996              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
9997
9998       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
9999              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10000
10001       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
10002              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10003
10004       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
10005              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10006
10007       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
10008
10009       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
10010              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
10011     }
10012   while (0);
10013
10014   if (immtype == -1)
10015     return false;
10016
10017   if (info)
10018     {
10019       info->element_width = elsize;
10020       info->mvn = emvn != 0;
10021       info->shift = eshift;
10022
10023       unsigned HOST_WIDE_INT imm = 0;
10024
10025       if (immtype >= 12 && immtype <= 15)
10026         info->msl = true;
10027
10028       /* Un-invert bytes of recognized vector, if necessary.  */
10029       if (invmask != 0)
10030         for (i = 0; i < idx; i++)
10031           bytes[i] ^= invmask;
10032
10033       if (immtype == 17)
10034         {
10035           /* FIXME: Broken on 32-bit H_W_I hosts.  */
10036           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
10037
10038           for (i = 0; i < 8; i++)
10039             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
10040               << (i * BITS_PER_UNIT);
10041
10042
10043           info->value = GEN_INT (imm);
10044         }
10045       else
10046         {
10047           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
10048             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
10049
10050           /* Construct 'abcdefgh' because the assembler cannot handle
10051              generic constants.  */
10052           if (info->mvn)
10053             imm = ~imm;
10054           imm = (imm >> info->shift) & 0xff;
10055           info->value = GEN_INT (imm);
10056         }
10057     }
10058
10059   return true;
10060 #undef CHECK
10061 }
10062
10063 /* Check of immediate shift constants are within range.  */
10064 bool
10065 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
10066 {
10067   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
10068   if (left)
10069     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
10070   else
10071     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
10072 }
10073
10074 /* Return true if X is a uniform vector where all elements
10075    are either the floating-point constant 0.0 or the
10076    integer constant 0.  */
10077 bool
10078 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
10079 {
10080   return x == CONST0_RTX (mode);
10081 }
10082
10083 bool
10084 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
10085 {
10086   HOST_WIDE_INT imm = INTVAL (x);
10087   int i;
10088
10089   for (i = 0; i < 8; i++)
10090     {
10091       unsigned int byte = imm & 0xff;
10092       if (byte != 0xff && byte != 0)
10093        return false;
10094       imm >>= 8;
10095     }
10096
10097   return true;
10098 }
10099
10100 bool
10101 aarch64_mov_operand_p (rtx x,
10102                        enum aarch64_symbol_context context,
10103                        machine_mode mode)
10104 {
10105   if (GET_CODE (x) == HIGH
10106       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10107     return true;
10108
10109   if (CONST_INT_P (x))
10110     return true;
10111
10112   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
10113     return true;
10114
10115   return aarch64_classify_symbolic_expression (x, context)
10116     == SYMBOL_TINY_ABSOLUTE;
10117 }
10118
10119 /* Return a const_int vector of VAL.  */
10120 rtx
10121 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
10122 {
10123   int nunits = GET_MODE_NUNITS (mode);
10124   rtvec v = rtvec_alloc (nunits);
10125   int i;
10126
10127   for (i=0; i < nunits; i++)
10128     RTVEC_ELT (v, i) = GEN_INT (val);
10129
10130   return gen_rtx_CONST_VECTOR (mode, v);
10131 }
10132
10133 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
10134
10135 bool
10136 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
10137 {
10138   machine_mode vmode;
10139
10140   gcc_assert (!VECTOR_MODE_P (mode));
10141   vmode = aarch64_preferred_simd_mode (mode);
10142   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
10143   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
10144 }
10145
10146 /* Construct and return a PARALLEL RTX vector with elements numbering the
10147    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
10148    the vector - from the perspective of the architecture.  This does not
10149    line up with GCC's perspective on lane numbers, so we end up with
10150    different masks depending on our target endian-ness.  The diagram
10151    below may help.  We must draw the distinction when building masks
10152    which select one half of the vector.  An instruction selecting
10153    architectural low-lanes for a big-endian target, must be described using
10154    a mask selecting GCC high-lanes.
10155
10156                  Big-Endian             Little-Endian
10157
10158 GCC             0   1   2   3           3   2   1   0
10159               | x | x | x | x |       | x | x | x | x |
10160 Architecture    3   2   1   0           3   2   1   0
10161
10162 Low Mask:         { 2, 3 }                { 0, 1 }
10163 High Mask:        { 0, 1 }                { 2, 3 }
10164 */
10165
10166 rtx
10167 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
10168 {
10169   int nunits = GET_MODE_NUNITS (mode);
10170   rtvec v = rtvec_alloc (nunits / 2);
10171   int high_base = nunits / 2;
10172   int low_base = 0;
10173   int base;
10174   rtx t1;
10175   int i;
10176
10177   if (BYTES_BIG_ENDIAN)
10178     base = high ? low_base : high_base;
10179   else
10180     base = high ? high_base : low_base;
10181
10182   for (i = 0; i < nunits / 2; i++)
10183     RTVEC_ELT (v, i) = GEN_INT (base + i);
10184
10185   t1 = gen_rtx_PARALLEL (mode, v);
10186   return t1;
10187 }
10188
10189 /* Check OP for validity as a PARALLEL RTX vector with elements
10190    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
10191    from the perspective of the architecture.  See the diagram above
10192    aarch64_simd_vect_par_cnst_half for more details.  */
10193
10194 bool
10195 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
10196                                        bool high)
10197 {
10198   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
10199   HOST_WIDE_INT count_op = XVECLEN (op, 0);
10200   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
10201   int i = 0;
10202
10203   if (!VECTOR_MODE_P (mode))
10204     return false;
10205
10206   if (count_op != count_ideal)
10207     return false;
10208
10209   for (i = 0; i < count_ideal; i++)
10210     {
10211       rtx elt_op = XVECEXP (op, 0, i);
10212       rtx elt_ideal = XVECEXP (ideal, 0, i);
10213
10214       if (!CONST_INT_P (elt_op)
10215           || INTVAL (elt_ideal) != INTVAL (elt_op))
10216         return false;
10217     }
10218   return true;
10219 }
10220
10221 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
10222    HIGH (exclusive).  */
10223 void
10224 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
10225                           const_tree exp)
10226 {
10227   HOST_WIDE_INT lane;
10228   gcc_assert (CONST_INT_P (operand));
10229   lane = INTVAL (operand);
10230
10231   if (lane < low || lane >= high)
10232   {
10233     if (exp)
10234       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
10235     else
10236       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
10237   }
10238 }
10239
10240 /* Return TRUE if OP is a valid vector addressing mode.  */
10241 bool
10242 aarch64_simd_mem_operand_p (rtx op)
10243 {
10244   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
10245                         || REG_P (XEXP (op, 0)));
10246 }
10247
10248 /* Emit a register copy from operand to operand, taking care not to
10249    early-clobber source registers in the process.
10250
10251    COUNT is the number of components into which the copy needs to be
10252    decomposed.  */
10253 void
10254 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
10255                                 unsigned int count)
10256 {
10257   unsigned int i;
10258   int rdest = REGNO (operands[0]);
10259   int rsrc = REGNO (operands[1]);
10260
10261   if (!reg_overlap_mentioned_p (operands[0], operands[1])
10262       || rdest < rsrc)
10263     for (i = 0; i < count; i++)
10264       emit_move_insn (gen_rtx_REG (mode, rdest + i),
10265                       gen_rtx_REG (mode, rsrc + i));
10266   else
10267     for (i = 0; i < count; i++)
10268       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
10269                       gen_rtx_REG (mode, rsrc + count - i - 1));
10270 }
10271
10272 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
10273    one of VSTRUCT modes: OI, CI or XI.  */
10274 int
10275 aarch64_simd_attr_length_move (rtx_insn *insn)
10276 {
10277   machine_mode mode;
10278
10279   extract_insn_cached (insn);
10280
10281   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
10282     {
10283       mode = GET_MODE (recog_data.operand[0]);
10284       switch (mode)
10285         {
10286         case OImode:
10287           return 8;
10288         case CImode:
10289           return 12;
10290         case XImode:
10291           return 16;
10292         default:
10293           gcc_unreachable ();
10294         }
10295     }
10296   return 4;
10297 }
10298
10299 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
10300    one of VSTRUCT modes: OI, CI, EI, or XI.  */
10301 int
10302 aarch64_simd_attr_length_rglist (enum machine_mode mode)
10303 {
10304   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
10305 }
10306
10307 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
10308    alignment of a vector to 128 bits.  */
10309 static HOST_WIDE_INT
10310 aarch64_simd_vector_alignment (const_tree type)
10311 {
10312   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
10313   return MIN (align, 128);
10314 }
10315
10316 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
10317 static bool
10318 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
10319 {
10320   if (is_packed)
10321     return false;
10322
10323   /* We guarantee alignment for vectors up to 128-bits.  */
10324   if (tree_int_cst_compare (TYPE_SIZE (type),
10325                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
10326     return false;
10327
10328   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
10329   return true;
10330 }
10331
10332 /* If VALS is a vector constant that can be loaded into a register
10333    using DUP, generate instructions to do so and return an RTX to
10334    assign to the register.  Otherwise return NULL_RTX.  */
10335 static rtx
10336 aarch64_simd_dup_constant (rtx vals)
10337 {
10338   machine_mode mode = GET_MODE (vals);
10339   machine_mode inner_mode = GET_MODE_INNER (mode);
10340   int n_elts = GET_MODE_NUNITS (mode);
10341   bool all_same = true;
10342   rtx x;
10343   int i;
10344
10345   if (GET_CODE (vals) != CONST_VECTOR)
10346     return NULL_RTX;
10347
10348   for (i = 1; i < n_elts; ++i)
10349     {
10350       x = CONST_VECTOR_ELT (vals, i);
10351       if (!rtx_equal_p (x, CONST_VECTOR_ELT (vals, 0)))
10352         all_same = false;
10353     }
10354
10355   if (!all_same)
10356     return NULL_RTX;
10357
10358   /* We can load this constant by using DUP and a constant in a
10359      single ARM register.  This will be cheaper than a vector
10360      load.  */
10361   x = copy_to_mode_reg (inner_mode, CONST_VECTOR_ELT (vals, 0));
10362   return gen_rtx_VEC_DUPLICATE (mode, x);
10363 }
10364
10365
10366 /* Generate code to load VALS, which is a PARALLEL containing only
10367    constants (for vec_init) or CONST_VECTOR, efficiently into a
10368    register.  Returns an RTX to copy into the register, or NULL_RTX
10369    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
10370 static rtx
10371 aarch64_simd_make_constant (rtx vals)
10372 {
10373   machine_mode mode = GET_MODE (vals);
10374   rtx const_dup;
10375   rtx const_vec = NULL_RTX;
10376   int n_elts = GET_MODE_NUNITS (mode);
10377   int n_const = 0;
10378   int i;
10379
10380   if (GET_CODE (vals) == CONST_VECTOR)
10381     const_vec = vals;
10382   else if (GET_CODE (vals) == PARALLEL)
10383     {
10384       /* A CONST_VECTOR must contain only CONST_INTs and
10385          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
10386          Only store valid constants in a CONST_VECTOR.  */
10387       for (i = 0; i < n_elts; ++i)
10388         {
10389           rtx x = XVECEXP (vals, 0, i);
10390           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
10391             n_const++;
10392         }
10393       if (n_const == n_elts)
10394         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
10395     }
10396   else
10397     gcc_unreachable ();
10398
10399   if (const_vec != NULL_RTX
10400       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
10401     /* Load using MOVI/MVNI.  */
10402     return const_vec;
10403   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
10404     /* Loaded using DUP.  */
10405     return const_dup;
10406   else if (const_vec != NULL_RTX)
10407     /* Load from constant pool. We can not take advantage of single-cycle
10408        LD1 because we need a PC-relative addressing mode.  */
10409     return const_vec;
10410   else
10411     /* A PARALLEL containing something not valid inside CONST_VECTOR.
10412        We can not construct an initializer.  */
10413     return NULL_RTX;
10414 }
10415
10416 void
10417 aarch64_expand_vector_init (rtx target, rtx vals)
10418 {
10419   machine_mode mode = GET_MODE (target);
10420   machine_mode inner_mode = GET_MODE_INNER (mode);
10421   int n_elts = GET_MODE_NUNITS (mode);
10422   int n_var = 0;
10423   rtx any_const = NULL_RTX;
10424   bool all_same = true;
10425
10426   for (int i = 0; i < n_elts; ++i)
10427     {
10428       rtx x = XVECEXP (vals, 0, i);
10429       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
10430         ++n_var;
10431       else
10432         any_const = x;
10433
10434       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
10435         all_same = false;
10436     }
10437
10438   if (n_var == 0)
10439     {
10440       rtx constant = aarch64_simd_make_constant (vals);
10441       if (constant != NULL_RTX)
10442         {
10443           emit_move_insn (target, constant);
10444           return;
10445         }
10446     }
10447
10448   /* Splat a single non-constant element if we can.  */
10449   if (all_same)
10450     {
10451       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
10452       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
10453       return;
10454     }
10455
10456   /* Half the fields (or less) are non-constant.  Load constant then overwrite
10457      varying fields.  Hope that this is more efficient than using the stack.  */
10458   if (n_var <= n_elts/2)
10459     {
10460       rtx copy = copy_rtx (vals);
10461
10462       /* Load constant part of vector.  We really don't care what goes into the
10463          parts we will overwrite, but we're more likely to be able to load the
10464          constant efficiently if it has fewer, larger, repeating parts
10465          (see aarch64_simd_valid_immediate).  */
10466       for (int i = 0; i < n_elts; i++)
10467         {
10468           rtx x = XVECEXP (vals, 0, i);
10469           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
10470             continue;
10471           rtx subst = any_const;
10472           for (int bit = n_elts / 2; bit > 0; bit /= 2)
10473             {
10474               /* Look in the copied vector, as more elements are const.  */
10475               rtx test = XVECEXP (copy, 0, i ^ bit);
10476               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
10477                 {
10478                   subst = test;
10479                   break;
10480                 }
10481             }
10482           XVECEXP (copy, 0, i) = subst;
10483         }
10484       aarch64_expand_vector_init (target, copy);
10485
10486       /* Insert variables.  */
10487       enum insn_code icode = optab_handler (vec_set_optab, mode);
10488       gcc_assert (icode != CODE_FOR_nothing);
10489
10490       for (int i = 0; i < n_elts; i++)
10491         {
10492           rtx x = XVECEXP (vals, 0, i);
10493           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
10494             continue;
10495           x = copy_to_mode_reg (inner_mode, x);
10496           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
10497         }
10498       return;
10499     }
10500
10501   /* Construct the vector in memory one field at a time
10502      and load the whole vector.  */
10503   rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
10504   for (int i = 0; i < n_elts; i++)
10505     emit_move_insn (adjust_address_nv (mem, inner_mode,
10506                                     i * GET_MODE_SIZE (inner_mode)),
10507                     XVECEXP (vals, 0, i));
10508   emit_move_insn (target, mem);
10509
10510 }
10511
10512 static unsigned HOST_WIDE_INT
10513 aarch64_shift_truncation_mask (machine_mode mode)
10514 {
10515   return
10516     (aarch64_vector_mode_supported_p (mode)
10517      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
10518 }
10519
10520 #ifndef TLS_SECTION_ASM_FLAG
10521 #define TLS_SECTION_ASM_FLAG 'T'
10522 #endif
10523
10524 void
10525 aarch64_elf_asm_named_section (const char *name, unsigned int flags,
10526                                tree decl ATTRIBUTE_UNUSED)
10527 {
10528   char flagchars[10], *f = flagchars;
10529
10530   /* If we have already declared this section, we can use an
10531      abbreviated form to switch back to it -- unless this section is
10532      part of a COMDAT groups, in which case GAS requires the full
10533      declaration every time.  */
10534   if (!(HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
10535       && (flags & SECTION_DECLARED))
10536     {
10537       fprintf (asm_out_file, "\t.section\t%s\n", name);
10538       return;
10539     }
10540
10541   if (!(flags & SECTION_DEBUG))
10542     *f++ = 'a';
10543   if (flags & SECTION_WRITE)
10544     *f++ = 'w';
10545   if (flags & SECTION_CODE)
10546     *f++ = 'x';
10547   if (flags & SECTION_SMALL)
10548     *f++ = 's';
10549   if (flags & SECTION_MERGE)
10550     *f++ = 'M';
10551   if (flags & SECTION_STRINGS)
10552     *f++ = 'S';
10553   if (flags & SECTION_TLS)
10554     *f++ = TLS_SECTION_ASM_FLAG;
10555   if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
10556     *f++ = 'G';
10557   *f = '\0';
10558
10559   fprintf (asm_out_file, "\t.section\t%s,\"%s\"", name, flagchars);
10560
10561   if (!(flags & SECTION_NOTYPE))
10562     {
10563       const char *type;
10564       const char *format;
10565
10566       if (flags & SECTION_BSS)
10567         type = "nobits";
10568       else
10569         type = "progbits";
10570
10571 #ifdef TYPE_OPERAND_FMT
10572       format = "," TYPE_OPERAND_FMT;
10573 #else
10574       format = ",@%s";
10575 #endif
10576
10577       fprintf (asm_out_file, format, type);
10578
10579       if (flags & SECTION_ENTSIZE)
10580         fprintf (asm_out_file, ",%d", flags & SECTION_ENTSIZE);
10581       if (HAVE_COMDAT_GROUP && (flags & SECTION_LINKONCE))
10582         {
10583           if (TREE_CODE (decl) == IDENTIFIER_NODE)
10584             fprintf (asm_out_file, ",%s,comdat", IDENTIFIER_POINTER (decl));
10585           else
10586             fprintf (asm_out_file, ",%s,comdat",
10587                      IDENTIFIER_POINTER (DECL_COMDAT_GROUP (decl)));
10588         }
10589     }
10590
10591   putc ('\n', asm_out_file);
10592 }
10593
10594 /* Select a format to encode pointers in exception handling data.  */
10595 int
10596 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
10597 {
10598    int type;
10599    switch (aarch64_cmodel)
10600      {
10601      case AARCH64_CMODEL_TINY:
10602      case AARCH64_CMODEL_TINY_PIC:
10603      case AARCH64_CMODEL_SMALL:
10604      case AARCH64_CMODEL_SMALL_PIC:
10605      case AARCH64_CMODEL_SMALL_SPIC:
10606        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
10607           for everything.  */
10608        type = DW_EH_PE_sdata4;
10609        break;
10610      default:
10611        /* No assumptions here.  8-byte relocs required.  */
10612        type = DW_EH_PE_sdata8;
10613        break;
10614      }
10615    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
10616 }
10617
10618 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
10619    by the function fndecl.  */
10620
10621 void
10622 aarch64_declare_function_name (FILE *stream, const char* name,
10623                                 tree fndecl)
10624 {
10625   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
10626
10627   struct cl_target_option *targ_options;
10628   if (target_parts)
10629     targ_options = TREE_TARGET_OPTION (target_parts);
10630   else
10631     targ_options = TREE_TARGET_OPTION (target_option_current_node);
10632   gcc_assert (targ_options);
10633
10634   const struct processor *this_arch
10635     = aarch64_get_arch (targ_options->x_explicit_arch);
10636
10637   asm_fprintf (asm_out_file, "\t.arch %s", this_arch->name);
10638   aarch64_print_extension (asm_out_file, targ_options->x_aarch64_isa_flags);
10639
10640   /* Print the cpu name we're tuning for in the comments, might be
10641      useful to readers of the generated asm.  */
10642
10643   const struct processor *this_tune
10644     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
10645
10646   asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
10647                this_tune->name);
10648
10649   /* Don't forget the type directive for ELF.  */
10650   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
10651   ASM_OUTPUT_LABEL (stream, name);
10652 }
10653
10654 /* Emit load exclusive.  */
10655
10656 static void
10657 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
10658                              rtx mem, rtx model_rtx)
10659 {
10660   rtx (*gen) (rtx, rtx, rtx);
10661
10662   switch (mode)
10663     {
10664     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
10665     case HImode: gen = gen_aarch64_load_exclusivehi; break;
10666     case SImode: gen = gen_aarch64_load_exclusivesi; break;
10667     case DImode: gen = gen_aarch64_load_exclusivedi; break;
10668     default:
10669       gcc_unreachable ();
10670     }
10671
10672   emit_insn (gen (rval, mem, model_rtx));
10673 }
10674
10675 /* Emit store exclusive.  */
10676
10677 static void
10678 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
10679                               rtx rval, rtx mem, rtx model_rtx)
10680 {
10681   rtx (*gen) (rtx, rtx, rtx, rtx);
10682
10683   switch (mode)
10684     {
10685     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
10686     case HImode: gen = gen_aarch64_store_exclusivehi; break;
10687     case SImode: gen = gen_aarch64_store_exclusivesi; break;
10688     case DImode: gen = gen_aarch64_store_exclusivedi; break;
10689     default:
10690       gcc_unreachable ();
10691     }
10692
10693   emit_insn (gen (bval, rval, mem, model_rtx));
10694 }
10695
10696 /* Mark the previous jump instruction as unlikely.  */
10697
10698 static void
10699 aarch64_emit_unlikely_jump (rtx insn)
10700 {
10701   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
10702
10703   insn = emit_jump_insn (insn);
10704   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
10705 }
10706
10707 /* Expand a compare and swap pattern.  */
10708
10709 void
10710 aarch64_expand_compare_and_swap (rtx operands[])
10711 {
10712   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
10713   machine_mode mode, cmp_mode;
10714   rtx (*gen) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
10715
10716   bval = operands[0];
10717   rval = operands[1];
10718   mem = operands[2];
10719   oldval = operands[3];
10720   newval = operands[4];
10721   is_weak = operands[5];
10722   mod_s = operands[6];
10723   mod_f = operands[7];
10724   mode = GET_MODE (mem);
10725   cmp_mode = mode;
10726
10727   /* Normally the succ memory model must be stronger than fail, but in the
10728      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
10729      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
10730
10731   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
10732       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
10733     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
10734
10735   switch (mode)
10736     {
10737     case QImode:
10738     case HImode:
10739       /* For short modes, we're going to perform the comparison in SImode,
10740          so do the zero-extension now.  */
10741       cmp_mode = SImode;
10742       rval = gen_reg_rtx (SImode);
10743       oldval = convert_modes (SImode, mode, oldval, true);
10744       /* Fall through.  */
10745
10746     case SImode:
10747     case DImode:
10748       /* Force the value into a register if needed.  */
10749       if (!aarch64_plus_operand (oldval, mode))
10750         oldval = force_reg (cmp_mode, oldval);
10751       break;
10752
10753     default:
10754       gcc_unreachable ();
10755     }
10756
10757   switch (mode)
10758     {
10759     case QImode: gen = gen_atomic_compare_and_swapqi_1; break;
10760     case HImode: gen = gen_atomic_compare_and_swaphi_1; break;
10761     case SImode: gen = gen_atomic_compare_and_swapsi_1; break;
10762     case DImode: gen = gen_atomic_compare_and_swapdi_1; break;
10763     default:
10764       gcc_unreachable ();
10765     }
10766
10767   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
10768
10769   if (mode == QImode || mode == HImode)
10770     emit_move_insn (operands[1], gen_lowpart (mode, rval));
10771
10772   x = gen_rtx_REG (CCmode, CC_REGNUM);
10773   x = gen_rtx_EQ (SImode, x, const0_rtx);
10774   emit_insn (gen_rtx_SET (bval, x));
10775 }
10776
10777 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
10778    sequence implementing an atomic operation.  */
10779
10780 static void
10781 aarch64_emit_post_barrier (enum memmodel model)
10782 {
10783   const enum memmodel base_model = memmodel_base (model);
10784
10785   if (is_mm_sync (model)
10786       && (base_model == MEMMODEL_ACQUIRE
10787           || base_model == MEMMODEL_ACQ_REL
10788           || base_model == MEMMODEL_SEQ_CST))
10789     {
10790       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
10791     }
10792 }
10793
10794 /* Split a compare and swap pattern.  */
10795
10796 void
10797 aarch64_split_compare_and_swap (rtx operands[])
10798 {
10799   rtx rval, mem, oldval, newval, scratch;
10800   machine_mode mode;
10801   bool is_weak;
10802   rtx_code_label *label1, *label2;
10803   rtx x, cond;
10804   enum memmodel model;
10805   rtx model_rtx;
10806
10807   rval = operands[0];
10808   mem = operands[1];
10809   oldval = operands[2];
10810   newval = operands[3];
10811   is_weak = (operands[4] != const0_rtx);
10812   model_rtx = operands[5];
10813   scratch = operands[7];
10814   mode = GET_MODE (mem);
10815   model = memmodel_from_int (INTVAL (model_rtx));
10816
10817   label1 = NULL;
10818   if (!is_weak)
10819     {
10820       label1 = gen_label_rtx ();
10821       emit_label (label1);
10822     }
10823   label2 = gen_label_rtx ();
10824
10825   /* The initial load can be relaxed for a __sync operation since a final
10826      barrier will be emitted to stop code hoisting.  */
10827   if (is_mm_sync (model))
10828     aarch64_emit_load_exclusive (mode, rval, mem,
10829                                  GEN_INT (MEMMODEL_RELAXED));
10830   else
10831     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
10832
10833   cond = aarch64_gen_compare_reg (NE, rval, oldval);
10834   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
10835   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
10836                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
10837   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
10838
10839   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
10840
10841   if (!is_weak)
10842     {
10843       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
10844       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
10845                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
10846       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
10847     }
10848   else
10849     {
10850       cond = gen_rtx_REG (CCmode, CC_REGNUM);
10851       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
10852       emit_insn (gen_rtx_SET (cond, x));
10853     }
10854
10855   emit_label (label2);
10856
10857   /* Emit any final barrier needed for a __sync operation.  */
10858   if (is_mm_sync (model))
10859     aarch64_emit_post_barrier (model);
10860 }
10861
10862 /* Split an atomic operation.  */
10863
10864 void
10865 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
10866                      rtx value, rtx model_rtx, rtx cond)
10867 {
10868   machine_mode mode = GET_MODE (mem);
10869   machine_mode wmode = (mode == DImode ? DImode : SImode);
10870   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
10871   const bool is_sync = is_mm_sync (model);
10872   rtx_code_label *label;
10873   rtx x;
10874
10875   label = gen_label_rtx ();
10876   emit_label (label);
10877
10878   if (new_out)
10879     new_out = gen_lowpart (wmode, new_out);
10880   if (old_out)
10881     old_out = gen_lowpart (wmode, old_out);
10882   else
10883     old_out = new_out;
10884   value = simplify_gen_subreg (wmode, value, mode, 0);
10885
10886   /* The initial load can be relaxed for a __sync operation since a final
10887      barrier will be emitted to stop code hoisting.  */
10888  if (is_sync)
10889     aarch64_emit_load_exclusive (mode, old_out, mem,
10890                                  GEN_INT (MEMMODEL_RELAXED));
10891   else
10892     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
10893
10894   switch (code)
10895     {
10896     case SET:
10897       new_out = value;
10898       break;
10899
10900     case NOT:
10901       x = gen_rtx_AND (wmode, old_out, value);
10902       emit_insn (gen_rtx_SET (new_out, x));
10903       x = gen_rtx_NOT (wmode, new_out);
10904       emit_insn (gen_rtx_SET (new_out, x));
10905       break;
10906
10907     case MINUS:
10908       if (CONST_INT_P (value))
10909         {
10910           value = GEN_INT (-INTVAL (value));
10911           code = PLUS;
10912         }
10913       /* Fall through.  */
10914
10915     default:
10916       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
10917       emit_insn (gen_rtx_SET (new_out, x));
10918       break;
10919     }
10920
10921   aarch64_emit_store_exclusive (mode, cond, mem,
10922                                 gen_lowpart (mode, new_out), model_rtx);
10923
10924   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
10925   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
10926                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
10927   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
10928
10929   /* Emit any final barrier needed for a __sync operation.  */
10930   if (is_sync)
10931     aarch64_emit_post_barrier (model);
10932 }
10933
10934 static void
10935 aarch64_init_libfuncs (void)
10936 {
10937    /* Half-precision float operations.  The compiler handles all operations
10938      with NULL libfuncs by converting to SFmode.  */
10939
10940   /* Conversions.  */
10941   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
10942   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
10943
10944   /* Arithmetic.  */
10945   set_optab_libfunc (add_optab, HFmode, NULL);
10946   set_optab_libfunc (sdiv_optab, HFmode, NULL);
10947   set_optab_libfunc (smul_optab, HFmode, NULL);
10948   set_optab_libfunc (neg_optab, HFmode, NULL);
10949   set_optab_libfunc (sub_optab, HFmode, NULL);
10950
10951   /* Comparisons.  */
10952   set_optab_libfunc (eq_optab, HFmode, NULL);
10953   set_optab_libfunc (ne_optab, HFmode, NULL);
10954   set_optab_libfunc (lt_optab, HFmode, NULL);
10955   set_optab_libfunc (le_optab, HFmode, NULL);
10956   set_optab_libfunc (ge_optab, HFmode, NULL);
10957   set_optab_libfunc (gt_optab, HFmode, NULL);
10958   set_optab_libfunc (unord_optab, HFmode, NULL);
10959 }
10960
10961 /* Target hook for c_mode_for_suffix.  */
10962 static machine_mode
10963 aarch64_c_mode_for_suffix (char suffix)
10964 {
10965   if (suffix == 'q')
10966     return TFmode;
10967
10968   return VOIDmode;
10969 }
10970
10971 /* We can only represent floating point constants which will fit in
10972    "quarter-precision" values.  These values are characterised by
10973    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
10974    by:
10975
10976    (-1)^s * (n/16) * 2^r
10977
10978    Where:
10979      's' is the sign bit.
10980      'n' is an integer in the range 16 <= n <= 31.
10981      'r' is an integer in the range -3 <= r <= 4.  */
10982
10983 /* Return true iff X can be represented by a quarter-precision
10984    floating point immediate operand X.  Note, we cannot represent 0.0.  */
10985 bool
10986 aarch64_float_const_representable_p (rtx x)
10987 {
10988   /* This represents our current view of how many bits
10989      make up the mantissa.  */
10990   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
10991   int exponent;
10992   unsigned HOST_WIDE_INT mantissa, mask;
10993   REAL_VALUE_TYPE r, m;
10994   bool fail;
10995
10996   if (!CONST_DOUBLE_P (x))
10997     return false;
10998
10999   /* We don't support HFmode constants yet.  */
11000   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
11001     return false;
11002
11003   REAL_VALUE_FROM_CONST_DOUBLE (r, x);
11004
11005   /* We cannot represent infinities, NaNs or +/-zero.  We won't
11006      know if we have +zero until we analyse the mantissa, but we
11007      can reject the other invalid values.  */
11008   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
11009       || REAL_VALUE_MINUS_ZERO (r))
11010     return false;
11011
11012   /* Extract exponent.  */
11013   r = real_value_abs (&r);
11014   exponent = REAL_EXP (&r);
11015
11016   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
11017      highest (sign) bit, with a fixed binary point at bit point_pos.
11018      m1 holds the low part of the mantissa, m2 the high part.
11019      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
11020      bits for the mantissa, this can fail (low bits will be lost).  */
11021   real_ldexp (&m, &r, point_pos - exponent);
11022   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
11023
11024   /* If the low part of the mantissa has bits set we cannot represent
11025      the value.  */
11026   if (w.elt (0) != 0)
11027     return false;
11028   /* We have rejected the lower HOST_WIDE_INT, so update our
11029      understanding of how many bits lie in the mantissa and
11030      look only at the high HOST_WIDE_INT.  */
11031   mantissa = w.elt (1);
11032   point_pos -= HOST_BITS_PER_WIDE_INT;
11033
11034   /* We can only represent values with a mantissa of the form 1.xxxx.  */
11035   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
11036   if ((mantissa & mask) != 0)
11037     return false;
11038
11039   /* Having filtered unrepresentable values, we may now remove all
11040      but the highest 5 bits.  */
11041   mantissa >>= point_pos - 5;
11042
11043   /* We cannot represent the value 0.0, so reject it.  This is handled
11044      elsewhere.  */
11045   if (mantissa == 0)
11046     return false;
11047
11048   /* Then, as bit 4 is always set, we can mask it off, leaving
11049      the mantissa in the range [0, 15].  */
11050   mantissa &= ~(1 << 4);
11051   gcc_assert (mantissa <= 15);
11052
11053   /* GCC internally does not use IEEE754-like encoding (where normalized
11054      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
11055      Our mantissa values are shifted 4 places to the left relative to
11056      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
11057      by 5 places to correct for GCC's representation.  */
11058   exponent = 5 - exponent;
11059
11060   return (exponent >= 0 && exponent <= 7);
11061 }
11062
11063 char*
11064 aarch64_output_simd_mov_immediate (rtx const_vector,
11065                                    machine_mode mode,
11066                                    unsigned width)
11067 {
11068   bool is_valid;
11069   static char templ[40];
11070   const char *mnemonic;
11071   const char *shift_op;
11072   unsigned int lane_count = 0;
11073   char element_char;
11074
11075   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
11076
11077   /* This will return true to show const_vector is legal for use as either
11078      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
11079      also update INFO to show how the immediate should be generated.  */
11080   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
11081   gcc_assert (is_valid);
11082
11083   element_char = sizetochar (info.element_width);
11084   lane_count = width / info.element_width;
11085
11086   mode = GET_MODE_INNER (mode);
11087   if (mode == SFmode || mode == DFmode)
11088     {
11089       gcc_assert (info.shift == 0 && ! info.mvn);
11090       if (aarch64_float_const_zero_rtx_p (info.value))
11091         info.value = GEN_INT (0);
11092       else
11093         {
11094 #define buf_size 20
11095           REAL_VALUE_TYPE r;
11096           REAL_VALUE_FROM_CONST_DOUBLE (r, info.value);
11097           char float_buf[buf_size] = {'\0'};
11098           real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size, 1, mode);
11099 #undef buf_size
11100
11101           if (lane_count == 1)
11102             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
11103           else
11104             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
11105                       lane_count, element_char, float_buf);
11106           return templ;
11107         }
11108     }
11109
11110   mnemonic = info.mvn ? "mvni" : "movi";
11111   shift_op = info.msl ? "msl" : "lsl";
11112
11113   if (lane_count == 1)
11114     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
11115               mnemonic, UINTVAL (info.value));
11116   else if (info.shift)
11117     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
11118               ", %s %d", mnemonic, lane_count, element_char,
11119               UINTVAL (info.value), shift_op, info.shift);
11120   else
11121     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
11122               mnemonic, lane_count, element_char, UINTVAL (info.value));
11123   return templ;
11124 }
11125
11126 char*
11127 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
11128                                           machine_mode mode)
11129 {
11130   machine_mode vmode;
11131
11132   gcc_assert (!VECTOR_MODE_P (mode));
11133   vmode = aarch64_simd_container_mode (mode, 64);
11134   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
11135   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
11136 }
11137
11138 /* Split operands into moves from op[1] + op[2] into op[0].  */
11139
11140 void
11141 aarch64_split_combinev16qi (rtx operands[3])
11142 {
11143   unsigned int dest = REGNO (operands[0]);
11144   unsigned int src1 = REGNO (operands[1]);
11145   unsigned int src2 = REGNO (operands[2]);
11146   machine_mode halfmode = GET_MODE (operands[1]);
11147   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
11148   rtx destlo, desthi;
11149
11150   gcc_assert (halfmode == V16QImode);
11151
11152   if (src1 == dest && src2 == dest + halfregs)
11153     {
11154       /* No-op move.  Can't split to nothing; emit something.  */
11155       emit_note (NOTE_INSN_DELETED);
11156       return;
11157     }
11158
11159   /* Preserve register attributes for variable tracking.  */
11160   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
11161   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
11162                                GET_MODE_SIZE (halfmode));
11163
11164   /* Special case of reversed high/low parts.  */
11165   if (reg_overlap_mentioned_p (operands[2], destlo)
11166       && reg_overlap_mentioned_p (operands[1], desthi))
11167     {
11168       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
11169       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
11170       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
11171     }
11172   else if (!reg_overlap_mentioned_p (operands[2], destlo))
11173     {
11174       /* Try to avoid unnecessary moves if part of the result
11175          is in the right place already.  */
11176       if (src1 != dest)
11177         emit_move_insn (destlo, operands[1]);
11178       if (src2 != dest + halfregs)
11179         emit_move_insn (desthi, operands[2]);
11180     }
11181   else
11182     {
11183       if (src2 != dest + halfregs)
11184         emit_move_insn (desthi, operands[2]);
11185       if (src1 != dest)
11186         emit_move_insn (destlo, operands[1]);
11187     }
11188 }
11189
11190 /* vec_perm support.  */
11191
11192 #define MAX_VECT_LEN 16
11193
11194 struct expand_vec_perm_d
11195 {
11196   rtx target, op0, op1;
11197   unsigned char perm[MAX_VECT_LEN];
11198   machine_mode vmode;
11199   unsigned char nelt;
11200   bool one_vector_p;
11201   bool testing_p;
11202 };
11203
11204 /* Generate a variable permutation.  */
11205
11206 static void
11207 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
11208 {
11209   machine_mode vmode = GET_MODE (target);
11210   bool one_vector_p = rtx_equal_p (op0, op1);
11211
11212   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
11213   gcc_checking_assert (GET_MODE (op0) == vmode);
11214   gcc_checking_assert (GET_MODE (op1) == vmode);
11215   gcc_checking_assert (GET_MODE (sel) == vmode);
11216   gcc_checking_assert (TARGET_SIMD);
11217
11218   if (one_vector_p)
11219     {
11220       if (vmode == V8QImode)
11221         {
11222           /* Expand the argument to a V16QI mode by duplicating it.  */
11223           rtx pair = gen_reg_rtx (V16QImode);
11224           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
11225           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
11226         }
11227       else
11228         {
11229           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
11230         }
11231     }
11232   else
11233     {
11234       rtx pair;
11235
11236       if (vmode == V8QImode)
11237         {
11238           pair = gen_reg_rtx (V16QImode);
11239           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
11240           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
11241         }
11242       else
11243         {
11244           pair = gen_reg_rtx (OImode);
11245           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
11246           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
11247         }
11248     }
11249 }
11250
11251 void
11252 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
11253 {
11254   machine_mode vmode = GET_MODE (target);
11255   unsigned int nelt = GET_MODE_NUNITS (vmode);
11256   bool one_vector_p = rtx_equal_p (op0, op1);
11257   rtx mask;
11258
11259   /* The TBL instruction does not use a modulo index, so we must take care
11260      of that ourselves.  */
11261   mask = aarch64_simd_gen_const_vector_dup (vmode,
11262       one_vector_p ? nelt - 1 : 2 * nelt - 1);
11263   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
11264
11265   /* For big-endian, we also need to reverse the index within the vector
11266      (but not which vector).  */
11267   if (BYTES_BIG_ENDIAN)
11268     {
11269       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
11270       if (!one_vector_p)
11271         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
11272       sel = expand_simple_binop (vmode, XOR, sel, mask,
11273                                  NULL, 0, OPTAB_LIB_WIDEN);
11274     }
11275   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
11276 }
11277
11278 /* Recognize patterns suitable for the TRN instructions.  */
11279 static bool
11280 aarch64_evpc_trn (struct expand_vec_perm_d *d)
11281 {
11282   unsigned int i, odd, mask, nelt = d->nelt;
11283   rtx out, in0, in1, x;
11284   rtx (*gen) (rtx, rtx, rtx);
11285   machine_mode vmode = d->vmode;
11286
11287   if (GET_MODE_UNIT_SIZE (vmode) > 8)
11288     return false;
11289
11290   /* Note that these are little-endian tests.
11291      We correct for big-endian later.  */
11292   if (d->perm[0] == 0)
11293     odd = 0;
11294   else if (d->perm[0] == 1)
11295     odd = 1;
11296   else
11297     return false;
11298   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
11299
11300   for (i = 0; i < nelt; i += 2)
11301     {
11302       if (d->perm[i] != i + odd)
11303         return false;
11304       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
11305         return false;
11306     }
11307
11308   /* Success!  */
11309   if (d->testing_p)
11310     return true;
11311
11312   in0 = d->op0;
11313   in1 = d->op1;
11314   if (BYTES_BIG_ENDIAN)
11315     {
11316       x = in0, in0 = in1, in1 = x;
11317       odd = !odd;
11318     }
11319   out = d->target;
11320
11321   if (odd)
11322     {
11323       switch (vmode)
11324         {
11325         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
11326         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
11327         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
11328         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
11329         case V4SImode: gen = gen_aarch64_trn2v4si; break;
11330         case V2SImode: gen = gen_aarch64_trn2v2si; break;
11331         case V2DImode: gen = gen_aarch64_trn2v2di; break;
11332         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
11333         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
11334         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
11335         default:
11336           return false;
11337         }
11338     }
11339   else
11340     {
11341       switch (vmode)
11342         {
11343         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
11344         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
11345         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
11346         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
11347         case V4SImode: gen = gen_aarch64_trn1v4si; break;
11348         case V2SImode: gen = gen_aarch64_trn1v2si; break;
11349         case V2DImode: gen = gen_aarch64_trn1v2di; break;
11350         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
11351         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
11352         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
11353         default:
11354           return false;
11355         }
11356     }
11357
11358   emit_insn (gen (out, in0, in1));
11359   return true;
11360 }
11361
11362 /* Recognize patterns suitable for the UZP instructions.  */
11363 static bool
11364 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
11365 {
11366   unsigned int i, odd, mask, nelt = d->nelt;
11367   rtx out, in0, in1, x;
11368   rtx (*gen) (rtx, rtx, rtx);
11369   machine_mode vmode = d->vmode;
11370
11371   if (GET_MODE_UNIT_SIZE (vmode) > 8)
11372     return false;
11373
11374   /* Note that these are little-endian tests.
11375      We correct for big-endian later.  */
11376   if (d->perm[0] == 0)
11377     odd = 0;
11378   else if (d->perm[0] == 1)
11379     odd = 1;
11380   else
11381     return false;
11382   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
11383
11384   for (i = 0; i < nelt; i++)
11385     {
11386       unsigned elt = (i * 2 + odd) & mask;
11387       if (d->perm[i] != elt)
11388         return false;
11389     }
11390
11391   /* Success!  */
11392   if (d->testing_p)
11393     return true;
11394
11395   in0 = d->op0;
11396   in1 = d->op1;
11397   if (BYTES_BIG_ENDIAN)
11398     {
11399       x = in0, in0 = in1, in1 = x;
11400       odd = !odd;
11401     }
11402   out = d->target;
11403
11404   if (odd)
11405     {
11406       switch (vmode)
11407         {
11408         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
11409         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
11410         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
11411         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
11412         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
11413         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
11414         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
11415         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
11416         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
11417         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
11418         default:
11419           return false;
11420         }
11421     }
11422   else
11423     {
11424       switch (vmode)
11425         {
11426         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
11427         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
11428         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
11429         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
11430         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
11431         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
11432         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
11433         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
11434         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
11435         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
11436         default:
11437           return false;
11438         }
11439     }
11440
11441   emit_insn (gen (out, in0, in1));
11442   return true;
11443 }
11444
11445 /* Recognize patterns suitable for the ZIP instructions.  */
11446 static bool
11447 aarch64_evpc_zip (struct expand_vec_perm_d *d)
11448 {
11449   unsigned int i, high, mask, nelt = d->nelt;
11450   rtx out, in0, in1, x;
11451   rtx (*gen) (rtx, rtx, rtx);
11452   machine_mode vmode = d->vmode;
11453
11454   if (GET_MODE_UNIT_SIZE (vmode) > 8)
11455     return false;
11456
11457   /* Note that these are little-endian tests.
11458      We correct for big-endian later.  */
11459   high = nelt / 2;
11460   if (d->perm[0] == high)
11461     /* Do Nothing.  */
11462     ;
11463   else if (d->perm[0] == 0)
11464     high = 0;
11465   else
11466     return false;
11467   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
11468
11469   for (i = 0; i < nelt / 2; i++)
11470     {
11471       unsigned elt = (i + high) & mask;
11472       if (d->perm[i * 2] != elt)
11473         return false;
11474       elt = (elt + nelt) & mask;
11475       if (d->perm[i * 2 + 1] != elt)
11476         return false;
11477     }
11478
11479   /* Success!  */
11480   if (d->testing_p)
11481     return true;
11482
11483   in0 = d->op0;
11484   in1 = d->op1;
11485   if (BYTES_BIG_ENDIAN)
11486     {
11487       x = in0, in0 = in1, in1 = x;
11488       high = !high;
11489     }
11490   out = d->target;
11491
11492   if (high)
11493     {
11494       switch (vmode)
11495         {
11496         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
11497         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
11498         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
11499         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
11500         case V4SImode: gen = gen_aarch64_zip2v4si; break;
11501         case V2SImode: gen = gen_aarch64_zip2v2si; break;
11502         case V2DImode: gen = gen_aarch64_zip2v2di; break;
11503         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
11504         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
11505         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
11506         default:
11507           return false;
11508         }
11509     }
11510   else
11511     {
11512       switch (vmode)
11513         {
11514         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
11515         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
11516         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
11517         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
11518         case V4SImode: gen = gen_aarch64_zip1v4si; break;
11519         case V2SImode: gen = gen_aarch64_zip1v2si; break;
11520         case V2DImode: gen = gen_aarch64_zip1v2di; break;
11521         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
11522         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
11523         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
11524         default:
11525           return false;
11526         }
11527     }
11528
11529   emit_insn (gen (out, in0, in1));
11530   return true;
11531 }
11532
11533 /* Recognize patterns for the EXT insn.  */
11534
11535 static bool
11536 aarch64_evpc_ext (struct expand_vec_perm_d *d)
11537 {
11538   unsigned int i, nelt = d->nelt;
11539   rtx (*gen) (rtx, rtx, rtx, rtx);
11540   rtx offset;
11541
11542   unsigned int location = d->perm[0]; /* Always < nelt.  */
11543
11544   /* Check if the extracted indices are increasing by one.  */
11545   for (i = 1; i < nelt; i++)
11546     {
11547       unsigned int required = location + i;
11548       if (d->one_vector_p)
11549         {
11550           /* We'll pass the same vector in twice, so allow indices to wrap.  */
11551           required &= (nelt - 1);
11552         }
11553       if (d->perm[i] != required)
11554         return false;
11555     }
11556
11557   switch (d->vmode)
11558     {
11559     case V16QImode: gen = gen_aarch64_extv16qi; break;
11560     case V8QImode: gen = gen_aarch64_extv8qi; break;
11561     case V4HImode: gen = gen_aarch64_extv4hi; break;
11562     case V8HImode: gen = gen_aarch64_extv8hi; break;
11563     case V2SImode: gen = gen_aarch64_extv2si; break;
11564     case V4SImode: gen = gen_aarch64_extv4si; break;
11565     case V2SFmode: gen = gen_aarch64_extv2sf; break;
11566     case V4SFmode: gen = gen_aarch64_extv4sf; break;
11567     case V2DImode: gen = gen_aarch64_extv2di; break;
11568     case V2DFmode: gen = gen_aarch64_extv2df; break;
11569     default:
11570       return false;
11571     }
11572
11573   /* Success! */
11574   if (d->testing_p)
11575     return true;
11576
11577   /* The case where (location == 0) is a no-op for both big- and little-endian,
11578      and is removed by the mid-end at optimization levels -O1 and higher.  */
11579
11580   if (BYTES_BIG_ENDIAN && (location != 0))
11581     {
11582       /* After setup, we want the high elements of the first vector (stored
11583          at the LSB end of the register), and the low elements of the second
11584          vector (stored at the MSB end of the register). So swap.  */
11585       std::swap (d->op0, d->op1);
11586       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
11587       location = nelt - location;
11588     }
11589
11590   offset = GEN_INT (location);
11591   emit_insn (gen (d->target, d->op0, d->op1, offset));
11592   return true;
11593 }
11594
11595 /* Recognize patterns for the REV insns.  */
11596
11597 static bool
11598 aarch64_evpc_rev (struct expand_vec_perm_d *d)
11599 {
11600   unsigned int i, j, diff, nelt = d->nelt;
11601   rtx (*gen) (rtx, rtx);
11602
11603   if (!d->one_vector_p)
11604     return false;
11605
11606   diff = d->perm[0];
11607   switch (diff)
11608     {
11609     case 7:
11610       switch (d->vmode)
11611         {
11612         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
11613         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
11614         default:
11615           return false;
11616         }
11617       break;
11618     case 3:
11619       switch (d->vmode)
11620         {
11621         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
11622         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
11623         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
11624         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
11625         default:
11626           return false;
11627         }
11628       break;
11629     case 1:
11630       switch (d->vmode)
11631         {
11632         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
11633         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
11634         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
11635         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
11636         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
11637         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
11638         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
11639         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
11640         default:
11641           return false;
11642         }
11643       break;
11644     default:
11645       return false;
11646     }
11647
11648   for (i = 0; i < nelt ; i += diff + 1)
11649     for (j = 0; j <= diff; j += 1)
11650       {
11651         /* This is guaranteed to be true as the value of diff
11652            is 7, 3, 1 and we should have enough elements in the
11653            queue to generate this.  Getting a vector mask with a
11654            value of diff other than these values implies that
11655            something is wrong by the time we get here.  */
11656         gcc_assert (i + j < nelt);
11657         if (d->perm[i + j] != i + diff - j)
11658           return false;
11659       }
11660
11661   /* Success! */
11662   if (d->testing_p)
11663     return true;
11664
11665   emit_insn (gen (d->target, d->op0));
11666   return true;
11667 }
11668
11669 static bool
11670 aarch64_evpc_dup (struct expand_vec_perm_d *d)
11671 {
11672   rtx (*gen) (rtx, rtx, rtx);
11673   rtx out = d->target;
11674   rtx in0;
11675   machine_mode vmode = d->vmode;
11676   unsigned int i, elt, nelt = d->nelt;
11677   rtx lane;
11678
11679   elt = d->perm[0];
11680   for (i = 1; i < nelt; i++)
11681     {
11682       if (elt != d->perm[i])
11683         return false;
11684     }
11685
11686   /* The generic preparation in aarch64_expand_vec_perm_const_1
11687      swaps the operand order and the permute indices if it finds
11688      d->perm[0] to be in the second operand.  Thus, we can always
11689      use d->op0 and need not do any extra arithmetic to get the
11690      correct lane number.  */
11691   in0 = d->op0;
11692   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
11693
11694   switch (vmode)
11695     {
11696     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
11697     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
11698     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
11699     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
11700     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
11701     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
11702     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
11703     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
11704     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
11705     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
11706     default:
11707       return false;
11708     }
11709
11710   emit_insn (gen (out, in0, lane));
11711   return true;
11712 }
11713
11714 static bool
11715 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
11716 {
11717   rtx rperm[MAX_VECT_LEN], sel;
11718   machine_mode vmode = d->vmode;
11719   unsigned int i, nelt = d->nelt;
11720
11721   if (d->testing_p)
11722     return true;
11723
11724   /* Generic code will try constant permutation twice.  Once with the
11725      original mode and again with the elements lowered to QImode.
11726      So wait and don't do the selector expansion ourselves.  */
11727   if (vmode != V8QImode && vmode != V16QImode)
11728     return false;
11729
11730   for (i = 0; i < nelt; ++i)
11731     {
11732       int nunits = GET_MODE_NUNITS (vmode);
11733
11734       /* If big-endian and two vectors we end up with a weird mixed-endian
11735          mode on NEON.  Reverse the index within each word but not the word
11736          itself.  */
11737       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
11738                                            : d->perm[i]);
11739     }
11740   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
11741   sel = force_reg (vmode, sel);
11742
11743   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
11744   return true;
11745 }
11746
11747 static bool
11748 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
11749 {
11750   /* The pattern matching functions above are written to look for a small
11751      number to begin the sequence (0, 1, N/2).  If we begin with an index
11752      from the second operand, we can swap the operands.  */
11753   if (d->perm[0] >= d->nelt)
11754     {
11755       unsigned i, nelt = d->nelt;
11756
11757       gcc_assert (nelt == (nelt & -nelt));
11758       for (i = 0; i < nelt; ++i)
11759         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
11760
11761       std::swap (d->op0, d->op1);
11762     }
11763
11764   if (TARGET_SIMD)
11765     {
11766       if (aarch64_evpc_rev (d))
11767         return true;
11768       else if (aarch64_evpc_ext (d))
11769         return true;
11770       else if (aarch64_evpc_dup (d))
11771         return true;
11772       else if (aarch64_evpc_zip (d))
11773         return true;
11774       else if (aarch64_evpc_uzp (d))
11775         return true;
11776       else if (aarch64_evpc_trn (d))
11777         return true;
11778       return aarch64_evpc_tbl (d);
11779     }
11780   return false;
11781 }
11782
11783 /* Expand a vec_perm_const pattern.  */
11784
11785 bool
11786 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
11787 {
11788   struct expand_vec_perm_d d;
11789   int i, nelt, which;
11790
11791   d.target = target;
11792   d.op0 = op0;
11793   d.op1 = op1;
11794
11795   d.vmode = GET_MODE (target);
11796   gcc_assert (VECTOR_MODE_P (d.vmode));
11797   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
11798   d.testing_p = false;
11799
11800   for (i = which = 0; i < nelt; ++i)
11801     {
11802       rtx e = XVECEXP (sel, 0, i);
11803       int ei = INTVAL (e) & (2 * nelt - 1);
11804       which |= (ei < nelt ? 1 : 2);
11805       d.perm[i] = ei;
11806     }
11807
11808   switch (which)
11809     {
11810     default:
11811       gcc_unreachable ();
11812
11813     case 3:
11814       d.one_vector_p = false;
11815       if (!rtx_equal_p (op0, op1))
11816         break;
11817
11818       /* The elements of PERM do not suggest that only the first operand
11819          is used, but both operands are identical.  Allow easier matching
11820          of the permutation by folding the permutation into the single
11821          input vector.  */
11822       /* Fall Through.  */
11823     case 2:
11824       for (i = 0; i < nelt; ++i)
11825         d.perm[i] &= nelt - 1;
11826       d.op0 = op1;
11827       d.one_vector_p = true;
11828       break;
11829
11830     case 1:
11831       d.op1 = op0;
11832       d.one_vector_p = true;
11833       break;
11834     }
11835
11836   return aarch64_expand_vec_perm_const_1 (&d);
11837 }
11838
11839 static bool
11840 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
11841                                      const unsigned char *sel)
11842 {
11843   struct expand_vec_perm_d d;
11844   unsigned int i, nelt, which;
11845   bool ret;
11846
11847   d.vmode = vmode;
11848   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
11849   d.testing_p = true;
11850   memcpy (d.perm, sel, nelt);
11851
11852   /* Calculate whether all elements are in one vector.  */
11853   for (i = which = 0; i < nelt; ++i)
11854     {
11855       unsigned char e = d.perm[i];
11856       gcc_assert (e < 2 * nelt);
11857       which |= (e < nelt ? 1 : 2);
11858     }
11859
11860   /* If all elements are from the second vector, reindex as if from the
11861      first vector.  */
11862   if (which == 2)
11863     for (i = 0; i < nelt; ++i)
11864       d.perm[i] -= nelt;
11865
11866   /* Check whether the mask can be applied to a single vector.  */
11867   d.one_vector_p = (which != 3);
11868
11869   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
11870   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
11871   if (!d.one_vector_p)
11872     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
11873
11874   start_sequence ();
11875   ret = aarch64_expand_vec_perm_const_1 (&d);
11876   end_sequence ();
11877
11878   return ret;
11879 }
11880
11881 rtx
11882 aarch64_reverse_mask (enum machine_mode mode)
11883 {
11884   /* We have to reverse each vector because we dont have
11885      a permuted load that can reverse-load according to ABI rules.  */
11886   rtx mask;
11887   rtvec v = rtvec_alloc (16);
11888   int i, j;
11889   int nunits = GET_MODE_NUNITS (mode);
11890   int usize = GET_MODE_UNIT_SIZE (mode);
11891
11892   gcc_assert (BYTES_BIG_ENDIAN);
11893   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
11894
11895   for (i = 0; i < nunits; i++)
11896     for (j = 0; j < usize; j++)
11897       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
11898   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
11899   return force_reg (V16QImode, mask);
11900 }
11901
11902 /* Implement MODES_TIEABLE_P.  */
11903
11904 bool
11905 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
11906 {
11907   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
11908     return true;
11909
11910   /* We specifically want to allow elements of "structure" modes to
11911      be tieable to the structure.  This more general condition allows
11912      other rarer situations too.  */
11913   if (TARGET_SIMD
11914       && aarch64_vector_mode_p (mode1)
11915       && aarch64_vector_mode_p (mode2))
11916     return true;
11917
11918   return false;
11919 }
11920
11921 /* Return a new RTX holding the result of moving POINTER forward by
11922    AMOUNT bytes.  */
11923
11924 static rtx
11925 aarch64_move_pointer (rtx pointer, int amount)
11926 {
11927   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
11928
11929   return adjust_automodify_address (pointer, GET_MODE (pointer),
11930                                     next, amount);
11931 }
11932
11933 /* Return a new RTX holding the result of moving POINTER forward by the
11934    size of the mode it points to.  */
11935
11936 static rtx
11937 aarch64_progress_pointer (rtx pointer)
11938 {
11939   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
11940
11941   return aarch64_move_pointer (pointer, amount);
11942 }
11943
11944 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
11945    MODE bytes.  */
11946
11947 static void
11948 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
11949                                               machine_mode mode)
11950 {
11951   rtx reg = gen_reg_rtx (mode);
11952
11953   /* "Cast" the pointers to the correct mode.  */
11954   *src = adjust_address (*src, mode, 0);
11955   *dst = adjust_address (*dst, mode, 0);
11956   /* Emit the memcpy.  */
11957   emit_move_insn (reg, *src);
11958   emit_move_insn (*dst, reg);
11959   /* Move the pointers forward.  */
11960   *src = aarch64_progress_pointer (*src);
11961   *dst = aarch64_progress_pointer (*dst);
11962 }
11963
11964 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
11965    we succeed, otherwise return false.  */
11966
11967 bool
11968 aarch64_expand_movmem (rtx *operands)
11969 {
11970   unsigned int n;
11971   rtx dst = operands[0];
11972   rtx src = operands[1];
11973   rtx base;
11974   bool speed_p = !optimize_function_for_size_p (cfun);
11975
11976   /* When optimizing for size, give a better estimate of the length of a
11977      memcpy call, but use the default otherwise.  */
11978   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
11979
11980   /* We can't do anything smart if the amount to copy is not constant.  */
11981   if (!CONST_INT_P (operands[2]))
11982     return false;
11983
11984   n = UINTVAL (operands[2]);
11985
11986   /* Try to keep the number of instructions low.  For cases below 16 bytes we
11987      need to make at most two moves.  For cases above 16 bytes it will be one
11988      move for each 16 byte chunk, then at most two additional moves.  */
11989   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
11990     return false;
11991
11992   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
11993   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
11994
11995   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
11996   src = adjust_automodify_address (src, VOIDmode, base, 0);
11997
11998   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
11999      1-byte chunk.  */
12000   if (n < 4)
12001     {
12002       if (n >= 2)
12003         {
12004           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
12005           n -= 2;
12006         }
12007
12008       if (n == 1)
12009         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
12010
12011       return true;
12012     }
12013
12014   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
12015      4-byte chunk, partially overlapping with the previously copied chunk.  */
12016   if (n < 8)
12017     {
12018       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12019       n -= 4;
12020       if (n > 0)
12021         {
12022           int move = n - 4;
12023
12024           src = aarch64_move_pointer (src, move);
12025           dst = aarch64_move_pointer (dst, move);
12026           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12027         }
12028       return true;
12029     }
12030
12031   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
12032      them, then (if applicable) an 8-byte chunk.  */
12033   while (n >= 8)
12034     {
12035       if (n / 16)
12036         {
12037           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
12038           n -= 16;
12039         }
12040       else
12041         {
12042           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
12043           n -= 8;
12044         }
12045     }
12046
12047   /* Finish the final bytes of the copy.  We can always do this in one
12048      instruction.  We either copy the exact amount we need, or partially
12049      overlap with the previous chunk we copied and copy 8-bytes.  */
12050   if (n == 0)
12051     return true;
12052   else if (n == 1)
12053     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
12054   else if (n == 2)
12055     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
12056   else if (n == 4)
12057     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12058   else
12059     {
12060       if (n == 3)
12061         {
12062           src = aarch64_move_pointer (src, -1);
12063           dst = aarch64_move_pointer (dst, -1);
12064           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12065         }
12066       else
12067         {
12068           int move = n - 8;
12069
12070           src = aarch64_move_pointer (src, move);
12071           dst = aarch64_move_pointer (dst, move);
12072           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
12073         }
12074     }
12075
12076   return true;
12077 }
12078
12079 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
12080
12081 static unsigned HOST_WIDE_INT
12082 aarch64_asan_shadow_offset (void)
12083 {
12084   return (HOST_WIDE_INT_1 << 36);
12085 }
12086
12087 static bool
12088 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
12089                                         unsigned int align,
12090                                         enum by_pieces_operation op,
12091                                         bool speed_p)
12092 {
12093   /* STORE_BY_PIECES can be used when copying a constant string, but
12094      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
12095      For now we always fail this and let the move_by_pieces code copy
12096      the string from read-only memory.  */
12097   if (op == STORE_BY_PIECES)
12098     return false;
12099
12100   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
12101 }
12102
12103 static enum machine_mode
12104 aarch64_code_to_ccmode (enum rtx_code code)
12105 {
12106   switch (code)
12107     {
12108     case NE:
12109       return CC_DNEmode;
12110
12111     case EQ:
12112       return CC_DEQmode;
12113
12114     case LE:
12115       return CC_DLEmode;
12116
12117     case LT:
12118       return CC_DLTmode;
12119
12120     case GE:
12121       return CC_DGEmode;
12122
12123     case GT:
12124       return CC_DGTmode;
12125
12126     case LEU:
12127       return CC_DLEUmode;
12128
12129     case LTU:
12130       return CC_DLTUmode;
12131
12132     case GEU:
12133       return CC_DGEUmode;
12134
12135     case GTU:
12136       return CC_DGTUmode;
12137
12138     default:
12139       return CCmode;
12140     }
12141 }
12142
12143 static rtx
12144 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
12145                         int code, tree treeop0, tree treeop1)
12146 {
12147   enum machine_mode op_mode, cmp_mode, cc_mode;
12148   rtx op0, op1, cmp, target;
12149   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
12150   enum insn_code icode;
12151   struct expand_operand ops[4];
12152
12153   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
12154   if (cc_mode == CCmode)
12155     return NULL_RTX;
12156
12157   start_sequence ();
12158   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
12159
12160   op_mode = GET_MODE (op0);
12161   if (op_mode == VOIDmode)
12162     op_mode = GET_MODE (op1);
12163
12164   switch (op_mode)
12165     {
12166     case QImode:
12167     case HImode:
12168     case SImode:
12169       cmp_mode = SImode;
12170       icode = CODE_FOR_cmpsi;
12171       break;
12172
12173     case DImode:
12174       cmp_mode = DImode;
12175       icode = CODE_FOR_cmpdi;
12176       break;
12177
12178     default:
12179       end_sequence ();
12180       return NULL_RTX;
12181     }
12182
12183   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
12184   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
12185   if (!op0 || !op1)
12186     {
12187       end_sequence ();
12188       return NULL_RTX;
12189     }
12190   *prep_seq = get_insns ();
12191   end_sequence ();
12192
12193   cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
12194   target = gen_rtx_REG (CCmode, CC_REGNUM);
12195
12196   create_output_operand (&ops[0], target, CCmode);
12197   create_fixed_operand (&ops[1], cmp);
12198   create_fixed_operand (&ops[2], op0);
12199   create_fixed_operand (&ops[3], op1);
12200
12201   start_sequence ();
12202   if (!maybe_expand_insn (icode, 4, ops))
12203     {
12204       end_sequence ();
12205       return NULL_RTX;
12206     }
12207   *gen_seq = get_insns ();
12208   end_sequence ();
12209
12210   return gen_rtx_REG (cc_mode, CC_REGNUM);
12211 }
12212
12213 static rtx
12214 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
12215                        tree treeop0, tree treeop1, int bit_code)
12216 {
12217   rtx op0, op1, cmp0, cmp1, target;
12218   enum machine_mode op_mode, cmp_mode, cc_mode;
12219   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
12220   enum insn_code icode = CODE_FOR_ccmp_andsi;
12221   struct expand_operand ops[6];
12222
12223   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
12224   if (cc_mode == CCmode)
12225     return NULL_RTX;
12226
12227   push_to_sequence ((rtx_insn*) *prep_seq);
12228   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
12229
12230   op_mode = GET_MODE (op0);
12231   if (op_mode == VOIDmode)
12232     op_mode = GET_MODE (op1);
12233
12234   switch (op_mode)
12235     {
12236     case QImode:
12237     case HImode:
12238     case SImode:
12239       cmp_mode = SImode;
12240       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
12241                                                 : CODE_FOR_ccmp_iorsi;
12242       break;
12243
12244     case DImode:
12245       cmp_mode = DImode;
12246       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
12247                                                 : CODE_FOR_ccmp_iordi;
12248       break;
12249
12250     default:
12251       end_sequence ();
12252       return NULL_RTX;
12253     }
12254
12255   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
12256   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
12257   if (!op0 || !op1)
12258     {
12259       end_sequence ();
12260       return NULL_RTX;
12261     }
12262   *prep_seq = get_insns ();
12263   end_sequence ();
12264
12265   target = gen_rtx_REG (cc_mode, CC_REGNUM);
12266   cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
12267   cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
12268
12269   create_fixed_operand (&ops[0], prev);
12270   create_fixed_operand (&ops[1], target);
12271   create_fixed_operand (&ops[2], op0);
12272   create_fixed_operand (&ops[3], op1);
12273   create_fixed_operand (&ops[4], cmp0);
12274   create_fixed_operand (&ops[5], cmp1);
12275
12276   push_to_sequence ((rtx_insn*) *gen_seq);
12277   if (!maybe_expand_insn (icode, 6, ops))
12278     {
12279       end_sequence ();
12280       return NULL_RTX;
12281     }
12282
12283   *gen_seq = get_insns ();
12284   end_sequence ();
12285
12286   return target;
12287 }
12288
12289 #undef TARGET_GEN_CCMP_FIRST
12290 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
12291
12292 #undef TARGET_GEN_CCMP_NEXT
12293 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
12294
12295 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
12296    instruction fusion of some sort.  */
12297
12298 static bool
12299 aarch64_macro_fusion_p (void)
12300 {
12301   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
12302 }
12303
12304
12305 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
12306    should be kept together during scheduling.  */
12307
12308 static bool
12309 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
12310 {
12311   rtx set_dest;
12312   rtx prev_set = single_set (prev);
12313   rtx curr_set = single_set (curr);
12314   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
12315   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
12316
12317   if (!aarch64_macro_fusion_p ())
12318     return false;
12319
12320   if (simple_sets_p
12321       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOV_MOVK))
12322     {
12323       /* We are trying to match:
12324          prev (mov)  == (set (reg r0) (const_int imm16))
12325          curr (movk) == (set (zero_extract (reg r0)
12326                                            (const_int 16)
12327                                            (const_int 16))
12328                              (const_int imm16_1))  */
12329
12330       set_dest = SET_DEST (curr_set);
12331
12332       if (GET_CODE (set_dest) == ZERO_EXTRACT
12333           && CONST_INT_P (SET_SRC (curr_set))
12334           && CONST_INT_P (SET_SRC (prev_set))
12335           && CONST_INT_P (XEXP (set_dest, 2))
12336           && INTVAL (XEXP (set_dest, 2)) == 16
12337           && REG_P (XEXP (set_dest, 0))
12338           && REG_P (SET_DEST (prev_set))
12339           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
12340         {
12341           return true;
12342         }
12343     }
12344
12345   if (simple_sets_p
12346       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_ADD))
12347     {
12348
12349       /*  We're trying to match:
12350           prev (adrp) == (set (reg r1)
12351                               (high (symbol_ref ("SYM"))))
12352           curr (add) == (set (reg r0)
12353                              (lo_sum (reg r1)
12354                                      (symbol_ref ("SYM"))))
12355           Note that r0 need not necessarily be the same as r1, especially
12356           during pre-regalloc scheduling.  */
12357
12358       if (satisfies_constraint_Ush (SET_SRC (prev_set))
12359           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
12360         {
12361           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
12362               && REG_P (XEXP (SET_SRC (curr_set), 0))
12363               && REGNO (XEXP (SET_SRC (curr_set), 0))
12364                  == REGNO (SET_DEST (prev_set))
12365               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
12366                               XEXP (SET_SRC (curr_set), 1)))
12367             return true;
12368         }
12369     }
12370
12371   if (simple_sets_p
12372       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOVK_MOVK))
12373     {
12374
12375       /* We're trying to match:
12376          prev (movk) == (set (zero_extract (reg r0)
12377                                            (const_int 16)
12378                                            (const_int 32))
12379                              (const_int imm16_1))
12380          curr (movk) == (set (zero_extract (reg r0)
12381                                            (const_int 16)
12382                                            (const_int 48))
12383                              (const_int imm16_2))  */
12384
12385       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
12386           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
12387           && REG_P (XEXP (SET_DEST (prev_set), 0))
12388           && REG_P (XEXP (SET_DEST (curr_set), 0))
12389           && REGNO (XEXP (SET_DEST (prev_set), 0))
12390              == REGNO (XEXP (SET_DEST (curr_set), 0))
12391           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
12392           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
12393           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
12394           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
12395           && CONST_INT_P (SET_SRC (prev_set))
12396           && CONST_INT_P (SET_SRC (curr_set)))
12397         return true;
12398
12399     }
12400   if (simple_sets_p
12401       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_LDR))
12402     {
12403       /* We're trying to match:
12404           prev (adrp) == (set (reg r0)
12405                               (high (symbol_ref ("SYM"))))
12406           curr (ldr) == (set (reg r1)
12407                              (mem (lo_sum (reg r0)
12408                                              (symbol_ref ("SYM")))))
12409                  or
12410           curr (ldr) == (set (reg r1)
12411                              (zero_extend (mem
12412                                            (lo_sum (reg r0)
12413                                                    (symbol_ref ("SYM"))))))  */
12414       if (satisfies_constraint_Ush (SET_SRC (prev_set))
12415           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
12416         {
12417           rtx curr_src = SET_SRC (curr_set);
12418
12419           if (GET_CODE (curr_src) == ZERO_EXTEND)
12420             curr_src = XEXP (curr_src, 0);
12421
12422           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
12423               && REG_P (XEXP (XEXP (curr_src, 0), 0))
12424               && REGNO (XEXP (XEXP (curr_src, 0), 0))
12425                  == REGNO (SET_DEST (prev_set))
12426               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
12427                               XEXP (SET_SRC (prev_set), 0)))
12428               return true;
12429         }
12430     }
12431
12432   if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_CMP_BRANCH)
12433       && any_condjump_p (curr))
12434     {
12435       enum attr_type prev_type = get_attr_type (prev);
12436
12437       /* FIXME: this misses some which is considered simple arthematic
12438          instructions for ThunderX.  Simple shifts are missed here.  */
12439       if (prev_type == TYPE_ALUS_SREG
12440           || prev_type == TYPE_ALUS_IMM
12441           || prev_type == TYPE_LOGICS_REG
12442           || prev_type == TYPE_LOGICS_IMM)
12443         return true;
12444     }
12445
12446   return false;
12447 }
12448
12449 /* If MEM is in the form of [base+offset], extract the two parts
12450    of address and set to BASE and OFFSET, otherwise return false
12451    after clearing BASE and OFFSET.  */
12452
12453 bool
12454 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
12455 {
12456   rtx addr;
12457
12458   gcc_assert (MEM_P (mem));
12459
12460   addr = XEXP (mem, 0);
12461
12462   if (REG_P (addr))
12463     {
12464       *base = addr;
12465       *offset = const0_rtx;
12466       return true;
12467     }
12468
12469   if (GET_CODE (addr) == PLUS
12470       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
12471     {
12472       *base = XEXP (addr, 0);
12473       *offset = XEXP (addr, 1);
12474       return true;
12475     }
12476
12477   *base = NULL_RTX;
12478   *offset = NULL_RTX;
12479
12480   return false;
12481 }
12482
12483 /* Types for scheduling fusion.  */
12484 enum sched_fusion_type
12485 {
12486   SCHED_FUSION_NONE = 0,
12487   SCHED_FUSION_LD_SIGN_EXTEND,
12488   SCHED_FUSION_LD_ZERO_EXTEND,
12489   SCHED_FUSION_LD,
12490   SCHED_FUSION_ST,
12491   SCHED_FUSION_NUM
12492 };
12493
12494 /* If INSN is a load or store of address in the form of [base+offset],
12495    extract the two parts and set to BASE and OFFSET.  Return scheduling
12496    fusion type this INSN is.  */
12497
12498 static enum sched_fusion_type
12499 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
12500 {
12501   rtx x, dest, src;
12502   enum sched_fusion_type fusion = SCHED_FUSION_LD;
12503
12504   gcc_assert (INSN_P (insn));
12505   x = PATTERN (insn);
12506   if (GET_CODE (x) != SET)
12507     return SCHED_FUSION_NONE;
12508
12509   src = SET_SRC (x);
12510   dest = SET_DEST (x);
12511
12512   if (GET_MODE (dest) != SImode && GET_MODE (dest) != DImode
12513       && GET_MODE (dest) != SFmode && GET_MODE (dest) != DFmode)
12514     return SCHED_FUSION_NONE;
12515
12516   if (GET_CODE (src) == SIGN_EXTEND)
12517     {
12518       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
12519       src = XEXP (src, 0);
12520       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
12521         return SCHED_FUSION_NONE;
12522     }
12523   else if (GET_CODE (src) == ZERO_EXTEND)
12524     {
12525       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
12526       src = XEXP (src, 0);
12527       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
12528         return SCHED_FUSION_NONE;
12529     }
12530
12531   if (GET_CODE (src) == MEM && REG_P (dest))
12532     extract_base_offset_in_addr (src, base, offset);
12533   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
12534     {
12535       fusion = SCHED_FUSION_ST;
12536       extract_base_offset_in_addr (dest, base, offset);
12537     }
12538   else
12539     return SCHED_FUSION_NONE;
12540
12541   if (*base == NULL_RTX || *offset == NULL_RTX)
12542     fusion = SCHED_FUSION_NONE;
12543
12544   return fusion;
12545 }
12546
12547 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
12548
12549    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
12550    and PRI are only calculated for these instructions.  For other instruction,
12551    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
12552    type instruction fusion can be added by returning different priorities.
12553
12554    It's important that irrelevant instructions get the largest FUSION_PRI.  */
12555
12556 static void
12557 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
12558                                int *fusion_pri, int *pri)
12559 {
12560   int tmp, off_val;
12561   rtx base, offset;
12562   enum sched_fusion_type fusion;
12563
12564   gcc_assert (INSN_P (insn));
12565
12566   tmp = max_pri - 1;
12567   fusion = fusion_load_store (insn, &base, &offset);
12568   if (fusion == SCHED_FUSION_NONE)
12569     {
12570       *pri = tmp;
12571       *fusion_pri = tmp;
12572       return;
12573     }
12574
12575   /* Set FUSION_PRI according to fusion type and base register.  */
12576   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
12577
12578   /* Calculate PRI.  */
12579   tmp /= 2;
12580
12581   /* INSN with smaller offset goes first.  */
12582   off_val = (int)(INTVAL (offset));
12583   if (off_val >= 0)
12584     tmp -= (off_val & 0xfffff);
12585   else
12586     tmp += ((- off_val) & 0xfffff);
12587
12588   *pri = tmp;
12589   return;
12590 }
12591
12592 /* Given OPERANDS of consecutive load/store, check if we can merge
12593    them into ldp/stp.  LOAD is true if they are load instructions.
12594    MODE is the mode of memory operands.  */
12595
12596 bool
12597 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
12598                                 enum machine_mode mode)
12599 {
12600   HOST_WIDE_INT offval_1, offval_2, msize;
12601   enum reg_class rclass_1, rclass_2;
12602   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
12603
12604   if (load)
12605     {
12606       mem_1 = operands[1];
12607       mem_2 = operands[3];
12608       reg_1 = operands[0];
12609       reg_2 = operands[2];
12610       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
12611       if (REGNO (reg_1) == REGNO (reg_2))
12612         return false;
12613     }
12614   else
12615     {
12616       mem_1 = operands[0];
12617       mem_2 = operands[2];
12618       reg_1 = operands[1];
12619       reg_2 = operands[3];
12620     }
12621
12622   /* The mems cannot be volatile.  */
12623   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
12624     return false;
12625
12626   /* Check if the addresses are in the form of [base+offset].  */
12627   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
12628   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
12629     return false;
12630   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
12631   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
12632     return false;
12633
12634   /* Check if the bases are same.  */
12635   if (!rtx_equal_p (base_1, base_2))
12636     return false;
12637
12638   offval_1 = INTVAL (offset_1);
12639   offval_2 = INTVAL (offset_2);
12640   msize = GET_MODE_SIZE (mode);
12641   /* Check if the offsets are consecutive.  */
12642   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
12643     return false;
12644
12645   /* Check if the addresses are clobbered by load.  */
12646   if (load)
12647     {
12648       if (reg_mentioned_p (reg_1, mem_1))
12649         return false;
12650
12651       /* In increasing order, the last load can clobber the address.  */
12652       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
12653       return false;
12654     }
12655
12656   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
12657     rclass_1 = FP_REGS;
12658   else
12659     rclass_1 = GENERAL_REGS;
12660
12661   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
12662     rclass_2 = FP_REGS;
12663   else
12664     rclass_2 = GENERAL_REGS;
12665
12666   /* Check if the registers are of same class.  */
12667   if (rclass_1 != rclass_2)
12668     return false;
12669
12670   return true;
12671 }
12672
12673 /* Given OPERANDS of consecutive load/store, check if we can merge
12674    them into ldp/stp by adjusting the offset.  LOAD is true if they
12675    are load instructions.  MODE is the mode of memory operands.
12676
12677    Given below consecutive stores:
12678
12679      str  w1, [xb, 0x100]
12680      str  w1, [xb, 0x104]
12681      str  w1, [xb, 0x108]
12682      str  w1, [xb, 0x10c]
12683
12684    Though the offsets are out of the range supported by stp, we can
12685    still pair them after adjusting the offset, like:
12686
12687      add  scratch, xb, 0x100
12688      stp  w1, w1, [scratch]
12689      stp  w1, w1, [scratch, 0x8]
12690
12691    The peephole patterns detecting this opportunity should guarantee
12692    the scratch register is avaliable.  */
12693
12694 bool
12695 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
12696                                        enum machine_mode mode)
12697 {
12698   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
12699   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
12700   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
12701   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
12702
12703   if (load)
12704     {
12705       reg_1 = operands[0];
12706       mem_1 = operands[1];
12707       reg_2 = operands[2];
12708       mem_2 = operands[3];
12709       reg_3 = operands[4];
12710       mem_3 = operands[5];
12711       reg_4 = operands[6];
12712       mem_4 = operands[7];
12713       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
12714                   && REG_P (reg_3) && REG_P (reg_4));
12715       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
12716         return false;
12717     }
12718   else
12719     {
12720       mem_1 = operands[0];
12721       reg_1 = operands[1];
12722       mem_2 = operands[2];
12723       reg_2 = operands[3];
12724       mem_3 = operands[4];
12725       reg_3 = operands[5];
12726       mem_4 = operands[6];
12727       reg_4 = operands[7];
12728     }
12729   /* Skip if memory operand is by itslef valid for ldp/stp.  */
12730   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
12731     return false;
12732
12733   /* The mems cannot be volatile.  */
12734   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
12735       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
12736     return false;
12737
12738   /* Check if the addresses are in the form of [base+offset].  */
12739   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
12740   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
12741     return false;
12742   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
12743   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
12744     return false;
12745   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
12746   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
12747     return false;
12748   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
12749   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
12750     return false;
12751
12752   /* Check if the bases are same.  */
12753   if (!rtx_equal_p (base_1, base_2)
12754       || !rtx_equal_p (base_2, base_3)
12755       || !rtx_equal_p (base_3, base_4))
12756     return false;
12757
12758   offval_1 = INTVAL (offset_1);
12759   offval_2 = INTVAL (offset_2);
12760   offval_3 = INTVAL (offset_3);
12761   offval_4 = INTVAL (offset_4);
12762   msize = GET_MODE_SIZE (mode);
12763   /* Check if the offsets are consecutive.  */
12764   if ((offval_1 != (offval_2 + msize)
12765        || offval_1 != (offval_3 + msize * 2)
12766        || offval_1 != (offval_4 + msize * 3))
12767       && (offval_4 != (offval_3 + msize)
12768           || offval_4 != (offval_2 + msize * 2)
12769           || offval_4 != (offval_1 + msize * 3)))
12770     return false;
12771
12772   /* Check if the addresses are clobbered by load.  */
12773   if (load)
12774     {
12775       if (reg_mentioned_p (reg_1, mem_1)
12776           || reg_mentioned_p (reg_2, mem_2)
12777           || reg_mentioned_p (reg_3, mem_3))
12778         return false;
12779
12780       /* In increasing order, the last load can clobber the address.  */
12781       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
12782         return false;
12783     }
12784
12785   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
12786     rclass_1 = FP_REGS;
12787   else
12788     rclass_1 = GENERAL_REGS;
12789
12790   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
12791     rclass_2 = FP_REGS;
12792   else
12793     rclass_2 = GENERAL_REGS;
12794
12795   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
12796     rclass_3 = FP_REGS;
12797   else
12798     rclass_3 = GENERAL_REGS;
12799
12800   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
12801     rclass_4 = FP_REGS;
12802   else
12803     rclass_4 = GENERAL_REGS;
12804
12805   /* Check if the registers are of same class.  */
12806   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
12807     return false;
12808
12809   return true;
12810 }
12811
12812 /* Given OPERANDS of consecutive load/store, this function pairs them
12813    into ldp/stp after adjusting the offset.  It depends on the fact
12814    that addresses of load/store instructions are in increasing order.
12815    MODE is the mode of memory operands.  CODE is the rtl operator
12816    which should be applied to all memory operands, it's SIGN_EXTEND,
12817    ZERO_EXTEND or UNKNOWN.  */
12818
12819 bool
12820 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
12821                              enum machine_mode mode, RTX_CODE code)
12822 {
12823   rtx base, offset, t1, t2;
12824   rtx mem_1, mem_2, mem_3, mem_4;
12825   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
12826
12827   if (load)
12828     {
12829       mem_1 = operands[1];
12830       mem_2 = operands[3];
12831       mem_3 = operands[5];
12832       mem_4 = operands[7];
12833     }
12834   else
12835     {
12836       mem_1 = operands[0];
12837       mem_2 = operands[2];
12838       mem_3 = operands[4];
12839       mem_4 = operands[6];
12840       gcc_assert (code == UNKNOWN);
12841     }
12842
12843   extract_base_offset_in_addr (mem_1, &base, &offset);
12844   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
12845
12846   /* Adjust offset thus it can fit in ldp/stp instruction.  */
12847   msize = GET_MODE_SIZE (mode);
12848   stp_off_limit = msize * 0x40;
12849   off_val = INTVAL (offset);
12850   abs_off = (off_val < 0) ? -off_val : off_val;
12851   new_off = abs_off % stp_off_limit;
12852   adj_off = abs_off - new_off;
12853
12854   /* Further adjust to make sure all offsets are OK.  */
12855   if ((new_off + msize * 2) >= stp_off_limit)
12856     {
12857       adj_off += stp_off_limit;
12858       new_off -= stp_off_limit;
12859     }
12860
12861   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
12862   if (adj_off >= 0x1000)
12863     return false;
12864
12865   if (off_val < 0)
12866     {
12867       adj_off = -adj_off;
12868       new_off = -new_off;
12869     }
12870
12871   /* Create new memory references.  */
12872   mem_1 = change_address (mem_1, VOIDmode,
12873                           plus_constant (DImode, operands[8], new_off));
12874
12875   /* Check if the adjusted address is OK for ldp/stp.  */
12876   if (!aarch64_mem_pair_operand (mem_1, mode))
12877     return false;
12878
12879   msize = GET_MODE_SIZE (mode);
12880   mem_2 = change_address (mem_2, VOIDmode,
12881                           plus_constant (DImode,
12882                                          operands[8],
12883                                          new_off + msize));
12884   mem_3 = change_address (mem_3, VOIDmode,
12885                           plus_constant (DImode,
12886                                          operands[8],
12887                                          new_off + msize * 2));
12888   mem_4 = change_address (mem_4, VOIDmode,
12889                           plus_constant (DImode,
12890                                          operands[8],
12891                                          new_off + msize * 3));
12892
12893   if (code == ZERO_EXTEND)
12894     {
12895       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
12896       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
12897       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
12898       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
12899     }
12900   else if (code == SIGN_EXTEND)
12901     {
12902       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
12903       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
12904       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
12905       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
12906     }
12907
12908   if (load)
12909     {
12910       operands[1] = mem_1;
12911       operands[3] = mem_2;
12912       operands[5] = mem_3;
12913       operands[7] = mem_4;
12914     }
12915   else
12916     {
12917       operands[0] = mem_1;
12918       operands[2] = mem_2;
12919       operands[4] = mem_3;
12920       operands[6] = mem_4;
12921     }
12922
12923   /* Emit adjusting instruction.  */
12924   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
12925   /* Emit ldp/stp instructions.  */
12926   t1 = gen_rtx_SET (operands[0], operands[1]);
12927   t2 = gen_rtx_SET (operands[2], operands[3]);
12928   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
12929   t1 = gen_rtx_SET (operands[4], operands[5]);
12930   t2 = gen_rtx_SET (operands[6], operands[7]);
12931   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
12932   return true;
12933 }
12934
12935 /* Return 1 if pseudo register should be created and used to hold
12936    GOT address for PIC code.  */
12937
12938 bool
12939 aarch64_use_pseudo_pic_reg (void)
12940 {
12941   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
12942 }
12943
12944 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
12945
12946 static int
12947 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
12948 {
12949   switch (XINT (x, 1))
12950     {
12951     case UNSPEC_GOTSMALLPIC:
12952     case UNSPEC_GOTSMALLPIC28K:
12953     case UNSPEC_GOTTINYPIC:
12954       return 0;
12955     default:
12956       break;
12957     }
12958
12959   return default_unspec_may_trap_p (x, flags);
12960 }
12961
12962 /* Implement TARGET_PROMOTED_TYPE to promote __fp16 to float.  */
12963 static tree
12964 aarch64_promoted_type (const_tree t)
12965 {
12966   if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
12967     return float_type_node;
12968   return NULL_TREE;
12969 }
12970 #undef TARGET_ADDRESS_COST
12971 #define TARGET_ADDRESS_COST aarch64_address_cost
12972
12973 /* This hook will determines whether unnamed bitfields affect the alignment
12974    of the containing structure.  The hook returns true if the structure
12975    should inherit the alignment requirements of an unnamed bitfield's
12976    type.  */
12977 #undef TARGET_ALIGN_ANON_BITFIELD
12978 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
12979
12980 #undef TARGET_ASM_ALIGNED_DI_OP
12981 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
12982
12983 #undef TARGET_ASM_ALIGNED_HI_OP
12984 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
12985
12986 #undef TARGET_ASM_ALIGNED_SI_OP
12987 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
12988
12989 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
12990 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
12991   hook_bool_const_tree_hwi_hwi_const_tree_true
12992
12993 #undef TARGET_ASM_OUTPUT_MI_THUNK
12994 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
12995
12996 #undef TARGET_ASM_SELECT_RTX_SECTION
12997 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
12998
12999 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
13000 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
13001
13002 #undef TARGET_BUILD_BUILTIN_VA_LIST
13003 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
13004
13005 #undef TARGET_CALLEE_COPIES
13006 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
13007
13008 #undef TARGET_CAN_ELIMINATE
13009 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
13010
13011 #undef TARGET_CAN_INLINE_P
13012 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
13013
13014 #undef TARGET_CANNOT_FORCE_CONST_MEM
13015 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
13016
13017 #undef TARGET_CONDITIONAL_REGISTER_USAGE
13018 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
13019
13020 /* Only the least significant bit is used for initialization guard
13021    variables.  */
13022 #undef TARGET_CXX_GUARD_MASK_BIT
13023 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
13024
13025 #undef TARGET_C_MODE_FOR_SUFFIX
13026 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
13027
13028 #ifdef TARGET_BIG_ENDIAN_DEFAULT
13029 #undef  TARGET_DEFAULT_TARGET_FLAGS
13030 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
13031 #endif
13032
13033 #undef TARGET_CLASS_MAX_NREGS
13034 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
13035
13036 #undef TARGET_BUILTIN_DECL
13037 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
13038
13039 #undef  TARGET_EXPAND_BUILTIN
13040 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
13041
13042 #undef TARGET_EXPAND_BUILTIN_VA_START
13043 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
13044
13045 #undef TARGET_FOLD_BUILTIN
13046 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
13047
13048 #undef TARGET_FUNCTION_ARG
13049 #define TARGET_FUNCTION_ARG aarch64_function_arg
13050
13051 #undef TARGET_FUNCTION_ARG_ADVANCE
13052 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
13053
13054 #undef TARGET_FUNCTION_ARG_BOUNDARY
13055 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
13056
13057 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
13058 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
13059
13060 #undef TARGET_FUNCTION_VALUE
13061 #define TARGET_FUNCTION_VALUE aarch64_function_value
13062
13063 #undef TARGET_FUNCTION_VALUE_REGNO_P
13064 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
13065
13066 #undef TARGET_FRAME_POINTER_REQUIRED
13067 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
13068
13069 #undef TARGET_GIMPLE_FOLD_BUILTIN
13070 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
13071
13072 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
13073 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
13074
13075 #undef  TARGET_INIT_BUILTINS
13076 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
13077
13078 #undef TARGET_LEGITIMATE_ADDRESS_P
13079 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
13080
13081 #undef TARGET_LEGITIMATE_CONSTANT_P
13082 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
13083
13084 #undef TARGET_LIBGCC_CMP_RETURN_MODE
13085 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
13086
13087 #undef TARGET_LRA_P
13088 #define TARGET_LRA_P hook_bool_void_true
13089
13090 #undef TARGET_MANGLE_TYPE
13091 #define TARGET_MANGLE_TYPE aarch64_mangle_type
13092
13093 #undef TARGET_MEMORY_MOVE_COST
13094 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
13095
13096 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
13097 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
13098
13099 #undef TARGET_MUST_PASS_IN_STACK
13100 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
13101
13102 /* This target hook should return true if accesses to volatile bitfields
13103    should use the narrowest mode possible.  It should return false if these
13104    accesses should use the bitfield container type.  */
13105 #undef TARGET_NARROW_VOLATILE_BITFIELD
13106 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
13107
13108 #undef  TARGET_OPTION_OVERRIDE
13109 #define TARGET_OPTION_OVERRIDE aarch64_override_options
13110
13111 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
13112 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
13113   aarch64_override_options_after_change
13114
13115 #undef TARGET_OPTION_SAVE
13116 #define TARGET_OPTION_SAVE aarch64_option_save
13117
13118 #undef TARGET_OPTION_RESTORE
13119 #define TARGET_OPTION_RESTORE aarch64_option_restore
13120
13121 #undef TARGET_OPTION_PRINT
13122 #define TARGET_OPTION_PRINT aarch64_option_print
13123
13124 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
13125 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
13126
13127 #undef TARGET_SET_CURRENT_FUNCTION
13128 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
13129
13130 #undef TARGET_PASS_BY_REFERENCE
13131 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
13132
13133 #undef TARGET_PREFERRED_RELOAD_CLASS
13134 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
13135
13136 #undef TARGET_SCHED_REASSOCIATION_WIDTH
13137 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
13138
13139 #undef TARGET_PROMOTED_TYPE
13140 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
13141
13142 #undef TARGET_SECONDARY_RELOAD
13143 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
13144
13145 #undef TARGET_SHIFT_TRUNCATION_MASK
13146 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
13147
13148 #undef TARGET_SETUP_INCOMING_VARARGS
13149 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
13150
13151 #undef TARGET_STRUCT_VALUE_RTX
13152 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
13153
13154 #undef TARGET_REGISTER_MOVE_COST
13155 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
13156
13157 #undef TARGET_RETURN_IN_MEMORY
13158 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
13159
13160 #undef TARGET_RETURN_IN_MSB
13161 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
13162
13163 #undef TARGET_RTX_COSTS
13164 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
13165
13166 #undef TARGET_SCHED_ISSUE_RATE
13167 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
13168
13169 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
13170 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
13171   aarch64_sched_first_cycle_multipass_dfa_lookahead
13172
13173 #undef TARGET_TRAMPOLINE_INIT
13174 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
13175
13176 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
13177 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
13178
13179 #undef TARGET_VECTOR_MODE_SUPPORTED_P
13180 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
13181
13182 #undef TARGET_ARRAY_MODE_SUPPORTED_P
13183 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
13184
13185 #undef TARGET_VECTORIZE_ADD_STMT_COST
13186 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
13187
13188 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
13189 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
13190   aarch64_builtin_vectorization_cost
13191
13192 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
13193 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
13194
13195 #undef TARGET_VECTORIZE_BUILTINS
13196 #define TARGET_VECTORIZE_BUILTINS
13197
13198 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
13199 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
13200   aarch64_builtin_vectorized_function
13201
13202 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
13203 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
13204   aarch64_autovectorize_vector_sizes
13205
13206 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
13207 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
13208   aarch64_atomic_assign_expand_fenv
13209
13210 /* Section anchor support.  */
13211
13212 #undef TARGET_MIN_ANCHOR_OFFSET
13213 #define TARGET_MIN_ANCHOR_OFFSET -256
13214
13215 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
13216    byte offset; we can do much more for larger data types, but have no way
13217    to determine the size of the access.  We assume accesses are aligned.  */
13218 #undef TARGET_MAX_ANCHOR_OFFSET
13219 #define TARGET_MAX_ANCHOR_OFFSET 4095
13220
13221 #undef TARGET_VECTOR_ALIGNMENT
13222 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
13223
13224 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
13225 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
13226   aarch64_simd_vector_alignment_reachable
13227
13228 /* vec_perm support.  */
13229
13230 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
13231 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
13232   aarch64_vectorize_vec_perm_const_ok
13233
13234 #undef TARGET_INIT_LIBFUNCS
13235 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
13236
13237 #undef TARGET_FIXED_CONDITION_CODE_REGS
13238 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
13239
13240 #undef TARGET_FLAGS_REGNUM
13241 #define TARGET_FLAGS_REGNUM CC_REGNUM
13242
13243 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
13244 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
13245
13246 #undef TARGET_ASAN_SHADOW_OFFSET
13247 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
13248
13249 #undef TARGET_LEGITIMIZE_ADDRESS
13250 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
13251
13252 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
13253 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
13254   aarch64_use_by_pieces_infrastructure_p
13255
13256 #undef TARGET_CAN_USE_DOLOOP_P
13257 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
13258
13259 #undef TARGET_SCHED_MACRO_FUSION_P
13260 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
13261
13262 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
13263 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
13264
13265 #undef TARGET_SCHED_FUSION_PRIORITY
13266 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
13267
13268 #undef TARGET_UNSPEC_MAY_TRAP_P
13269 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
13270
13271 #undef TARGET_USE_PSEUDO_PIC_REG
13272 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
13273
13274 struct gcc_target targetm = TARGET_INITIALIZER;
13275
13276 #include "gt-aarch64.h"