gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2015 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #include "system.h"
  23 #include "coretypes.h"
  24 #include "backend.h"
  25 #include "target.h"
  26 #include "rtl.h"
  27 #include "tree.h"
  28 #include "gimple.h"
  29 #include "cfghooks.h"
  30 #include "cfgloop.h"
  31 #include "df.h"
  32 #include "tm_p.h"
  33 #include "stringpool.h"
  34 #include "optabs.h"
  35 #include "regs.h"
  36 #include "emit-rtl.h"
  37 #include "recog.h"
  38 #include "diagnostic.h"
  39 #include "insn-attr.h"
  40 #include "alias.h"
  41 #include "fold-const.h"
  42 #include "stor-layout.h"
  43 #include "calls.h"
  44 #include "varasm.h"
  45 #include "output.h"
  46 #include "flags.h"
  47 #include "explow.h"
  48 #include "expr.h"
  49 #include "reload.h"
  50 #include "langhooks.h"
  51 #include "opts.h"
  52 #include "params.h"
  53 #include "gimplify.h"
  54 #include "dwarf2.h"
  55 #include "gimple-iterator.h"
  56 #include "tree-vectorizer.h"
  57 #include "aarch64-cost-tables.h"
  58 #include "dumpfile.h"
  59 #include "builtins.h"
  60 #include "rtl-iter.h"
  61 #include "tm-constrs.h"
  62 #include "sched-int.h"
  63 #include "cortex-a57-fma-steering.h"
  64 #include "target-globals.h"
  65
  66 /* This file should be included last.  */
  67 #include "target-def.h"
  68
  69 /* Defined for convenience.  */
  70 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  71
  72 /* Classifies an address.
  73
  74    ADDRESS_REG_IMM
  75        A simple base register plus immediate offset.
  76
  77    ADDRESS_REG_WB
  78        A base register indexed by immediate offset with writeback.
  79
  80    ADDRESS_REG_REG
  81        A base register indexed by (optionally scaled) register.
  82
  83    ADDRESS_REG_UXTW
  84        A base register indexed by (optionally scaled) zero-extended register.
  85
  86    ADDRESS_REG_SXTW
  87        A base register indexed by (optionally scaled) sign-extended register.
  88
  89    ADDRESS_LO_SUM
  90        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  91
  92    ADDRESS_SYMBOLIC:
  93        A constant symbolic address, in pc-relative literal pool.  */
  94
  95 enum aarch64_address_type {
  96   ADDRESS_REG_IMM,
  97   ADDRESS_REG_WB,
  98   ADDRESS_REG_REG,
  99   ADDRESS_REG_UXTW,
 100   ADDRESS_REG_SXTW,
 101   ADDRESS_LO_SUM,
 102   ADDRESS_SYMBOLIC
 103 };
 104
 105 struct aarch64_address_info {
 106   enum aarch64_address_type type;
 107   rtx base;
 108   rtx offset;
 109   int shift;
 110   enum aarch64_symbol_type symbol_type;
 111 };
 112
 113 struct simd_immediate_info
 114 {
 115   rtx value;
 116   int shift;
 117   int element_width;
 118   bool mvn;
 119   bool msl;
 120 };
 121
 122 /* The current code model.  */
 123 enum aarch64_code_model aarch64_cmodel;
 124
 125 #ifdef HAVE_AS_TLS
 126 #undef TARGET_HAVE_TLS
 127 #define TARGET_HAVE_TLS 1
 128 #endif
 129
 130 static bool aarch64_composite_type_p (const_tree, machine_mode);
 131 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 132                                                      const_tree,
 133                                                      machine_mode *, int *,
 134                                                      bool *);
 135 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 136 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 137 static void aarch64_override_options_after_change (void);
 138 static bool aarch64_vector_mode_supported_p (machine_mode);
 139 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 140                                                  const unsigned char *sel);
 141 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 142
 143 /* Major revision number of the ARM Architecture implemented by the target.  */
 144 unsigned aarch64_architecture_version;
 145
 146 /* The processor for which instructions should be scheduled.  */
 147 enum aarch64_processor aarch64_tune = cortexa53;
 148
 149 /* Mask to specify which instruction scheduling options should be used.  */
 150 unsigned long aarch64_tune_flags = 0;
 151
 152 /* Global flag for PC relative loads.  */
 153 bool aarch64_nopcrelative_literal_loads;
 154
 155 /* Support for command line parsing of boolean flags in the tuning
 156    structures.  */
 157 struct aarch64_flag_desc
 158 {
 159   const char* name;
 160   unsigned int flag;
 161 };
 162
 163 #define AARCH64_FUSION_PAIR(name, internal_name) \
 164   { name, AARCH64_FUSE_##internal_name },
 165 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 166 {
 167   { "none", AARCH64_FUSE_NOTHING },
 168 #include "aarch64-fusion-pairs.def"
 169   { "all", AARCH64_FUSE_ALL },
 170   { NULL, AARCH64_FUSE_NOTHING }
 171 };
 172 #undef AARCH64_FUION_PAIR
 173
 174 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 175   { name, AARCH64_EXTRA_TUNE_##internal_name },
 176 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 177 {
 178   { "none", AARCH64_EXTRA_TUNE_NONE },
 179 #include "aarch64-tuning-flags.def"
 180   { "all", AARCH64_EXTRA_TUNE_ALL },
 181   { NULL, AARCH64_EXTRA_TUNE_NONE }
 182 };
 183 #undef AARCH64_EXTRA_TUNING_OPTION
 184
 185 /* Tuning parameters.  */
 186
 187 static const struct cpu_addrcost_table generic_addrcost_table =
 188 {
 189     {
 190       0, /* hi  */
 191       0, /* si  */
 192       0, /* di  */
 193       0, /* ti  */
 194     },
 195   0, /* pre_modify  */
 196   0, /* post_modify  */
 197   0, /* register_offset  */
 198   0, /* register_sextend  */
 199   0, /* register_zextend  */
 200   0 /* imm_offset  */
 201 };
 202
 203 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 204 {
 205     {
 206       1, /* hi  */
 207       0, /* si  */
 208       0, /* di  */
 209       1, /* ti  */
 210     },
 211   0, /* pre_modify  */
 212   0, /* post_modify  */
 213   0, /* register_offset  */
 214   0, /* register_sextend  */
 215   0, /* register_zextend  */
 216   0, /* imm_offset  */
 217 };
 218
 219 static const struct cpu_addrcost_table xgene1_addrcost_table =
 220 {
 221     {
 222       1, /* hi  */
 223       0, /* si  */
 224       0, /* di  */
 225       1, /* ti  */
 226     },
 227   1, /* pre_modify  */
 228   0, /* post_modify  */
 229   0, /* register_offset  */
 230   1, /* register_sextend  */
 231   1, /* register_zextend  */
 232   0, /* imm_offset  */
 233 };
 234
 235 static const struct cpu_regmove_cost generic_regmove_cost =
 236 {
 237   1, /* GP2GP  */
 238   /* Avoid the use of slow int<->fp moves for spilling by setting
 239      their cost higher than memmov_cost.  */
 240   5, /* GP2FP  */
 241   5, /* FP2GP  */
 242   2 /* FP2FP  */
 243 };
 244
 245 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 246 {
 247   1, /* GP2GP  */
 248   /* Avoid the use of slow int<->fp moves for spilling by setting
 249      their cost higher than memmov_cost.  */
 250   5, /* GP2FP  */
 251   5, /* FP2GP  */
 252   2 /* FP2FP  */
 253 };
 254
 255 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 256 {
 257   1, /* GP2GP  */
 258   /* Avoid the use of slow int<->fp moves for spilling by setting
 259      their cost higher than memmov_cost.  */
 260   5, /* GP2FP  */
 261   5, /* FP2GP  */
 262   2 /* FP2FP  */
 263 };
 264
 265 static const struct cpu_regmove_cost thunderx_regmove_cost =
 266 {
 267   2, /* GP2GP  */
 268   2, /* GP2FP  */
 269   6, /* FP2GP  */
 270   4 /* FP2FP  */
 271 };
 272
 273 static const struct cpu_regmove_cost xgene1_regmove_cost =
 274 {
 275   1, /* GP2GP  */
 276   /* Avoid the use of slow int<->fp moves for spilling by setting
 277      their cost higher than memmov_cost.  */
 278   8, /* GP2FP  */
 279   8, /* FP2GP  */
 280   2 /* FP2FP  */
 281 };
 282
 283 /* Generic costs for vector insn classes.  */
 284 static const struct cpu_vector_cost generic_vector_cost =
 285 {
 286   1, /* scalar_stmt_cost  */
 287   1, /* scalar_load_cost  */
 288   1, /* scalar_store_cost  */
 289   1, /* vec_stmt_cost  */
 290   1, /* vec_to_scalar_cost  */
 291   1, /* scalar_to_vec_cost  */
 292   1, /* vec_align_load_cost  */
 293   1, /* vec_unalign_load_cost  */
 294   1, /* vec_unalign_store_cost  */
 295   1, /* vec_store_cost  */
 296   3, /* cond_taken_branch_cost  */
 297   1 /* cond_not_taken_branch_cost  */
 298 };
 299
 300 /* Generic costs for vector insn classes.  */
 301 static const struct cpu_vector_cost cortexa57_vector_cost =
 302 {
 303   1, /* scalar_stmt_cost  */
 304   4, /* scalar_load_cost  */
 305   1, /* scalar_store_cost  */
 306   3, /* vec_stmt_cost  */
 307   8, /* vec_to_scalar_cost  */
 308   8, /* scalar_to_vec_cost  */
 309   5, /* vec_align_load_cost  */
 310   5, /* vec_unalign_load_cost  */
 311   1, /* vec_unalign_store_cost  */
 312   1, /* vec_store_cost  */
 313   1, /* cond_taken_branch_cost  */
 314   1 /* cond_not_taken_branch_cost  */
 315 };
 316
 317 /* Generic costs for vector insn classes.  */
 318 static const struct cpu_vector_cost xgene1_vector_cost =
 319 {
 320   1, /* scalar_stmt_cost  */
 321   5, /* scalar_load_cost  */
 322   1, /* scalar_store_cost  */
 323   2, /* vec_stmt_cost  */
 324   4, /* vec_to_scalar_cost  */
 325   4, /* scalar_to_vec_cost  */
 326   10, /* vec_align_load_cost  */
 327   10, /* vec_unalign_load_cost  */
 328   2, /* vec_unalign_store_cost  */
 329   2, /* vec_store_cost  */
 330   2, /* cond_taken_branch_cost  */
 331   1 /* cond_not_taken_branch_cost  */
 332 };
 333
 334 /* Generic costs for branch instructions.  */
 335 static const struct cpu_branch_cost generic_branch_cost =
 336 {
 337   2,  /* Predictable.  */
 338   2   /* Unpredictable.  */
 339 };
 340
 341 static const struct tune_params generic_tunings =
 342 {
 343   &cortexa57_extra_costs,
 344   &generic_addrcost_table,
 345   &generic_regmove_cost,
 346   &generic_vector_cost,
 347   &generic_branch_cost,
 348   4, /* memmov_cost  */
 349   2, /* issue_rate  */
 350   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 351   8,    /* function_align.  */
 352   8,    /* jump_align.  */
 353   4,    /* loop_align.  */
 354   2,    /* int_reassoc_width.  */
 355   4,    /* fp_reassoc_width.  */
 356   1,    /* vec_reassoc_width.  */
 357   2,    /* min_div_recip_mul_sf.  */
 358   2,    /* min_div_recip_mul_df.  */
 359   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 360   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 361 };
 362
 363 static const struct tune_params cortexa53_tunings =
 364 {
 365   &cortexa53_extra_costs,
 366   &generic_addrcost_table,
 367   &cortexa53_regmove_cost,
 368   &generic_vector_cost,
 369   &generic_branch_cost,
 370   4, /* memmov_cost  */
 371   2, /* issue_rate  */
 372   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 373    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 374   8,    /* function_align.  */
 375   8,    /* jump_align.  */
 376   4,    /* loop_align.  */
 377   2,    /* int_reassoc_width.  */
 378   4,    /* fp_reassoc_width.  */
 379   1,    /* vec_reassoc_width.  */
 380   2,    /* min_div_recip_mul_sf.  */
 381   2,    /* min_div_recip_mul_df.  */
 382   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 383   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 384 };
 385
 386 static const struct tune_params cortexa57_tunings =
 387 {
 388   &cortexa57_extra_costs,
 389   &cortexa57_addrcost_table,
 390   &cortexa57_regmove_cost,
 391   &cortexa57_vector_cost,
 392   &generic_branch_cost,
 393   4, /* memmov_cost  */
 394   3, /* issue_rate  */
 395   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 396    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 397   16,   /* function_align.  */
 398   8,    /* jump_align.  */
 399   4,    /* loop_align.  */
 400   2,    /* int_reassoc_width.  */
 401   4,    /* fp_reassoc_width.  */
 402   1,    /* vec_reassoc_width.  */
 403   2,    /* min_div_recip_mul_sf.  */
 404   2,    /* min_div_recip_mul_df.  */
 405   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 406   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS
 407    | AARCH64_EXTRA_TUNE_RECIP_SQRT)     /* tune_flags.  */
 408 };
 409
 410 static const struct tune_params cortexa72_tunings =
 411 {
 412   &cortexa57_extra_costs,
 413   &cortexa57_addrcost_table,
 414   &cortexa57_regmove_cost,
 415   &cortexa57_vector_cost,
 416   &generic_branch_cost,
 417   4, /* memmov_cost  */
 418   3, /* issue_rate  */
 419   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 420    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 421   16,   /* function_align.  */
 422   8,    /* jump_align.  */
 423   4,    /* loop_align.  */
 424   2,    /* int_reassoc_width.  */
 425   4,    /* fp_reassoc_width.  */
 426   1,    /* vec_reassoc_width.  */
 427   2,    /* min_div_recip_mul_sf.  */
 428   2,    /* min_div_recip_mul_df.  */
 429   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 430   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 431 };
 432
 433 static const struct tune_params thunderx_tunings =
 434 {
 435   &thunderx_extra_costs,
 436   &generic_addrcost_table,
 437   &thunderx_regmove_cost,
 438   &generic_vector_cost,
 439   &generic_branch_cost,
 440   6, /* memmov_cost  */
 441   2, /* issue_rate  */
 442   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 443   8,    /* function_align.  */
 444   8,    /* jump_align.  */
 445   8,    /* loop_align.  */
 446   2,    /* int_reassoc_width.  */
 447   4,    /* fp_reassoc_width.  */
 448   1,    /* vec_reassoc_width.  */
 449   2,    /* min_div_recip_mul_sf.  */
 450   2,    /* min_div_recip_mul_df.  */
 451   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 452   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 453 };
 454
 455 static const struct tune_params xgene1_tunings =
 456 {
 457   &xgene1_extra_costs,
 458   &xgene1_addrcost_table,
 459   &xgene1_regmove_cost,
 460   &xgene1_vector_cost,
 461   &generic_branch_cost,
 462   6, /* memmov_cost  */
 463   4, /* issue_rate  */
 464   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 465   16,   /* function_align.  */
 466   8,    /* jump_align.  */
 467   16,   /* loop_align.  */
 468   2,    /* int_reassoc_width.  */
 469   4,    /* fp_reassoc_width.  */
 470   1,    /* vec_reassoc_width.  */
 471   2,    /* min_div_recip_mul_sf.  */
 472   2,    /* min_div_recip_mul_df.  */
 473   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 474   (AARCH64_EXTRA_TUNE_RECIP_SQRT)       /* tune_flags.  */
 475 };
 476
 477 /* Support for fine-grained override of the tuning structures.  */
 478 struct aarch64_tuning_override_function
 479 {
 480   const char* name;
 481   void (*parse_override)(const char*, struct tune_params*);
 482 };
 483
 484 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 485 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 486
 487 static const struct aarch64_tuning_override_function
 488 aarch64_tuning_override_functions[] =
 489 {
 490   { "fuse", aarch64_parse_fuse_string },
 491   { "tune", aarch64_parse_tune_string },
 492   { NULL, NULL }
 493 };
 494
 495 /* A processor implementing AArch64.  */
 496 struct processor
 497 {
 498   const char *const name;
 499   enum aarch64_processor ident;
 500   enum aarch64_processor sched_core;
 501   enum aarch64_arch arch;
 502   unsigned architecture_version;
 503   const unsigned long flags;
 504   const struct tune_params *const tune;
 505 };
 506
 507 /* Architectures implementing AArch64.  */
 508 static const struct processor all_architectures[] =
 509 {
 510 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 511   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 512 #include "aarch64-arches.def"
 513 #undef AARCH64_ARCH
 514   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 515 };
 516
 517 /* Processor cores implementing AArch64.  */
 518 static const struct processor all_cores[] =
 519 {
 520 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
 521   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 522   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 523   FLAGS, &COSTS##_tunings},
 524 #include "aarch64-cores.def"
 525 #undef AARCH64_CORE
 526   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 527     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 528   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 529 };
 530
 531
 532 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 533    handling code or by target attributes.  */
 534 static const struct processor *selected_arch;
 535 static const struct processor *selected_cpu;
 536 static const struct processor *selected_tune;
 537
 538 /* The current tuning set.  */
 539 struct tune_params aarch64_tune_params = generic_tunings;
 540
 541 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 542
 543 /* An ISA extension in the co-processor and main instruction set space.  */
 544 struct aarch64_option_extension
 545 {
 546   const char *const name;
 547   const unsigned long flags_on;
 548   const unsigned long flags_off;
 549 };
 550
 551 /* ISA extensions in AArch64.  */
 552 static const struct aarch64_option_extension all_extensions[] =
 553 {
 554 #define AARCH64_OPT_EXTENSION(NAME, FLAGS_ON, FLAGS_OFF, FEATURE_STRING) \
 555   {NAME, FLAGS_ON, FLAGS_OFF},
 556 #include "aarch64-option-extensions.def"
 557 #undef AARCH64_OPT_EXTENSION
 558   {NULL, 0, 0}
 559 };
 560
 561 typedef enum aarch64_cond_code
 562 {
 563   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 564   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 565   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 566 }
 567 aarch64_cc;
 568
 569 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 570
 571 /* The condition codes of the processor, and the inverse function.  */
 572 static const char * const aarch64_condition_codes[] =
 573 {
 574   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 575   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 576 };
 577
 578 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 579 const char *
 580 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 581                         const char * branch_format)
 582 {
 583     rtx_code_label * tmp_label = gen_label_rtx ();
 584     char label_buf[256];
 585     char buffer[128];
 586     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 587                                  CODE_LABEL_NUMBER (tmp_label));
 588     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 589     rtx dest_label = operands[pos_label];
 590     operands[pos_label] = tmp_label;
 591
 592     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 593     output_asm_insn (buffer, operands);
 594
 595     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 596     operands[pos_label] = dest_label;
 597     output_asm_insn (buffer, operands);
 598     return "";
 599 }
 600
 601 void
 602 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 603 {
 604   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 605   if (TARGET_GENERAL_REGS_ONLY)
 606     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 607   else
 608     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 609 }
 610
 611 static unsigned int
 612 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 613 {
 614   if (GET_MODE_UNIT_SIZE (mode) == 4)
 615     return aarch64_tune_params.min_div_recip_mul_sf;
 616   return aarch64_tune_params.min_div_recip_mul_df;
 617 }
 618
 619 static int
 620 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 621                              enum machine_mode mode)
 622 {
 623   if (VECTOR_MODE_P (mode))
 624     return aarch64_tune_params.vec_reassoc_width;
 625   if (INTEGRAL_MODE_P (mode))
 626     return aarch64_tune_params.int_reassoc_width;
 627   if (FLOAT_MODE_P (mode))
 628     return aarch64_tune_params.fp_reassoc_width;
 629   return 1;
 630 }
 631
 632 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 633 unsigned
 634 aarch64_dbx_register_number (unsigned regno)
 635 {
 636    if (GP_REGNUM_P (regno))
 637      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 638    else if (regno == SP_REGNUM)
 639      return AARCH64_DWARF_SP;
 640    else if (FP_REGNUM_P (regno))
 641      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 642
 643    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 644       equivalent DWARF register.  */
 645    return DWARF_FRAME_REGISTERS;
 646 }
 647
 648 /* Return TRUE if MODE is any of the large INT modes.  */
 649 static bool
 650 aarch64_vect_struct_mode_p (machine_mode mode)
 651 {
 652   return mode == OImode || mode == CImode || mode == XImode;
 653 }
 654
 655 /* Return TRUE if MODE is any of the vector modes.  */
 656 static bool
 657 aarch64_vector_mode_p (machine_mode mode)
 658 {
 659   return aarch64_vector_mode_supported_p (mode)
 660          || aarch64_vect_struct_mode_p (mode);
 661 }
 662
 663 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 664 static bool
 665 aarch64_array_mode_supported_p (machine_mode mode,
 666                                 unsigned HOST_WIDE_INT nelems)
 667 {
 668   if (TARGET_SIMD
 669       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
 670           || AARCH64_VALID_SIMD_DREG_MODE (mode))
 671       && (nelems >= 2 && nelems <= 4))
 672     return true;
 673
 674   return false;
 675 }
 676
 677 /* Implement HARD_REGNO_NREGS.  */
 678
 679 int
 680 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
 681 {
 682   switch (aarch64_regno_regclass (regno))
 683     {
 684     case FP_REGS:
 685     case FP_LO_REGS:
 686       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
 687     default:
 688       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 689     }
 690   gcc_unreachable ();
 691 }
 692
 693 /* Implement HARD_REGNO_MODE_OK.  */
 694
 695 int
 696 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
 697 {
 698   if (GET_MODE_CLASS (mode) == MODE_CC)
 699     return regno == CC_REGNUM;
 700
 701   if (regno == SP_REGNUM)
 702     /* The purpose of comparing with ptr_mode is to support the
 703        global register variable associated with the stack pointer
 704        register via the syntax of asm ("wsp") in ILP32.  */
 705     return mode == Pmode || mode == ptr_mode;
 706
 707   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
 708     return mode == Pmode;
 709
 710   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
 711     return 1;
 712
 713   if (FP_REGNUM_P (regno))
 714     {
 715       if (aarch64_vect_struct_mode_p (mode))
 716         return
 717           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
 718       else
 719         return 1;
 720     }
 721
 722   return 0;
 723 }
 724
 725 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
 726 machine_mode
 727 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
 728                                      machine_mode mode)
 729 {
 730   /* Handle modes that fit within single registers.  */
 731   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
 732     {
 733       if (GET_MODE_SIZE (mode) >= 4)
 734         return mode;
 735       else
 736         return SImode;
 737     }
 738   /* Fall back to generic for multi-reg and very large modes.  */
 739   else
 740     return choose_hard_reg_mode (regno, nregs, false);
 741 }
 742
 743 /* Return true if calls to DECL should be treated as
 744    long-calls (ie called via a register).  */
 745 static bool
 746 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
 747 {
 748   return false;
 749 }
 750
 751 /* Return true if calls to symbol-ref SYM should be treated as
 752    long-calls (ie called via a register).  */
 753 bool
 754 aarch64_is_long_call_p (rtx sym)
 755 {
 756   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
 757 }
 758
 759 /* Return true if calls to symbol-ref SYM should not go through
 760    plt stubs.  */
 761
 762 bool
 763 aarch64_is_noplt_call_p (rtx sym)
 764 {
 765   const_tree decl = SYMBOL_REF_DECL (sym);
 766
 767   if (flag_pic
 768       && decl
 769       && (!flag_plt
 770           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
 771       && !targetm.binds_local_p (decl))
 772     return true;
 773
 774   return false;
 775 }
 776
 777 /* Return true if the offsets to a zero/sign-extract operation
 778    represent an expression that matches an extend operation.  The
 779    operands represent the paramters from
 780
 781    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
 782 bool
 783 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
 784                                 rtx extract_imm)
 785 {
 786   HOST_WIDE_INT mult_val, extract_val;
 787
 788   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
 789     return false;
 790
 791   mult_val = INTVAL (mult_imm);
 792   extract_val = INTVAL (extract_imm);
 793
 794   if (extract_val > 8
 795       && extract_val < GET_MODE_BITSIZE (mode)
 796       && exact_log2 (extract_val & ~7) > 0
 797       && (extract_val & 7) <= 4
 798       && mult_val == (1 << (extract_val & 7)))
 799     return true;
 800
 801   return false;
 802 }
 803
 804 /* Emit an insn that's a simple single-set.  Both the operands must be
 805    known to be valid.  */
 806 inline static rtx
 807 emit_set_insn (rtx x, rtx y)
 808 {
 809   return emit_insn (gen_rtx_SET (x, y));
 810 }
 811
 812 /* X and Y are two things to compare using CODE.  Emit the compare insn and
 813    return the rtx for register 0 in the proper mode.  */
 814 rtx
 815 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
 816 {
 817   machine_mode mode = SELECT_CC_MODE (code, x, y);
 818   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
 819
 820   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
 821   return cc_reg;
 822 }
 823
 824 /* Build the SYMBOL_REF for __tls_get_addr.  */
 825
 826 static GTY(()) rtx tls_get_addr_libfunc;
 827
 828 rtx
 829 aarch64_tls_get_addr (void)
 830 {
 831   if (!tls_get_addr_libfunc)
 832     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
 833   return tls_get_addr_libfunc;
 834 }
 835
 836 /* Return the TLS model to use for ADDR.  */
 837
 838 static enum tls_model
 839 tls_symbolic_operand_type (rtx addr)
 840 {
 841   enum tls_model tls_kind = TLS_MODEL_NONE;
 842   rtx sym, addend;
 843
 844   if (GET_CODE (addr) == CONST)
 845     {
 846       split_const (addr, &sym, &addend);
 847       if (GET_CODE (sym) == SYMBOL_REF)
 848         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
 849     }
 850   else if (GET_CODE (addr) == SYMBOL_REF)
 851     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
 852
 853   return tls_kind;
 854 }
 855
 856 /* We'll allow lo_sum's in addresses in our legitimate addresses
 857    so that combine would take care of combining addresses where
 858    necessary, but for generation purposes, we'll generate the address
 859    as :
 860    RTL                               Absolute
 861    tmp = hi (symbol_ref);            adrp  x1, foo
 862    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
 863                                      nop
 864
 865    PIC                               TLS
 866    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
 867    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
 868                                      bl   __tls_get_addr
 869                                      nop
 870
 871    Load TLS symbol, depending on TLS mechanism and TLS access model.
 872
 873    Global Dynamic - Traditional TLS:
 874    adrp tmp, :tlsgd:imm
 875    add  dest, tmp, #:tlsgd_lo12:imm
 876    bl   __tls_get_addr
 877
 878    Global Dynamic - TLS Descriptors:
 879    adrp dest, :tlsdesc:imm
 880    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
 881    add  dest, dest, #:tlsdesc_lo12:imm
 882    blr  tmp
 883    mrs  tp, tpidr_el0
 884    add  dest, dest, tp
 885
 886    Initial Exec:
 887    mrs  tp, tpidr_el0
 888    adrp tmp, :gottprel:imm
 889    ldr  dest, [tmp, #:gottprel_lo12:imm]
 890    add  dest, dest, tp
 891
 892    Local Exec:
 893    mrs  tp, tpidr_el0
 894    add  t0, tp, #:tprel_hi12:imm, lsl #12
 895    add  t0, t0, #:tprel_lo12_nc:imm
 896 */
 897
 898 static void
 899 aarch64_load_symref_appropriately (rtx dest, rtx imm,
 900                                    enum aarch64_symbol_type type)
 901 {
 902   switch (type)
 903     {
 904     case SYMBOL_SMALL_ABSOLUTE:
 905       {
 906         /* In ILP32, the mode of dest can be either SImode or DImode.  */
 907         rtx tmp_reg = dest;
 908         machine_mode mode = GET_MODE (dest);
 909
 910         gcc_assert (mode == Pmode || mode == ptr_mode);
 911
 912         if (can_create_pseudo_p ())
 913           tmp_reg = gen_reg_rtx (mode);
 914
 915         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
 916         emit_insn (gen_add_losym (dest, tmp_reg, imm));
 917         return;
 918       }
 919
 920     case SYMBOL_TINY_ABSOLUTE:
 921       emit_insn (gen_rtx_SET (dest, imm));
 922       return;
 923
 924     case SYMBOL_SMALL_GOT_28K:
 925       {
 926         machine_mode mode = GET_MODE (dest);
 927         rtx gp_rtx = pic_offset_table_rtx;
 928         rtx insn;
 929         rtx mem;
 930
 931         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
 932            here before rtl expand.  Tree IVOPT will generate rtl pattern to
 933            decide rtx costs, in which case pic_offset_table_rtx is not
 934            initialized.  For that case no need to generate the first adrp
 935            instruction as the final cost for global variable access is
 936            one instruction.  */
 937         if (gp_rtx != NULL)
 938           {
 939             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
 940                using the page base as GOT base, the first page may be wasted,
 941                in the worst scenario, there is only 28K space for GOT).
 942
 943                The generate instruction sequence for accessing global variable
 944                is:
 945
 946                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
 947
 948                Only one instruction needed. But we must initialize
 949                pic_offset_table_rtx properly.  We generate initialize insn for
 950                every global access, and allow CSE to remove all redundant.
 951
 952                The final instruction sequences will look like the following
 953                for multiply global variables access.
 954
 955                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
 956
 957                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
 958                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
 959                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
 960                  ...  */
 961
 962             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
 963             crtl->uses_pic_offset_table = 1;
 964             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
 965
 966             if (mode != GET_MODE (gp_rtx))
 967               gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
 968           }
 969
 970         if (mode == ptr_mode)
 971           {
 972             if (mode == DImode)
 973               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
 974             else
 975               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
 976
 977             mem = XVECEXP (SET_SRC (insn), 0, 0);
 978           }
 979         else
 980           {
 981             gcc_assert (mode == Pmode);
 982
 983             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
 984             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
 985           }
 986
 987         /* The operand is expected to be MEM.  Whenever the related insn
 988            pattern changed, above code which calculate mem should be
 989            updated.  */
 990         gcc_assert (GET_CODE (mem) == MEM);
 991         MEM_READONLY_P (mem) = 1;
 992         MEM_NOTRAP_P (mem) = 1;
 993         emit_insn (insn);
 994         return;
 995       }
 996
 997     case SYMBOL_SMALL_GOT_4G:
 998       {
 999         /* In ILP32, the mode of dest can be either SImode or DImode,
1000            while the got entry is always of SImode size.  The mode of
1001            dest depends on how dest is used: if dest is assigned to a
1002            pointer (e.g. in the memory), it has SImode; it may have
1003            DImode if dest is dereferenced to access the memeory.
1004            This is why we have to handle three different ldr_got_small
1005            patterns here (two patterns for ILP32).  */
1006
1007         rtx insn;
1008         rtx mem;
1009         rtx tmp_reg = dest;
1010         machine_mode mode = GET_MODE (dest);
1011
1012         if (can_create_pseudo_p ())
1013           tmp_reg = gen_reg_rtx (mode);
1014
1015         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1016         if (mode == ptr_mode)
1017           {
1018             if (mode == DImode)
1019               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1020             else
1021               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1022
1023             mem = XVECEXP (SET_SRC (insn), 0, 0);
1024           }
1025         else
1026           {
1027             gcc_assert (mode == Pmode);
1028
1029             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1030             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1031           }
1032
1033         gcc_assert (GET_CODE (mem) == MEM);
1034         MEM_READONLY_P (mem) = 1;
1035         MEM_NOTRAP_P (mem) = 1;
1036         emit_insn (insn);
1037         return;
1038       }
1039
1040     case SYMBOL_SMALL_TLSGD:
1041       {
1042         rtx_insn *insns;
1043         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
1044
1045         start_sequence ();
1046         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
1047         insns = get_insns ();
1048         end_sequence ();
1049
1050         RTL_CONST_CALL_P (insns) = 1;
1051         emit_libcall_block (insns, dest, result, imm);
1052         return;
1053       }
1054
1055     case SYMBOL_SMALL_TLSDESC:
1056       {
1057         machine_mode mode = GET_MODE (dest);
1058         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1059         rtx tp;
1060
1061         gcc_assert (mode == Pmode || mode == ptr_mode);
1062
1063         /* In ILP32, the got entry is always of SImode size.  Unlike
1064            small GOT, the dest is fixed at reg 0.  */
1065         if (TARGET_ILP32)
1066           emit_insn (gen_tlsdesc_small_si (imm));
1067         else
1068           emit_insn (gen_tlsdesc_small_di (imm));
1069         tp = aarch64_load_tp (NULL);
1070
1071         if (mode != Pmode)
1072           tp = gen_lowpart (mode, tp);
1073
1074         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1075         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1076         return;
1077       }
1078
1079     case SYMBOL_SMALL_TLSIE:
1080       {
1081         /* In ILP32, the mode of dest can be either SImode or DImode,
1082            while the got entry is always of SImode size.  The mode of
1083            dest depends on how dest is used: if dest is assigned to a
1084            pointer (e.g. in the memory), it has SImode; it may have
1085            DImode if dest is dereferenced to access the memeory.
1086            This is why we have to handle three different tlsie_small
1087            patterns here (two patterns for ILP32).  */
1088         machine_mode mode = GET_MODE (dest);
1089         rtx tmp_reg = gen_reg_rtx (mode);
1090         rtx tp = aarch64_load_tp (NULL);
1091
1092         if (mode == ptr_mode)
1093           {
1094             if (mode == DImode)
1095               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1096             else
1097               {
1098                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1099                 tp = gen_lowpart (mode, tp);
1100               }
1101           }
1102         else
1103           {
1104             gcc_assert (mode == Pmode);
1105             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1106           }
1107
1108         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1109         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1110         return;
1111       }
1112
1113     case SYMBOL_TLSLE12:
1114     case SYMBOL_TLSLE24:
1115     case SYMBOL_TLSLE32:
1116     case SYMBOL_TLSLE48:
1117       {
1118         machine_mode mode = GET_MODE (dest);
1119         rtx tp = aarch64_load_tp (NULL);
1120
1121         if (mode != Pmode)
1122           tp = gen_lowpart (mode, tp);
1123
1124         switch (type)
1125           {
1126           case SYMBOL_TLSLE12:
1127             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1128                         (dest, tp, imm));
1129             break;
1130           case SYMBOL_TLSLE24:
1131             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1132                         (dest, tp, imm));
1133           break;
1134           case SYMBOL_TLSLE32:
1135             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1136                         (dest, imm));
1137             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1138                         (dest, dest, tp));
1139           break;
1140           case SYMBOL_TLSLE48:
1141             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1142                         (dest, imm));
1143             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1144                         (dest, dest, tp));
1145             break;
1146           default:
1147             gcc_unreachable ();
1148           }
1149
1150         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1151         return;
1152       }
1153
1154     case SYMBOL_TINY_GOT:
1155       emit_insn (gen_ldr_got_tiny (dest, imm));
1156       return;
1157
1158     case SYMBOL_TINY_TLSIE:
1159       {
1160         machine_mode mode = GET_MODE (dest);
1161         rtx tp = aarch64_load_tp (NULL);
1162
1163         if (mode == ptr_mode)
1164           {
1165             if (mode == DImode)
1166               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1167             else
1168               {
1169                 tp = gen_lowpart (mode, tp);
1170                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1171               }
1172           }
1173         else
1174           {
1175             gcc_assert (mode == Pmode);
1176             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1177           }
1178
1179         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1180         return;
1181       }
1182
1183     default:
1184       gcc_unreachable ();
1185     }
1186 }
1187
1188 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1189    handle all moves if !can_create_pseudo_p ().  The distinction is
1190    important because, unlike emit_move_insn, the move expanders know
1191    how to force Pmode objects into the constant pool even when the
1192    constant pool address is not itself legitimate.  */
1193 static rtx
1194 aarch64_emit_move (rtx dest, rtx src)
1195 {
1196   return (can_create_pseudo_p ()
1197           ? emit_move_insn (dest, src)
1198           : emit_move_insn_1 (dest, src));
1199 }
1200
1201 /* Split a 128-bit move operation into two 64-bit move operations,
1202    taking care to handle partial overlap of register to register
1203    copies.  Special cases are needed when moving between GP regs and
1204    FP regs.  SRC can be a register, constant or memory; DST a register
1205    or memory.  If either operand is memory it must not have any side
1206    effects.  */
1207 void
1208 aarch64_split_128bit_move (rtx dst, rtx src)
1209 {
1210   rtx dst_lo, dst_hi;
1211   rtx src_lo, src_hi;
1212
1213   machine_mode mode = GET_MODE (dst);
1214
1215   gcc_assert (mode == TImode || mode == TFmode);
1216   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1217   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1218
1219   if (REG_P (dst) && REG_P (src))
1220     {
1221       int src_regno = REGNO (src);
1222       int dst_regno = REGNO (dst);
1223
1224       /* Handle FP <-> GP regs.  */
1225       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1226         {
1227           src_lo = gen_lowpart (word_mode, src);
1228           src_hi = gen_highpart (word_mode, src);
1229
1230           if (mode == TImode)
1231             {
1232               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1233               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1234             }
1235           else
1236             {
1237               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1238               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1239             }
1240           return;
1241         }
1242       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1243         {
1244           dst_lo = gen_lowpart (word_mode, dst);
1245           dst_hi = gen_highpart (word_mode, dst);
1246
1247           if (mode == TImode)
1248             {
1249               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1250               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1251             }
1252           else
1253             {
1254               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1255               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1256             }
1257           return;
1258         }
1259     }
1260
1261   dst_lo = gen_lowpart (word_mode, dst);
1262   dst_hi = gen_highpart (word_mode, dst);
1263   src_lo = gen_lowpart (word_mode, src);
1264   src_hi = gen_highpart_mode (word_mode, mode, src);
1265
1266   /* At most one pairing may overlap.  */
1267   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1268     {
1269       aarch64_emit_move (dst_hi, src_hi);
1270       aarch64_emit_move (dst_lo, src_lo);
1271     }
1272   else
1273     {
1274       aarch64_emit_move (dst_lo, src_lo);
1275       aarch64_emit_move (dst_hi, src_hi);
1276     }
1277 }
1278
1279 bool
1280 aarch64_split_128bit_move_p (rtx dst, rtx src)
1281 {
1282   return (! REG_P (src)
1283           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1284 }
1285
1286 /* Split a complex SIMD combine.  */
1287
1288 void
1289 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1290 {
1291   machine_mode src_mode = GET_MODE (src1);
1292   machine_mode dst_mode = GET_MODE (dst);
1293
1294   gcc_assert (VECTOR_MODE_P (dst_mode));
1295
1296   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1297     {
1298       rtx (*gen) (rtx, rtx, rtx);
1299
1300       switch (src_mode)
1301         {
1302         case V8QImode:
1303           gen = gen_aarch64_simd_combinev8qi;
1304           break;
1305         case V4HImode:
1306           gen = gen_aarch64_simd_combinev4hi;
1307           break;
1308         case V2SImode:
1309           gen = gen_aarch64_simd_combinev2si;
1310           break;
1311         case V4HFmode:
1312           gen = gen_aarch64_simd_combinev4hf;
1313           break;
1314         case V2SFmode:
1315           gen = gen_aarch64_simd_combinev2sf;
1316           break;
1317         case DImode:
1318           gen = gen_aarch64_simd_combinedi;
1319           break;
1320         case DFmode:
1321           gen = gen_aarch64_simd_combinedf;
1322           break;
1323         default:
1324           gcc_unreachable ();
1325         }
1326
1327       emit_insn (gen (dst, src1, src2));
1328       return;
1329     }
1330 }
1331
1332 /* Split a complex SIMD move.  */
1333
1334 void
1335 aarch64_split_simd_move (rtx dst, rtx src)
1336 {
1337   machine_mode src_mode = GET_MODE (src);
1338   machine_mode dst_mode = GET_MODE (dst);
1339
1340   gcc_assert (VECTOR_MODE_P (dst_mode));
1341
1342   if (REG_P (dst) && REG_P (src))
1343     {
1344       rtx (*gen) (rtx, rtx);
1345
1346       gcc_assert (VECTOR_MODE_P (src_mode));
1347
1348       switch (src_mode)
1349         {
1350         case V16QImode:
1351           gen = gen_aarch64_split_simd_movv16qi;
1352           break;
1353         case V8HImode:
1354           gen = gen_aarch64_split_simd_movv8hi;
1355           break;
1356         case V4SImode:
1357           gen = gen_aarch64_split_simd_movv4si;
1358           break;
1359         case V2DImode:
1360           gen = gen_aarch64_split_simd_movv2di;
1361           break;
1362         case V8HFmode:
1363           gen = gen_aarch64_split_simd_movv8hf;
1364           break;
1365         case V4SFmode:
1366           gen = gen_aarch64_split_simd_movv4sf;
1367           break;
1368         case V2DFmode:
1369           gen = gen_aarch64_split_simd_movv2df;
1370           break;
1371         default:
1372           gcc_unreachable ();
1373         }
1374
1375       emit_insn (gen (dst, src));
1376       return;
1377     }
1378 }
1379
1380 static rtx
1381 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1382 {
1383   if (can_create_pseudo_p ())
1384     return force_reg (mode, value);
1385   else
1386     {
1387       x = aarch64_emit_move (x, value);
1388       return x;
1389     }
1390 }
1391
1392
1393 static rtx
1394 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1395 {
1396   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1397     {
1398       rtx high;
1399       /* Load the full offset into a register.  This
1400          might be improvable in the future.  */
1401       high = GEN_INT (offset);
1402       offset = 0;
1403       high = aarch64_force_temporary (mode, temp, high);
1404       reg = aarch64_force_temporary (mode, temp,
1405                                      gen_rtx_PLUS (mode, high, reg));
1406     }
1407   return plus_constant (mode, reg, offset);
1408 }
1409
1410 static int
1411 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1412                                 machine_mode mode)
1413 {
1414   int i;
1415   unsigned HOST_WIDE_INT val, val2, mask;
1416   int one_match, zero_match;
1417   int num_insns;
1418
1419   val = INTVAL (imm);
1420
1421   if (aarch64_move_imm (val, mode))
1422     {
1423       if (generate)
1424         emit_insn (gen_rtx_SET (dest, imm));
1425       return 1;
1426     }
1427
1428   if ((val >> 32) == 0 || mode == SImode)
1429     {
1430       if (generate)
1431         {
1432           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1433           if (mode == SImode)
1434             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1435                                        GEN_INT ((val >> 16) & 0xffff)));
1436           else
1437             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1438                                        GEN_INT ((val >> 16) & 0xffff)));
1439         }
1440       return 2;
1441     }
1442
1443   /* Remaining cases are all for DImode.  */
1444
1445   mask = 0xffff;
1446   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1447     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1448   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1449     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1450
1451   if (zero_match != 2 && one_match != 2)
1452     {
1453       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1454          For a 64-bit bitmask try whether changing 16 bits to all ones or
1455          zeroes creates a valid bitmask.  To check any repeated bitmask,
1456          try using 16 bits from the other 32-bit half of val.  */
1457
1458       for (i = 0; i < 64; i += 16, mask <<= 16)
1459         {
1460           val2 = val & ~mask;
1461           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1462             break;
1463           val2 = val | mask;
1464           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1465             break;
1466           val2 = val2 & ~mask;
1467           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1468           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1469             break;
1470         }
1471       if (i != 64)
1472         {
1473           if (generate)
1474             {
1475               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1476               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1477                                          GEN_INT ((val >> i) & 0xffff)));
1478             }
1479         }
1480     }
1481
1482   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1483      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1484      otherwise skip zero bits.  */
1485
1486   num_insns = 1;
1487   mask = 0xffff;
1488   val2 = one_match > zero_match ? ~val : val;
1489   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1490
1491   if (generate)
1492     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1493                                            ? (val | ~(mask << i))
1494                                            : (val & (mask << i)))));
1495   for (i += 16; i < 64; i += 16)
1496     {
1497       if ((val2 & (mask << i)) == 0)
1498         continue;
1499       if (generate)
1500         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1501                                    GEN_INT ((val >> i) & 0xffff)));
1502       num_insns ++;
1503     }
1504
1505   return num_insns;
1506 }
1507
1508
1509 void
1510 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1511 {
1512   machine_mode mode = GET_MODE (dest);
1513
1514   gcc_assert (mode == SImode || mode == DImode);
1515
1516   /* Check on what type of symbol it is.  */
1517   if (GET_CODE (imm) == SYMBOL_REF
1518       || GET_CODE (imm) == LABEL_REF
1519       || GET_CODE (imm) == CONST)
1520     {
1521       rtx mem, base, offset;
1522       enum aarch64_symbol_type sty;
1523
1524       /* If we have (const (plus symbol offset)), separate out the offset
1525          before we start classifying the symbol.  */
1526       split_const (imm, &base, &offset);
1527
1528       sty = aarch64_classify_symbol (base, offset);
1529       switch (sty)
1530         {
1531         case SYMBOL_FORCE_TO_MEM:
1532           if (offset != const0_rtx
1533               && targetm.cannot_force_const_mem (mode, imm))
1534             {
1535               gcc_assert (can_create_pseudo_p ());
1536               base = aarch64_force_temporary (mode, dest, base);
1537               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1538               aarch64_emit_move (dest, base);
1539               return;
1540             }
1541
1542           mem = force_const_mem (ptr_mode, imm);
1543           gcc_assert (mem);
1544
1545           /* If we aren't generating PC relative literals, then
1546              we need to expand the literal pool access carefully.
1547              This is something that needs to be done in a number
1548              of places, so could well live as a separate function.  */
1549           if (aarch64_nopcrelative_literal_loads)
1550             {
1551               gcc_assert (can_create_pseudo_p ());
1552               base = gen_reg_rtx (ptr_mode);
1553               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1554               mem = gen_rtx_MEM (ptr_mode, base);
1555             }
1556
1557           if (mode != ptr_mode)
1558             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1559
1560           emit_insn (gen_rtx_SET (dest, mem));
1561
1562           return;
1563
1564         case SYMBOL_SMALL_TLSGD:
1565         case SYMBOL_SMALL_TLSDESC:
1566         case SYMBOL_SMALL_TLSIE:
1567         case SYMBOL_SMALL_GOT_28K:
1568         case SYMBOL_SMALL_GOT_4G:
1569         case SYMBOL_TINY_GOT:
1570         case SYMBOL_TINY_TLSIE:
1571           if (offset != const0_rtx)
1572             {
1573               gcc_assert(can_create_pseudo_p ());
1574               base = aarch64_force_temporary (mode, dest, base);
1575               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1576               aarch64_emit_move (dest, base);
1577               return;
1578             }
1579           /* FALLTHRU */
1580
1581         case SYMBOL_SMALL_ABSOLUTE:
1582         case SYMBOL_TINY_ABSOLUTE:
1583         case SYMBOL_TLSLE12:
1584         case SYMBOL_TLSLE24:
1585         case SYMBOL_TLSLE32:
1586         case SYMBOL_TLSLE48:
1587           aarch64_load_symref_appropriately (dest, imm, sty);
1588           return;
1589
1590         default:
1591           gcc_unreachable ();
1592         }
1593     }
1594
1595   if (!CONST_INT_P (imm))
1596     {
1597       if (GET_CODE (imm) == HIGH)
1598         emit_insn (gen_rtx_SET (dest, imm));
1599       else
1600         {
1601           rtx mem = force_const_mem (mode, imm);
1602           gcc_assert (mem);
1603           emit_insn (gen_rtx_SET (dest, mem));
1604         }
1605
1606       return;
1607     }
1608
1609   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1610 }
1611
1612 static bool
1613 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
1614                                  tree exp ATTRIBUTE_UNUSED)
1615 {
1616   /* Currently, always true.  */
1617   return true;
1618 }
1619
1620 /* Implement TARGET_PASS_BY_REFERENCE.  */
1621
1622 static bool
1623 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
1624                            machine_mode mode,
1625                            const_tree type,
1626                            bool named ATTRIBUTE_UNUSED)
1627 {
1628   HOST_WIDE_INT size;
1629   machine_mode dummymode;
1630   int nregs;
1631
1632   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
1633   size = (mode == BLKmode && type)
1634     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
1635
1636   /* Aggregates are passed by reference based on their size.  */
1637   if (type && AGGREGATE_TYPE_P (type))
1638     {
1639       size = int_size_in_bytes (type);
1640     }
1641
1642   /* Variable sized arguments are always returned by reference.  */
1643   if (size < 0)
1644     return true;
1645
1646   /* Can this be a candidate to be passed in fp/simd register(s)?  */
1647   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1648                                                &dummymode, &nregs,
1649                                                NULL))
1650     return false;
1651
1652   /* Arguments which are variable sized or larger than 2 registers are
1653      passed by reference unless they are a homogenous floating point
1654      aggregate.  */
1655   return size > 2 * UNITS_PER_WORD;
1656 }
1657
1658 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
1659 static bool
1660 aarch64_return_in_msb (const_tree valtype)
1661 {
1662   machine_mode dummy_mode;
1663   int dummy_int;
1664
1665   /* Never happens in little-endian mode.  */
1666   if (!BYTES_BIG_ENDIAN)
1667     return false;
1668
1669   /* Only composite types smaller than or equal to 16 bytes can
1670      be potentially returned in registers.  */
1671   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
1672       || int_size_in_bytes (valtype) <= 0
1673       || int_size_in_bytes (valtype) > 16)
1674     return false;
1675
1676   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
1677      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
1678      is always passed/returned in the least significant bits of fp/simd
1679      register(s).  */
1680   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
1681                                                &dummy_mode, &dummy_int, NULL))
1682     return false;
1683
1684   return true;
1685 }
1686
1687 /* Implement TARGET_FUNCTION_VALUE.
1688    Define how to find the value returned by a function.  */
1689
1690 static rtx
1691 aarch64_function_value (const_tree type, const_tree func,
1692                         bool outgoing ATTRIBUTE_UNUSED)
1693 {
1694   machine_mode mode;
1695   int unsignedp;
1696   int count;
1697   machine_mode ag_mode;
1698
1699   mode = TYPE_MODE (type);
1700   if (INTEGRAL_TYPE_P (type))
1701     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
1702
1703   if (aarch64_return_in_msb (type))
1704     {
1705       HOST_WIDE_INT size = int_size_in_bytes (type);
1706
1707       if (size % UNITS_PER_WORD != 0)
1708         {
1709           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
1710           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
1711         }
1712     }
1713
1714   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
1715                                                &ag_mode, &count, NULL))
1716     {
1717       if (!aarch64_composite_type_p (type, mode))
1718         {
1719           gcc_assert (count == 1 && mode == ag_mode);
1720           return gen_rtx_REG (mode, V0_REGNUM);
1721         }
1722       else
1723         {
1724           int i;
1725           rtx par;
1726
1727           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
1728           for (i = 0; i < count; i++)
1729             {
1730               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
1731               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1732                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
1733               XVECEXP (par, 0, i) = tmp;
1734             }
1735           return par;
1736         }
1737     }
1738   else
1739     return gen_rtx_REG (mode, R0_REGNUM);
1740 }
1741
1742 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
1743    Return true if REGNO is the number of a hard register in which the values
1744    of called function may come back.  */
1745
1746 static bool
1747 aarch64_function_value_regno_p (const unsigned int regno)
1748 {
1749   /* Maximum of 16 bytes can be returned in the general registers.  Examples
1750      of 16-byte return values are: 128-bit integers and 16-byte small
1751      structures (excluding homogeneous floating-point aggregates).  */
1752   if (regno == R0_REGNUM || regno == R1_REGNUM)
1753     return true;
1754
1755   /* Up to four fp/simd registers can return a function value, e.g. a
1756      homogeneous floating-point aggregate having four members.  */
1757   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
1758     return TARGET_FLOAT;
1759
1760   return false;
1761 }
1762
1763 /* Implement TARGET_RETURN_IN_MEMORY.
1764
1765    If the type T of the result of a function is such that
1766      void func (T arg)
1767    would require that arg be passed as a value in a register (or set of
1768    registers) according to the parameter passing rules, then the result
1769    is returned in the same registers as would be used for such an
1770    argument.  */
1771
1772 static bool
1773 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
1774 {
1775   HOST_WIDE_INT size;
1776   machine_mode ag_mode;
1777   int count;
1778
1779   if (!AGGREGATE_TYPE_P (type)
1780       && TREE_CODE (type) != COMPLEX_TYPE
1781       && TREE_CODE (type) != VECTOR_TYPE)
1782     /* Simple scalar types always returned in registers.  */
1783     return false;
1784
1785   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
1786                                                type,
1787                                                &ag_mode,
1788                                                &count,
1789                                                NULL))
1790     return false;
1791
1792   /* Types larger than 2 registers returned in memory.  */
1793   size = int_size_in_bytes (type);
1794   return (size < 0 || size > 2 * UNITS_PER_WORD);
1795 }
1796
1797 static bool
1798 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
1799                                const_tree type, int *nregs)
1800 {
1801   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1802   return aarch64_vfp_is_call_or_return_candidate (mode,
1803                                                   type,
1804                                                   &pcum->aapcs_vfp_rmode,
1805                                                   nregs,
1806                                                   NULL);
1807 }
1808
1809 /* Given MODE and TYPE of a function argument, return the alignment in
1810    bits.  The idea is to suppress any stronger alignment requested by
1811    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
1812    This is a helper function for local use only.  */
1813
1814 static unsigned int
1815 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
1816 {
1817   unsigned int alignment;
1818
1819   if (type)
1820     {
1821       if (!integer_zerop (TYPE_SIZE (type)))
1822         {
1823           if (TYPE_MODE (type) == mode)
1824             alignment = TYPE_ALIGN (type);
1825           else
1826             alignment = GET_MODE_ALIGNMENT (mode);
1827         }
1828       else
1829         alignment = 0;
1830     }
1831   else
1832     alignment = GET_MODE_ALIGNMENT (mode);
1833
1834   return alignment;
1835 }
1836
1837 /* Layout a function argument according to the AAPCS64 rules.  The rule
1838    numbers refer to the rule numbers in the AAPCS64.  */
1839
1840 static void
1841 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
1842                     const_tree type,
1843                     bool named ATTRIBUTE_UNUSED)
1844 {
1845   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1846   int ncrn, nvrn, nregs;
1847   bool allocate_ncrn, allocate_nvrn;
1848   HOST_WIDE_INT size;
1849
1850   /* We need to do this once per argument.  */
1851   if (pcum->aapcs_arg_processed)
1852     return;
1853
1854   pcum->aapcs_arg_processed = true;
1855
1856   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
1857   size
1858     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
1859                 UNITS_PER_WORD);
1860
1861   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
1862   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
1863                                                  mode,
1864                                                  type,
1865                                                  &nregs);
1866
1867   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
1868      The following code thus handles passing by SIMD/FP registers first.  */
1869
1870   nvrn = pcum->aapcs_nvrn;
1871
1872   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
1873      and homogenous short-vector aggregates (HVA).  */
1874   if (allocate_nvrn)
1875     {
1876       if (!TARGET_FLOAT)
1877         aarch64_err_no_fpadvsimd (mode, "argument");
1878
1879       if (nvrn + nregs <= NUM_FP_ARG_REGS)
1880         {
1881           pcum->aapcs_nextnvrn = nvrn + nregs;
1882           if (!aarch64_composite_type_p (type, mode))
1883             {
1884               gcc_assert (nregs == 1);
1885               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
1886             }
1887           else
1888             {
1889               rtx par;
1890               int i;
1891               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1892               for (i = 0; i < nregs; i++)
1893                 {
1894                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
1895                                          V0_REGNUM + nvrn + i);
1896                   tmp = gen_rtx_EXPR_LIST
1897                     (VOIDmode, tmp,
1898                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
1899                   XVECEXP (par, 0, i) = tmp;
1900                 }
1901               pcum->aapcs_reg = par;
1902             }
1903           return;
1904         }
1905       else
1906         {
1907           /* C.3 NSRN is set to 8.  */
1908           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
1909           goto on_stack;
1910         }
1911     }
1912
1913   ncrn = pcum->aapcs_ncrn;
1914   nregs = size / UNITS_PER_WORD;
1915
1916   /* C6 - C9.  though the sign and zero extension semantics are
1917      handled elsewhere.  This is the case where the argument fits
1918      entirely general registers.  */
1919   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
1920     {
1921       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
1922
1923       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
1924
1925       /* C.8 if the argument has an alignment of 16 then the NGRN is
1926          rounded up to the next even number.  */
1927       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
1928         {
1929           ++ncrn;
1930           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
1931         }
1932       /* NREGS can be 0 when e.g. an empty structure is to be passed.
1933          A reg is still generated for it, but the caller should be smart
1934          enough not to use it.  */
1935       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
1936         {
1937           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
1938         }
1939       else
1940         {
1941           rtx par;
1942           int i;
1943
1944           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
1945           for (i = 0; i < nregs; i++)
1946             {
1947               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
1948               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
1949                                        GEN_INT (i * UNITS_PER_WORD));
1950               XVECEXP (par, 0, i) = tmp;
1951             }
1952           pcum->aapcs_reg = par;
1953         }
1954
1955       pcum->aapcs_nextncrn = ncrn + nregs;
1956       return;
1957     }
1958
1959   /* C.11  */
1960   pcum->aapcs_nextncrn = NUM_ARG_REGS;
1961
1962   /* The argument is passed on stack; record the needed number of words for
1963      this argument and align the total size if necessary.  */
1964 on_stack:
1965   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
1966   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
1967     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
1968                                        16 / UNITS_PER_WORD);
1969   return;
1970 }
1971
1972 /* Implement TARGET_FUNCTION_ARG.  */
1973
1974 static rtx
1975 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
1976                       const_tree type, bool named)
1977 {
1978   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
1979   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
1980
1981   if (mode == VOIDmode)
1982     return NULL_RTX;
1983
1984   aarch64_layout_arg (pcum_v, mode, type, named);
1985   return pcum->aapcs_reg;
1986 }
1987
1988 void
1989 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
1990                            const_tree fntype ATTRIBUTE_UNUSED,
1991                            rtx libname ATTRIBUTE_UNUSED,
1992                            const_tree fndecl ATTRIBUTE_UNUSED,
1993                            unsigned n_named ATTRIBUTE_UNUSED)
1994 {
1995   pcum->aapcs_ncrn = 0;
1996   pcum->aapcs_nvrn = 0;
1997   pcum->aapcs_nextncrn = 0;
1998   pcum->aapcs_nextnvrn = 0;
1999   pcum->pcs_variant = ARM_PCS_AAPCS64;
2000   pcum->aapcs_reg = NULL_RTX;
2001   pcum->aapcs_arg_processed = false;
2002   pcum->aapcs_stack_words = 0;
2003   pcum->aapcs_stack_size = 0;
2004
2005   if (!TARGET_FLOAT
2006       && fndecl && TREE_PUBLIC (fndecl)
2007       && fntype && fntype != error_mark_node)
2008     {
2009       const_tree type = TREE_TYPE (fntype);
2010       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2011       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2012       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2013                                                    &mode, &nregs, NULL))
2014         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2015     }
2016   return;
2017 }
2018
2019 static void
2020 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2021                               machine_mode mode,
2022                               const_tree type,
2023                               bool named)
2024 {
2025   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2026   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2027     {
2028       aarch64_layout_arg (pcum_v, mode, type, named);
2029       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2030                   != (pcum->aapcs_stack_words != 0));
2031       pcum->aapcs_arg_processed = false;
2032       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2033       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2034       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2035       pcum->aapcs_stack_words = 0;
2036       pcum->aapcs_reg = NULL_RTX;
2037     }
2038 }
2039
2040 bool
2041 aarch64_function_arg_regno_p (unsigned regno)
2042 {
2043   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2044           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2045 }
2046
2047 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2048    PARM_BOUNDARY bits of alignment, but will be given anything up
2049    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2050    that both before and after the layout of each argument, the Next
2051    Stacked Argument Address (NSAA) will have a minimum alignment of
2052    8 bytes.  */
2053
2054 static unsigned int
2055 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2056 {
2057   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2058
2059   if (alignment < PARM_BOUNDARY)
2060     alignment = PARM_BOUNDARY;
2061   if (alignment > STACK_BOUNDARY)
2062     alignment = STACK_BOUNDARY;
2063   return alignment;
2064 }
2065
2066 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2067
2068    Return true if an argument passed on the stack should be padded upwards,
2069    i.e. if the least-significant byte of the stack slot has useful data.
2070
2071    Small aggregate types are placed in the lowest memory address.
2072
2073    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2074
2075 bool
2076 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2077 {
2078   /* On little-endian targets, the least significant byte of every stack
2079      argument is passed at the lowest byte address of the stack slot.  */
2080   if (!BYTES_BIG_ENDIAN)
2081     return true;
2082
2083   /* Otherwise, integral, floating-point and pointer types are padded downward:
2084      the least significant byte of a stack argument is passed at the highest
2085      byte address of the stack slot.  */
2086   if (type
2087       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2088          || POINTER_TYPE_P (type))
2089       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2090     return false;
2091
2092   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2093   return true;
2094 }
2095
2096 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2097
2098    It specifies padding for the last (may also be the only)
2099    element of a block move between registers and memory.  If
2100    assuming the block is in the memory, padding upward means that
2101    the last element is padded after its highest significant byte,
2102    while in downward padding, the last element is padded at the
2103    its least significant byte side.
2104
2105    Small aggregates and small complex types are always padded
2106    upwards.
2107
2108    We don't need to worry about homogeneous floating-point or
2109    short-vector aggregates; their move is not affected by the
2110    padding direction determined here.  Regardless of endianness,
2111    each element of such an aggregate is put in the least
2112    significant bits of a fp/simd register.
2113
2114    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2115    register has useful data, and return the opposite if the most
2116    significant byte does.  */
2117
2118 bool
2119 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2120                      bool first ATTRIBUTE_UNUSED)
2121 {
2122
2123   /* Small composite types are always padded upward.  */
2124   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2125     {
2126       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2127                             : GET_MODE_SIZE (mode));
2128       if (size < 2 * UNITS_PER_WORD)
2129         return true;
2130     }
2131
2132   /* Otherwise, use the default padding.  */
2133   return !BYTES_BIG_ENDIAN;
2134 }
2135
2136 static machine_mode
2137 aarch64_libgcc_cmp_return_mode (void)
2138 {
2139   return SImode;
2140 }
2141
2142 static bool
2143 aarch64_frame_pointer_required (void)
2144 {
2145   /* In aarch64_override_options_after_change
2146      flag_omit_leaf_frame_pointer turns off the frame pointer by
2147      default.  Turn it back on now if we've not got a leaf
2148      function.  */
2149   if (flag_omit_leaf_frame_pointer
2150       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2151     return true;
2152
2153   return false;
2154 }
2155
2156 /* Mark the registers that need to be saved by the callee and calculate
2157    the size of the callee-saved registers area and frame record (both FP
2158    and LR may be omitted).  */
2159 static void
2160 aarch64_layout_frame (void)
2161 {
2162   HOST_WIDE_INT offset = 0;
2163   int regno;
2164
2165   if (reload_completed && cfun->machine->frame.laid_out)
2166     return;
2167
2168 #define SLOT_NOT_REQUIRED (-2)
2169 #define SLOT_REQUIRED     (-1)
2170
2171   cfun->machine->frame.wb_candidate1 = FIRST_PSEUDO_REGISTER;
2172   cfun->machine->frame.wb_candidate2 = FIRST_PSEUDO_REGISTER;
2173
2174   /* First mark all the registers that really need to be saved...  */
2175   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2176     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2177
2178   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2179     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2180
2181   /* ... that includes the eh data registers (if needed)...  */
2182   if (crtl->calls_eh_return)
2183     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2184       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2185         = SLOT_REQUIRED;
2186
2187   /* ... and any callee saved register that dataflow says is live.  */
2188   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2189     if (df_regs_ever_live_p (regno)
2190         && (regno == R30_REGNUM
2191             || !call_used_regs[regno]))
2192       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2193
2194   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2195     if (df_regs_ever_live_p (regno)
2196         && !call_used_regs[regno])
2197       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2198
2199   if (frame_pointer_needed)
2200     {
2201       /* FP and LR are placed in the linkage record.  */
2202       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2203       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2204       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2205       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2206       cfun->machine->frame.hardfp_offset = 2 * UNITS_PER_WORD;
2207       offset += 2 * UNITS_PER_WORD;
2208     }
2209
2210   /* Now assign stack slots for them.  */
2211   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2212     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2213       {
2214         cfun->machine->frame.reg_offset[regno] = offset;
2215         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2216           cfun->machine->frame.wb_candidate1 = regno;
2217         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER)
2218           cfun->machine->frame.wb_candidate2 = regno;
2219         offset += UNITS_PER_WORD;
2220       }
2221
2222   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2223     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2224       {
2225         cfun->machine->frame.reg_offset[regno] = offset;
2226         if (cfun->machine->frame.wb_candidate1 == FIRST_PSEUDO_REGISTER)
2227           cfun->machine->frame.wb_candidate1 = regno;
2228         else if (cfun->machine->frame.wb_candidate2 == FIRST_PSEUDO_REGISTER
2229                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2230           cfun->machine->frame.wb_candidate2 = regno;
2231         offset += UNITS_PER_WORD;
2232       }
2233
2234   cfun->machine->frame.padding0 =
2235     (ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT) - offset);
2236   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2237
2238   cfun->machine->frame.saved_regs_size = offset;
2239
2240   cfun->machine->frame.hard_fp_offset
2241     = ROUND_UP (cfun->machine->frame.saved_varargs_size
2242                 + get_frame_size ()
2243                 + cfun->machine->frame.saved_regs_size,
2244                 STACK_BOUNDARY / BITS_PER_UNIT);
2245
2246   cfun->machine->frame.frame_size
2247     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2248                 + crtl->outgoing_args_size,
2249                 STACK_BOUNDARY / BITS_PER_UNIT);
2250
2251   cfun->machine->frame.laid_out = true;
2252 }
2253
2254 static bool
2255 aarch64_register_saved_on_entry (int regno)
2256 {
2257   return cfun->machine->frame.reg_offset[regno] >= 0;
2258 }
2259
2260 static unsigned
2261 aarch64_next_callee_save (unsigned regno, unsigned limit)
2262 {
2263   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2264     regno ++;
2265   return regno;
2266 }
2267
2268 static void
2269 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2270                            HOST_WIDE_INT adjustment)
2271  {
2272   rtx base_rtx = stack_pointer_rtx;
2273   rtx insn, reg, mem;
2274
2275   reg = gen_rtx_REG (mode, regno);
2276   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2277                             plus_constant (Pmode, base_rtx, -adjustment));
2278   mem = gen_rtx_MEM (mode, mem);
2279
2280   insn = emit_move_insn (mem, reg);
2281   RTX_FRAME_RELATED_P (insn) = 1;
2282 }
2283
2284 static rtx
2285 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2286                           HOST_WIDE_INT adjustment)
2287 {
2288   switch (mode)
2289     {
2290     case DImode:
2291       return gen_storewb_pairdi_di (base, base, reg, reg2,
2292                                     GEN_INT (-adjustment),
2293                                     GEN_INT (UNITS_PER_WORD - adjustment));
2294     case DFmode:
2295       return gen_storewb_pairdf_di (base, base, reg, reg2,
2296                                     GEN_INT (-adjustment),
2297                                     GEN_INT (UNITS_PER_WORD - adjustment));
2298     default:
2299       gcc_unreachable ();
2300     }
2301 }
2302
2303 static void
2304 aarch64_pushwb_pair_reg (machine_mode mode, unsigned regno1,
2305                          unsigned regno2, HOST_WIDE_INT adjustment)
2306 {
2307   rtx_insn *insn;
2308   rtx reg1 = gen_rtx_REG (mode, regno1);
2309   rtx reg2 = gen_rtx_REG (mode, regno2);
2310
2311   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2312                                               reg2, adjustment));
2313   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2314   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2315   RTX_FRAME_RELATED_P (insn) = 1;
2316 }
2317
2318 static rtx
2319 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2320                          HOST_WIDE_INT adjustment)
2321 {
2322   switch (mode)
2323     {
2324     case DImode:
2325       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2326                                    GEN_INT (UNITS_PER_WORD));
2327     case DFmode:
2328       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2329                                    GEN_INT (UNITS_PER_WORD));
2330     default:
2331       gcc_unreachable ();
2332     }
2333 }
2334
2335 static rtx
2336 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
2337                         rtx reg2)
2338 {
2339   switch (mode)
2340     {
2341     case DImode:
2342       return gen_store_pairdi (mem1, reg1, mem2, reg2);
2343
2344     case DFmode:
2345       return gen_store_pairdf (mem1, reg1, mem2, reg2);
2346
2347     default:
2348       gcc_unreachable ();
2349     }
2350 }
2351
2352 static rtx
2353 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
2354                        rtx mem2)
2355 {
2356   switch (mode)
2357     {
2358     case DImode:
2359       return gen_load_pairdi (reg1, mem1, reg2, mem2);
2360
2361     case DFmode:
2362       return gen_load_pairdf (reg1, mem1, reg2, mem2);
2363
2364     default:
2365       gcc_unreachable ();
2366     }
2367 }
2368
2369
2370 static void
2371 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
2372                            unsigned start, unsigned limit, bool skip_wb)
2373 {
2374   rtx_insn *insn;
2375   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2376                                                  ? gen_frame_mem : gen_rtx_MEM);
2377   unsigned regno;
2378   unsigned regno2;
2379
2380   for (regno = aarch64_next_callee_save (start, limit);
2381        regno <= limit;
2382        regno = aarch64_next_callee_save (regno + 1, limit))
2383     {
2384       rtx reg, mem;
2385       HOST_WIDE_INT offset;
2386
2387       if (skip_wb
2388           && (regno == cfun->machine->frame.wb_candidate1
2389               || regno == cfun->machine->frame.wb_candidate2))
2390         continue;
2391
2392       reg = gen_rtx_REG (mode, regno);
2393       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2394       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2395                                               offset));
2396
2397       regno2 = aarch64_next_callee_save (regno + 1, limit);
2398
2399       if (regno2 <= limit
2400           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2401               == cfun->machine->frame.reg_offset[regno2]))
2402
2403         {
2404           rtx reg2 = gen_rtx_REG (mode, regno2);
2405           rtx mem2;
2406
2407           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2408           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
2409                                                    offset));
2410           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
2411                                                     reg2));
2412
2413           /* The first part of a frame-related parallel insn is
2414              always assumed to be relevant to the frame
2415              calculations; subsequent parts, are only
2416              frame-related if explicitly marked.  */
2417           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2418           regno = regno2;
2419         }
2420       else
2421         insn = emit_move_insn (mem, reg);
2422
2423       RTX_FRAME_RELATED_P (insn) = 1;
2424     }
2425 }
2426
2427 static void
2428 aarch64_restore_callee_saves (machine_mode mode,
2429                               HOST_WIDE_INT start_offset, unsigned start,
2430                               unsigned limit, bool skip_wb, rtx *cfi_ops)
2431 {
2432   rtx base_rtx = stack_pointer_rtx;
2433   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
2434                                                  ? gen_frame_mem : gen_rtx_MEM);
2435   unsigned regno;
2436   unsigned regno2;
2437   HOST_WIDE_INT offset;
2438
2439   for (regno = aarch64_next_callee_save (start, limit);
2440        regno <= limit;
2441        regno = aarch64_next_callee_save (regno + 1, limit))
2442     {
2443       rtx reg, mem;
2444
2445       if (skip_wb
2446           && (regno == cfun->machine->frame.wb_candidate1
2447               || regno == cfun->machine->frame.wb_candidate2))
2448         continue;
2449
2450       reg = gen_rtx_REG (mode, regno);
2451       offset = start_offset + cfun->machine->frame.reg_offset[regno];
2452       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2453
2454       regno2 = aarch64_next_callee_save (regno + 1, limit);
2455
2456       if (regno2 <= limit
2457           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
2458               == cfun->machine->frame.reg_offset[regno2]))
2459         {
2460           rtx reg2 = gen_rtx_REG (mode, regno2);
2461           rtx mem2;
2462
2463           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
2464           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
2465           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
2466
2467           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
2468           regno = regno2;
2469         }
2470       else
2471         emit_move_insn (reg, mem);
2472       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
2473     }
2474 }
2475
2476 /* AArch64 stack frames generated by this compiler look like:
2477
2478         +-------------------------------+
2479         |                               |
2480         |  incoming stack arguments     |
2481         |                               |
2482         +-------------------------------+
2483         |                               | <-- incoming stack pointer (aligned)
2484         |  callee-allocated save area   |
2485         |  for register varargs         |
2486         |                               |
2487         +-------------------------------+
2488         |  local variables              | <-- frame_pointer_rtx
2489         |                               |
2490         +-------------------------------+
2491         |  padding0                     | \
2492         +-------------------------------+  |
2493         |  callee-saved registers       |  | frame.saved_regs_size
2494         +-------------------------------+  |
2495         |  LR'                          |  |
2496         +-------------------------------+  |
2497         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
2498         +-------------------------------+
2499         |  dynamic allocation           |
2500         +-------------------------------+
2501         |  padding                      |
2502         +-------------------------------+
2503         |  outgoing stack arguments     | <-- arg_pointer
2504         |                               |
2505         +-------------------------------+
2506         |                               | <-- stack_pointer_rtx (aligned)
2507
2508    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
2509    but leave frame_pointer_rtx and hard_frame_pointer_rtx
2510    unchanged.  */
2511
2512 /* Generate the prologue instructions for entry into a function.
2513    Establish the stack frame by decreasing the stack pointer with a
2514    properly calculated size and, if necessary, create a frame record
2515    filled with the values of LR and previous frame pointer.  The
2516    current FP is also set up if it is in use.  */
2517
2518 void
2519 aarch64_expand_prologue (void)
2520 {
2521   /* sub sp, sp, #<frame_size>
2522      stp {fp, lr}, [sp, #<frame_size> - 16]
2523      add fp, sp, #<frame_size> - hardfp_offset
2524      stp {cs_reg}, [fp, #-16] etc.
2525
2526      sub sp, sp, <final_adjustment_if_any>
2527   */
2528   HOST_WIDE_INT frame_size, offset;
2529   HOST_WIDE_INT fp_offset;              /* Offset from hard FP to SP.  */
2530   HOST_WIDE_INT hard_fp_offset;
2531   rtx_insn *insn;
2532
2533   aarch64_layout_frame ();
2534
2535   offset = frame_size = cfun->machine->frame.frame_size;
2536   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2537   fp_offset = frame_size - hard_fp_offset;
2538
2539   if (flag_stack_usage_info)
2540     current_function_static_stack_size = frame_size;
2541
2542   /* Store pairs and load pairs have a range only -512 to 504.  */
2543   if (offset >= 512)
2544     {
2545       /* When the frame has a large size, an initial decrease is done on
2546          the stack pointer to jump over the callee-allocated save area for
2547          register varargs, the local variable area and/or the callee-saved
2548          register area.  This will allow the pre-index write-back
2549          store pair instructions to be used for setting up the stack frame
2550          efficiently.  */
2551       offset = hard_fp_offset;
2552       if (offset >= 512)
2553         offset = cfun->machine->frame.saved_regs_size;
2554
2555       frame_size -= (offset + crtl->outgoing_args_size);
2556       fp_offset = 0;
2557
2558       if (frame_size >= 0x1000000)
2559         {
2560           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2561           emit_move_insn (op0, GEN_INT (-frame_size));
2562           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2563
2564           add_reg_note (insn, REG_CFA_ADJUST_CFA,
2565                         gen_rtx_SET (stack_pointer_rtx,
2566                                      plus_constant (Pmode, stack_pointer_rtx,
2567                                                     -frame_size)));
2568           RTX_FRAME_RELATED_P (insn) = 1;
2569         }
2570       else if (frame_size > 0)
2571         {
2572           int hi_ofs = frame_size & 0xfff000;
2573           int lo_ofs = frame_size & 0x000fff;
2574
2575           if (hi_ofs)
2576             {
2577               insn = emit_insn (gen_add2_insn
2578                                 (stack_pointer_rtx, GEN_INT (-hi_ofs)));
2579               RTX_FRAME_RELATED_P (insn) = 1;
2580             }
2581           if (lo_ofs)
2582             {
2583               insn = emit_insn (gen_add2_insn
2584                                 (stack_pointer_rtx, GEN_INT (-lo_ofs)));
2585               RTX_FRAME_RELATED_P (insn) = 1;
2586             }
2587         }
2588     }
2589   else
2590     frame_size = -1;
2591
2592   if (offset > 0)
2593     {
2594       bool skip_wb = false;
2595
2596       if (frame_pointer_needed)
2597         {
2598           skip_wb = true;
2599
2600           if (fp_offset)
2601             {
2602               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2603                                                GEN_INT (-offset)));
2604               RTX_FRAME_RELATED_P (insn) = 1;
2605
2606               aarch64_save_callee_saves (DImode, fp_offset, R29_REGNUM,
2607                                          R30_REGNUM, false);
2608             }
2609           else
2610             aarch64_pushwb_pair_reg (DImode, R29_REGNUM, R30_REGNUM, offset);
2611
2612           /* Set up frame pointer to point to the location of the
2613              previous frame pointer on the stack.  */
2614           insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
2615                                            stack_pointer_rtx,
2616                                            GEN_INT (fp_offset)));
2617           RTX_FRAME_RELATED_P (insn) = 1;
2618           emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
2619         }
2620       else
2621         {
2622           unsigned reg1 = cfun->machine->frame.wb_candidate1;
2623           unsigned reg2 = cfun->machine->frame.wb_candidate2;
2624
2625           if (fp_offset
2626               || reg1 == FIRST_PSEUDO_REGISTER
2627               || (reg2 == FIRST_PSEUDO_REGISTER
2628                   && offset >= 256))
2629             {
2630               insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2631                                                GEN_INT (-offset)));
2632               RTX_FRAME_RELATED_P (insn) = 1;
2633             }
2634           else
2635             {
2636               machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2637
2638               skip_wb = true;
2639
2640               if (reg2 == FIRST_PSEUDO_REGISTER)
2641                 aarch64_pushwb_single_reg (mode1, reg1, offset);
2642               else
2643                 aarch64_pushwb_pair_reg (mode1, reg1, reg2, offset);
2644             }
2645         }
2646
2647       aarch64_save_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2648                                  skip_wb);
2649       aarch64_save_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2650                                  skip_wb);
2651     }
2652
2653   /* when offset >= 512,
2654      sub sp, sp, #<outgoing_args_size> */
2655   if (frame_size > -1)
2656     {
2657       if (crtl->outgoing_args_size > 0)
2658         {
2659           insn = emit_insn (gen_add2_insn
2660                             (stack_pointer_rtx,
2661                              GEN_INT (- crtl->outgoing_args_size)));
2662           RTX_FRAME_RELATED_P (insn) = 1;
2663         }
2664     }
2665 }
2666
2667 /* Return TRUE if we can use a simple_return insn.
2668
2669    This function checks whether the callee saved stack is empty, which
2670    means no restore actions are need. The pro_and_epilogue will use
2671    this to check whether shrink-wrapping opt is feasible.  */
2672
2673 bool
2674 aarch64_use_return_insn_p (void)
2675 {
2676   if (!reload_completed)
2677     return false;
2678
2679   if (crtl->profile)
2680     return false;
2681
2682   aarch64_layout_frame ();
2683
2684   return cfun->machine->frame.frame_size == 0;
2685 }
2686
2687 /* Generate the epilogue instructions for returning from a function.  */
2688 void
2689 aarch64_expand_epilogue (bool for_sibcall)
2690 {
2691   HOST_WIDE_INT frame_size, offset;
2692   HOST_WIDE_INT fp_offset;
2693   HOST_WIDE_INT hard_fp_offset;
2694   rtx_insn *insn;
2695   /* We need to add memory barrier to prevent read from deallocated stack.  */
2696   bool need_barrier_p = (get_frame_size () != 0
2697                          || cfun->machine->frame.saved_varargs_size);
2698
2699   aarch64_layout_frame ();
2700
2701   offset = frame_size = cfun->machine->frame.frame_size;
2702   hard_fp_offset = cfun->machine->frame.hard_fp_offset;
2703   fp_offset = frame_size - hard_fp_offset;
2704
2705   /* Store pairs and load pairs have a range only -512 to 504.  */
2706   if (offset >= 512)
2707     {
2708       offset = hard_fp_offset;
2709       if (offset >= 512)
2710         offset = cfun->machine->frame.saved_regs_size;
2711
2712       frame_size -= (offset + crtl->outgoing_args_size);
2713       fp_offset = 0;
2714       if (!frame_pointer_needed && crtl->outgoing_args_size > 0)
2715         {
2716           insn = emit_insn (gen_add2_insn
2717                             (stack_pointer_rtx,
2718                              GEN_INT (crtl->outgoing_args_size)));
2719           RTX_FRAME_RELATED_P (insn) = 1;
2720         }
2721     }
2722   else
2723     frame_size = -1;
2724
2725   /* If there were outgoing arguments or we've done dynamic stack
2726      allocation, then restore the stack pointer from the frame
2727      pointer.  This is at most one insn and more efficient than using
2728      GCC's internal mechanism.  */
2729   if (frame_pointer_needed
2730       && (crtl->outgoing_args_size || cfun->calls_alloca))
2731     {
2732       if (cfun->calls_alloca)
2733         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2734
2735       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
2736                                        hard_frame_pointer_rtx,
2737                                        GEN_INT (0)));
2738       offset = offset - fp_offset;
2739     }
2740
2741   if (offset > 0)
2742     {
2743       unsigned reg1 = cfun->machine->frame.wb_candidate1;
2744       unsigned reg2 = cfun->machine->frame.wb_candidate2;
2745       bool skip_wb = true;
2746       rtx cfi_ops = NULL;
2747
2748       if (frame_pointer_needed)
2749         fp_offset = 0;
2750       else if (fp_offset
2751                || reg1 == FIRST_PSEUDO_REGISTER
2752                || (reg2 == FIRST_PSEUDO_REGISTER
2753                    && offset >= 256))
2754         skip_wb = false;
2755
2756       aarch64_restore_callee_saves (DImode, fp_offset, R0_REGNUM, R30_REGNUM,
2757                                     skip_wb, &cfi_ops);
2758       aarch64_restore_callee_saves (DFmode, fp_offset, V0_REGNUM, V31_REGNUM,
2759                                     skip_wb, &cfi_ops);
2760
2761       if (need_barrier_p)
2762         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2763
2764       if (skip_wb)
2765         {
2766           machine_mode mode1 = (reg1 <= R30_REGNUM) ? DImode : DFmode;
2767           rtx rreg1 = gen_rtx_REG (mode1, reg1);
2768
2769           cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg1, cfi_ops);
2770           if (reg2 == FIRST_PSEUDO_REGISTER)
2771             {
2772               rtx mem = plus_constant (Pmode, stack_pointer_rtx, offset);
2773               mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2774               mem = gen_rtx_MEM (mode1, mem);
2775               insn = emit_move_insn (rreg1, mem);
2776             }
2777           else
2778             {
2779               rtx rreg2 = gen_rtx_REG (mode1, reg2);
2780
2781               cfi_ops = alloc_reg_note (REG_CFA_RESTORE, rreg2, cfi_ops);
2782               insn = emit_insn (aarch64_gen_loadwb_pair
2783                                 (mode1, stack_pointer_rtx, rreg1,
2784                                  rreg2, offset));
2785             }
2786         }
2787       else
2788         {
2789           insn = emit_insn (gen_add2_insn (stack_pointer_rtx,
2790                                            GEN_INT (offset)));
2791         }
2792
2793       /* Reset the CFA to be SP + FRAME_SIZE.  */
2794       rtx new_cfa = stack_pointer_rtx;
2795       if (frame_size > 0)
2796         new_cfa = plus_constant (Pmode, new_cfa, frame_size);
2797       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
2798       REG_NOTES (insn) = cfi_ops;
2799       RTX_FRAME_RELATED_P (insn) = 1;
2800     }
2801
2802   if (frame_size > 0)
2803     {
2804       if (need_barrier_p)
2805         emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
2806
2807       if (frame_size >= 0x1000000)
2808         {
2809           rtx op0 = gen_rtx_REG (Pmode, IP0_REGNUM);
2810           emit_move_insn (op0, GEN_INT (frame_size));
2811           insn = emit_insn (gen_add2_insn (stack_pointer_rtx, op0));
2812         }
2813       else
2814         {
2815           int hi_ofs = frame_size & 0xfff000;
2816           int lo_ofs = frame_size & 0x000fff;
2817
2818           if (hi_ofs && lo_ofs)
2819             {
2820               insn = emit_insn (gen_add2_insn
2821                                 (stack_pointer_rtx, GEN_INT (hi_ofs)));
2822               RTX_FRAME_RELATED_P (insn) = 1;
2823               frame_size = lo_ofs;
2824             }
2825           insn = emit_insn (gen_add2_insn
2826                             (stack_pointer_rtx, GEN_INT (frame_size)));
2827         }
2828
2829       /* Reset the CFA to be SP + 0.  */
2830       add_reg_note (insn, REG_CFA_DEF_CFA, stack_pointer_rtx);
2831       RTX_FRAME_RELATED_P (insn) = 1;
2832     }
2833
2834   /* Stack adjustment for exception handler.  */
2835   if (crtl->calls_eh_return)
2836     {
2837       /* We need to unwind the stack by the offset computed by
2838          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
2839          to be SP; letting the CFA move during this adjustment
2840          is just as correct as retaining the CFA from the body
2841          of the function.  Therefore, do nothing special.  */
2842       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
2843     }
2844
2845   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
2846   if (!for_sibcall)
2847     emit_jump_insn (ret_rtx);
2848 }
2849
2850 /* Return the place to copy the exception unwinding return address to.
2851    This will probably be a stack slot, but could (in theory be the
2852    return register).  */
2853 rtx
2854 aarch64_final_eh_return_addr (void)
2855 {
2856   HOST_WIDE_INT fp_offset;
2857
2858   aarch64_layout_frame ();
2859
2860   fp_offset = cfun->machine->frame.frame_size
2861               - cfun->machine->frame.hard_fp_offset;
2862
2863   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
2864     return gen_rtx_REG (DImode, LR_REGNUM);
2865
2866   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
2867      result in a store to save LR introduced by builtin_eh_return () being
2868      incorrectly deleted because the alias is not detected.
2869      So in the calculation of the address to copy the exception unwinding
2870      return address to, we note 2 cases.
2871      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
2872      we return a SP-relative location since all the addresses are SP-relative
2873      in this case.  This prevents the store from being optimized away.
2874      If the fp_offset is not 0, then the addresses will be FP-relative and
2875      therefore we return a FP-relative location.  */
2876
2877   if (frame_pointer_needed)
2878     {
2879       if (fp_offset)
2880         return gen_frame_mem (DImode,
2881                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
2882       else
2883         return gen_frame_mem (DImode,
2884                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
2885     }
2886
2887   /* If FP is not needed, we calculate the location of LR, which would be
2888      at the top of the saved registers block.  */
2889
2890   return gen_frame_mem (DImode,
2891                         plus_constant (Pmode,
2892                                        stack_pointer_rtx,
2893                                        fp_offset
2894                                        + cfun->machine->frame.saved_regs_size
2895                                        - 2 * UNITS_PER_WORD));
2896 }
2897
2898 /* Possibly output code to build up a constant in a register.  For
2899    the benefit of the costs infrastructure, returns the number of
2900    instructions which would be emitted.  GENERATE inhibits or
2901    enables code generation.  */
2902
2903 static int
2904 aarch64_build_constant (int regnum, HOST_WIDE_INT val, bool generate)
2905 {
2906   int insns = 0;
2907
2908   if (aarch64_bitmask_imm (val, DImode))
2909     {
2910       if (generate)
2911         emit_move_insn (gen_rtx_REG (Pmode, regnum), GEN_INT (val));
2912       insns = 1;
2913     }
2914   else
2915     {
2916       int i;
2917       int ncount = 0;
2918       int zcount = 0;
2919       HOST_WIDE_INT valp = val >> 16;
2920       HOST_WIDE_INT valm;
2921       HOST_WIDE_INT tval;
2922
2923       for (i = 16; i < 64; i += 16)
2924         {
2925           valm = (valp & 0xffff);
2926
2927           if (valm != 0)
2928             ++ zcount;
2929
2930           if (valm != 0xffff)
2931             ++ ncount;
2932
2933           valp >>= 16;
2934         }
2935
2936       /* zcount contains the number of additional MOVK instructions
2937          required if the constant is built up with an initial MOVZ instruction,
2938          while ncount is the number of MOVK instructions required if starting
2939          with a MOVN instruction.  Choose the sequence that yields the fewest
2940          number of instructions, preferring MOVZ instructions when they are both
2941          the same.  */
2942       if (ncount < zcount)
2943         {
2944           if (generate)
2945             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2946                             GEN_INT (val | ~(HOST_WIDE_INT) 0xffff));
2947           tval = 0xffff;
2948           insns++;
2949         }
2950       else
2951         {
2952           if (generate)
2953             emit_move_insn (gen_rtx_REG (Pmode, regnum),
2954                             GEN_INT (val & 0xffff));
2955           tval = 0;
2956           insns++;
2957         }
2958
2959       val >>= 16;
2960
2961       for (i = 16; i < 64; i += 16)
2962         {
2963           if ((val & 0xffff) != tval)
2964             {
2965               if (generate)
2966                 emit_insn (gen_insv_immdi (gen_rtx_REG (Pmode, regnum),
2967                                            GEN_INT (i),
2968                                            GEN_INT (val & 0xffff)));
2969               insns++;
2970             }
2971           val >>= 16;
2972         }
2973     }
2974   return insns;
2975 }
2976
2977 static void
2978 aarch64_add_constant (int regnum, int scratchreg, HOST_WIDE_INT delta)
2979 {
2980   HOST_WIDE_INT mdelta = delta;
2981   rtx this_rtx = gen_rtx_REG (Pmode, regnum);
2982   rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg);
2983
2984   if (mdelta < 0)
2985     mdelta = -mdelta;
2986
2987   if (mdelta >= 4096 * 4096)
2988     {
2989       (void) aarch64_build_constant (scratchreg, delta, true);
2990       emit_insn (gen_add3_insn (this_rtx, this_rtx, scratch_rtx));
2991     }
2992   else if (mdelta > 0)
2993     {
2994       if (mdelta >= 4096)
2995         {
2996           emit_insn (gen_rtx_SET (scratch_rtx, GEN_INT (mdelta / 4096)));
2997           rtx shift = gen_rtx_ASHIFT (Pmode, scratch_rtx, GEN_INT (12));
2998           if (delta < 0)
2999             emit_insn (gen_rtx_SET (this_rtx,
3000                                     gen_rtx_MINUS (Pmode, this_rtx, shift)));
3001           else
3002             emit_insn (gen_rtx_SET (this_rtx,
3003                                     gen_rtx_PLUS (Pmode, this_rtx, shift)));
3004         }
3005       if (mdelta % 4096 != 0)
3006         {
3007           scratch_rtx = GEN_INT ((delta < 0 ? -1 : 1) * (mdelta % 4096));
3008           emit_insn (gen_rtx_SET (this_rtx,
3009                                   gen_rtx_PLUS (Pmode, this_rtx, scratch_rtx)));
3010         }
3011     }
3012 }
3013
3014 /* Output code to add DELTA to the first argument, and then jump
3015    to FUNCTION.  Used for C++ multiple inheritance.  */
3016 static void
3017 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3018                          HOST_WIDE_INT delta,
3019                          HOST_WIDE_INT vcall_offset,
3020                          tree function)
3021 {
3022   /* The this pointer is always in x0.  Note that this differs from
3023      Arm where the this pointer maybe bumped to r1 if r0 is required
3024      to return a pointer to an aggregate.  On AArch64 a result value
3025      pointer will be in x8.  */
3026   int this_regno = R0_REGNUM;
3027   rtx this_rtx, temp0, temp1, addr, funexp;
3028   rtx_insn *insn;
3029
3030   reload_completed = 1;
3031   emit_note (NOTE_INSN_PROLOGUE_END);
3032
3033   if (vcall_offset == 0)
3034     aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3035   else
3036     {
3037       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3038
3039       this_rtx = gen_rtx_REG (Pmode, this_regno);
3040       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3041       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3042
3043       addr = this_rtx;
3044       if (delta != 0)
3045         {
3046           if (delta >= -256 && delta < 256)
3047             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3048                                        plus_constant (Pmode, this_rtx, delta));
3049           else
3050             aarch64_add_constant (this_regno, IP1_REGNUM, delta);
3051         }
3052
3053       if (Pmode == ptr_mode)
3054         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3055       else
3056         aarch64_emit_move (temp0,
3057                            gen_rtx_ZERO_EXTEND (Pmode,
3058                                                 gen_rtx_MEM (ptr_mode, addr)));
3059
3060       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3061           addr = plus_constant (Pmode, temp0, vcall_offset);
3062       else
3063         {
3064           (void) aarch64_build_constant (IP1_REGNUM, vcall_offset, true);
3065           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3066         }
3067
3068       if (Pmode == ptr_mode)
3069         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3070       else
3071         aarch64_emit_move (temp1,
3072                            gen_rtx_SIGN_EXTEND (Pmode,
3073                                                 gen_rtx_MEM (ptr_mode, addr)));
3074
3075       emit_insn (gen_add2_insn (this_rtx, temp1));
3076     }
3077
3078   /* Generate a tail call to the target function.  */
3079   if (!TREE_USED (function))
3080     {
3081       assemble_external (function);
3082       TREE_USED (function) = 1;
3083     }
3084   funexp = XEXP (DECL_RTL (function), 0);
3085   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3086   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3087   SIBLING_CALL_P (insn) = 1;
3088
3089   insn = get_insns ();
3090   shorten_branches (insn);
3091   final_start_function (insn, file, 1);
3092   final (insn, file, 1);
3093   final_end_function ();
3094
3095   /* Stop pretending to be a post-reload pass.  */
3096   reload_completed = 0;
3097 }
3098
3099 static bool
3100 aarch64_tls_referenced_p (rtx x)
3101 {
3102   if (!TARGET_HAVE_TLS)
3103     return false;
3104   subrtx_iterator::array_type array;
3105   FOR_EACH_SUBRTX (iter, array, x, ALL)
3106     {
3107       const_rtx x = *iter;
3108       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3109         return true;
3110       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3111          TLS offsets, not real symbol references.  */
3112       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3113         iter.skip_subrtxes ();
3114     }
3115   return false;
3116 }
3117
3118
3119 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3120    a left shift of 0 or 12 bits.  */
3121 bool
3122 aarch64_uimm12_shift (HOST_WIDE_INT val)
3123 {
3124   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3125           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3126           );
3127 }
3128
3129
3130 /* Return true if val is an immediate that can be loaded into a
3131    register by a MOVZ instruction.  */
3132 static bool
3133 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3134 {
3135   if (GET_MODE_SIZE (mode) > 4)
3136     {
3137       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3138           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3139         return 1;
3140     }
3141   else
3142     {
3143       /* Ignore sign extension.  */
3144       val &= (HOST_WIDE_INT) 0xffffffff;
3145     }
3146   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3147           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3148 }
3149
3150 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
3151
3152 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3153   {
3154     0x0000000100000001ull,
3155     0x0001000100010001ull,
3156     0x0101010101010101ull,
3157     0x1111111111111111ull,
3158     0x5555555555555555ull,
3159   };
3160
3161
3162 /* Return true if val is a valid bitmask immediate.  */
3163
3164 bool
3165 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3166 {
3167   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3168   int bits;
3169
3170   /* Check for a single sequence of one bits and return quickly if so.
3171      The special cases of all ones and all zeroes returns false.  */
3172   val = (unsigned HOST_WIDE_INT) val_in;
3173   tmp = val + (val & -val);
3174
3175   if (tmp == (tmp & -tmp))
3176     return (val + 1) > 1;
3177
3178   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
3179   if (mode == SImode)
3180     val = (val << 32) | (val & 0xffffffff);
3181
3182   /* Invert if the immediate doesn't start with a zero bit - this means we
3183      only need to search for sequences of one bits.  */
3184   if (val & 1)
3185     val = ~val;
3186
3187   /* Find the first set bit and set tmp to val with the first sequence of one
3188      bits removed.  Return success if there is a single sequence of ones.  */
3189   first_one = val & -val;
3190   tmp = val & (val + first_one);
3191
3192   if (tmp == 0)
3193     return true;
3194
3195   /* Find the next set bit and compute the difference in bit position.  */
3196   next_one = tmp & -tmp;
3197   bits = clz_hwi (first_one) - clz_hwi (next_one);
3198   mask = val ^ tmp;
3199
3200   /* Check the bit position difference is a power of 2, and that the first
3201      sequence of one bits fits within 'bits' bits.  */
3202   if ((mask >> bits) != 0 || bits != (bits & -bits))
3203     return false;
3204
3205   /* Check the sequence of one bits is repeated 64/bits times.  */
3206   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3207 }
3208
3209
3210 /* Return true if val is an immediate that can be loaded into a
3211    register in a single instruction.  */
3212 bool
3213 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3214 {
3215   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3216     return 1;
3217   return aarch64_bitmask_imm (val, mode);
3218 }
3219
3220 static bool
3221 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3222 {
3223   rtx base, offset;
3224
3225   if (GET_CODE (x) == HIGH)
3226     return true;
3227
3228   split_const (x, &base, &offset);
3229   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3230     {
3231       if (aarch64_classify_symbol (base, offset)
3232           != SYMBOL_FORCE_TO_MEM)
3233         return true;
3234       else
3235         /* Avoid generating a 64-bit relocation in ILP32; leave
3236            to aarch64_expand_mov_immediate to handle it properly.  */
3237         return mode != ptr_mode;
3238     }
3239
3240   return aarch64_tls_referenced_p (x);
3241 }
3242
3243 /* Return true if register REGNO is a valid index register.
3244    STRICT_P is true if REG_OK_STRICT is in effect.  */
3245
3246 bool
3247 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3248 {
3249   if (!HARD_REGISTER_NUM_P (regno))
3250     {
3251       if (!strict_p)
3252         return true;
3253
3254       if (!reg_renumber)
3255         return false;
3256
3257       regno = reg_renumber[regno];
3258     }
3259   return GP_REGNUM_P (regno);
3260 }
3261
3262 /* Return true if register REGNO is a valid base register for mode MODE.
3263    STRICT_P is true if REG_OK_STRICT is in effect.  */
3264
3265 bool
3266 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3267 {
3268   if (!HARD_REGISTER_NUM_P (regno))
3269     {
3270       if (!strict_p)
3271         return true;
3272
3273       if (!reg_renumber)
3274         return false;
3275
3276       regno = reg_renumber[regno];
3277     }
3278
3279   /* The fake registers will be eliminated to either the stack or
3280      hard frame pointer, both of which are usually valid base registers.
3281      Reload deals with the cases where the eliminated form isn't valid.  */
3282   return (GP_REGNUM_P (regno)
3283           || regno == SP_REGNUM
3284           || regno == FRAME_POINTER_REGNUM
3285           || regno == ARG_POINTER_REGNUM);
3286 }
3287
3288 /* Return true if X is a valid base register for mode MODE.
3289    STRICT_P is true if REG_OK_STRICT is in effect.  */
3290
3291 static bool
3292 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3293 {
3294   if (!strict_p && GET_CODE (x) == SUBREG)
3295     x = SUBREG_REG (x);
3296
3297   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3298 }
3299
3300 /* Return true if address offset is a valid index.  If it is, fill in INFO
3301    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3302
3303 static bool
3304 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3305                         machine_mode mode, bool strict_p)
3306 {
3307   enum aarch64_address_type type;
3308   rtx index;
3309   int shift;
3310
3311   /* (reg:P) */
3312   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3313       && GET_MODE (x) == Pmode)
3314     {
3315       type = ADDRESS_REG_REG;
3316       index = x;
3317       shift = 0;
3318     }
3319   /* (sign_extend:DI (reg:SI)) */
3320   else if ((GET_CODE (x) == SIGN_EXTEND
3321             || GET_CODE (x) == ZERO_EXTEND)
3322            && GET_MODE (x) == DImode
3323            && GET_MODE (XEXP (x, 0)) == SImode)
3324     {
3325       type = (GET_CODE (x) == SIGN_EXTEND)
3326         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3327       index = XEXP (x, 0);
3328       shift = 0;
3329     }
3330   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3331   else if (GET_CODE (x) == MULT
3332            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3333                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3334            && GET_MODE (XEXP (x, 0)) == DImode
3335            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3336            && CONST_INT_P (XEXP (x, 1)))
3337     {
3338       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3339         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3340       index = XEXP (XEXP (x, 0), 0);
3341       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3342     }
3343   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3344   else if (GET_CODE (x) == ASHIFT
3345            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3346                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3347            && GET_MODE (XEXP (x, 0)) == DImode
3348            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3349            && CONST_INT_P (XEXP (x, 1)))
3350     {
3351       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3352         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3353       index = XEXP (XEXP (x, 0), 0);
3354       shift = INTVAL (XEXP (x, 1));
3355     }
3356   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3357   else if ((GET_CODE (x) == SIGN_EXTRACT
3358             || GET_CODE (x) == ZERO_EXTRACT)
3359            && GET_MODE (x) == DImode
3360            && GET_CODE (XEXP (x, 0)) == MULT
3361            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3362            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3363     {
3364       type = (GET_CODE (x) == SIGN_EXTRACT)
3365         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3366       index = XEXP (XEXP (x, 0), 0);
3367       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3368       if (INTVAL (XEXP (x, 1)) != 32 + shift
3369           || INTVAL (XEXP (x, 2)) != 0)
3370         shift = -1;
3371     }
3372   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3373      (const_int 0xffffffff<<shift)) */
3374   else if (GET_CODE (x) == AND
3375            && GET_MODE (x) == DImode
3376            && GET_CODE (XEXP (x, 0)) == MULT
3377            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3378            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3379            && CONST_INT_P (XEXP (x, 1)))
3380     {
3381       type = ADDRESS_REG_UXTW;
3382       index = XEXP (XEXP (x, 0), 0);
3383       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3384       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3385         shift = -1;
3386     }
3387   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3388   else if ((GET_CODE (x) == SIGN_EXTRACT
3389             || GET_CODE (x) == ZERO_EXTRACT)
3390            && GET_MODE (x) == DImode
3391            && GET_CODE (XEXP (x, 0)) == ASHIFT
3392            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3393            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3394     {
3395       type = (GET_CODE (x) == SIGN_EXTRACT)
3396         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3397       index = XEXP (XEXP (x, 0), 0);
3398       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3399       if (INTVAL (XEXP (x, 1)) != 32 + shift
3400           || INTVAL (XEXP (x, 2)) != 0)
3401         shift = -1;
3402     }
3403   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3404      (const_int 0xffffffff<<shift)) */
3405   else if (GET_CODE (x) == AND
3406            && GET_MODE (x) == DImode
3407            && GET_CODE (XEXP (x, 0)) == ASHIFT
3408            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3409            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3410            && CONST_INT_P (XEXP (x, 1)))
3411     {
3412       type = ADDRESS_REG_UXTW;
3413       index = XEXP (XEXP (x, 0), 0);
3414       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3415       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3416         shift = -1;
3417     }
3418   /* (mult:P (reg:P) (const_int scale)) */
3419   else if (GET_CODE (x) == MULT
3420            && GET_MODE (x) == Pmode
3421            && GET_MODE (XEXP (x, 0)) == Pmode
3422            && CONST_INT_P (XEXP (x, 1)))
3423     {
3424       type = ADDRESS_REG_REG;
3425       index = XEXP (x, 0);
3426       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3427     }
3428   /* (ashift:P (reg:P) (const_int shift)) */
3429   else if (GET_CODE (x) == ASHIFT
3430            && GET_MODE (x) == Pmode
3431            && GET_MODE (XEXP (x, 0)) == Pmode
3432            && CONST_INT_P (XEXP (x, 1)))
3433     {
3434       type = ADDRESS_REG_REG;
3435       index = XEXP (x, 0);
3436       shift = INTVAL (XEXP (x, 1));
3437     }
3438   else
3439     return false;
3440
3441   if (GET_CODE (index) == SUBREG)
3442     index = SUBREG_REG (index);
3443
3444   if ((shift == 0 ||
3445        (shift > 0 && shift <= 3
3446         && (1 << shift) == GET_MODE_SIZE (mode)))
3447       && REG_P (index)
3448       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3449     {
3450       info->type = type;
3451       info->offset = index;
3452       info->shift = shift;
3453       return true;
3454     }
3455
3456   return false;
3457 }
3458
3459 bool
3460 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3461 {
3462   return (offset >= -64 * GET_MODE_SIZE (mode)
3463           && offset < 64 * GET_MODE_SIZE (mode)
3464           && offset % GET_MODE_SIZE (mode) == 0);
3465 }
3466
3467 static inline bool
3468 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3469                                HOST_WIDE_INT offset)
3470 {
3471   return offset >= -256 && offset < 256;
3472 }
3473
3474 static inline bool
3475 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3476 {
3477   return (offset >= 0
3478           && offset < 4096 * GET_MODE_SIZE (mode)
3479           && offset % GET_MODE_SIZE (mode) == 0);
3480 }
3481
3482 /* Return true if MODE is one of the modes for which we
3483    support LDP/STP operations.  */
3484
3485 static bool
3486 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
3487 {
3488   return mode == SImode || mode == DImode
3489          || mode == SFmode || mode == DFmode
3490          || (aarch64_vector_mode_supported_p (mode)
3491              && GET_MODE_SIZE (mode) == 8);
3492 }
3493
3494 /* Return true if X is a valid address for machine mode MODE.  If it is,
3495    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3496    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3497
3498 static bool
3499 aarch64_classify_address (struct aarch64_address_info *info,
3500                           rtx x, machine_mode mode,
3501                           RTX_CODE outer_code, bool strict_p)
3502 {
3503   enum rtx_code code = GET_CODE (x);
3504   rtx op0, op1;
3505
3506   /* On BE, we use load/store pair for all large int mode load/stores.  */
3507   bool load_store_pair_p = (outer_code == PARALLEL
3508                             || (BYTES_BIG_ENDIAN
3509                                 && aarch64_vect_struct_mode_p (mode)));
3510
3511   bool allow_reg_index_p =
3512     !load_store_pair_p
3513     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3514     && !aarch64_vect_struct_mode_p (mode);
3515
3516   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3517      REG addressing.  */
3518   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3519       && (code != POST_INC && code != REG))
3520     return false;
3521
3522   switch (code)
3523     {
3524     case REG:
3525     case SUBREG:
3526       info->type = ADDRESS_REG_IMM;
3527       info->base = x;
3528       info->offset = const0_rtx;
3529       return aarch64_base_register_rtx_p (x, strict_p);
3530
3531     case PLUS:
3532       op0 = XEXP (x, 0);
3533       op1 = XEXP (x, 1);
3534
3535       if (! strict_p
3536           && REG_P (op0)
3537           && (op0 == virtual_stack_vars_rtx
3538               || op0 == frame_pointer_rtx
3539               || op0 == arg_pointer_rtx)
3540           && CONST_INT_P (op1))
3541         {
3542           info->type = ADDRESS_REG_IMM;
3543           info->base = op0;
3544           info->offset = op1;
3545
3546           return true;
3547         }
3548
3549       if (GET_MODE_SIZE (mode) != 0
3550           && CONST_INT_P (op1)
3551           && aarch64_base_register_rtx_p (op0, strict_p))
3552         {
3553           HOST_WIDE_INT offset = INTVAL (op1);
3554
3555           info->type = ADDRESS_REG_IMM;
3556           info->base = op0;
3557           info->offset = op1;
3558
3559           /* TImode and TFmode values are allowed in both pairs of X
3560              registers and individual Q registers.  The available
3561              address modes are:
3562              X,X: 7-bit signed scaled offset
3563              Q:   9-bit signed offset
3564              We conservatively require an offset representable in either mode.
3565            */
3566           if (mode == TImode || mode == TFmode)
3567             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3568                     && offset_9bit_signed_unscaled_p (mode, offset));
3569
3570           /* A 7bit offset check because OImode will emit a ldp/stp
3571              instruction (only big endian will get here).
3572              For ldp/stp instructions, the offset is scaled for the size of a
3573              single element of the pair.  */
3574           if (mode == OImode)
3575             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
3576
3577           /* Three 9/12 bit offsets checks because CImode will emit three
3578              ldr/str instructions (only big endian will get here).  */
3579           if (mode == CImode)
3580             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3581                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
3582                         || offset_12bit_unsigned_scaled_p (V16QImode,
3583                                                            offset + 32)));
3584
3585           /* Two 7bit offsets checks because XImode will emit two ldp/stp
3586              instructions (only big endian will get here).  */
3587           if (mode == XImode)
3588             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
3589                     && aarch64_offset_7bit_signed_scaled_p (TImode,
3590                                                             offset + 32));
3591
3592           if (load_store_pair_p)
3593             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3594                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3595           else
3596             return (offset_9bit_signed_unscaled_p (mode, offset)
3597                     || offset_12bit_unsigned_scaled_p (mode, offset));
3598         }
3599
3600       if (allow_reg_index_p)
3601         {
3602           /* Look for base + (scaled/extended) index register.  */
3603           if (aarch64_base_register_rtx_p (op0, strict_p)
3604               && aarch64_classify_index (info, op1, mode, strict_p))
3605             {
3606               info->base = op0;
3607               return true;
3608             }
3609           if (aarch64_base_register_rtx_p (op1, strict_p)
3610               && aarch64_classify_index (info, op0, mode, strict_p))
3611             {
3612               info->base = op1;
3613               return true;
3614             }
3615         }
3616
3617       return false;
3618
3619     case POST_INC:
3620     case POST_DEC:
3621     case PRE_INC:
3622     case PRE_DEC:
3623       info->type = ADDRESS_REG_WB;
3624       info->base = XEXP (x, 0);
3625       info->offset = NULL_RTX;
3626       return aarch64_base_register_rtx_p (info->base, strict_p);
3627
3628     case POST_MODIFY:
3629     case PRE_MODIFY:
3630       info->type = ADDRESS_REG_WB;
3631       info->base = XEXP (x, 0);
3632       if (GET_CODE (XEXP (x, 1)) == PLUS
3633           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
3634           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
3635           && aarch64_base_register_rtx_p (info->base, strict_p))
3636         {
3637           HOST_WIDE_INT offset;
3638           info->offset = XEXP (XEXP (x, 1), 1);
3639           offset = INTVAL (info->offset);
3640
3641           /* TImode and TFmode values are allowed in both pairs of X
3642              registers and individual Q registers.  The available
3643              address modes are:
3644              X,X: 7-bit signed scaled offset
3645              Q:   9-bit signed offset
3646              We conservatively require an offset representable in either mode.
3647            */
3648           if (mode == TImode || mode == TFmode)
3649             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
3650                     && offset_9bit_signed_unscaled_p (mode, offset));
3651
3652           if (load_store_pair_p)
3653             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
3654                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
3655           else
3656             return offset_9bit_signed_unscaled_p (mode, offset);
3657         }
3658       return false;
3659
3660     case CONST:
3661     case SYMBOL_REF:
3662     case LABEL_REF:
3663       /* load literal: pc-relative constant pool entry.  Only supported
3664          for SI mode or larger.  */
3665       info->type = ADDRESS_SYMBOLIC;
3666
3667       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
3668         {
3669           rtx sym, addend;
3670
3671           split_const (x, &sym, &addend);
3672           return ((GET_CODE (sym) == LABEL_REF
3673                    || (GET_CODE (sym) == SYMBOL_REF
3674                        && CONSTANT_POOL_ADDRESS_P (sym)
3675                        && !aarch64_nopcrelative_literal_loads)));
3676         }
3677       return false;
3678
3679     case LO_SUM:
3680       info->type = ADDRESS_LO_SUM;
3681       info->base = XEXP (x, 0);
3682       info->offset = XEXP (x, 1);
3683       if (allow_reg_index_p
3684           && aarch64_base_register_rtx_p (info->base, strict_p))
3685         {
3686           rtx sym, offs;
3687           split_const (info->offset, &sym, &offs);
3688           if (GET_CODE (sym) == SYMBOL_REF
3689               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
3690             {
3691               /* The symbol and offset must be aligned to the access size.  */
3692               unsigned int align;
3693               unsigned int ref_size;
3694
3695               if (CONSTANT_POOL_ADDRESS_P (sym))
3696                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
3697               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
3698                 {
3699                   tree exp = SYMBOL_REF_DECL (sym);
3700                   align = TYPE_ALIGN (TREE_TYPE (exp));
3701                   align = CONSTANT_ALIGNMENT (exp, align);
3702                 }
3703               else if (SYMBOL_REF_DECL (sym))
3704                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
3705               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
3706                        && SYMBOL_REF_BLOCK (sym) != NULL)
3707                 align = SYMBOL_REF_BLOCK (sym)->alignment;
3708               else
3709                 align = BITS_PER_UNIT;
3710
3711               ref_size = GET_MODE_SIZE (mode);
3712               if (ref_size == 0)
3713                 ref_size = GET_MODE_SIZE (DImode);
3714
3715               return ((INTVAL (offs) & (ref_size - 1)) == 0
3716                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
3717             }
3718         }
3719       return false;
3720
3721     default:
3722       return false;
3723     }
3724 }
3725
3726 bool
3727 aarch64_symbolic_address_p (rtx x)
3728 {
3729   rtx offset;
3730
3731   split_const (x, &x, &offset);
3732   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
3733 }
3734
3735 /* Classify the base of symbolic expression X.  */
3736
3737 enum aarch64_symbol_type
3738 aarch64_classify_symbolic_expression (rtx x)
3739 {
3740   rtx offset;
3741
3742   split_const (x, &x, &offset);
3743   return aarch64_classify_symbol (x, offset);
3744 }
3745
3746
3747 /* Return TRUE if X is a legitimate address for accessing memory in
3748    mode MODE.  */
3749 static bool
3750 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
3751 {
3752   struct aarch64_address_info addr;
3753
3754   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
3755 }
3756
3757 /* Return TRUE if X is a legitimate address for accessing memory in
3758    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
3759    pair operation.  */
3760 bool
3761 aarch64_legitimate_address_p (machine_mode mode, rtx x,
3762                               RTX_CODE outer_code, bool strict_p)
3763 {
3764   struct aarch64_address_info addr;
3765
3766   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
3767 }
3768
3769 /* Return TRUE if rtx X is immediate constant 0.0 */
3770 bool
3771 aarch64_float_const_zero_rtx_p (rtx x)
3772 {
3773   if (GET_MODE (x) == VOIDmode)
3774     return false;
3775
3776   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
3777     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
3778   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
3779 }
3780
3781 /* Return the fixed registers used for condition codes.  */
3782
3783 static bool
3784 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
3785 {
3786   *p1 = CC_REGNUM;
3787   *p2 = INVALID_REGNUM;
3788   return true;
3789 }
3790
3791 /* Emit call insn with PAT and do aarch64-specific handling.  */
3792
3793 void
3794 aarch64_emit_call_insn (rtx pat)
3795 {
3796   rtx insn = emit_call_insn (pat);
3797
3798   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
3799   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
3800   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
3801 }
3802
3803 machine_mode
3804 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
3805 {
3806   /* All floating point compares return CCFP if it is an equality
3807      comparison, and CCFPE otherwise.  */
3808   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
3809     {
3810       switch (code)
3811         {
3812         case EQ:
3813         case NE:
3814         case UNORDERED:
3815         case ORDERED:
3816         case UNLT:
3817         case UNLE:
3818         case UNGT:
3819         case UNGE:
3820         case UNEQ:
3821         case LTGT:
3822           return CCFPmode;
3823
3824         case LT:
3825         case LE:
3826         case GT:
3827         case GE:
3828           return CCFPEmode;
3829
3830         default:
3831           gcc_unreachable ();
3832         }
3833     }
3834
3835   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3836       && y == const0_rtx
3837       && (code == EQ || code == NE || code == LT || code == GE)
3838       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
3839           || GET_CODE (x) == NEG))
3840     return CC_NZmode;
3841
3842   /* A compare with a shifted operand.  Because of canonicalization,
3843      the comparison will have to be swapped when we emit the assembly
3844      code.  */
3845   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3846       && (REG_P (y) || GET_CODE (y) == SUBREG)
3847       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
3848           || GET_CODE (x) == LSHIFTRT
3849           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
3850     return CC_SWPmode;
3851
3852   /* Similarly for a negated operand, but we can only do this for
3853      equalities.  */
3854   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
3855       && (REG_P (y) || GET_CODE (y) == SUBREG)
3856       && (code == EQ || code == NE)
3857       && GET_CODE (x) == NEG)
3858     return CC_Zmode;
3859
3860   /* A compare of a mode narrower than SI mode against zero can be done
3861      by extending the value in the comparison.  */
3862   if ((GET_MODE (x) == QImode || GET_MODE (x) == HImode)
3863       && y == const0_rtx)
3864     /* Only use sign-extension if we really need it.  */
3865     return ((code == GT || code == GE || code == LE || code == LT)
3866             ? CC_SESWPmode : CC_ZESWPmode);
3867
3868   /* For everything else, return CCmode.  */
3869   return CCmode;
3870 }
3871
3872 static int
3873 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
3874
3875 int
3876 aarch64_get_condition_code (rtx x)
3877 {
3878   machine_mode mode = GET_MODE (XEXP (x, 0));
3879   enum rtx_code comp_code = GET_CODE (x);
3880
3881   if (GET_MODE_CLASS (mode) != MODE_CC)
3882     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
3883   return aarch64_get_condition_code_1 (mode, comp_code);
3884 }
3885
3886 static int
3887 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
3888 {
3889   int ne = -1, eq = -1;
3890   switch (mode)
3891     {
3892     case CCFPmode:
3893     case CCFPEmode:
3894       switch (comp_code)
3895         {
3896         case GE: return AARCH64_GE;
3897         case GT: return AARCH64_GT;
3898         case LE: return AARCH64_LS;
3899         case LT: return AARCH64_MI;
3900         case NE: return AARCH64_NE;
3901         case EQ: return AARCH64_EQ;
3902         case ORDERED: return AARCH64_VC;
3903         case UNORDERED: return AARCH64_VS;
3904         case UNLT: return AARCH64_LT;
3905         case UNLE: return AARCH64_LE;
3906         case UNGT: return AARCH64_HI;
3907         case UNGE: return AARCH64_PL;
3908         default: return -1;
3909         }
3910       break;
3911
3912     case CC_DNEmode:
3913       ne = AARCH64_NE;
3914       eq = AARCH64_EQ;
3915       break;
3916
3917     case CC_DEQmode:
3918       ne = AARCH64_EQ;
3919       eq = AARCH64_NE;
3920       break;
3921
3922     case CC_DGEmode:
3923       ne = AARCH64_GE;
3924       eq = AARCH64_LT;
3925       break;
3926
3927     case CC_DLTmode:
3928       ne = AARCH64_LT;
3929       eq = AARCH64_GE;
3930       break;
3931
3932     case CC_DGTmode:
3933       ne = AARCH64_GT;
3934       eq = AARCH64_LE;
3935       break;
3936
3937     case CC_DLEmode:
3938       ne = AARCH64_LE;
3939       eq = AARCH64_GT;
3940       break;
3941
3942     case CC_DGEUmode:
3943       ne = AARCH64_CS;
3944       eq = AARCH64_CC;
3945       break;
3946
3947     case CC_DLTUmode:
3948       ne = AARCH64_CC;
3949       eq = AARCH64_CS;
3950       break;
3951
3952     case CC_DGTUmode:
3953       ne = AARCH64_HI;
3954       eq = AARCH64_LS;
3955       break;
3956
3957     case CC_DLEUmode:
3958       ne = AARCH64_LS;
3959       eq = AARCH64_HI;
3960       break;
3961
3962     case CCmode:
3963       switch (comp_code)
3964         {
3965         case NE: return AARCH64_NE;
3966         case EQ: return AARCH64_EQ;
3967         case GE: return AARCH64_GE;
3968         case GT: return AARCH64_GT;
3969         case LE: return AARCH64_LE;
3970         case LT: return AARCH64_LT;
3971         case GEU: return AARCH64_CS;
3972         case GTU: return AARCH64_HI;
3973         case LEU: return AARCH64_LS;
3974         case LTU: return AARCH64_CC;
3975         default: return -1;
3976         }
3977       break;
3978
3979     case CC_SWPmode:
3980     case CC_ZESWPmode:
3981     case CC_SESWPmode:
3982       switch (comp_code)
3983         {
3984         case NE: return AARCH64_NE;
3985         case EQ: return AARCH64_EQ;
3986         case GE: return AARCH64_LE;
3987         case GT: return AARCH64_LT;
3988         case LE: return AARCH64_GE;
3989         case LT: return AARCH64_GT;
3990         case GEU: return AARCH64_LS;
3991         case GTU: return AARCH64_CC;
3992         case LEU: return AARCH64_CS;
3993         case LTU: return AARCH64_HI;
3994         default: return -1;
3995         }
3996       break;
3997
3998     case CC_NZmode:
3999       switch (comp_code)
4000         {
4001         case NE: return AARCH64_NE;
4002         case EQ: return AARCH64_EQ;
4003         case GE: return AARCH64_PL;
4004         case LT: return AARCH64_MI;
4005         default: return -1;
4006         }
4007       break;
4008
4009     case CC_Zmode:
4010       switch (comp_code)
4011         {
4012         case NE: return AARCH64_NE;
4013         case EQ: return AARCH64_EQ;
4014         default: return -1;
4015         }
4016       break;
4017
4018     default:
4019       return -1;
4020       break;
4021     }
4022
4023   if (comp_code == NE)
4024     return ne;
4025
4026   if (comp_code == EQ)
4027     return eq;
4028
4029   return -1;
4030 }
4031
4032 bool
4033 aarch64_const_vec_all_same_in_range_p (rtx x,
4034                                   HOST_WIDE_INT minval,
4035                                   HOST_WIDE_INT maxval)
4036 {
4037   HOST_WIDE_INT firstval;
4038   int count, i;
4039
4040   if (GET_CODE (x) != CONST_VECTOR
4041       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4042     return false;
4043
4044   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4045   if (firstval < minval || firstval > maxval)
4046     return false;
4047
4048   count = CONST_VECTOR_NUNITS (x);
4049   for (i = 1; i < count; i++)
4050     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4051       return false;
4052
4053   return true;
4054 }
4055
4056 bool
4057 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4058 {
4059   return aarch64_const_vec_all_same_in_range_p (x, val, val);
4060 }
4061
4062
4063 /* N Z C V.  */
4064 #define AARCH64_CC_V 1
4065 #define AARCH64_CC_C (1 << 1)
4066 #define AARCH64_CC_Z (1 << 2)
4067 #define AARCH64_CC_N (1 << 3)
4068
4069 /* N Z C V flags for ccmp.  The first code is for AND op and the other
4070    is for IOR op.  Indexed by AARCH64_COND_CODE.  */
4071 static const int aarch64_nzcv_codes[][2] =
4072 {
4073   {AARCH64_CC_Z, 0}, /* EQ, Z == 1.  */
4074   {0, AARCH64_CC_Z}, /* NE, Z == 0.  */
4075   {AARCH64_CC_C, 0}, /* CS, C == 1.  */
4076   {0, AARCH64_CC_C}, /* CC, C == 0.  */
4077   {AARCH64_CC_N, 0}, /* MI, N == 1.  */
4078   {0, AARCH64_CC_N}, /* PL, N == 0.  */
4079   {AARCH64_CC_V, 0}, /* VS, V == 1.  */
4080   {0, AARCH64_CC_V}, /* VC, V == 0.  */
4081   {AARCH64_CC_C, 0}, /* HI, C ==1 && Z == 0.  */
4082   {0, AARCH64_CC_C}, /* LS, !(C == 1 && Z == 0).  */
4083   {0, AARCH64_CC_V}, /* GE, N == V.  */
4084   {AARCH64_CC_V, 0}, /* LT, N != V.  */
4085   {0, AARCH64_CC_Z}, /* GT, Z == 0 && N == V.  */
4086   {AARCH64_CC_Z, 0}, /* LE, !(Z == 0 && N == V).  */
4087   {0, 0}, /* AL, Any.  */
4088   {0, 0}, /* NV, Any.  */
4089 };
4090
4091 int
4092 aarch64_ccmp_mode_to_code (enum machine_mode mode)
4093 {
4094   switch (mode)
4095     {
4096     case CC_DNEmode:
4097       return NE;
4098
4099     case CC_DEQmode:
4100       return EQ;
4101
4102     case CC_DLEmode:
4103       return LE;
4104
4105     case CC_DGTmode:
4106       return GT;
4107
4108     case CC_DLTmode:
4109       return LT;
4110
4111     case CC_DGEmode:
4112       return GE;
4113
4114     case CC_DLEUmode:
4115       return LEU;
4116
4117     case CC_DGTUmode:
4118       return GTU;
4119
4120     case CC_DLTUmode:
4121       return LTU;
4122
4123     case CC_DGEUmode:
4124       return GEU;
4125
4126     default:
4127       gcc_unreachable ();
4128     }
4129 }
4130
4131
4132 static void
4133 aarch64_print_operand (FILE *f, rtx x, int code)
4134 {
4135   switch (code)
4136     {
4137     /* An integer or symbol address without a preceding # sign.  */
4138     case 'c':
4139       switch (GET_CODE (x))
4140         {
4141         case CONST_INT:
4142           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4143           break;
4144
4145         case SYMBOL_REF:
4146           output_addr_const (f, x);
4147           break;
4148
4149         case CONST:
4150           if (GET_CODE (XEXP (x, 0)) == PLUS
4151               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4152             {
4153               output_addr_const (f, x);
4154               break;
4155             }
4156           /* Fall through.  */
4157
4158         default:
4159           output_operand_lossage ("Unsupported operand for code '%c'", code);
4160         }
4161       break;
4162
4163     case 'e':
4164       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4165       {
4166         int n;
4167
4168         if (!CONST_INT_P (x)
4169             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4170           {
4171             output_operand_lossage ("invalid operand for '%%%c'", code);
4172             return;
4173           }
4174
4175         switch (n)
4176           {
4177           case 3:
4178             fputc ('b', f);
4179             break;
4180           case 4:
4181             fputc ('h', f);
4182             break;
4183           case 5:
4184             fputc ('w', f);
4185             break;
4186           default:
4187             output_operand_lossage ("invalid operand for '%%%c'", code);
4188             return;
4189           }
4190       }
4191       break;
4192
4193     case 'p':
4194       {
4195         int n;
4196
4197         /* Print N such that 2^N == X.  */
4198         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4199           {
4200             output_operand_lossage ("invalid operand for '%%%c'", code);
4201             return;
4202           }
4203
4204         asm_fprintf (f, "%d", n);
4205       }
4206       break;
4207
4208     case 'P':
4209       /* Print the number of non-zero bits in X (a const_int).  */
4210       if (!CONST_INT_P (x))
4211         {
4212           output_operand_lossage ("invalid operand for '%%%c'", code);
4213           return;
4214         }
4215
4216       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4217       break;
4218
4219     case 'H':
4220       /* Print the higher numbered register of a pair (TImode) of regs.  */
4221       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4222         {
4223           output_operand_lossage ("invalid operand for '%%%c'", code);
4224           return;
4225         }
4226
4227       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4228       break;
4229
4230     case 'm':
4231       {
4232         int cond_code;
4233         /* Print a condition (eq, ne, etc).  */
4234
4235         /* CONST_TRUE_RTX means always -- that's the default.  */
4236         if (x == const_true_rtx)
4237           return;
4238
4239         if (!COMPARISON_P (x))
4240           {
4241             output_operand_lossage ("invalid operand for '%%%c'", code);
4242             return;
4243           }
4244
4245         cond_code = aarch64_get_condition_code (x);
4246         gcc_assert (cond_code >= 0);
4247         fputs (aarch64_condition_codes[cond_code], f);
4248       }
4249       break;
4250
4251     case 'M':
4252       {
4253         int cond_code;
4254         /* Print the inverse of a condition (eq <-> ne, etc).  */
4255
4256         /* CONST_TRUE_RTX means never -- that's the default.  */
4257         if (x == const_true_rtx)
4258           {
4259             fputs ("nv", f);
4260             return;
4261           }
4262
4263         if (!COMPARISON_P (x))
4264           {
4265             output_operand_lossage ("invalid operand for '%%%c'", code);
4266             return;
4267           }
4268         cond_code = aarch64_get_condition_code (x);
4269         gcc_assert (cond_code >= 0);
4270         fputs (aarch64_condition_codes[AARCH64_INVERSE_CONDITION_CODE
4271                                        (cond_code)], f);
4272       }
4273       break;
4274
4275     case 'b':
4276     case 'h':
4277     case 's':
4278     case 'd':
4279     case 'q':
4280       /* Print a scalar FP/SIMD register name.  */
4281       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4282         {
4283           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4284           return;
4285         }
4286       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4287       break;
4288
4289     case 'S':
4290     case 'T':
4291     case 'U':
4292     case 'V':
4293       /* Print the first FP/SIMD register name in a list.  */
4294       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4295         {
4296           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4297           return;
4298         }
4299       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4300       break;
4301
4302     case 'R':
4303       /* Print a scalar FP/SIMD register name + 1.  */
4304       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4305         {
4306           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4307           return;
4308         }
4309       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4310       break;
4311
4312     case 'X':
4313       /* Print bottom 16 bits of integer constant in hex.  */
4314       if (!CONST_INT_P (x))
4315         {
4316           output_operand_lossage ("invalid operand for '%%%c'", code);
4317           return;
4318         }
4319       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4320       break;
4321
4322     case 'w':
4323     case 'x':
4324       /* Print a general register name or the zero register (32-bit or
4325          64-bit).  */
4326       if (x == const0_rtx
4327           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4328         {
4329           asm_fprintf (f, "%czr", code);
4330           break;
4331         }
4332
4333       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4334         {
4335           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4336           break;
4337         }
4338
4339       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4340         {
4341           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4342           break;
4343         }
4344
4345       /* Fall through */
4346
4347     case 0:
4348       /* Print a normal operand, if it's a general register, then we
4349          assume DImode.  */
4350       if (x == NULL)
4351         {
4352           output_operand_lossage ("missing operand");
4353           return;
4354         }
4355
4356       switch (GET_CODE (x))
4357         {
4358         case REG:
4359           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4360           break;
4361
4362         case MEM:
4363           output_address (GET_MODE (x), XEXP (x, 0));
4364           break;
4365
4366         case CONST:
4367         case LABEL_REF:
4368         case SYMBOL_REF:
4369           output_addr_const (asm_out_file, x);
4370           break;
4371
4372         case CONST_INT:
4373           asm_fprintf (f, "%wd", INTVAL (x));
4374           break;
4375
4376         case CONST_VECTOR:
4377           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4378             {
4379               gcc_assert (
4380                   aarch64_const_vec_all_same_in_range_p (x,
4381                                                          HOST_WIDE_INT_MIN,
4382                                                          HOST_WIDE_INT_MAX));
4383               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4384             }
4385           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4386             {
4387               fputc ('0', f);
4388             }
4389           else
4390             gcc_unreachable ();
4391           break;
4392
4393         case CONST_DOUBLE:
4394           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
4395              be getting CONST_DOUBLEs holding integers.  */
4396           gcc_assert (GET_MODE (x) != VOIDmode);
4397           if (aarch64_float_const_zero_rtx_p (x))
4398             {
4399               fputc ('0', f);
4400               break;
4401             }
4402           else if (aarch64_float_const_representable_p (x))
4403             {
4404 #define buf_size 20
4405               char float_buf[buf_size] = {'\0'};
4406               real_to_decimal_for_mode (float_buf,
4407                                         CONST_DOUBLE_REAL_VALUE (x),
4408                                         buf_size, buf_size,
4409                                         1, GET_MODE (x));
4410               asm_fprintf (asm_out_file, "%s", float_buf);
4411               break;
4412 #undef buf_size
4413             }
4414           output_operand_lossage ("invalid constant");
4415           return;
4416         default:
4417           output_operand_lossage ("invalid operand");
4418           return;
4419         }
4420       break;
4421
4422     case 'A':
4423       if (GET_CODE (x) == HIGH)
4424         x = XEXP (x, 0);
4425
4426       switch (aarch64_classify_symbolic_expression (x))
4427         {
4428         case SYMBOL_SMALL_GOT_4G:
4429           asm_fprintf (asm_out_file, ":got:");
4430           break;
4431
4432         case SYMBOL_SMALL_TLSGD:
4433           asm_fprintf (asm_out_file, ":tlsgd:");
4434           break;
4435
4436         case SYMBOL_SMALL_TLSDESC:
4437           asm_fprintf (asm_out_file, ":tlsdesc:");
4438           break;
4439
4440         case SYMBOL_SMALL_TLSIE:
4441           asm_fprintf (asm_out_file, ":gottprel:");
4442           break;
4443
4444         case SYMBOL_TLSLE24:
4445           asm_fprintf (asm_out_file, ":tprel:");
4446           break;
4447
4448         case SYMBOL_TINY_GOT:
4449           gcc_unreachable ();
4450           break;
4451
4452         default:
4453           break;
4454         }
4455       output_addr_const (asm_out_file, x);
4456       break;
4457
4458     case 'L':
4459       switch (aarch64_classify_symbolic_expression (x))
4460         {
4461         case SYMBOL_SMALL_GOT_4G:
4462           asm_fprintf (asm_out_file, ":lo12:");
4463           break;
4464
4465         case SYMBOL_SMALL_TLSGD:
4466           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4467           break;
4468
4469         case SYMBOL_SMALL_TLSDESC:
4470           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4471           break;
4472
4473         case SYMBOL_SMALL_TLSIE:
4474           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4475           break;
4476
4477         case SYMBOL_TLSLE12:
4478           asm_fprintf (asm_out_file, ":tprel_lo12:");
4479           break;
4480
4481         case SYMBOL_TLSLE24:
4482           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4483           break;
4484
4485         case SYMBOL_TINY_GOT:
4486           asm_fprintf (asm_out_file, ":got:");
4487           break;
4488
4489         case SYMBOL_TINY_TLSIE:
4490           asm_fprintf (asm_out_file, ":gottprel:");
4491           break;
4492
4493         default:
4494           break;
4495         }
4496       output_addr_const (asm_out_file, x);
4497       break;
4498
4499     case 'G':
4500
4501       switch (aarch64_classify_symbolic_expression (x))
4502         {
4503         case SYMBOL_TLSLE24:
4504           asm_fprintf (asm_out_file, ":tprel_hi12:");
4505           break;
4506         default:
4507           break;
4508         }
4509       output_addr_const (asm_out_file, x);
4510       break;
4511
4512     case 'K':
4513       {
4514         int cond_code;
4515         /* Print nzcv.  */
4516
4517         if (!COMPARISON_P (x))
4518           {
4519             output_operand_lossage ("invalid operand for '%%%c'", code);
4520             return;
4521           }
4522
4523         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4524         gcc_assert (cond_code >= 0);
4525         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][0]);
4526       }
4527       break;
4528
4529     case 'k':
4530       {
4531         int cond_code;
4532         /* Print nzcv.  */
4533
4534         if (!COMPARISON_P (x))
4535           {
4536             output_operand_lossage ("invalid operand for '%%%c'", code);
4537             return;
4538           }
4539
4540         cond_code = aarch64_get_condition_code_1 (CCmode, GET_CODE (x));
4541         gcc_assert (cond_code >= 0);
4542         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code][1]);
4543       }
4544       break;
4545
4546     default:
4547       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4548       return;
4549     }
4550 }
4551
4552 static void
4553 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
4554 {
4555   struct aarch64_address_info addr;
4556
4557   if (aarch64_classify_address (&addr, x, mode, MEM, true))
4558     switch (addr.type)
4559       {
4560       case ADDRESS_REG_IMM:
4561         if (addr.offset == const0_rtx)
4562           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4563         else
4564           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4565                        INTVAL (addr.offset));
4566         return;
4567
4568       case ADDRESS_REG_REG:
4569         if (addr.shift == 0)
4570           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4571                        reg_names [REGNO (addr.offset)]);
4572         else
4573           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4574                        reg_names [REGNO (addr.offset)], addr.shift);
4575         return;
4576
4577       case ADDRESS_REG_UXTW:
4578         if (addr.shift == 0)
4579           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4580                        REGNO (addr.offset) - R0_REGNUM);
4581         else
4582           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4583                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4584         return;
4585
4586       case ADDRESS_REG_SXTW:
4587         if (addr.shift == 0)
4588           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4589                        REGNO (addr.offset) - R0_REGNUM);
4590         else
4591           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4592                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4593         return;
4594
4595       case ADDRESS_REG_WB:
4596         switch (GET_CODE (x))
4597           {
4598           case PRE_INC:
4599             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4600                          GET_MODE_SIZE (mode));
4601             return;
4602           case POST_INC:
4603             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4604                          GET_MODE_SIZE (mode));
4605             return;
4606           case PRE_DEC:
4607             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4608                          GET_MODE_SIZE (mode));
4609             return;
4610           case POST_DEC:
4611             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4612                          GET_MODE_SIZE (mode));
4613             return;
4614           case PRE_MODIFY:
4615             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4616                          INTVAL (addr.offset));
4617             return;
4618           case POST_MODIFY:
4619             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4620                          INTVAL (addr.offset));
4621             return;
4622           default:
4623             break;
4624           }
4625         break;
4626
4627       case ADDRESS_LO_SUM:
4628         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4629         output_addr_const (f, addr.offset);
4630         asm_fprintf (f, "]");
4631         return;
4632
4633       case ADDRESS_SYMBOLIC:
4634         break;
4635       }
4636
4637   output_addr_const (f, x);
4638 }
4639
4640 bool
4641 aarch64_label_mentioned_p (rtx x)
4642 {
4643   const char *fmt;
4644   int i;
4645
4646   if (GET_CODE (x) == LABEL_REF)
4647     return true;
4648
4649   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4650      referencing instruction, but they are constant offsets, not
4651      symbols.  */
4652   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4653     return false;
4654
4655   fmt = GET_RTX_FORMAT (GET_CODE (x));
4656   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4657     {
4658       if (fmt[i] == 'E')
4659         {
4660           int j;
4661
4662           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4663             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4664               return 1;
4665         }
4666       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4667         return 1;
4668     }
4669
4670   return 0;
4671 }
4672
4673 /* Implement REGNO_REG_CLASS.  */
4674
4675 enum reg_class
4676 aarch64_regno_regclass (unsigned regno)
4677 {
4678   if (GP_REGNUM_P (regno))
4679     return GENERAL_REGS;
4680
4681   if (regno == SP_REGNUM)
4682     return STACK_REG;
4683
4684   if (regno == FRAME_POINTER_REGNUM
4685       || regno == ARG_POINTER_REGNUM)
4686     return POINTER_REGS;
4687
4688   if (FP_REGNUM_P (regno))
4689     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
4690
4691   return NO_REGS;
4692 }
4693
4694 static rtx
4695 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
4696 {
4697   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
4698      where mask is selected by alignment and size of the offset.
4699      We try to pick as large a range for the offset as possible to
4700      maximize the chance of a CSE.  However, for aligned addresses
4701      we limit the range to 4k so that structures with different sized
4702      elements are likely to use the same base.  */
4703
4704   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
4705     {
4706       HOST_WIDE_INT offset = INTVAL (XEXP (x, 1));
4707       HOST_WIDE_INT base_offset;
4708
4709       /* Does it look like we'll need a load/store-pair operation?  */
4710       if (GET_MODE_SIZE (mode) > 16
4711           || mode == TImode)
4712         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
4713                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
4714       /* For offsets aren't a multiple of the access size, the limit is
4715          -256...255.  */
4716       else if (offset & (GET_MODE_SIZE (mode) - 1))
4717         base_offset = (offset + 0x100) & ~0x1ff;
4718       else
4719         base_offset = offset & ~0xfff;
4720
4721       if (base_offset == 0)
4722         return x;
4723
4724       offset -= base_offset;
4725       rtx base_reg = gen_reg_rtx (Pmode);
4726       rtx val = force_operand (plus_constant (Pmode, XEXP (x, 0), base_offset),
4727                            NULL_RTX);
4728       emit_move_insn (base_reg, val);
4729       x = plus_constant (Pmode, base_reg, offset);
4730     }
4731
4732   return x;
4733 }
4734
4735 /* Try a machine-dependent way of reloading an illegitimate address
4736    operand.  If we find one, push the reload and return the new rtx.  */
4737
4738 rtx
4739 aarch64_legitimize_reload_address (rtx *x_p,
4740                                    machine_mode mode,
4741                                    int opnum, int type,
4742                                    int ind_levels ATTRIBUTE_UNUSED)
4743 {
4744   rtx x = *x_p;
4745
4746   /* Do not allow mem (plus (reg, const)) if vector struct mode.  */
4747   if (aarch64_vect_struct_mode_p (mode)
4748       && GET_CODE (x) == PLUS
4749       && REG_P (XEXP (x, 0))
4750       && CONST_INT_P (XEXP (x, 1)))
4751     {
4752       rtx orig_rtx = x;
4753       x = copy_rtx (x);
4754       push_reload (orig_rtx, NULL_RTX, x_p, NULL,
4755                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4756                    opnum, (enum reload_type) type);
4757       return x;
4758     }
4759
4760   /* We must recognize output that we have already generated ourselves.  */
4761   if (GET_CODE (x) == PLUS
4762       && GET_CODE (XEXP (x, 0)) == PLUS
4763       && REG_P (XEXP (XEXP (x, 0), 0))
4764       && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4765       && CONST_INT_P (XEXP (x, 1)))
4766     {
4767       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4768                    BASE_REG_CLASS, GET_MODE (x), VOIDmode, 0, 0,
4769                    opnum, (enum reload_type) type);
4770       return x;
4771     }
4772
4773   /* We wish to handle large displacements off a base register by splitting
4774      the addend across an add and the mem insn.  This can cut the number of
4775      extra insns needed from 3 to 1.  It is only useful for load/store of a
4776      single register with 12 bit offset field.  */
4777   if (GET_CODE (x) == PLUS
4778       && REG_P (XEXP (x, 0))
4779       && CONST_INT_P (XEXP (x, 1))
4780       && HARD_REGISTER_P (XEXP (x, 0))
4781       && mode != TImode
4782       && mode != TFmode
4783       && aarch64_regno_ok_for_base_p (REGNO (XEXP (x, 0)), true))
4784     {
4785       HOST_WIDE_INT val = INTVAL (XEXP (x, 1));
4786       HOST_WIDE_INT low = val & 0xfff;
4787       HOST_WIDE_INT high = val - low;
4788       HOST_WIDE_INT offs;
4789       rtx cst;
4790       machine_mode xmode = GET_MODE (x);
4791
4792       /* In ILP32, xmode can be either DImode or SImode.  */
4793       gcc_assert (xmode == DImode || xmode == SImode);
4794
4795       /* Reload non-zero BLKmode offsets.  This is because we cannot ascertain
4796          BLKmode alignment.  */
4797       if (GET_MODE_SIZE (mode) == 0)
4798         return NULL_RTX;
4799
4800       offs = low % GET_MODE_SIZE (mode);
4801
4802       /* Align misaligned offset by adjusting high part to compensate.  */
4803       if (offs != 0)
4804         {
4805           if (aarch64_uimm12_shift (high + offs))
4806             {
4807               /* Align down.  */
4808               low = low - offs;
4809               high = high + offs;
4810             }
4811           else
4812             {
4813               /* Align up.  */
4814               offs = GET_MODE_SIZE (mode) - offs;
4815               low = low + offs;
4816               high = high + (low & 0x1000) - offs;
4817               low &= 0xfff;
4818             }
4819         }
4820
4821       /* Check for overflow.  */
4822       if (high + low != val)
4823         return NULL_RTX;
4824
4825       cst = GEN_INT (high);
4826       if (!aarch64_uimm12_shift (high))
4827         cst = force_const_mem (xmode, cst);
4828
4829       /* Reload high part into base reg, leaving the low part
4830          in the mem instruction.
4831          Note that replacing this gen_rtx_PLUS with plus_constant is
4832          wrong in this case because we rely on the
4833          (plus (plus reg c1) c2) structure being preserved so that
4834          XEXP (*p, 0) in push_reload below uses the correct term.  */
4835       x = gen_rtx_PLUS (xmode,
4836                         gen_rtx_PLUS (xmode, XEXP (x, 0), cst),
4837                         GEN_INT (low));
4838
4839       push_reload (XEXP (x, 0), NULL_RTX, &XEXP (x, 0), NULL,
4840                    BASE_REG_CLASS, xmode, VOIDmode, 0, 0,
4841                    opnum, (enum reload_type) type);
4842       return x;
4843     }
4844
4845   return NULL_RTX;
4846 }
4847
4848
4849 /* Return the reload icode required for a constant pool in mode.  */
4850 static enum insn_code
4851 aarch64_constant_pool_reload_icode (machine_mode mode)
4852 {
4853   switch (mode)
4854     {
4855     case SFmode:
4856       return CODE_FOR_aarch64_reload_movcpsfdi;
4857
4858     case DFmode:
4859       return CODE_FOR_aarch64_reload_movcpdfdi;
4860
4861     case TFmode:
4862       return CODE_FOR_aarch64_reload_movcptfdi;
4863
4864     case V8QImode:
4865       return CODE_FOR_aarch64_reload_movcpv8qidi;
4866
4867     case V16QImode:
4868       return CODE_FOR_aarch64_reload_movcpv16qidi;
4869
4870     case V4HImode:
4871       return CODE_FOR_aarch64_reload_movcpv4hidi;
4872
4873     case V8HImode:
4874       return CODE_FOR_aarch64_reload_movcpv8hidi;
4875
4876     case V2SImode:
4877       return CODE_FOR_aarch64_reload_movcpv2sidi;
4878
4879     case V4SImode:
4880       return CODE_FOR_aarch64_reload_movcpv4sidi;
4881
4882     case V2DImode:
4883       return CODE_FOR_aarch64_reload_movcpv2didi;
4884
4885     case V2DFmode:
4886       return CODE_FOR_aarch64_reload_movcpv2dfdi;
4887
4888     default:
4889       gcc_unreachable ();
4890     }
4891
4892   gcc_unreachable ();
4893 }
4894 static reg_class_t
4895 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
4896                           reg_class_t rclass,
4897                           machine_mode mode,
4898                           secondary_reload_info *sri)
4899 {
4900
4901   /* If we have to disable direct literal pool loads and stores because the
4902      function is too big, then we need a scratch register.  */
4903   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
4904       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
4905           || targetm.vector_mode_supported_p (GET_MODE (x)))
4906       && aarch64_nopcrelative_literal_loads)
4907     {
4908       sri->icode = aarch64_constant_pool_reload_icode (mode);
4909       return NO_REGS;
4910     }
4911
4912   /* Without the TARGET_SIMD instructions we cannot move a Q register
4913      to a Q register directly.  We need a scratch.  */
4914   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
4915       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
4916       && reg_class_subset_p (rclass, FP_REGS))
4917     {
4918       if (mode == TFmode)
4919         sri->icode = CODE_FOR_aarch64_reload_movtf;
4920       else if (mode == TImode)
4921         sri->icode = CODE_FOR_aarch64_reload_movti;
4922       return NO_REGS;
4923     }
4924
4925   /* A TFmode or TImode memory access should be handled via an FP_REGS
4926      because AArch64 has richer addressing modes for LDR/STR instructions
4927      than LDP/STP instructions.  */
4928   if (TARGET_FLOAT && rclass == GENERAL_REGS
4929       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
4930     return FP_REGS;
4931
4932   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
4933       return GENERAL_REGS;
4934
4935   return NO_REGS;
4936 }
4937
4938 static bool
4939 aarch64_can_eliminate (const int from, const int to)
4940 {
4941   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
4942      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
4943
4944   if (frame_pointer_needed)
4945     {
4946       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4947         return true;
4948       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
4949         return false;
4950       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
4951           && !cfun->calls_alloca)
4952         return true;
4953       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
4954         return true;
4955
4956       return false;
4957     }
4958   else
4959     {
4960       /* If we decided that we didn't need a leaf frame pointer but then used
4961          LR in the function, then we'll want a frame pointer after all, so
4962          prevent this elimination to ensure a frame pointer is used.  */
4963       if (to == STACK_POINTER_REGNUM
4964           && flag_omit_leaf_frame_pointer
4965           && df_regs_ever_live_p (LR_REGNUM))
4966         return false;
4967     }
4968
4969   return true;
4970 }
4971
4972 HOST_WIDE_INT
4973 aarch64_initial_elimination_offset (unsigned from, unsigned to)
4974 {
4975   aarch64_layout_frame ();
4976
4977   if (to == HARD_FRAME_POINTER_REGNUM)
4978     {
4979       if (from == ARG_POINTER_REGNUM)
4980         return cfun->machine->frame.frame_size - crtl->outgoing_args_size;
4981
4982       if (from == FRAME_POINTER_REGNUM)
4983         return (cfun->machine->frame.hard_fp_offset
4984                 - cfun->machine->frame.saved_varargs_size);
4985     }
4986
4987   if (to == STACK_POINTER_REGNUM)
4988     {
4989       if (from == FRAME_POINTER_REGNUM)
4990           return (cfun->machine->frame.frame_size
4991                   - cfun->machine->frame.saved_varargs_size);
4992     }
4993
4994   return cfun->machine->frame.frame_size;
4995 }
4996
4997 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
4998    previous frame.  */
4999
5000 rtx
5001 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5002 {
5003   if (count != 0)
5004     return const0_rtx;
5005   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5006 }
5007
5008
5009 static void
5010 aarch64_asm_trampoline_template (FILE *f)
5011 {
5012   if (TARGET_ILP32)
5013     {
5014       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5015       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5016     }
5017   else
5018     {
5019       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5020       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5021     }
5022   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5023   assemble_aligned_integer (4, const0_rtx);
5024   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5025   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5026 }
5027
5028 static void
5029 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5030 {
5031   rtx fnaddr, mem, a_tramp;
5032   const int tramp_code_sz = 16;
5033
5034   /* Don't need to copy the trailing D-words, we fill those in below.  */
5035   emit_block_move (m_tramp, assemble_trampoline_template (),
5036                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5037   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5038   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5039   if (GET_MODE (fnaddr) != ptr_mode)
5040     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5041   emit_move_insn (mem, fnaddr);
5042
5043   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5044   emit_move_insn (mem, chain_value);
5045
5046   /* XXX We should really define a "clear_cache" pattern and use
5047      gen_clear_cache().  */
5048   a_tramp = XEXP (m_tramp, 0);
5049   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5050                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5051                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5052                      ptr_mode);
5053 }
5054
5055 static unsigned char
5056 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5057 {
5058   switch (regclass)
5059     {
5060     case CALLER_SAVE_REGS:
5061     case POINTER_REGS:
5062     case GENERAL_REGS:
5063     case ALL_REGS:
5064     case FP_REGS:
5065     case FP_LO_REGS:
5066       return
5067         aarch64_vector_mode_p (mode)
5068           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5069           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5070     case STACK_REG:
5071       return 1;
5072
5073     case NO_REGS:
5074       return 0;
5075
5076     default:
5077       break;
5078     }
5079   gcc_unreachable ();
5080 }
5081
5082 static reg_class_t
5083 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5084 {
5085   if (regclass == POINTER_REGS)
5086     return GENERAL_REGS;
5087
5088   if (regclass == STACK_REG)
5089     {
5090       if (REG_P(x)
5091           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5092           return regclass;
5093
5094       return NO_REGS;
5095     }
5096
5097   /* If it's an integer immediate that MOVI can't handle, then
5098      FP_REGS is not an option, so we return NO_REGS instead.  */
5099   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5100       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5101     return NO_REGS;
5102
5103   /* Register eliminiation can result in a request for
5104      SP+constant->FP_REGS.  We cannot support such operations which
5105      use SP as source and an FP_REG as destination, so reject out
5106      right now.  */
5107   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5108     {
5109       rtx lhs = XEXP (x, 0);
5110
5111       /* Look through a possible SUBREG introduced by ILP32.  */
5112       if (GET_CODE (lhs) == SUBREG)
5113         lhs = SUBREG_REG (lhs);
5114
5115       gcc_assert (REG_P (lhs));
5116       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5117                                       POINTER_REGS));
5118       return NO_REGS;
5119     }
5120
5121   return regclass;
5122 }
5123
5124 void
5125 aarch64_asm_output_labelref (FILE* f, const char *name)
5126 {
5127   asm_fprintf (f, "%U%s", name);
5128 }
5129
5130 static void
5131 aarch64_elf_asm_constructor (rtx symbol, int priority)
5132 {
5133   if (priority == DEFAULT_INIT_PRIORITY)
5134     default_ctor_section_asm_out_constructor (symbol, priority);
5135   else
5136     {
5137       section *s;
5138       char buf[18];
5139       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5140       s = get_section (buf, SECTION_WRITE, NULL);
5141       switch_to_section (s);
5142       assemble_align (POINTER_SIZE);
5143       assemble_aligned_integer (POINTER_BYTES, symbol);
5144     }
5145 }
5146
5147 static void
5148 aarch64_elf_asm_destructor (rtx symbol, int priority)
5149 {
5150   if (priority == DEFAULT_INIT_PRIORITY)
5151     default_dtor_section_asm_out_destructor (symbol, priority);
5152   else
5153     {
5154       section *s;
5155       char buf[18];
5156       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5157       s = get_section (buf, SECTION_WRITE, NULL);
5158       switch_to_section (s);
5159       assemble_align (POINTER_SIZE);
5160       assemble_aligned_integer (POINTER_BYTES, symbol);
5161     }
5162 }
5163
5164 const char*
5165 aarch64_output_casesi (rtx *operands)
5166 {
5167   char buf[100];
5168   char label[100];
5169   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5170   int index;
5171   static const char *const patterns[4][2] =
5172   {
5173     {
5174       "ldrb\t%w3, [%0,%w1,uxtw]",
5175       "add\t%3, %4, %w3, sxtb #2"
5176     },
5177     {
5178       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5179       "add\t%3, %4, %w3, sxth #2"
5180     },
5181     {
5182       "ldr\t%w3, [%0,%w1,uxtw #2]",
5183       "add\t%3, %4, %w3, sxtw #2"
5184     },
5185     /* We assume that DImode is only generated when not optimizing and
5186        that we don't really need 64-bit address offsets.  That would
5187        imply an object file with 8GB of code in a single function!  */
5188     {
5189       "ldr\t%w3, [%0,%w1,uxtw #2]",
5190       "add\t%3, %4, %w3, sxtw #2"
5191     }
5192   };
5193
5194   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5195
5196   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5197
5198   gcc_assert (index >= 0 && index <= 3);
5199
5200   /* Need to implement table size reduction, by chaning the code below.  */
5201   output_asm_insn (patterns[index][0], operands);
5202   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5203   snprintf (buf, sizeof (buf),
5204             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5205   output_asm_insn (buf, operands);
5206   output_asm_insn (patterns[index][1], operands);
5207   output_asm_insn ("br\t%3", operands);
5208   assemble_label (asm_out_file, label);
5209   return "";
5210 }
5211
5212
5213 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5214    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5215    operator.  */
5216
5217 int
5218 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5219 {
5220   if (shift >= 0 && shift <= 3)
5221     {
5222       int size;
5223       for (size = 8; size <= 32; size *= 2)
5224         {
5225           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5226           if (mask == bits << shift)
5227             return size;
5228         }
5229     }
5230   return 0;
5231 }
5232
5233 /* Constant pools are per function only when PC relative
5234    literal loads are true or we are in the large memory
5235    model.  */
5236
5237 static inline bool
5238 aarch64_can_use_per_function_literal_pools_p (void)
5239 {
5240   return (!aarch64_nopcrelative_literal_loads
5241           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5242 }
5243
5244 static bool
5245 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5246 {
5247   /* We can't use blocks for constants when we're using a per-function
5248      constant pool.  */
5249   return !aarch64_can_use_per_function_literal_pools_p ();
5250 }
5251
5252 /* Select appropriate section for constants depending
5253    on where we place literal pools.  */
5254
5255 static section *
5256 aarch64_select_rtx_section (machine_mode mode,
5257                             rtx x,
5258                             unsigned HOST_WIDE_INT align)
5259 {
5260   if (aarch64_can_use_per_function_literal_pools_p ())
5261     return function_section (current_function_decl);
5262
5263   return default_elf_select_rtx_section (mode, x, align);
5264 }
5265
5266 /* Costs.  */
5267
5268 /* Helper function for rtx cost calculation.  Strip a shift expression
5269    from X.  Returns the inner operand if successful, or the original
5270    expression on failure.  */
5271 static rtx
5272 aarch64_strip_shift (rtx x)
5273 {
5274   rtx op = x;
5275
5276   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5277      we can convert both to ROR during final output.  */
5278   if ((GET_CODE (op) == ASHIFT
5279        || GET_CODE (op) == ASHIFTRT
5280        || GET_CODE (op) == LSHIFTRT
5281        || GET_CODE (op) == ROTATERT
5282        || GET_CODE (op) == ROTATE)
5283       && CONST_INT_P (XEXP (op, 1)))
5284     return XEXP (op, 0);
5285
5286   if (GET_CODE (op) == MULT
5287       && CONST_INT_P (XEXP (op, 1))
5288       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5289     return XEXP (op, 0);
5290
5291   return x;
5292 }
5293
5294 /* Helper function for rtx cost calculation.  Strip an extend
5295    expression from X.  Returns the inner operand if successful, or the
5296    original expression on failure.  We deal with a number of possible
5297    canonicalization variations here.  */
5298 static rtx
5299 aarch64_strip_extend (rtx x)
5300 {
5301   rtx op = x;
5302
5303   /* Zero and sign extraction of a widened value.  */
5304   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5305       && XEXP (op, 2) == const0_rtx
5306       && GET_CODE (XEXP (op, 0)) == MULT
5307       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5308                                          XEXP (op, 1)))
5309     return XEXP (XEXP (op, 0), 0);
5310
5311   /* It can also be represented (for zero-extend) as an AND with an
5312      immediate.  */
5313   if (GET_CODE (op) == AND
5314       && GET_CODE (XEXP (op, 0)) == MULT
5315       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5316       && CONST_INT_P (XEXP (op, 1))
5317       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5318                            INTVAL (XEXP (op, 1))) != 0)
5319     return XEXP (XEXP (op, 0), 0);
5320
5321   /* Now handle extended register, as this may also have an optional
5322      left shift by 1..4.  */
5323   if (GET_CODE (op) == ASHIFT
5324       && CONST_INT_P (XEXP (op, 1))
5325       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5326     op = XEXP (op, 0);
5327
5328   if (GET_CODE (op) == ZERO_EXTEND
5329       || GET_CODE (op) == SIGN_EXTEND)
5330     op = XEXP (op, 0);
5331
5332   if (op != x)
5333     return op;
5334
5335   return x;
5336 }
5337
5338 /* Return true iff CODE is a shift supported in combination
5339    with arithmetic instructions.  */
5340
5341 static bool
5342 aarch64_shift_p (enum rtx_code code)
5343 {
5344   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5345 }
5346
5347 /* Helper function for rtx cost calculation.  Calculate the cost of
5348    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5349    Return the calculated cost of the expression, recursing manually in to
5350    operands where needed.  */
5351
5352 static int
5353 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
5354 {
5355   rtx op0, op1;
5356   const struct cpu_cost_table *extra_cost
5357     = aarch64_tune_params.insn_extra_cost;
5358   int cost = 0;
5359   bool compound_p = (outer == PLUS || outer == MINUS);
5360   machine_mode mode = GET_MODE (x);
5361
5362   gcc_checking_assert (code == MULT);
5363
5364   op0 = XEXP (x, 0);
5365   op1 = XEXP (x, 1);
5366
5367   if (VECTOR_MODE_P (mode))
5368     mode = GET_MODE_INNER (mode);
5369
5370   /* Integer multiply/fma.  */
5371   if (GET_MODE_CLASS (mode) == MODE_INT)
5372     {
5373       /* The multiply will be canonicalized as a shift, cost it as such.  */
5374       if (aarch64_shift_p (GET_CODE (x))
5375           || (CONST_INT_P (op1)
5376               && exact_log2 (INTVAL (op1)) > 0))
5377         {
5378           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5379                            || GET_CODE (op0) == SIGN_EXTEND;
5380           if (speed)
5381             {
5382               if (compound_p)
5383                 {
5384                   if (REG_P (op1))
5385                     /* ARITH + shift-by-register.  */
5386                     cost += extra_cost->alu.arith_shift_reg;
5387                   else if (is_extend)
5388                     /* ARITH + extended register.  We don't have a cost field
5389                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
5390                     cost += extra_cost->alu.extend_arith;
5391                   else
5392                     /* ARITH + shift-by-immediate.  */
5393                     cost += extra_cost->alu.arith_shift;
5394                 }
5395               else
5396                 /* LSL (immediate).  */
5397                 cost += extra_cost->alu.shift;
5398
5399             }
5400           /* Strip extends as we will have costed them in the case above.  */
5401           if (is_extend)
5402             op0 = aarch64_strip_extend (op0);
5403
5404           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
5405
5406           return cost;
5407         }
5408
5409       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
5410          compound and let the below cases handle it.  After all, MNEG is a
5411          special-case alias of MSUB.  */
5412       if (GET_CODE (op0) == NEG)
5413         {
5414           op0 = XEXP (op0, 0);
5415           compound_p = true;
5416         }
5417
5418       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5419       if ((GET_CODE (op0) == ZERO_EXTEND
5420            && GET_CODE (op1) == ZERO_EXTEND)
5421           || (GET_CODE (op0) == SIGN_EXTEND
5422               && GET_CODE (op1) == SIGN_EXTEND))
5423         {
5424           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
5425           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
5426
5427           if (speed)
5428             {
5429               if (compound_p)
5430                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
5431                 cost += extra_cost->mult[0].extend_add;
5432               else
5433                 /* MUL/SMULL/UMULL.  */
5434                 cost += extra_cost->mult[0].extend;
5435             }
5436
5437           return cost;
5438         }
5439
5440       /* This is either an integer multiply or a MADD.  In both cases
5441          we want to recurse and cost the operands.  */
5442       cost += rtx_cost (op0, mode, MULT, 0, speed);
5443       cost += rtx_cost (op1, mode, MULT, 1, speed);
5444
5445       if (speed)
5446         {
5447           if (compound_p)
5448             /* MADD/MSUB.  */
5449             cost += extra_cost->mult[mode == DImode].add;
5450           else
5451             /* MUL.  */
5452             cost += extra_cost->mult[mode == DImode].simple;
5453         }
5454
5455       return cost;
5456     }
5457   else
5458     {
5459       if (speed)
5460         {
5461           /* Floating-point FMA/FMUL can also support negations of the
5462              operands, unless the rounding mode is upward or downward in
5463              which case FNMUL is different than FMUL with operand negation.  */
5464           bool neg0 = GET_CODE (op0) == NEG;
5465           bool neg1 = GET_CODE (op1) == NEG;
5466           if (compound_p || !flag_rounding_math || (neg0 && neg1))
5467             {
5468               if (neg0)
5469                 op0 = XEXP (op0, 0);
5470               if (neg1)
5471                 op1 = XEXP (op1, 0);
5472             }
5473
5474           if (compound_p)
5475             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5476             cost += extra_cost->fp[mode == DFmode].fma;
5477           else
5478             /* FMUL/FNMUL.  */
5479             cost += extra_cost->fp[mode == DFmode].mult;
5480         }
5481
5482       cost += rtx_cost (op0, mode, MULT, 0, speed);
5483       cost += rtx_cost (op1, mode, MULT, 1, speed);
5484       return cost;
5485     }
5486 }
5487
5488 static int
5489 aarch64_address_cost (rtx x,
5490                       machine_mode mode,
5491                       addr_space_t as ATTRIBUTE_UNUSED,
5492                       bool speed)
5493 {
5494   enum rtx_code c = GET_CODE (x);
5495   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
5496   struct aarch64_address_info info;
5497   int cost = 0;
5498   info.shift = 0;
5499
5500   if (!aarch64_classify_address (&info, x, mode, c, false))
5501     {
5502       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5503         {
5504           /* This is a CONST or SYMBOL ref which will be split
5505              in a different way depending on the code model in use.
5506              Cost it through the generic infrastructure.  */
5507           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
5508           /* Divide through by the cost of one instruction to
5509              bring it to the same units as the address costs.  */
5510           cost_symbol_ref /= COSTS_N_INSNS (1);
5511           /* The cost is then the cost of preparing the address,
5512              followed by an immediate (possibly 0) offset.  */
5513           return cost_symbol_ref + addr_cost->imm_offset;
5514         }
5515       else
5516         {
5517           /* This is most likely a jump table from a case
5518              statement.  */
5519           return addr_cost->register_offset;
5520         }
5521     }
5522
5523   switch (info.type)
5524     {
5525       case ADDRESS_LO_SUM:
5526       case ADDRESS_SYMBOLIC:
5527       case ADDRESS_REG_IMM:
5528         cost += addr_cost->imm_offset;
5529         break;
5530
5531       case ADDRESS_REG_WB:
5532         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5533           cost += addr_cost->pre_modify;
5534         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5535           cost += addr_cost->post_modify;
5536         else
5537           gcc_unreachable ();
5538
5539         break;
5540
5541       case ADDRESS_REG_REG:
5542         cost += addr_cost->register_offset;
5543         break;
5544
5545       case ADDRESS_REG_SXTW:
5546         cost += addr_cost->register_sextend;
5547         break;
5548
5549       case ADDRESS_REG_UXTW:
5550         cost += addr_cost->register_zextend;
5551         break;
5552
5553       default:
5554         gcc_unreachable ();
5555     }
5556
5557
5558   if (info.shift > 0)
5559     {
5560       /* For the sake of calculating the cost of the shifted register
5561          component, we can treat same sized modes in the same way.  */
5562       switch (GET_MODE_BITSIZE (mode))
5563         {
5564           case 16:
5565             cost += addr_cost->addr_scale_costs.hi;
5566             break;
5567
5568           case 32:
5569             cost += addr_cost->addr_scale_costs.si;
5570             break;
5571
5572           case 64:
5573             cost += addr_cost->addr_scale_costs.di;
5574             break;
5575
5576           /* We can't tell, or this is a 128-bit vector.  */
5577           default:
5578             cost += addr_cost->addr_scale_costs.ti;
5579             break;
5580         }
5581     }
5582
5583   return cost;
5584 }
5585
5586 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
5587    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
5588    to be taken.  */
5589
5590 int
5591 aarch64_branch_cost (bool speed_p, bool predictable_p)
5592 {
5593   /* When optimizing for speed, use the cost of unpredictable branches.  */
5594   const struct cpu_branch_cost *branch_costs =
5595     aarch64_tune_params.branch_costs;
5596
5597   if (!speed_p || predictable_p)
5598     return branch_costs->predictable;
5599   else
5600     return branch_costs->unpredictable;
5601 }
5602
5603 /* Return true if the RTX X in mode MODE is a zero or sign extract
5604    usable in an ADD or SUB (extended register) instruction.  */
5605 static bool
5606 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5607 {
5608   /* Catch add with a sign extract.
5609      This is add_<optab><mode>_multp2.  */
5610   if (GET_CODE (x) == SIGN_EXTRACT
5611       || GET_CODE (x) == ZERO_EXTRACT)
5612     {
5613       rtx op0 = XEXP (x, 0);
5614       rtx op1 = XEXP (x, 1);
5615       rtx op2 = XEXP (x, 2);
5616
5617       if (GET_CODE (op0) == MULT
5618           && CONST_INT_P (op1)
5619           && op2 == const0_rtx
5620           && CONST_INT_P (XEXP (op0, 1))
5621           && aarch64_is_extend_from_extract (mode,
5622                                              XEXP (op0, 1),
5623                                              op1))
5624         {
5625           return true;
5626         }
5627     }
5628   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
5629      No shift.  */
5630   else if (GET_CODE (x) == SIGN_EXTEND
5631            || GET_CODE (x) == ZERO_EXTEND)
5632     return REG_P (XEXP (x, 0));
5633
5634   return false;
5635 }
5636
5637 static bool
5638 aarch64_frint_unspec_p (unsigned int u)
5639 {
5640   switch (u)
5641     {
5642       case UNSPEC_FRINTZ:
5643       case UNSPEC_FRINTP:
5644       case UNSPEC_FRINTM:
5645       case UNSPEC_FRINTA:
5646       case UNSPEC_FRINTN:
5647       case UNSPEC_FRINTX:
5648       case UNSPEC_FRINTI:
5649         return true;
5650
5651       default:
5652         return false;
5653     }
5654 }
5655
5656 /* Return true iff X is an rtx that will match an extr instruction
5657    i.e. as described in the *extr<mode>5_insn family of patterns.
5658    OP0 and OP1 will be set to the operands of the shifts involved
5659    on success and will be NULL_RTX otherwise.  */
5660
5661 static bool
5662 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5663 {
5664   rtx op0, op1;
5665   machine_mode mode = GET_MODE (x);
5666
5667   *res_op0 = NULL_RTX;
5668   *res_op1 = NULL_RTX;
5669
5670   if (GET_CODE (x) != IOR)
5671     return false;
5672
5673   op0 = XEXP (x, 0);
5674   op1 = XEXP (x, 1);
5675
5676   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5677       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5678     {
5679      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
5680       if (GET_CODE (op1) == ASHIFT)
5681         std::swap (op0, op1);
5682
5683       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5684         return false;
5685
5686       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5687       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5688
5689       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5690           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5691         {
5692           *res_op0 = XEXP (op0, 0);
5693           *res_op1 = XEXP (op1, 0);
5694           return true;
5695         }
5696     }
5697
5698   return false;
5699 }
5700
5701 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5702    storing it in *COST.  Result is true if the total cost of the operation
5703    has now been calculated.  */
5704 static bool
5705 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5706 {
5707   rtx inner;
5708   rtx comparator;
5709   enum rtx_code cmpcode;
5710
5711   if (COMPARISON_P (op0))
5712     {
5713       inner = XEXP (op0, 0);
5714       comparator = XEXP (op0, 1);
5715       cmpcode = GET_CODE (op0);
5716     }
5717   else
5718     {
5719       inner = op0;
5720       comparator = const0_rtx;
5721       cmpcode = NE;
5722     }
5723
5724   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5725     {
5726       /* Conditional branch.  */
5727       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5728         return true;
5729       else
5730         {
5731           if (cmpcode == NE || cmpcode == EQ)
5732             {
5733               if (comparator == const0_rtx)
5734                 {
5735                   /* TBZ/TBNZ/CBZ/CBNZ.  */
5736                   if (GET_CODE (inner) == ZERO_EXTRACT)
5737                     /* TBZ/TBNZ.  */
5738                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
5739                                        ZERO_EXTRACT, 0, speed);
5740                   else
5741                     /* CBZ/CBNZ.  */
5742                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
5743
5744                 return true;
5745               }
5746             }
5747           else if (cmpcode == LT || cmpcode == GE)
5748             {
5749               /* TBZ/TBNZ.  */
5750               if (comparator == const0_rtx)
5751                 return true;
5752             }
5753         }
5754     }
5755   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5756     {
5757       /* It's a conditional operation based on the status flags,
5758          so it must be some flavor of CSEL.  */
5759
5760       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
5761       if (GET_CODE (op1) == NEG
5762           || GET_CODE (op1) == NOT
5763           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
5764         op1 = XEXP (op1, 0);
5765
5766       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
5767       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
5768       return true;
5769     }
5770
5771   /* We don't know what this is, cost all operands.  */
5772   return false;
5773 }
5774
5775 /* Calculate the cost of calculating X, storing it in *COST.  Result
5776    is true if the total cost of the operation has now been calculated.  */
5777 static bool
5778 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
5779                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
5780 {
5781   rtx op0, op1, op2;
5782   const struct cpu_cost_table *extra_cost
5783     = aarch64_tune_params.insn_extra_cost;
5784   int code = GET_CODE (x);
5785
5786   /* By default, assume that everything has equivalent cost to the
5787      cheapest instruction.  Any additional costs are applied as a delta
5788      above this default.  */
5789   *cost = COSTS_N_INSNS (1);
5790
5791   switch (code)
5792     {
5793     case SET:
5794       /* The cost depends entirely on the operands to SET.  */
5795       *cost = 0;
5796       op0 = SET_DEST (x);
5797       op1 = SET_SRC (x);
5798
5799       switch (GET_CODE (op0))
5800         {
5801         case MEM:
5802           if (speed)
5803             {
5804               rtx address = XEXP (op0, 0);
5805               if (VECTOR_MODE_P (mode))
5806                 *cost += extra_cost->ldst.storev;
5807               else if (GET_MODE_CLASS (mode) == MODE_INT)
5808                 *cost += extra_cost->ldst.store;
5809               else if (mode == SFmode)
5810                 *cost += extra_cost->ldst.storef;
5811               else if (mode == DFmode)
5812                 *cost += extra_cost->ldst.stored;
5813
5814               *cost +=
5815                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5816                                                      0, speed));
5817             }
5818
5819           *cost += rtx_cost (op1, mode, SET, 1, speed);
5820           return true;
5821
5822         case SUBREG:
5823           if (! REG_P (SUBREG_REG (op0)))
5824             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
5825
5826           /* Fall through.  */
5827         case REG:
5828           /* The cost is one per vector-register copied.  */
5829           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
5830             {
5831               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5832                               / GET_MODE_SIZE (V4SImode);
5833               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5834             }
5835           /* const0_rtx is in general free, but we will use an
5836              instruction to set a register to 0.  */
5837           else if (REG_P (op1) || op1 == const0_rtx)
5838             {
5839               /* The cost is 1 per register copied.  */
5840               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
5841                               / UNITS_PER_WORD;
5842               *cost = COSTS_N_INSNS (n_minus_1 + 1);
5843             }
5844           else
5845             /* Cost is just the cost of the RHS of the set.  */
5846             *cost += rtx_cost (op1, mode, SET, 1, speed);
5847           return true;
5848
5849         case ZERO_EXTRACT:
5850         case SIGN_EXTRACT:
5851           /* Bit-field insertion.  Strip any redundant widening of
5852              the RHS to meet the width of the target.  */
5853           if (GET_CODE (op1) == SUBREG)
5854             op1 = SUBREG_REG (op1);
5855           if ((GET_CODE (op1) == ZERO_EXTEND
5856                || GET_CODE (op1) == SIGN_EXTEND)
5857               && CONST_INT_P (XEXP (op0, 1))
5858               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
5859                   >= INTVAL (XEXP (op0, 1))))
5860             op1 = XEXP (op1, 0);
5861
5862           if (CONST_INT_P (op1))
5863             {
5864               /* MOV immediate is assumed to always be cheap.  */
5865               *cost = COSTS_N_INSNS (1);
5866             }
5867           else
5868             {
5869               /* BFM.  */
5870               if (speed)
5871                 *cost += extra_cost->alu.bfi;
5872               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
5873             }
5874
5875           return true;
5876
5877         default:
5878           /* We can't make sense of this, assume default cost.  */
5879           *cost = COSTS_N_INSNS (1);
5880           return false;
5881         }
5882       return false;
5883
5884     case CONST_INT:
5885       /* If an instruction can incorporate a constant within the
5886          instruction, the instruction's expression avoids calling
5887          rtx_cost() on the constant.  If rtx_cost() is called on a
5888          constant, then it is usually because the constant must be
5889          moved into a register by one or more instructions.
5890
5891          The exception is constant 0, which can be expressed
5892          as XZR/WZR and is therefore free.  The exception to this is
5893          if we have (set (reg) (const0_rtx)) in which case we must cost
5894          the move.  However, we can catch that when we cost the SET, so
5895          we don't need to consider that here.  */
5896       if (x == const0_rtx)
5897         *cost = 0;
5898       else
5899         {
5900           /* To an approximation, building any other constant is
5901              proportionally expensive to the number of instructions
5902              required to build that constant.  This is true whether we
5903              are compiling for SPEED or otherwise.  */
5904           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
5905                                  (NULL_RTX, x, false, mode));
5906         }
5907       return true;
5908
5909     case CONST_DOUBLE:
5910       if (speed)
5911         {
5912           /* mov[df,sf]_aarch64.  */
5913           if (aarch64_float_const_representable_p (x))
5914             /* FMOV (scalar immediate).  */
5915             *cost += extra_cost->fp[mode == DFmode].fpconst;
5916           else if (!aarch64_float_const_zero_rtx_p (x))
5917             {
5918               /* This will be a load from memory.  */
5919               if (mode == DFmode)
5920                 *cost += extra_cost->ldst.loadd;
5921               else
5922                 *cost += extra_cost->ldst.loadf;
5923             }
5924           else
5925             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
5926                or MOV v0.s[0], wzr - neither of which are modeled by the
5927                cost tables.  Just use the default cost.  */
5928             {
5929             }
5930         }
5931
5932       return true;
5933
5934     case MEM:
5935       if (speed)
5936         {
5937           /* For loads we want the base cost of a load, plus an
5938              approximation for the additional cost of the addressing
5939              mode.  */
5940           rtx address = XEXP (x, 0);
5941           if (VECTOR_MODE_P (mode))
5942             *cost += extra_cost->ldst.loadv;
5943           else if (GET_MODE_CLASS (mode) == MODE_INT)
5944             *cost += extra_cost->ldst.load;
5945           else if (mode == SFmode)
5946             *cost += extra_cost->ldst.loadf;
5947           else if (mode == DFmode)
5948             *cost += extra_cost->ldst.loadd;
5949
5950           *cost +=
5951                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
5952                                                      0, speed));
5953         }
5954
5955       return true;
5956
5957     case NEG:
5958       op0 = XEXP (x, 0);
5959
5960       if (VECTOR_MODE_P (mode))
5961         {
5962           if (speed)
5963             {
5964               /* FNEG.  */
5965               *cost += extra_cost->vect.alu;
5966             }
5967           return false;
5968         }
5969
5970       if (GET_MODE_CLASS (mode) == MODE_INT)
5971         {
5972           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
5973               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
5974             {
5975               /* CSETM.  */
5976               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
5977               return true;
5978             }
5979
5980           /* Cost this as SUB wzr, X.  */
5981           op0 = CONST0_RTX (mode);
5982           op1 = XEXP (x, 0);
5983           goto cost_minus;
5984         }
5985
5986       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
5987         {
5988           /* Support (neg(fma...)) as a single instruction only if
5989              sign of zeros is unimportant.  This matches the decision
5990              making in aarch64.md.  */
5991           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
5992             {
5993               /* FNMADD.  */
5994               *cost = rtx_cost (op0, mode, NEG, 0, speed);
5995               return true;
5996             }
5997           if (GET_CODE (op0) == MULT)
5998             {
5999               /* FNMUL.  */
6000               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6001               return true;
6002             }
6003           if (speed)
6004             /* FNEG.  */
6005             *cost += extra_cost->fp[mode == DFmode].neg;
6006           return false;
6007         }
6008
6009       return false;
6010
6011     case CLRSB:
6012     case CLZ:
6013       if (speed)
6014         {
6015           if (VECTOR_MODE_P (mode))
6016             *cost += extra_cost->vect.alu;
6017           else
6018             *cost += extra_cost->alu.clz;
6019         }
6020
6021       return false;
6022
6023     case COMPARE:
6024       op0 = XEXP (x, 0);
6025       op1 = XEXP (x, 1);
6026
6027       if (op1 == const0_rtx
6028           && GET_CODE (op0) == AND)
6029         {
6030           x = op0;
6031           mode = GET_MODE (op0);
6032           goto cost_logic;
6033         }
6034
6035       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6036         {
6037           /* TODO: A write to the CC flags possibly costs extra, this
6038              needs encoding in the cost tables.  */
6039
6040           /* CC_ZESWPmode supports zero extend for free.  */
6041           if (mode == CC_ZESWPmode && GET_CODE (op0) == ZERO_EXTEND)
6042             op0 = XEXP (op0, 0);
6043
6044           mode = GET_MODE (op0);
6045           /* ANDS.  */
6046           if (GET_CODE (op0) == AND)
6047             {
6048               x = op0;
6049               goto cost_logic;
6050             }
6051
6052           if (GET_CODE (op0) == PLUS)
6053             {
6054               /* ADDS (and CMN alias).  */
6055               x = op0;
6056               goto cost_plus;
6057             }
6058
6059           if (GET_CODE (op0) == MINUS)
6060             {
6061               /* SUBS.  */
6062               x = op0;
6063               goto cost_minus;
6064             }
6065
6066           if (GET_CODE (op1) == NEG)
6067             {
6068               /* CMN.  */
6069               if (speed)
6070                 *cost += extra_cost->alu.arith;
6071
6072               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6073               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6074               return true;
6075             }
6076
6077           /* CMP.
6078
6079              Compare can freely swap the order of operands, and
6080              canonicalization puts the more complex operation first.
6081              But the integer MINUS logic expects the shift/extend
6082              operation in op1.  */
6083           if (! (REG_P (op0)
6084                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6085           {
6086             op0 = XEXP (x, 1);
6087             op1 = XEXP (x, 0);
6088           }
6089           goto cost_minus;
6090         }
6091
6092       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6093         {
6094           /* FCMP.  */
6095           if (speed)
6096             *cost += extra_cost->fp[mode == DFmode].compare;
6097
6098           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6099             {
6100               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6101               /* FCMP supports constant 0.0 for no extra cost. */
6102               return true;
6103             }
6104           return false;
6105         }
6106
6107       if (VECTOR_MODE_P (mode))
6108         {
6109           /* Vector compare.  */
6110           if (speed)
6111             *cost += extra_cost->vect.alu;
6112
6113           if (aarch64_float_const_zero_rtx_p (op1))
6114             {
6115               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6116                  cost.  */
6117               return true;
6118             }
6119           return false;
6120         }
6121       return false;
6122
6123     case MINUS:
6124       {
6125         op0 = XEXP (x, 0);
6126         op1 = XEXP (x, 1);
6127
6128 cost_minus:
6129         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6130
6131         /* Detect valid immediates.  */
6132         if ((GET_MODE_CLASS (mode) == MODE_INT
6133              || (GET_MODE_CLASS (mode) == MODE_CC
6134                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6135             && CONST_INT_P (op1)
6136             && aarch64_uimm12_shift (INTVAL (op1)))
6137           {
6138             if (speed)
6139               /* SUB(S) (immediate).  */
6140               *cost += extra_cost->alu.arith;
6141             return true;
6142           }
6143
6144         /* Look for SUB (extended register).  */
6145         if (aarch64_rtx_arith_op_extract_p (op1, mode))
6146           {
6147             if (speed)
6148               *cost += extra_cost->alu.extend_arith;
6149
6150             op1 = aarch64_strip_extend (op1);
6151             *cost += rtx_cost (op1, VOIDmode,
6152                                (enum rtx_code) GET_CODE (op1), 0, speed);
6153             return true;
6154           }
6155
6156         rtx new_op1 = aarch64_strip_extend (op1);
6157
6158         /* Cost this as an FMA-alike operation.  */
6159         if ((GET_CODE (new_op1) == MULT
6160              || aarch64_shift_p (GET_CODE (new_op1)))
6161             && code != COMPARE)
6162           {
6163             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6164                                             (enum rtx_code) code,
6165                                             speed);
6166             return true;
6167           }
6168
6169         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6170
6171         if (speed)
6172           {
6173             if (VECTOR_MODE_P (mode))
6174               {
6175                 /* Vector SUB.  */
6176                 *cost += extra_cost->vect.alu;
6177               }
6178             else if (GET_MODE_CLASS (mode) == MODE_INT)
6179               {
6180                 /* SUB(S).  */
6181                 *cost += extra_cost->alu.arith;
6182               }
6183             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6184               {
6185                 /* FSUB.  */
6186                 *cost += extra_cost->fp[mode == DFmode].addsub;
6187               }
6188           }
6189         return true;
6190       }
6191
6192     case PLUS:
6193       {
6194         rtx new_op0;
6195
6196         op0 = XEXP (x, 0);
6197         op1 = XEXP (x, 1);
6198
6199 cost_plus:
6200         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6201             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6202           {
6203             /* CSINC.  */
6204             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6205             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6206             return true;
6207           }
6208
6209         if (GET_MODE_CLASS (mode) == MODE_INT
6210             && CONST_INT_P (op1)
6211             && aarch64_uimm12_shift (INTVAL (op1)))
6212           {
6213             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6214
6215             if (speed)
6216               /* ADD (immediate).  */
6217               *cost += extra_cost->alu.arith;
6218             return true;
6219           }
6220
6221         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6222
6223         /* Look for ADD (extended register).  */
6224         if (aarch64_rtx_arith_op_extract_p (op0, mode))
6225           {
6226             if (speed)
6227               *cost += extra_cost->alu.extend_arith;
6228
6229             op0 = aarch64_strip_extend (op0);
6230             *cost += rtx_cost (op0, VOIDmode,
6231                                (enum rtx_code) GET_CODE (op0), 0, speed);
6232             return true;
6233           }
6234
6235         /* Strip any extend, leave shifts behind as we will
6236            cost them through mult_cost.  */
6237         new_op0 = aarch64_strip_extend (op0);
6238
6239         if (GET_CODE (new_op0) == MULT
6240             || aarch64_shift_p (GET_CODE (new_op0)))
6241           {
6242             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6243                                             speed);
6244             return true;
6245           }
6246
6247         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
6248
6249         if (speed)
6250           {
6251             if (VECTOR_MODE_P (mode))
6252               {
6253                 /* Vector ADD.  */
6254                 *cost += extra_cost->vect.alu;
6255               }
6256             else if (GET_MODE_CLASS (mode) == MODE_INT)
6257               {
6258                 /* ADD.  */
6259                 *cost += extra_cost->alu.arith;
6260               }
6261             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6262               {
6263                 /* FADD.  */
6264                 *cost += extra_cost->fp[mode == DFmode].addsub;
6265               }
6266           }
6267         return true;
6268       }
6269
6270     case BSWAP:
6271       *cost = COSTS_N_INSNS (1);
6272
6273       if (speed)
6274         {
6275           if (VECTOR_MODE_P (mode))
6276             *cost += extra_cost->vect.alu;
6277           else
6278             *cost += extra_cost->alu.rev;
6279         }
6280       return false;
6281
6282     case IOR:
6283       if (aarch_rev16_p (x))
6284         {
6285           *cost = COSTS_N_INSNS (1);
6286
6287           if (speed)
6288             {
6289               if (VECTOR_MODE_P (mode))
6290                 *cost += extra_cost->vect.alu;
6291               else
6292                 *cost += extra_cost->alu.rev;
6293             }
6294           return true;
6295         }
6296
6297       if (aarch64_extr_rtx_p (x, &op0, &op1))
6298         {
6299           *cost += rtx_cost (op0, mode, IOR, 0, speed);
6300           *cost += rtx_cost (op1, mode, IOR, 1, speed);
6301           if (speed)
6302             *cost += extra_cost->alu.shift;
6303
6304           return true;
6305         }
6306     /* Fall through.  */
6307     case XOR:
6308     case AND:
6309     cost_logic:
6310       op0 = XEXP (x, 0);
6311       op1 = XEXP (x, 1);
6312
6313       if (VECTOR_MODE_P (mode))
6314         {
6315           if (speed)
6316             *cost += extra_cost->vect.alu;
6317           return true;
6318         }
6319
6320       if (code == AND
6321           && GET_CODE (op0) == MULT
6322           && CONST_INT_P (XEXP (op0, 1))
6323           && CONST_INT_P (op1)
6324           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6325                                INTVAL (op1)) != 0)
6326         {
6327           /* This is a UBFM/SBFM.  */
6328           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
6329           if (speed)
6330             *cost += extra_cost->alu.bfx;
6331           return true;
6332         }
6333
6334       if (GET_MODE_CLASS (mode) == MODE_INT)
6335         {
6336           /* We possibly get the immediate for free, this is not
6337              modelled.  */
6338           if (CONST_INT_P (op1)
6339               && aarch64_bitmask_imm (INTVAL (op1), mode))
6340             {
6341               *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6342
6343               if (speed)
6344                 *cost += extra_cost->alu.logical;
6345
6346               return true;
6347             }
6348           else
6349             {
6350               rtx new_op0 = op0;
6351
6352               /* Handle ORN, EON, or BIC.  */
6353               if (GET_CODE (op0) == NOT)
6354                 op0 = XEXP (op0, 0);
6355
6356               new_op0 = aarch64_strip_shift (op0);
6357
6358               /* If we had a shift on op0 then this is a logical-shift-
6359                  by-register/immediate operation.  Otherwise, this is just
6360                  a logical operation.  */
6361               if (speed)
6362                 {
6363                   if (new_op0 != op0)
6364                     {
6365                       /* Shift by immediate.  */
6366                       if (CONST_INT_P (XEXP (op0, 1)))
6367                         *cost += extra_cost->alu.log_shift;
6368                       else
6369                         *cost += extra_cost->alu.log_shift_reg;
6370                     }
6371                   else
6372                     *cost += extra_cost->alu.logical;
6373                 }
6374
6375               /* In both cases we want to cost both operands.  */
6376               *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
6377               *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
6378
6379               return true;
6380             }
6381         }
6382       return false;
6383
6384     case NOT:
6385       x = XEXP (x, 0);
6386       op0 = aarch64_strip_shift (x);
6387
6388       if (VECTOR_MODE_P (mode))
6389         {
6390           /* Vector NOT.  */
6391           *cost += extra_cost->vect.alu;
6392           return false;
6393         }
6394
6395       /* MVN-shifted-reg.  */
6396       if (op0 != x)
6397         {
6398           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6399
6400           if (speed)
6401             *cost += extra_cost->alu.log_shift;
6402
6403           return true;
6404         }
6405       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6406          Handle the second form here taking care that 'a' in the above can
6407          be a shift.  */
6408       else if (GET_CODE (op0) == XOR)
6409         {
6410           rtx newop0 = XEXP (op0, 0);
6411           rtx newop1 = XEXP (op0, 1);
6412           rtx op0_stripped = aarch64_strip_shift (newop0);
6413
6414           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
6415           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
6416
6417           if (speed)
6418             {
6419               if (op0_stripped != newop0)
6420                 *cost += extra_cost->alu.log_shift;
6421               else
6422                 *cost += extra_cost->alu.logical;
6423             }
6424
6425           return true;
6426         }
6427       /* MVN.  */
6428       if (speed)
6429         *cost += extra_cost->alu.logical;
6430
6431       return false;
6432
6433     case ZERO_EXTEND:
6434
6435       op0 = XEXP (x, 0);
6436       /* If a value is written in SI mode, then zero extended to DI
6437          mode, the operation will in general be free as a write to
6438          a 'w' register implicitly zeroes the upper bits of an 'x'
6439          register.  However, if this is
6440
6441            (set (reg) (zero_extend (reg)))
6442
6443          we must cost the explicit register move.  */
6444       if (mode == DImode
6445           && GET_MODE (op0) == SImode
6446           && outer == SET)
6447         {
6448           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
6449
6450           if (!op_cost && speed)
6451             /* MOV.  */
6452             *cost += extra_cost->alu.extend;
6453           else
6454             /* Free, the cost is that of the SI mode operation.  */
6455             *cost = op_cost;
6456
6457           return true;
6458         }
6459       else if (MEM_P (op0))
6460         {
6461           /* All loads can zero extend to any size for free.  */
6462           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
6463           return true;
6464         }
6465
6466       if (speed)
6467         {
6468           if (VECTOR_MODE_P (mode))
6469             {
6470               /* UMOV.  */
6471               *cost += extra_cost->vect.alu;
6472             }
6473           else
6474             {
6475               /* UXTB/UXTH.  */
6476               *cost += extra_cost->alu.extend;
6477             }
6478         }
6479       return false;
6480
6481     case SIGN_EXTEND:
6482       if (MEM_P (XEXP (x, 0)))
6483         {
6484           /* LDRSH.  */
6485           if (speed)
6486             {
6487               rtx address = XEXP (XEXP (x, 0), 0);
6488               *cost += extra_cost->ldst.load_sign_extend;
6489
6490               *cost +=
6491                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6492                                                      0, speed));
6493             }
6494           return true;
6495         }
6496
6497       if (speed)
6498         {
6499           if (VECTOR_MODE_P (mode))
6500             *cost += extra_cost->vect.alu;
6501           else
6502             *cost += extra_cost->alu.extend;
6503         }
6504       return false;
6505
6506     case ASHIFT:
6507       op0 = XEXP (x, 0);
6508       op1 = XEXP (x, 1);
6509
6510       if (CONST_INT_P (op1))
6511         {
6512           if (speed)
6513             {
6514               if (VECTOR_MODE_P (mode))
6515                 {
6516                   /* Vector shift (immediate).  */
6517                   *cost += extra_cost->vect.alu;
6518                 }
6519               else
6520                 {
6521                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6522                      aliases.  */
6523                   *cost += extra_cost->alu.shift;
6524                 }
6525             }
6526
6527           /* We can incorporate zero/sign extend for free.  */
6528           if (GET_CODE (op0) == ZERO_EXTEND
6529               || GET_CODE (op0) == SIGN_EXTEND)
6530             op0 = XEXP (op0, 0);
6531
6532           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
6533           return true;
6534         }
6535       else
6536         {
6537           if (speed)
6538             {
6539               if (VECTOR_MODE_P (mode))
6540                 {
6541                   /* Vector shift (register).  */
6542                   *cost += extra_cost->vect.alu;
6543                 }
6544               else
6545                 {
6546                   /* LSLV.  */
6547                   *cost += extra_cost->alu.shift_reg;
6548                 }
6549             }
6550           return false;  /* All arguments need to be in registers.  */
6551         }
6552
6553     case ROTATE:
6554     case ROTATERT:
6555     case LSHIFTRT:
6556     case ASHIFTRT:
6557       op0 = XEXP (x, 0);
6558       op1 = XEXP (x, 1);
6559
6560       if (CONST_INT_P (op1))
6561         {
6562           /* ASR (immediate) and friends.  */
6563           if (speed)
6564             {
6565               if (VECTOR_MODE_P (mode))
6566                 *cost += extra_cost->vect.alu;
6567               else
6568                 *cost += extra_cost->alu.shift;
6569             }
6570
6571           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6572           return true;
6573         }
6574       else
6575         {
6576
6577           /* ASR (register) and friends.  */
6578           if (speed)
6579             {
6580               if (VECTOR_MODE_P (mode))
6581                 *cost += extra_cost->vect.alu;
6582               else
6583                 *cost += extra_cost->alu.shift_reg;
6584             }
6585           return false;  /* All arguments need to be in registers.  */
6586         }
6587
6588     case SYMBOL_REF:
6589
6590       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
6591           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
6592         {
6593           /* LDR.  */
6594           if (speed)
6595             *cost += extra_cost->ldst.load;
6596         }
6597       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6598                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6599         {
6600           /* ADRP, followed by ADD.  */
6601           *cost += COSTS_N_INSNS (1);
6602           if (speed)
6603             *cost += 2 * extra_cost->alu.arith;
6604         }
6605       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6606                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6607         {
6608           /* ADR.  */
6609           if (speed)
6610             *cost += extra_cost->alu.arith;
6611         }
6612
6613       if (flag_pic)
6614         {
6615           /* One extra load instruction, after accessing the GOT.  */
6616           *cost += COSTS_N_INSNS (1);
6617           if (speed)
6618             *cost += extra_cost->ldst.load;
6619         }
6620       return true;
6621
6622     case HIGH:
6623     case LO_SUM:
6624       /* ADRP/ADD (immediate).  */
6625       if (speed)
6626         *cost += extra_cost->alu.arith;
6627       return true;
6628
6629     case ZERO_EXTRACT:
6630     case SIGN_EXTRACT:
6631       /* UBFX/SBFX.  */
6632       if (speed)
6633         {
6634           if (VECTOR_MODE_P (mode))
6635             *cost += extra_cost->vect.alu;
6636           else
6637             *cost += extra_cost->alu.bfx;
6638         }
6639
6640       /* We can trust that the immediates used will be correct (there
6641          are no by-register forms), so we need only cost op0.  */
6642       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
6643       return true;
6644
6645     case MULT:
6646       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
6647       /* aarch64_rtx_mult_cost always handles recursion to its
6648          operands.  */
6649       return true;
6650
6651     case MOD:
6652     /* We can expand signed mod by power of 2 using a NEGS, two parallel
6653        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
6654        an unconditional negate.  This case should only ever be reached through
6655        the set_smod_pow2_cheap check in expmed.c.  */
6656       if (CONST_INT_P (XEXP (x, 1))
6657           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
6658           && (mode == SImode || mode == DImode))
6659         {
6660           /* We expand to 4 instructions.  Reset the baseline.  */
6661           *cost = COSTS_N_INSNS (4);
6662
6663           if (speed)
6664             *cost += 2 * extra_cost->alu.logical
6665                      + 2 * extra_cost->alu.arith;
6666
6667           return true;
6668         }
6669
6670     /* Fall-through.  */
6671     case UMOD:
6672       if (speed)
6673         {
6674           if (VECTOR_MODE_P (mode))
6675             *cost += extra_cost->vect.alu;
6676           else if (GET_MODE_CLASS (mode) == MODE_INT)
6677             *cost += (extra_cost->mult[mode == DImode].add
6678                       + extra_cost->mult[mode == DImode].idiv);
6679           else if (mode == DFmode)
6680             *cost += (extra_cost->fp[1].mult
6681                       + extra_cost->fp[1].div);
6682           else if (mode == SFmode)
6683             *cost += (extra_cost->fp[0].mult
6684                       + extra_cost->fp[0].div);
6685         }
6686       return false;  /* All arguments need to be in registers.  */
6687
6688     case DIV:
6689     case UDIV:
6690     case SQRT:
6691       if (speed)
6692         {
6693           if (VECTOR_MODE_P (mode))
6694             *cost += extra_cost->vect.alu;
6695           else if (GET_MODE_CLASS (mode) == MODE_INT)
6696             /* There is no integer SQRT, so only DIV and UDIV can get
6697                here.  */
6698             *cost += extra_cost->mult[mode == DImode].idiv;
6699           else
6700             *cost += extra_cost->fp[mode == DFmode].div;
6701         }
6702       return false;  /* All arguments need to be in registers.  */
6703
6704     case IF_THEN_ELSE:
6705       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
6706                                          XEXP (x, 2), cost, speed);
6707
6708     case EQ:
6709     case NE:
6710     case GT:
6711     case GTU:
6712     case LT:
6713     case LTU:
6714     case GE:
6715     case GEU:
6716     case LE:
6717     case LEU:
6718
6719       return false; /* All arguments must be in registers.  */
6720
6721     case FMA:
6722       op0 = XEXP (x, 0);
6723       op1 = XEXP (x, 1);
6724       op2 = XEXP (x, 2);
6725
6726       if (speed)
6727         {
6728           if (VECTOR_MODE_P (mode))
6729             *cost += extra_cost->vect.alu;
6730           else
6731             *cost += extra_cost->fp[mode == DFmode].fma;
6732         }
6733
6734       /* FMSUB, FNMADD, and FNMSUB are free.  */
6735       if (GET_CODE (op0) == NEG)
6736         op0 = XEXP (op0, 0);
6737
6738       if (GET_CODE (op2) == NEG)
6739         op2 = XEXP (op2, 0);
6740
6741       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
6742          and the by-element operand as operand 0.  */
6743       if (GET_CODE (op1) == NEG)
6744         op1 = XEXP (op1, 0);
6745
6746       /* Catch vector-by-element operations.  The by-element operand can
6747          either be (vec_duplicate (vec_select (x))) or just
6748          (vec_select (x)), depending on whether we are multiplying by
6749          a vector or a scalar.
6750
6751          Canonicalization is not very good in these cases, FMA4 will put the
6752          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
6753       if (GET_CODE (op0) == VEC_DUPLICATE)
6754         op0 = XEXP (op0, 0);
6755       else if (GET_CODE (op1) == VEC_DUPLICATE)
6756         op1 = XEXP (op1, 0);
6757
6758       if (GET_CODE (op0) == VEC_SELECT)
6759         op0 = XEXP (op0, 0);
6760       else if (GET_CODE (op1) == VEC_SELECT)
6761         op1 = XEXP (op1, 0);
6762
6763       /* If the remaining parameters are not registers,
6764          get the cost to put them into registers.  */
6765       *cost += rtx_cost (op0, mode, FMA, 0, speed);
6766       *cost += rtx_cost (op1, mode, FMA, 1, speed);
6767       *cost += rtx_cost (op2, mode, FMA, 2, speed);
6768       return true;
6769
6770     case FLOAT:
6771     case UNSIGNED_FLOAT:
6772       if (speed)
6773         *cost += extra_cost->fp[mode == DFmode].fromint;
6774       return false;
6775
6776     case FLOAT_EXTEND:
6777       if (speed)
6778         {
6779           if (VECTOR_MODE_P (mode))
6780             {
6781               /*Vector truncate.  */
6782               *cost += extra_cost->vect.alu;
6783             }
6784           else
6785             *cost += extra_cost->fp[mode == DFmode].widen;
6786         }
6787       return false;
6788
6789     case FLOAT_TRUNCATE:
6790       if (speed)
6791         {
6792           if (VECTOR_MODE_P (mode))
6793             {
6794               /*Vector conversion.  */
6795               *cost += extra_cost->vect.alu;
6796             }
6797           else
6798             *cost += extra_cost->fp[mode == DFmode].narrow;
6799         }
6800       return false;
6801
6802     case FIX:
6803     case UNSIGNED_FIX:
6804       x = XEXP (x, 0);
6805       /* Strip the rounding part.  They will all be implemented
6806          by the fcvt* family of instructions anyway.  */
6807       if (GET_CODE (x) == UNSPEC)
6808         {
6809           unsigned int uns_code = XINT (x, 1);
6810
6811           if (uns_code == UNSPEC_FRINTA
6812               || uns_code == UNSPEC_FRINTM
6813               || uns_code == UNSPEC_FRINTN
6814               || uns_code == UNSPEC_FRINTP
6815               || uns_code == UNSPEC_FRINTZ)
6816             x = XVECEXP (x, 0, 0);
6817         }
6818
6819       if (speed)
6820         {
6821           if (VECTOR_MODE_P (mode))
6822             *cost += extra_cost->vect.alu;
6823           else
6824             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
6825         }
6826
6827       /* We can combine fmul by a power of 2 followed by a fcvt into a single
6828          fixed-point fcvt.  */
6829       if (GET_CODE (x) == MULT
6830           && ((VECTOR_MODE_P (mode)
6831                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
6832               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
6833         {
6834           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
6835                              0, speed);
6836           return true;
6837         }
6838
6839       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
6840       return true;
6841
6842     case ABS:
6843       if (VECTOR_MODE_P (mode))
6844         {
6845           /* ABS (vector).  */
6846           if (speed)
6847             *cost += extra_cost->vect.alu;
6848         }
6849       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6850         {
6851           op0 = XEXP (x, 0);
6852
6853           /* FABD, which is analogous to FADD.  */
6854           if (GET_CODE (op0) == MINUS)
6855             {
6856               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
6857               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
6858               if (speed)
6859                 *cost += extra_cost->fp[mode == DFmode].addsub;
6860
6861               return true;
6862             }
6863           /* Simple FABS is analogous to FNEG.  */
6864           if (speed)
6865             *cost += extra_cost->fp[mode == DFmode].neg;
6866         }
6867       else
6868         {
6869           /* Integer ABS will either be split to
6870              two arithmetic instructions, or will be an ABS
6871              (scalar), which we don't model.  */
6872           *cost = COSTS_N_INSNS (2);
6873           if (speed)
6874             *cost += 2 * extra_cost->alu.arith;
6875         }
6876       return false;
6877
6878     case SMAX:
6879     case SMIN:
6880       if (speed)
6881         {
6882           if (VECTOR_MODE_P (mode))
6883             *cost += extra_cost->vect.alu;
6884           else
6885             {
6886               /* FMAXNM/FMINNM/FMAX/FMIN.
6887                  TODO: This may not be accurate for all implementations, but
6888                  we do not model this in the cost tables.  */
6889               *cost += extra_cost->fp[mode == DFmode].addsub;
6890             }
6891         }
6892       return false;
6893
6894     case UNSPEC:
6895       /* The floating point round to integer frint* instructions.  */
6896       if (aarch64_frint_unspec_p (XINT (x, 1)))
6897         {
6898           if (speed)
6899             *cost += extra_cost->fp[mode == DFmode].roundint;
6900
6901           return false;
6902         }
6903
6904       if (XINT (x, 1) == UNSPEC_RBIT)
6905         {
6906           if (speed)
6907             *cost += extra_cost->alu.rev;
6908
6909           return false;
6910         }
6911       break;
6912
6913     case TRUNCATE:
6914
6915       /* Decompose <su>muldi3_highpart.  */
6916       if (/* (truncate:DI  */
6917           mode == DImode
6918           /*   (lshiftrt:TI  */
6919           && GET_MODE (XEXP (x, 0)) == TImode
6920           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
6921           /*      (mult:TI  */
6922           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
6923           /*        (ANY_EXTEND:TI (reg:DI))
6924                     (ANY_EXTEND:TI (reg:DI)))  */
6925           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
6926                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
6927               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
6928                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
6929           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
6930           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
6931           /*     (const_int 64)  */
6932           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
6933           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
6934         {
6935           /* UMULH/SMULH.  */
6936           if (speed)
6937             *cost += extra_cost->mult[mode == DImode].extend;
6938           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
6939                              mode, MULT, 0, speed);
6940           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
6941                              mode, MULT, 1, speed);
6942           return true;
6943         }
6944
6945       /* Fall through.  */
6946     default:
6947       break;
6948     }
6949
6950   if (dump_file && (dump_flags & TDF_DETAILS))
6951     fprintf (dump_file,
6952       "\nFailed to cost RTX.  Assuming default cost.\n");
6953
6954   return true;
6955 }
6956
6957 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
6958    calculated for X.  This cost is stored in *COST.  Returns true
6959    if the total cost of X was calculated.  */
6960 static bool
6961 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
6962                    int param, int *cost, bool speed)
6963 {
6964   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
6965
6966   if (dump_file && (dump_flags & TDF_DETAILS))
6967     {
6968       print_rtl_single (dump_file, x);
6969       fprintf (dump_file, "\n%s cost: %d (%s)\n",
6970                speed ? "Hot" : "Cold",
6971                *cost, result ? "final" : "partial");
6972     }
6973
6974   return result;
6975 }
6976
6977 static int
6978 aarch64_register_move_cost (machine_mode mode,
6979                             reg_class_t from_i, reg_class_t to_i)
6980 {
6981   enum reg_class from = (enum reg_class) from_i;
6982   enum reg_class to = (enum reg_class) to_i;
6983   const struct cpu_regmove_cost *regmove_cost
6984     = aarch64_tune_params.regmove_cost;
6985
6986   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
6987   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
6988     to = GENERAL_REGS;
6989
6990   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
6991     from = GENERAL_REGS;
6992
6993   /* Moving between GPR and stack cost is the same as GP2GP.  */
6994   if ((from == GENERAL_REGS && to == STACK_REG)
6995       || (to == GENERAL_REGS && from == STACK_REG))
6996     return regmove_cost->GP2GP;
6997
6998   /* To/From the stack register, we move via the gprs.  */
6999   if (to == STACK_REG || from == STACK_REG)
7000     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7001             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7002
7003   if (GET_MODE_SIZE (mode) == 16)
7004     {
7005       /* 128-bit operations on general registers require 2 instructions.  */
7006       if (from == GENERAL_REGS && to == GENERAL_REGS)
7007         return regmove_cost->GP2GP * 2;
7008       else if (from == GENERAL_REGS)
7009         return regmove_cost->GP2FP * 2;
7010       else if (to == GENERAL_REGS)
7011         return regmove_cost->FP2GP * 2;
7012
7013       /* When AdvSIMD instructions are disabled it is not possible to move
7014          a 128-bit value directly between Q registers.  This is handled in
7015          secondary reload.  A general register is used as a scratch to move
7016          the upper DI value and the lower DI value is moved directly,
7017          hence the cost is the sum of three moves. */
7018       if (! TARGET_SIMD)
7019         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7020
7021       return regmove_cost->FP2FP;
7022     }
7023
7024   if (from == GENERAL_REGS && to == GENERAL_REGS)
7025     return regmove_cost->GP2GP;
7026   else if (from == GENERAL_REGS)
7027     return regmove_cost->GP2FP;
7028   else if (to == GENERAL_REGS)
7029     return regmove_cost->FP2GP;
7030
7031   return regmove_cost->FP2FP;
7032 }
7033
7034 static int
7035 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7036                           reg_class_t rclass ATTRIBUTE_UNUSED,
7037                           bool in ATTRIBUTE_UNUSED)
7038 {
7039   return aarch64_tune_params.memmov_cost;
7040 }
7041
7042 /* Function to decide when to use
7043    reciprocal square root builtins.  */
7044
7045 static tree
7046 aarch64_builtin_reciprocal (unsigned int fn,
7047                             bool md_fn,
7048                             bool)
7049 {
7050   if (flag_trapping_math
7051       || !flag_unsafe_math_optimizations
7052       || optimize_size
7053       || ! (aarch64_tune_params.extra_tuning_flags
7054            & AARCH64_EXTRA_TUNE_RECIP_SQRT))
7055   {
7056     return NULL_TREE;
7057   }
7058
7059   return aarch64_builtin_rsqrt (fn, md_fn);
7060 }
7061
7062 typedef rtx (*rsqrte_type) (rtx, rtx);
7063
7064 /* Select reciprocal square root initial estimate
7065    insn depending on machine mode.  */
7066
7067 rsqrte_type
7068 get_rsqrte_type (machine_mode mode)
7069 {
7070   switch (mode)
7071   {
7072     case DFmode:   return gen_aarch64_rsqrte_df2;
7073     case SFmode:   return gen_aarch64_rsqrte_sf2;
7074     case V2DFmode: return gen_aarch64_rsqrte_v2df2;
7075     case V2SFmode: return gen_aarch64_rsqrte_v2sf2;
7076     case V4SFmode: return gen_aarch64_rsqrte_v4sf2;
7077     default: gcc_unreachable ();
7078   }
7079 }
7080
7081 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7082
7083 /* Select reciprocal square root Newton-Raphson step
7084    insn depending on machine mode.  */
7085
7086 rsqrts_type
7087 get_rsqrts_type (machine_mode mode)
7088 {
7089   switch (mode)
7090   {
7091     case DFmode:   return gen_aarch64_rsqrts_df3;
7092     case SFmode:   return gen_aarch64_rsqrts_sf3;
7093     case V2DFmode: return gen_aarch64_rsqrts_v2df3;
7094     case V2SFmode: return gen_aarch64_rsqrts_v2sf3;
7095     case V4SFmode: return gen_aarch64_rsqrts_v4sf3;
7096     default: gcc_unreachable ();
7097   }
7098 }
7099
7100 /* Emit instruction sequence to compute
7101    reciprocal square root.  Use two Newton-Raphson steps
7102    for single precision and three for double precision.  */
7103
7104 void
7105 aarch64_emit_swrsqrt (rtx dst, rtx src)
7106 {
7107   machine_mode mode = GET_MODE (src);
7108   gcc_assert (
7109     mode == SFmode || mode == V2SFmode || mode == V4SFmode
7110         || mode == DFmode || mode == V2DFmode);
7111
7112   rtx xsrc = gen_reg_rtx (mode);
7113   emit_move_insn (xsrc, src);
7114   rtx x0 = gen_reg_rtx (mode);
7115
7116   emit_insn ((*get_rsqrte_type (mode)) (x0, xsrc));
7117
7118   bool double_mode = (mode == DFmode || mode == V2DFmode);
7119
7120   int iterations = double_mode ? 3 : 2;
7121
7122   if (flag_mrecip_low_precision_sqrt)
7123     iterations--;
7124
7125   for (int i = 0; i < iterations; ++i)
7126     {
7127       rtx x1 = gen_reg_rtx (mode);
7128       rtx x2 = gen_reg_rtx (mode);
7129       rtx x3 = gen_reg_rtx (mode);
7130       emit_set_insn (x2, gen_rtx_MULT (mode, x0, x0));
7131
7132       emit_insn ((*get_rsqrts_type (mode)) (x3, xsrc, x2));
7133
7134       emit_set_insn (x1, gen_rtx_MULT (mode, x0, x3));
7135       x0 = x1;
7136     }
7137
7138   emit_move_insn (dst, x0);
7139 }
7140
7141 /* Return the number of instructions that can be issued per cycle.  */
7142 static int
7143 aarch64_sched_issue_rate (void)
7144 {
7145   return aarch64_tune_params.issue_rate;
7146 }
7147
7148 static int
7149 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
7150 {
7151   int issue_rate = aarch64_sched_issue_rate ();
7152
7153   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
7154 }
7155
7156
7157 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
7158    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
7159    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
7160
7161 static int
7162 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
7163                                                     int ready_index)
7164 {
7165   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
7166 }
7167
7168
7169 /* Vectorizer cost model target hooks.  */
7170
7171 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
7172 static int
7173 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
7174                                     tree vectype,
7175                                     int misalign ATTRIBUTE_UNUSED)
7176 {
7177   unsigned elements;
7178
7179   switch (type_of_cost)
7180     {
7181       case scalar_stmt:
7182         return aarch64_tune_params.vec_costs->scalar_stmt_cost;
7183
7184       case scalar_load:
7185         return aarch64_tune_params.vec_costs->scalar_load_cost;
7186
7187       case scalar_store:
7188         return aarch64_tune_params.vec_costs->scalar_store_cost;
7189
7190       case vector_stmt:
7191         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7192
7193       case vector_load:
7194         return aarch64_tune_params.vec_costs->vec_align_load_cost;
7195
7196       case vector_store:
7197         return aarch64_tune_params.vec_costs->vec_store_cost;
7198
7199       case vec_to_scalar:
7200         return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
7201
7202       case scalar_to_vec:
7203         return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
7204
7205       case unaligned_load:
7206         return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
7207
7208       case unaligned_store:
7209         return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
7210
7211       case cond_branch_taken:
7212         return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
7213
7214       case cond_branch_not_taken:
7215         return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
7216
7217       case vec_perm:
7218       case vec_promote_demote:
7219         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7220
7221       case vec_construct:
7222         elements = TYPE_VECTOR_SUBPARTS (vectype);
7223         return elements / 2 + 1;
7224
7225       default:
7226         gcc_unreachable ();
7227     }
7228 }
7229
7230 /* Implement targetm.vectorize.add_stmt_cost.  */
7231 static unsigned
7232 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
7233                        struct _stmt_vec_info *stmt_info, int misalign,
7234                        enum vect_cost_model_location where)
7235 {
7236   unsigned *cost = (unsigned *) data;
7237   unsigned retval = 0;
7238
7239   if (flag_vect_cost_model)
7240     {
7241       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
7242       int stmt_cost =
7243             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
7244
7245       /* Statements in an inner loop relative to the loop being
7246          vectorized are weighted more heavily.  The value here is
7247          arbitrary and could potentially be improved with analysis.  */
7248       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
7249         count *= 50; /*  FIXME  */
7250
7251       retval = (unsigned) (count * stmt_cost);
7252       cost[where] += retval;
7253     }
7254
7255   return retval;
7256 }
7257
7258 static void initialize_aarch64_code_model (struct gcc_options *);
7259
7260 /* Enum describing the various ways that the
7261    aarch64_parse_{arch,tune,cpu,extension} functions can fail.
7262    This way their callers can choose what kind of error to give.  */
7263
7264 enum aarch64_parse_opt_result
7265 {
7266   AARCH64_PARSE_OK,                     /* Parsing was successful.  */
7267   AARCH64_PARSE_MISSING_ARG,            /* Missing argument.  */
7268   AARCH64_PARSE_INVALID_FEATURE,        /* Invalid feature modifier.  */
7269   AARCH64_PARSE_INVALID_ARG             /* Invalid arch, tune, cpu arg.  */
7270 };
7271
7272 /* Parse the architecture extension string STR and update ISA_FLAGS
7273    with the architecture features turned on or off.  Return a
7274    aarch64_parse_opt_result describing the result.  */
7275
7276 static enum aarch64_parse_opt_result
7277 aarch64_parse_extension (char *str, unsigned long *isa_flags)
7278 {
7279   /* The extension string is parsed left to right.  */
7280   const struct aarch64_option_extension *opt = NULL;
7281
7282   /* Flag to say whether we are adding or removing an extension.  */
7283   int adding_ext = -1;
7284
7285   while (str != NULL && *str != 0)
7286     {
7287       char *ext;
7288       size_t len;
7289
7290       str++;
7291       ext = strchr (str, '+');
7292
7293       if (ext != NULL)
7294         len = ext - str;
7295       else
7296         len = strlen (str);
7297
7298       if (len >= 2 && strncmp (str, "no", 2) == 0)
7299         {
7300           adding_ext = 0;
7301           len -= 2;
7302           str += 2;
7303         }
7304       else if (len > 0)
7305         adding_ext = 1;
7306
7307       if (len == 0)
7308         return AARCH64_PARSE_MISSING_ARG;
7309
7310
7311       /* Scan over the extensions table trying to find an exact match.  */
7312       for (opt = all_extensions; opt->name != NULL; opt++)
7313         {
7314           if (strlen (opt->name) == len && strncmp (opt->name, str, len) == 0)
7315             {
7316               /* Add or remove the extension.  */
7317               if (adding_ext)
7318                 *isa_flags |= opt->flags_on;
7319               else
7320                 *isa_flags &= ~(opt->flags_off);
7321               break;
7322             }
7323         }
7324
7325       if (opt->name == NULL)
7326         {
7327           /* Extension not found in list.  */
7328           return AARCH64_PARSE_INVALID_FEATURE;
7329         }
7330
7331       str = ext;
7332     };
7333
7334   return AARCH64_PARSE_OK;
7335 }
7336
7337 /* Parse the TO_PARSE string and put the architecture struct that it
7338    selects into RES and the architectural features into ISA_FLAGS.
7339    Return an aarch64_parse_opt_result describing the parse result.
7340    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
7341
7342 static enum aarch64_parse_opt_result
7343 aarch64_parse_arch (const char *to_parse, const struct processor **res,
7344                     unsigned long *isa_flags)
7345 {
7346   char *ext;
7347   const struct processor *arch;
7348   char *str = (char *) alloca (strlen (to_parse) + 1);
7349   size_t len;
7350
7351   strcpy (str, to_parse);
7352
7353   ext = strchr (str, '+');
7354
7355   if (ext != NULL)
7356     len = ext - str;
7357   else
7358     len = strlen (str);
7359
7360   if (len == 0)
7361     return AARCH64_PARSE_MISSING_ARG;
7362
7363
7364   /* Loop through the list of supported ARCHes to find a match.  */
7365   for (arch = all_architectures; arch->name != NULL; arch++)
7366     {
7367       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7368         {
7369           unsigned long isa_temp = arch->flags;
7370
7371           if (ext != NULL)
7372             {
7373               /* TO_PARSE string contains at least one extension.  */
7374               enum aarch64_parse_opt_result ext_res
7375                 = aarch64_parse_extension (ext, &isa_temp);
7376
7377               if (ext_res != AARCH64_PARSE_OK)
7378                 return ext_res;
7379             }
7380           /* Extension parsing was successful.  Confirm the result
7381              arch and ISA flags.  */
7382           *res = arch;
7383           *isa_flags = isa_temp;
7384           return AARCH64_PARSE_OK;
7385         }
7386     }
7387
7388   /* ARCH name not found in list.  */
7389   return AARCH64_PARSE_INVALID_ARG;
7390 }
7391
7392 /* Parse the TO_PARSE string and put the result tuning in RES and the
7393    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
7394    describing the parse result.  If there is an error parsing, RES and
7395    ISA_FLAGS are left unchanged.  */
7396
7397 static enum aarch64_parse_opt_result
7398 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
7399                    unsigned long *isa_flags)
7400 {
7401   char *ext;
7402   const struct processor *cpu;
7403   char *str = (char *) alloca (strlen (to_parse) + 1);
7404   size_t len;
7405
7406   strcpy (str, to_parse);
7407
7408   ext = strchr (str, '+');
7409
7410   if (ext != NULL)
7411     len = ext - str;
7412   else
7413     len = strlen (str);
7414
7415   if (len == 0)
7416     return AARCH64_PARSE_MISSING_ARG;
7417
7418
7419   /* Loop through the list of supported CPUs to find a match.  */
7420   for (cpu = all_cores; cpu->name != NULL; cpu++)
7421     {
7422       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7423         {
7424           unsigned long isa_temp = cpu->flags;
7425
7426
7427           if (ext != NULL)
7428             {
7429               /* TO_PARSE string contains at least one extension.  */
7430               enum aarch64_parse_opt_result ext_res
7431                 = aarch64_parse_extension (ext, &isa_temp);
7432
7433               if (ext_res != AARCH64_PARSE_OK)
7434                 return ext_res;
7435             }
7436           /* Extension parsing was successfull.  Confirm the result
7437              cpu and ISA flags.  */
7438           *res = cpu;
7439           *isa_flags = isa_temp;
7440           return AARCH64_PARSE_OK;
7441         }
7442     }
7443
7444   /* CPU name not found in list.  */
7445   return AARCH64_PARSE_INVALID_ARG;
7446 }
7447
7448 /* Parse the TO_PARSE string and put the cpu it selects into RES.
7449    Return an aarch64_parse_opt_result describing the parse result.
7450    If the parsing fails the RES does not change.  */
7451
7452 static enum aarch64_parse_opt_result
7453 aarch64_parse_tune (const char *to_parse, const struct processor **res)
7454 {
7455   const struct processor *cpu;
7456   char *str = (char *) alloca (strlen (to_parse) + 1);
7457
7458   strcpy (str, to_parse);
7459
7460   /* Loop through the list of supported CPUs to find a match.  */
7461   for (cpu = all_cores; cpu->name != NULL; cpu++)
7462     {
7463       if (strcmp (cpu->name, str) == 0)
7464         {
7465           *res = cpu;
7466           return AARCH64_PARSE_OK;
7467         }
7468     }
7469
7470   /* CPU name not found in list.  */
7471   return AARCH64_PARSE_INVALID_ARG;
7472 }
7473
7474 /* Parse TOKEN, which has length LENGTH to see if it is an option
7475    described in FLAG.  If it is, return the index bit for that fusion type.
7476    If not, error (printing OPTION_NAME) and return zero.  */
7477
7478 static unsigned int
7479 aarch64_parse_one_option_token (const char *token,
7480                                 size_t length,
7481                                 const struct aarch64_flag_desc *flag,
7482                                 const char *option_name)
7483 {
7484   for (; flag->name != NULL; flag++)
7485     {
7486       if (length == strlen (flag->name)
7487           && !strncmp (flag->name, token, length))
7488         return flag->flag;
7489     }
7490
7491   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
7492   return 0;
7493 }
7494
7495 /* Parse OPTION which is a comma-separated list of flags to enable.
7496    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
7497    default state we inherit from the CPU tuning structures.  OPTION_NAME
7498    gives the top-level option we are parsing in the -moverride string,
7499    for use in error messages.  */
7500
7501 static unsigned int
7502 aarch64_parse_boolean_options (const char *option,
7503                                const struct aarch64_flag_desc *flags,
7504                                unsigned int initial_state,
7505                                const char *option_name)
7506 {
7507   const char separator = '.';
7508   const char* specs = option;
7509   const char* ntoken = option;
7510   unsigned int found_flags = initial_state;
7511
7512   while ((ntoken = strchr (specs, separator)))
7513     {
7514       size_t token_length = ntoken - specs;
7515       unsigned token_ops = aarch64_parse_one_option_token (specs,
7516                                                            token_length,
7517                                                            flags,
7518                                                            option_name);
7519       /* If we find "none" (or, for simplicity's sake, an error) anywhere
7520          in the token stream, reset the supported operations.  So:
7521
7522            adrp+add.cmp+branch.none.adrp+add
7523
7524            would have the result of turning on only adrp+add fusion.  */
7525       if (!token_ops)
7526         found_flags = 0;
7527
7528       found_flags |= token_ops;
7529       specs = ++ntoken;
7530     }
7531
7532   /* We ended with a comma, print something.  */
7533   if (!(*specs))
7534     {
7535       error ("%s string ill-formed\n", option_name);
7536       return 0;
7537     }
7538
7539   /* We still have one more token to parse.  */
7540   size_t token_length = strlen (specs);
7541   unsigned token_ops = aarch64_parse_one_option_token (specs,
7542                                                        token_length,
7543                                                        flags,
7544                                                        option_name);
7545    if (!token_ops)
7546      found_flags = 0;
7547
7548   found_flags |= token_ops;
7549   return found_flags;
7550 }
7551
7552 /* Support for overriding instruction fusion.  */
7553
7554 static void
7555 aarch64_parse_fuse_string (const char *fuse_string,
7556                             struct tune_params *tune)
7557 {
7558   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
7559                                                      aarch64_fusible_pairs,
7560                                                      tune->fusible_ops,
7561                                                      "fuse=");
7562 }
7563
7564 /* Support for overriding other tuning flags.  */
7565
7566 static void
7567 aarch64_parse_tune_string (const char *tune_string,
7568                             struct tune_params *tune)
7569 {
7570   tune->extra_tuning_flags
7571     = aarch64_parse_boolean_options (tune_string,
7572                                      aarch64_tuning_flags,
7573                                      tune->extra_tuning_flags,
7574                                      "tune=");
7575 }
7576
7577 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
7578    we understand.  If it is, extract the option string and handoff to
7579    the appropriate function.  */
7580
7581 void
7582 aarch64_parse_one_override_token (const char* token,
7583                                   size_t length,
7584                                   struct tune_params *tune)
7585 {
7586   const struct aarch64_tuning_override_function *fn
7587     = aarch64_tuning_override_functions;
7588
7589   const char *option_part = strchr (token, '=');
7590   if (!option_part)
7591     {
7592       error ("tuning string missing in option (%s)", token);
7593       return;
7594     }
7595
7596   /* Get the length of the option name.  */
7597   length = option_part - token;
7598   /* Skip the '=' to get to the option string.  */
7599   option_part++;
7600
7601   for (; fn->name != NULL; fn++)
7602     {
7603       if (!strncmp (fn->name, token, length))
7604         {
7605           fn->parse_override (option_part, tune);
7606           return;
7607         }
7608     }
7609
7610   error ("unknown tuning option (%s)",token);
7611   return;
7612 }
7613
7614 /* A checking mechanism for the implementation of the tls size.  */
7615
7616 static void
7617 initialize_aarch64_tls_size (struct gcc_options *opts)
7618 {
7619   if (aarch64_tls_size == 0)
7620     aarch64_tls_size = 24;
7621
7622   switch (opts->x_aarch64_cmodel_var)
7623     {
7624     case AARCH64_CMODEL_TINY:
7625       /* Both the default and maximum TLS size allowed under tiny is 1M which
7626          needs two instructions to address, so we clamp the size to 24.  */
7627       if (aarch64_tls_size > 24)
7628         aarch64_tls_size = 24;
7629       break;
7630     case AARCH64_CMODEL_SMALL:
7631       /* The maximum TLS size allowed under small is 4G.  */
7632       if (aarch64_tls_size > 32)
7633         aarch64_tls_size = 32;
7634       break;
7635     case AARCH64_CMODEL_LARGE:
7636       /* The maximum TLS size allowed under large is 16E.
7637          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
7638       if (aarch64_tls_size > 48)
7639         aarch64_tls_size = 48;
7640       break;
7641     default:
7642       gcc_unreachable ();
7643     }
7644
7645   return;
7646 }
7647
7648 /* Parse STRING looking for options in the format:
7649      string     :: option:string
7650      option     :: name=substring
7651      name       :: {a-z}
7652      substring  :: defined by option.  */
7653
7654 static void
7655 aarch64_parse_override_string (const char* input_string,
7656                                struct tune_params* tune)
7657 {
7658   const char separator = ':';
7659   size_t string_length = strlen (input_string) + 1;
7660   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
7661   char *string = string_root;
7662   strncpy (string, input_string, string_length);
7663   string[string_length - 1] = '\0';
7664
7665   char* ntoken = string;
7666
7667   while ((ntoken = strchr (string, separator)))
7668     {
7669       size_t token_length = ntoken - string;
7670       /* Make this substring look like a string.  */
7671       *ntoken = '\0';
7672       aarch64_parse_one_override_token (string, token_length, tune);
7673       string = ++ntoken;
7674     }
7675
7676   /* One last option to parse.  */
7677   aarch64_parse_one_override_token (string, strlen (string), tune);
7678   free (string_root);
7679 }
7680
7681
7682 static void
7683 aarch64_override_options_after_change_1 (struct gcc_options *opts)
7684 {
7685   if (opts->x_flag_omit_frame_pointer)
7686     opts->x_flag_omit_leaf_frame_pointer = false;
7687   else if (opts->x_flag_omit_leaf_frame_pointer)
7688     opts->x_flag_omit_frame_pointer = true;
7689
7690   /* If not optimizing for size, set the default
7691      alignment to what the target wants.  */
7692   if (!opts->x_optimize_size)
7693     {
7694       if (opts->x_align_loops <= 0)
7695         opts->x_align_loops = aarch64_tune_params.loop_align;
7696       if (opts->x_align_jumps <= 0)
7697         opts->x_align_jumps = aarch64_tune_params.jump_align;
7698       if (opts->x_align_functions <= 0)
7699         opts->x_align_functions = aarch64_tune_params.function_align;
7700     }
7701
7702   /* If nopcrelative_literal_loads is set on the command line, this
7703      implies that the user asked for PC relative literal loads.  */
7704   if (opts->x_nopcrelative_literal_loads == 1)
7705     aarch64_nopcrelative_literal_loads = false;
7706
7707   /* If it is not set on the command line, we default to no
7708      pc relative literal loads.  */
7709   if (opts->x_nopcrelative_literal_loads == 2)
7710     aarch64_nopcrelative_literal_loads = true;
7711
7712   /* In the tiny memory model it makes no sense
7713      to disallow non PC relative literal pool loads
7714      as many other things will break anyway.  */
7715   if (opts->x_nopcrelative_literal_loads
7716       && (aarch64_cmodel == AARCH64_CMODEL_TINY
7717           || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC))
7718     aarch64_nopcrelative_literal_loads = false;
7719 }
7720
7721 /* 'Unpack' up the internal tuning structs and update the options
7722     in OPTS.  The caller must have set up selected_tune and selected_arch
7723     as all the other target-specific codegen decisions are
7724     derived from them.  */
7725
7726 void
7727 aarch64_override_options_internal (struct gcc_options *opts)
7728 {
7729   aarch64_tune_flags = selected_tune->flags;
7730   aarch64_tune = selected_tune->sched_core;
7731   /* Make a copy of the tuning parameters attached to the core, which
7732      we may later overwrite.  */
7733   aarch64_tune_params = *(selected_tune->tune);
7734   aarch64_architecture_version = selected_arch->architecture_version;
7735
7736   if (opts->x_aarch64_override_tune_string)
7737     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
7738                                   &aarch64_tune_params);
7739
7740   /* This target defaults to strict volatile bitfields.  */
7741   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
7742     opts->x_flag_strict_volatile_bitfields = 1;
7743
7744   /* -mgeneral-regs-only sets a mask in target_flags, make sure that
7745      aarch64_isa_flags does not contain the FP/SIMD/Crypto feature flags
7746      in case some code tries reading aarch64_isa_flags directly to check if
7747      FP is available.  Reuse the aarch64_parse_extension machinery since it
7748      knows how to disable any other flags that fp implies.  */
7749   if (TARGET_GENERAL_REGS_ONLY_P (opts->x_target_flags))
7750     {
7751       /* aarch64_parse_extension takes char* rather than const char* because
7752          it is usually called from within other parsing functions.  */
7753       char tmp_str[] = "+nofp";
7754       aarch64_parse_extension (tmp_str, &opts->x_aarch64_isa_flags);
7755     }
7756
7757   initialize_aarch64_code_model (opts);
7758   initialize_aarch64_tls_size (opts);
7759
7760   int queue_depth = 0;
7761   switch (aarch64_tune_params.autoprefetcher_model)
7762     {
7763       case tune_params::AUTOPREFETCHER_OFF:
7764         queue_depth = -1;
7765         break;
7766       case tune_params::AUTOPREFETCHER_WEAK:
7767         queue_depth = 0;
7768         break;
7769       case tune_params::AUTOPREFETCHER_STRONG:
7770         queue_depth = max_insn_queue_index + 1;
7771         break;
7772       default:
7773         gcc_unreachable ();
7774     }
7775
7776   /* We don't mind passing in global_options_set here as we don't use
7777      the *options_set structs anyway.  */
7778   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
7779                          queue_depth,
7780                          opts->x_param_values,
7781                          global_options_set.x_param_values);
7782
7783   aarch64_override_options_after_change_1 (opts);
7784 }
7785
7786 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
7787    specified in STR and throw errors if appropriate.  Put the results if
7788    they are valid in RES and ISA_FLAGS.  Return whether the option is
7789    valid.  */
7790
7791 static bool
7792 aarch64_validate_mcpu (const char *str, const struct processor **res,
7793                        unsigned long *isa_flags)
7794 {
7795   enum aarch64_parse_opt_result parse_res
7796     = aarch64_parse_cpu (str, res, isa_flags);
7797
7798   if (parse_res == AARCH64_PARSE_OK)
7799     return true;
7800
7801   switch (parse_res)
7802     {
7803       case AARCH64_PARSE_MISSING_ARG:
7804         error ("missing cpu name in -mcpu=%qs", str);
7805         break;
7806       case AARCH64_PARSE_INVALID_ARG:
7807         error ("unknown value %qs for -mcpu", str);
7808         break;
7809       case AARCH64_PARSE_INVALID_FEATURE:
7810         error ("invalid feature modifier in -mcpu=%qs", str);
7811         break;
7812       default:
7813         gcc_unreachable ();
7814     }
7815
7816   return false;
7817 }
7818
7819 /* Validate a command-line -march option.  Parse the arch and extensions
7820    (if any) specified in STR and throw errors if appropriate.  Put the
7821    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
7822    option is valid.  */
7823
7824 static bool
7825 aarch64_validate_march (const char *str, const struct processor **res,
7826                        unsigned long *isa_flags)
7827 {
7828   enum aarch64_parse_opt_result parse_res
7829     = aarch64_parse_arch (str, res, isa_flags);
7830
7831   if (parse_res == AARCH64_PARSE_OK)
7832     return true;
7833
7834   switch (parse_res)
7835     {
7836       case AARCH64_PARSE_MISSING_ARG:
7837         error ("missing arch name in -march=%qs", str);
7838         break;
7839       case AARCH64_PARSE_INVALID_ARG:
7840         error ("unknown value %qs for -march", str);
7841         break;
7842       case AARCH64_PARSE_INVALID_FEATURE:
7843         error ("invalid feature modifier in -march=%qs", str);
7844         break;
7845       default:
7846         gcc_unreachable ();
7847     }
7848
7849   return false;
7850 }
7851
7852 /* Validate a command-line -mtune option.  Parse the cpu
7853    specified in STR and throw errors if appropriate.  Put the
7854    result, if it is valid, in RES.  Return whether the option is
7855    valid.  */
7856
7857 static bool
7858 aarch64_validate_mtune (const char *str, const struct processor **res)
7859 {
7860   enum aarch64_parse_opt_result parse_res
7861     = aarch64_parse_tune (str, res);
7862
7863   if (parse_res == AARCH64_PARSE_OK)
7864     return true;
7865
7866   switch (parse_res)
7867     {
7868       case AARCH64_PARSE_MISSING_ARG:
7869         error ("missing cpu name in -mtune=%qs", str);
7870         break;
7871       case AARCH64_PARSE_INVALID_ARG:
7872         error ("unknown value %qs for -mtune", str);
7873         break;
7874       default:
7875         gcc_unreachable ();
7876     }
7877   return false;
7878 }
7879
7880 /* Return the CPU corresponding to the enum CPU.
7881    If it doesn't specify a cpu, return the default.  */
7882
7883 static const struct processor *
7884 aarch64_get_tune_cpu (enum aarch64_processor cpu)
7885 {
7886   if (cpu != aarch64_none)
7887     return &all_cores[cpu];
7888
7889   /* The & 0x3f is to extract the bottom 6 bits that encode the
7890      default cpu as selected by the --with-cpu GCC configure option
7891      in config.gcc.
7892      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
7893      flags mechanism should be reworked to make it more sane.  */
7894   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
7895 }
7896
7897 /* Return the architecture corresponding to the enum ARCH.
7898    If it doesn't specify a valid architecture, return the default.  */
7899
7900 static const struct processor *
7901 aarch64_get_arch (enum aarch64_arch arch)
7902 {
7903   if (arch != aarch64_no_arch)
7904     return &all_architectures[arch];
7905
7906   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
7907
7908   return &all_architectures[cpu->arch];
7909 }
7910
7911 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
7912    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
7913    tuning structs.  In particular it must set selected_tune and
7914    aarch64_isa_flags that define the available ISA features and tuning
7915    decisions.  It must also set selected_arch as this will be used to
7916    output the .arch asm tags for each function.  */
7917
7918 static void
7919 aarch64_override_options (void)
7920 {
7921   unsigned long cpu_isa = 0;
7922   unsigned long arch_isa = 0;
7923   aarch64_isa_flags = 0;
7924
7925   bool valid_cpu = true;
7926   bool valid_tune = true;
7927   bool valid_arch = true;
7928
7929   selected_cpu = NULL;
7930   selected_arch = NULL;
7931   selected_tune = NULL;
7932
7933   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
7934      If either of -march or -mtune is given, they override their
7935      respective component of -mcpu.  */
7936   if (aarch64_cpu_string)
7937     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
7938                                         &cpu_isa);
7939
7940   if (aarch64_arch_string)
7941     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
7942                                           &arch_isa);
7943
7944   if (aarch64_tune_string)
7945     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
7946
7947   /* If the user did not specify a processor, choose the default
7948      one for them.  This will be the CPU set during configuration using
7949      --with-cpu, otherwise it is "generic".  */
7950   if (!selected_cpu)
7951     {
7952       if (selected_arch)
7953         {
7954           selected_cpu = &all_cores[selected_arch->ident];
7955           aarch64_isa_flags = arch_isa;
7956           explicit_arch = selected_arch->arch;
7957         }
7958       else
7959         {
7960           /* Get default configure-time CPU.  */
7961           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
7962           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
7963         }
7964
7965       if (selected_tune)
7966         explicit_tune_core = selected_tune->ident;
7967     }
7968   /* If both -mcpu and -march are specified check that they are architecturally
7969      compatible, warn if they're not and prefer the -march ISA flags.  */
7970   else if (selected_arch)
7971     {
7972       if (selected_arch->arch != selected_cpu->arch)
7973         {
7974           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
7975                        all_architectures[selected_cpu->arch].name,
7976                        selected_arch->name);
7977         }
7978       aarch64_isa_flags = arch_isa;
7979       explicit_arch = selected_arch->arch;
7980       explicit_tune_core = selected_tune ? selected_tune->ident
7981                                           : selected_cpu->ident;
7982     }
7983   else
7984     {
7985       /* -mcpu but no -march.  */
7986       aarch64_isa_flags = cpu_isa;
7987       explicit_tune_core = selected_tune ? selected_tune->ident
7988                                           : selected_cpu->ident;
7989       gcc_assert (selected_cpu);
7990       selected_arch = &all_architectures[selected_cpu->arch];
7991       explicit_arch = selected_arch->arch;
7992     }
7993
7994   /* Set the arch as well as we will need it when outputing
7995      the .arch directive in assembly.  */
7996   if (!selected_arch)
7997     {
7998       gcc_assert (selected_cpu);
7999       selected_arch = &all_architectures[selected_cpu->arch];
8000     }
8001
8002   if (!selected_tune)
8003     selected_tune = selected_cpu;
8004
8005 #ifndef HAVE_AS_MABI_OPTION
8006   /* The compiler may have been configured with 2.23.* binutils, which does
8007      not have support for ILP32.  */
8008   if (TARGET_ILP32)
8009     error ("Assembler does not support -mabi=ilp32");
8010 #endif
8011
8012   /* Make sure we properly set up the explicit options.  */
8013   if ((aarch64_cpu_string && valid_cpu)
8014        || (aarch64_tune_string && valid_tune))
8015     gcc_assert (explicit_tune_core != aarch64_none);
8016
8017   if ((aarch64_cpu_string && valid_cpu)
8018        || (aarch64_arch_string && valid_arch))
8019     gcc_assert (explicit_arch != aarch64_no_arch);
8020
8021   aarch64_override_options_internal (&global_options);
8022
8023   /* Save these options as the default ones in case we push and pop them later
8024      while processing functions with potential target attributes.  */
8025   target_option_default_node = target_option_current_node
8026       = build_target_option_node (&global_options);
8027
8028   aarch64_register_fma_steering ();
8029
8030 }
8031
8032 /* Implement targetm.override_options_after_change.  */
8033
8034 static void
8035 aarch64_override_options_after_change (void)
8036 {
8037   aarch64_override_options_after_change_1 (&global_options);
8038 }
8039
8040 static struct machine_function *
8041 aarch64_init_machine_status (void)
8042 {
8043   struct machine_function *machine;
8044   machine = ggc_cleared_alloc<machine_function> ();
8045   return machine;
8046 }
8047
8048 void
8049 aarch64_init_expanders (void)
8050 {
8051   init_machine_status = aarch64_init_machine_status;
8052 }
8053
8054 /* A checking mechanism for the implementation of the various code models.  */
8055 static void
8056 initialize_aarch64_code_model (struct gcc_options *opts)
8057 {
8058    if (opts->x_flag_pic)
8059      {
8060        switch (opts->x_aarch64_cmodel_var)
8061          {
8062          case AARCH64_CMODEL_TINY:
8063            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8064            break;
8065          case AARCH64_CMODEL_SMALL:
8066 #ifdef HAVE_AS_SMALL_PIC_RELOCS
8067            aarch64_cmodel = (flag_pic == 2
8068                              ? AARCH64_CMODEL_SMALL_PIC
8069                              : AARCH64_CMODEL_SMALL_SPIC);
8070 #else
8071            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
8072 #endif
8073            break;
8074          case AARCH64_CMODEL_LARGE:
8075            sorry ("code model %qs with -f%s", "large",
8076                   opts->x_flag_pic > 1 ? "PIC" : "pic");
8077            break;
8078          default:
8079            gcc_unreachable ();
8080          }
8081      }
8082    else
8083      aarch64_cmodel = opts->x_aarch64_cmodel_var;
8084 }
8085
8086 /* Implement TARGET_OPTION_SAVE.  */
8087
8088 static void
8089 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
8090 {
8091   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
8092 }
8093
8094 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
8095    using the information saved in PTR.  */
8096
8097 static void
8098 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
8099 {
8100   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
8101   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8102   opts->x_explicit_arch = ptr->x_explicit_arch;
8103   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
8104   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
8105
8106   aarch64_override_options_internal (opts);
8107 }
8108
8109 /* Implement TARGET_OPTION_PRINT.  */
8110
8111 static void
8112 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
8113 {
8114   const struct processor *cpu
8115     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8116   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
8117   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
8118   std::string extension
8119     = aarch64_get_extension_string_for_isa_flags (isa_flags);
8120
8121   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
8122   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
8123            arch->name, extension.c_str ());
8124 }
8125
8126 static GTY(()) tree aarch64_previous_fndecl;
8127
8128 void
8129 aarch64_reset_previous_fndecl (void)
8130 {
8131   aarch64_previous_fndecl = NULL;
8132 }
8133
8134 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
8135    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
8136    of the function, if such exists.  This function may be called multiple
8137    times on a single function so use aarch64_previous_fndecl to avoid
8138    setting up identical state.  */
8139
8140 static void
8141 aarch64_set_current_function (tree fndecl)
8142 {
8143   tree old_tree = (aarch64_previous_fndecl
8144                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
8145                    : NULL_TREE);
8146
8147   tree new_tree = (fndecl
8148                    ? DECL_FUNCTION_SPECIFIC_TARGET (fndecl)
8149                    : NULL_TREE);
8150
8151
8152   if (fndecl && fndecl != aarch64_previous_fndecl)
8153     {
8154       aarch64_previous_fndecl = fndecl;
8155       if (old_tree == new_tree)
8156         ;
8157
8158       else if (new_tree && new_tree != target_option_default_node)
8159         {
8160           cl_target_option_restore (&global_options,
8161                                     TREE_TARGET_OPTION (new_tree));
8162           if (TREE_TARGET_GLOBALS (new_tree))
8163             restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8164           else
8165             TREE_TARGET_GLOBALS (new_tree)
8166               = save_target_globals_default_opts ();
8167         }
8168
8169       else if (old_tree && old_tree != target_option_default_node)
8170         {
8171           new_tree = target_option_current_node;
8172           cl_target_option_restore (&global_options,
8173                                     TREE_TARGET_OPTION (new_tree));
8174           if (TREE_TARGET_GLOBALS (new_tree))
8175             restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8176           else if (new_tree == target_option_default_node)
8177             restore_target_globals (&default_target_globals);
8178           else
8179             TREE_TARGET_GLOBALS (new_tree)
8180               = save_target_globals_default_opts ();
8181         }
8182     }
8183
8184   if (!fndecl)
8185     return;
8186
8187   /* If we turned on SIMD make sure that any vector parameters are re-laid out
8188      so that they use proper vector modes.  */
8189   if (TARGET_SIMD)
8190     {
8191       tree parms = DECL_ARGUMENTS (fndecl);
8192       for (; parms && parms != void_list_node; parms = TREE_CHAIN (parms))
8193         {
8194           if (TREE_CODE (parms) == PARM_DECL
8195               && VECTOR_TYPE_P (TREE_TYPE (parms))
8196               && DECL_MODE (parms) != TYPE_MODE (TREE_TYPE (parms)))
8197             relayout_decl (parms);
8198         }
8199     }
8200 }
8201
8202 /* Enum describing the various ways we can handle attributes.
8203    In many cases we can reuse the generic option handling machinery.  */
8204
8205 enum aarch64_attr_opt_type
8206 {
8207   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
8208   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
8209   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
8210   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
8211 };
8212
8213 /* All the information needed to handle a target attribute.
8214    NAME is the name of the attribute.
8215    ATTR_TYPE specifies the type of behaviour of the attribute as described
8216    in the definition of enum aarch64_attr_opt_type.
8217    ALLOW_NEG is true if the attribute supports a "no-" form.
8218    HANDLER is the function that takes the attribute string and whether
8219    it is a pragma or attribute and handles the option.  It is needed only
8220    when the ATTR_TYPE is aarch64_attr_custom.
8221    OPT_NUM is the enum specifying the option that the attribute modifies.
8222    This is needed for attributes that mirror the behaviour of a command-line
8223    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
8224    aarch64_attr_enum.  */
8225
8226 struct aarch64_attribute_info
8227 {
8228   const char *name;
8229   enum aarch64_attr_opt_type attr_type;
8230   bool allow_neg;
8231   bool (*handler) (const char *, const char *);
8232   enum opt_code opt_num;
8233 };
8234
8235 /* Handle the ARCH_STR argument to the arch= target attribute.
8236    PRAGMA_OR_ATTR is used in potential error messages.  */
8237
8238 static bool
8239 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
8240 {
8241   const struct processor *tmp_arch = NULL;
8242   enum aarch64_parse_opt_result parse_res
8243     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
8244
8245   if (parse_res == AARCH64_PARSE_OK)
8246     {
8247       gcc_assert (tmp_arch);
8248       selected_arch = tmp_arch;
8249       explicit_arch = selected_arch->arch;
8250       return true;
8251     }
8252
8253   switch (parse_res)
8254     {
8255       case AARCH64_PARSE_MISSING_ARG:
8256         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
8257         break;
8258       case AARCH64_PARSE_INVALID_ARG:
8259         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
8260         break;
8261       case AARCH64_PARSE_INVALID_FEATURE:
8262         error ("invalid feature modifier %qs for 'arch' target %s",
8263                str, pragma_or_attr);
8264         break;
8265       default:
8266         gcc_unreachable ();
8267     }
8268
8269   return false;
8270 }
8271
8272 /* Handle the argument CPU_STR to the cpu= target attribute.
8273    PRAGMA_OR_ATTR is used in potential error messages.  */
8274
8275 static bool
8276 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
8277 {
8278   const struct processor *tmp_cpu = NULL;
8279   enum aarch64_parse_opt_result parse_res
8280     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
8281
8282   if (parse_res == AARCH64_PARSE_OK)
8283     {
8284       gcc_assert (tmp_cpu);
8285       selected_tune = tmp_cpu;
8286       explicit_tune_core = selected_tune->ident;
8287
8288       selected_arch = &all_architectures[tmp_cpu->arch];
8289       explicit_arch = selected_arch->arch;
8290       return true;
8291     }
8292
8293   switch (parse_res)
8294     {
8295       case AARCH64_PARSE_MISSING_ARG:
8296         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
8297         break;
8298       case AARCH64_PARSE_INVALID_ARG:
8299         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
8300         break;
8301       case AARCH64_PARSE_INVALID_FEATURE:
8302         error ("invalid feature modifier %qs for 'cpu' target %s",
8303                str, pragma_or_attr);
8304         break;
8305       default:
8306         gcc_unreachable ();
8307     }
8308
8309   return false;
8310 }
8311
8312 /* Handle the argument STR to the tune= target attribute.
8313    PRAGMA_OR_ATTR is used in potential error messages.  */
8314
8315 static bool
8316 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
8317 {
8318   const struct processor *tmp_tune = NULL;
8319   enum aarch64_parse_opt_result parse_res
8320     = aarch64_parse_tune (str, &tmp_tune);
8321
8322   if (parse_res == AARCH64_PARSE_OK)
8323     {
8324       gcc_assert (tmp_tune);
8325       selected_tune = tmp_tune;
8326       explicit_tune_core = selected_tune->ident;
8327       return true;
8328     }
8329
8330   switch (parse_res)
8331     {
8332       case AARCH64_PARSE_INVALID_ARG:
8333         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
8334         break;
8335       default:
8336         gcc_unreachable ();
8337     }
8338
8339   return false;
8340 }
8341
8342 /* Parse an architecture extensions target attribute string specified in STR.
8343    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
8344    if successful.  Update aarch64_isa_flags to reflect the ISA features
8345    modified.
8346    PRAGMA_OR_ATTR is used in potential error messages.  */
8347
8348 static bool
8349 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
8350 {
8351   enum aarch64_parse_opt_result parse_res;
8352   unsigned long isa_flags = aarch64_isa_flags;
8353
8354   /* We allow "+nothing" in the beginning to clear out all architectural
8355      features if the user wants to handpick specific features.  */
8356   if (strncmp ("+nothing", str, 8) == 0)
8357     {
8358       isa_flags = 0;
8359       str += 8;
8360     }
8361
8362   parse_res = aarch64_parse_extension (str, &isa_flags);
8363
8364   if (parse_res == AARCH64_PARSE_OK)
8365     {
8366       aarch64_isa_flags = isa_flags;
8367       return true;
8368     }
8369
8370   switch (parse_res)
8371     {
8372       case AARCH64_PARSE_MISSING_ARG:
8373         error ("missing feature modifier in target %s %qs",
8374                pragma_or_attr, str);
8375         break;
8376
8377       case AARCH64_PARSE_INVALID_FEATURE:
8378         error ("invalid feature modifier in target %s %qs",
8379                pragma_or_attr, str);
8380         break;
8381
8382       default:
8383         gcc_unreachable ();
8384     }
8385
8386  return false;
8387 }
8388
8389 /* The target attributes that we support.  On top of these we also support just
8390    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
8391    handled explicitly in aarch64_process_one_target_attr.  */
8392
8393 static const struct aarch64_attribute_info aarch64_attributes[] =
8394 {
8395   { "general-regs-only", aarch64_attr_mask, false, NULL,
8396      OPT_mgeneral_regs_only },
8397   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
8398      OPT_mfix_cortex_a53_835769 },
8399   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
8400   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
8401   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
8402      OPT_momit_leaf_frame_pointer },
8403   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
8404   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
8405      OPT_march_ },
8406   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
8407   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
8408      OPT_mtune_ },
8409   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
8410 };
8411
8412 /* Parse ARG_STR which contains the definition of one target attribute.
8413    Show appropriate errors if any or return true if the attribute is valid.
8414    PRAGMA_OR_ATTR holds the string to use in error messages about whether
8415    we're processing a target attribute or pragma.  */
8416
8417 static bool
8418 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
8419 {
8420   bool invert = false;
8421
8422   size_t len = strlen (arg_str);
8423
8424   if (len == 0)
8425     {
8426       error ("malformed target %s", pragma_or_attr);
8427       return false;
8428     }
8429
8430   char *str_to_check = (char *) alloca (len + 1);
8431   strcpy (str_to_check, arg_str);
8432
8433   /* Skip leading whitespace.  */
8434   while (*str_to_check == ' ' || *str_to_check == '\t')
8435     str_to_check++;
8436
8437   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
8438      It is easier to detect and handle it explicitly here rather than going
8439      through the machinery for the rest of the target attributes in this
8440      function.  */
8441   if (*str_to_check == '+')
8442     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
8443
8444   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
8445     {
8446       invert = true;
8447       str_to_check += 3;
8448     }
8449   char *arg = strchr (str_to_check, '=');
8450
8451   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
8452      and point ARG to "foo".  */
8453   if (arg)
8454     {
8455       *arg = '\0';
8456       arg++;
8457     }
8458   const struct aarch64_attribute_info *p_attr;
8459   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
8460     {
8461       /* If the names don't match up, or the user has given an argument
8462          to an attribute that doesn't accept one, or didn't give an argument
8463          to an attribute that expects one, fail to match.  */
8464       if (strcmp (str_to_check, p_attr->name) != 0)
8465         continue;
8466
8467       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
8468                               || p_attr->attr_type == aarch64_attr_enum;
8469
8470       if (attr_need_arg_p ^ (arg != NULL))
8471         {
8472           error ("target %s %qs does not accept an argument",
8473                   pragma_or_attr, str_to_check);
8474           return false;
8475         }
8476
8477       /* If the name matches but the attribute does not allow "no-" versions
8478          then we can't match.  */
8479       if (invert && !p_attr->allow_neg)
8480         {
8481           error ("target %s %qs does not allow a negated form",
8482                   pragma_or_attr, str_to_check);
8483           return false;
8484         }
8485
8486       switch (p_attr->attr_type)
8487         {
8488         /* Has a custom handler registered.
8489            For example, cpu=, arch=, tune=.  */
8490           case aarch64_attr_custom:
8491             gcc_assert (p_attr->handler);
8492             if (!p_attr->handler (arg, pragma_or_attr))
8493               return false;
8494             break;
8495
8496           /* Either set or unset a boolean option.  */
8497           case aarch64_attr_bool:
8498             {
8499               struct cl_decoded_option decoded;
8500
8501               generate_option (p_attr->opt_num, NULL, !invert,
8502                                CL_TARGET, &decoded);
8503               aarch64_handle_option (&global_options, &global_options_set,
8504                                       &decoded, input_location);
8505               break;
8506             }
8507           /* Set or unset a bit in the target_flags.  aarch64_handle_option
8508              should know what mask to apply given the option number.  */
8509           case aarch64_attr_mask:
8510             {
8511               struct cl_decoded_option decoded;
8512               /* We only need to specify the option number.
8513                  aarch64_handle_option will know which mask to apply.  */
8514               decoded.opt_index = p_attr->opt_num;
8515               decoded.value = !invert;
8516               aarch64_handle_option (&global_options, &global_options_set,
8517                                       &decoded, input_location);
8518               break;
8519             }
8520           /* Use the option setting machinery to set an option to an enum.  */
8521           case aarch64_attr_enum:
8522             {
8523               gcc_assert (arg);
8524               bool valid;
8525               int value;
8526               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
8527                                               &value, CL_TARGET);
8528               if (valid)
8529                 {
8530                   set_option (&global_options, NULL, p_attr->opt_num, value,
8531                               NULL, DK_UNSPECIFIED, input_location,
8532                               global_dc);
8533                 }
8534               else
8535                 {
8536                   error ("target %s %s=%s is not valid",
8537                          pragma_or_attr, str_to_check, arg);
8538                 }
8539               break;
8540             }
8541           default:
8542             gcc_unreachable ();
8543         }
8544     }
8545
8546   return true;
8547 }
8548
8549 /* Count how many times the character C appears in
8550    NULL-terminated string STR.  */
8551
8552 static unsigned int
8553 num_occurences_in_str (char c, char *str)
8554 {
8555   unsigned int res = 0;
8556   while (*str != '\0')
8557     {
8558       if (*str == c)
8559         res++;
8560
8561       str++;
8562     }
8563
8564   return res;
8565 }
8566
8567 /* Parse the tree in ARGS that contains the target attribute information
8568    and update the global target options space.  PRAGMA_OR_ATTR is a string
8569    to be used in error messages, specifying whether this is processing
8570    a target attribute or a target pragma.  */
8571
8572 bool
8573 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
8574 {
8575   if (TREE_CODE (args) == TREE_LIST)
8576     {
8577       do
8578         {
8579           tree head = TREE_VALUE (args);
8580           if (head)
8581             {
8582               if (!aarch64_process_target_attr (head, pragma_or_attr))
8583                 return false;
8584             }
8585           args = TREE_CHAIN (args);
8586         } while (args);
8587
8588       return true;
8589     }
8590   /* We expect to find a string to parse.  */
8591   gcc_assert (TREE_CODE (args) == STRING_CST);
8592
8593   size_t len = strlen (TREE_STRING_POINTER (args));
8594   char *str_to_check = (char *) alloca (len + 1);
8595   strcpy (str_to_check, TREE_STRING_POINTER (args));
8596
8597   if (len == 0)
8598     {
8599       error ("malformed target %s value", pragma_or_attr);
8600       return false;
8601     }
8602
8603   /* Used to catch empty spaces between commas i.e.
8604      attribute ((target ("attr1,,attr2"))).  */
8605   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
8606
8607   /* Handle multiple target attributes separated by ','.  */
8608   char *token = strtok (str_to_check, ",");
8609
8610   unsigned int num_attrs = 0;
8611   while (token)
8612     {
8613       num_attrs++;
8614       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
8615         {
8616           error ("target %s %qs is invalid", pragma_or_attr, token);
8617           return false;
8618         }
8619
8620       token = strtok (NULL, ",");
8621     }
8622
8623   if (num_attrs != num_commas + 1)
8624     {
8625       error ("malformed target %s list %qs",
8626               pragma_or_attr, TREE_STRING_POINTER (args));
8627       return false;
8628     }
8629
8630   return true;
8631 }
8632
8633 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
8634    process attribute ((target ("..."))).  */
8635
8636 static bool
8637 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
8638 {
8639   struct cl_target_option cur_target;
8640   bool ret;
8641   tree old_optimize;
8642   tree new_target, new_optimize;
8643   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
8644
8645   /* If what we're processing is the current pragma string then the
8646      target option node is already stored in target_option_current_node
8647      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
8648      having to re-parse the string.  This is especially useful to keep
8649      arm_neon.h compile times down since that header contains a lot
8650      of intrinsics enclosed in pragmas.  */
8651   if (!existing_target && args == current_target_pragma)
8652     {
8653       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
8654       return true;
8655     }
8656   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
8657
8658   old_optimize = build_optimization_node (&global_options);
8659   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
8660
8661   /* If the function changed the optimization levels as well as setting
8662      target options, start with the optimizations specified.  */
8663   if (func_optimize && func_optimize != old_optimize)
8664     cl_optimization_restore (&global_options,
8665                              TREE_OPTIMIZATION (func_optimize));
8666
8667   /* Save the current target options to restore at the end.  */
8668   cl_target_option_save (&cur_target, &global_options);
8669
8670   /* If fndecl already has some target attributes applied to it, unpack
8671      them so that we add this attribute on top of them, rather than
8672      overwriting them.  */
8673   if (existing_target)
8674     {
8675       struct cl_target_option *existing_options
8676         = TREE_TARGET_OPTION (existing_target);
8677
8678       if (existing_options)
8679         cl_target_option_restore (&global_options, existing_options);
8680     }
8681   else
8682     cl_target_option_restore (&global_options,
8683                         TREE_TARGET_OPTION (target_option_current_node));
8684
8685
8686   ret = aarch64_process_target_attr (args, "attribute");
8687
8688   /* Set up any additional state.  */
8689   if (ret)
8690     {
8691       aarch64_override_options_internal (&global_options);
8692       /* Initialize SIMD builtins if we haven't already.
8693          Set current_target_pragma to NULL for the duration so that
8694          the builtin initialization code doesn't try to tag the functions
8695          being built with the attributes specified by any current pragma, thus
8696          going into an infinite recursion.  */
8697       if (TARGET_SIMD)
8698         {
8699           tree saved_current_target_pragma = current_target_pragma;
8700           current_target_pragma = NULL;
8701           aarch64_init_simd_builtins ();
8702           current_target_pragma = saved_current_target_pragma;
8703         }
8704       new_target = build_target_option_node (&global_options);
8705     }
8706   else
8707     new_target = NULL;
8708
8709   new_optimize = build_optimization_node (&global_options);
8710
8711   if (fndecl && ret)
8712     {
8713       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
8714
8715       if (old_optimize != new_optimize)
8716         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
8717     }
8718
8719   cl_target_option_restore (&global_options, &cur_target);
8720
8721   if (old_optimize != new_optimize)
8722     cl_optimization_restore (&global_options,
8723                              TREE_OPTIMIZATION (old_optimize));
8724   return ret;
8725 }
8726
8727 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
8728    tri-bool options (yes, no, don't care) and the default value is
8729    DEF, determine whether to reject inlining.  */
8730
8731 static bool
8732 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
8733                                      int dont_care, int def)
8734 {
8735   /* If the callee doesn't care, always allow inlining.  */
8736   if (callee == dont_care)
8737     return true;
8738
8739   /* If the caller doesn't care, always allow inlining.  */
8740   if (caller == dont_care)
8741     return true;
8742
8743   /* Otherwise, allow inlining if either the callee and caller values
8744      agree, or if the callee is using the default value.  */
8745   return (callee == caller || callee == def);
8746 }
8747
8748 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
8749    to inline CALLEE into CALLER based on target-specific info.
8750    Make sure that the caller and callee have compatible architectural
8751    features.  Then go through the other possible target attributes
8752    and see if they can block inlining.  Try not to reject always_inline
8753    callees unless they are incompatible architecturally.  */
8754
8755 static bool
8756 aarch64_can_inline_p (tree caller, tree callee)
8757 {
8758   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
8759   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
8760
8761   /* If callee has no option attributes, then it is ok to inline.  */
8762   if (!callee_tree)
8763     return true;
8764
8765   struct cl_target_option *caller_opts
8766         = TREE_TARGET_OPTION (caller_tree ? caller_tree
8767                                            : target_option_default_node);
8768
8769   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
8770
8771
8772   /* Callee's ISA flags should be a subset of the caller's.  */
8773   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
8774        != callee_opts->x_aarch64_isa_flags)
8775     return false;
8776
8777   /* Allow non-strict aligned functions inlining into strict
8778      aligned ones.  */
8779   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
8780        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
8781       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
8782            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
8783     return false;
8784
8785   bool always_inline = lookup_attribute ("always_inline",
8786                                           DECL_ATTRIBUTES (callee));
8787
8788   /* If the architectural features match up and the callee is always_inline
8789      then the other attributes don't matter.  */
8790   if (always_inline)
8791     return true;
8792
8793   if (caller_opts->x_aarch64_cmodel_var
8794       != callee_opts->x_aarch64_cmodel_var)
8795     return false;
8796
8797   if (caller_opts->x_aarch64_tls_dialect
8798       != callee_opts->x_aarch64_tls_dialect)
8799     return false;
8800
8801   /* Honour explicit requests to workaround errata.  */
8802   if (!aarch64_tribools_ok_for_inlining_p (
8803           caller_opts->x_aarch64_fix_a53_err835769,
8804           callee_opts->x_aarch64_fix_a53_err835769,
8805           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
8806     return false;
8807
8808   /* If the user explicitly specified -momit-leaf-frame-pointer for the
8809      caller and calle and they don't match up, reject inlining.  */
8810   if (!aarch64_tribools_ok_for_inlining_p (
8811           caller_opts->x_flag_omit_leaf_frame_pointer,
8812           callee_opts->x_flag_omit_leaf_frame_pointer,
8813           2, 1))
8814     return false;
8815
8816   /* If the callee has specific tuning overrides, respect them.  */
8817   if (callee_opts->x_aarch64_override_tune_string != NULL
8818       && caller_opts->x_aarch64_override_tune_string == NULL)
8819     return false;
8820
8821   /* If the user specified tuning override strings for the
8822      caller and callee and they don't match up, reject inlining.
8823      We just do a string compare here, we don't analyze the meaning
8824      of the string, as it would be too costly for little gain.  */
8825   if (callee_opts->x_aarch64_override_tune_string
8826       && caller_opts->x_aarch64_override_tune_string
8827       && (strcmp (callee_opts->x_aarch64_override_tune_string,
8828                   caller_opts->x_aarch64_override_tune_string) != 0))
8829     return false;
8830
8831   return true;
8832 }
8833
8834 /* Return true if SYMBOL_REF X binds locally.  */
8835
8836 static bool
8837 aarch64_symbol_binds_local_p (const_rtx x)
8838 {
8839   return (SYMBOL_REF_DECL (x)
8840           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
8841           : SYMBOL_REF_LOCAL_P (x));
8842 }
8843
8844 /* Return true if SYMBOL_REF X is thread local */
8845 static bool
8846 aarch64_tls_symbol_p (rtx x)
8847 {
8848   if (! TARGET_HAVE_TLS)
8849     return false;
8850
8851   if (GET_CODE (x) != SYMBOL_REF)
8852     return false;
8853
8854   return SYMBOL_REF_TLS_MODEL (x) != 0;
8855 }
8856
8857 /* Classify a TLS symbol into one of the TLS kinds.  */
8858 enum aarch64_symbol_type
8859 aarch64_classify_tls_symbol (rtx x)
8860 {
8861   enum tls_model tls_kind = tls_symbolic_operand_type (x);
8862
8863   switch (tls_kind)
8864     {
8865     case TLS_MODEL_GLOBAL_DYNAMIC:
8866     case TLS_MODEL_LOCAL_DYNAMIC:
8867       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
8868
8869     case TLS_MODEL_INITIAL_EXEC:
8870       switch (aarch64_cmodel)
8871         {
8872         case AARCH64_CMODEL_TINY:
8873         case AARCH64_CMODEL_TINY_PIC:
8874           return SYMBOL_TINY_TLSIE;
8875         default:
8876           return SYMBOL_SMALL_TLSIE;
8877         }
8878
8879     case TLS_MODEL_LOCAL_EXEC:
8880       if (aarch64_tls_size == 12)
8881         return SYMBOL_TLSLE12;
8882       else if (aarch64_tls_size == 24)
8883         return SYMBOL_TLSLE24;
8884       else if (aarch64_tls_size == 32)
8885         return SYMBOL_TLSLE32;
8886       else if (aarch64_tls_size == 48)
8887         return SYMBOL_TLSLE48;
8888       else
8889         gcc_unreachable ();
8890
8891     case TLS_MODEL_EMULATED:
8892     case TLS_MODEL_NONE:
8893       return SYMBOL_FORCE_TO_MEM;
8894
8895     default:
8896       gcc_unreachable ();
8897     }
8898 }
8899
8900 /* Return the method that should be used to access SYMBOL_REF or
8901    LABEL_REF X.  */
8902
8903 enum aarch64_symbol_type
8904 aarch64_classify_symbol (rtx x, rtx offset)
8905 {
8906   if (GET_CODE (x) == LABEL_REF)
8907     {
8908       switch (aarch64_cmodel)
8909         {
8910         case AARCH64_CMODEL_LARGE:
8911           return SYMBOL_FORCE_TO_MEM;
8912
8913         case AARCH64_CMODEL_TINY_PIC:
8914         case AARCH64_CMODEL_TINY:
8915           return SYMBOL_TINY_ABSOLUTE;
8916
8917         case AARCH64_CMODEL_SMALL_SPIC:
8918         case AARCH64_CMODEL_SMALL_PIC:
8919         case AARCH64_CMODEL_SMALL:
8920           return SYMBOL_SMALL_ABSOLUTE;
8921
8922         default:
8923           gcc_unreachable ();
8924         }
8925     }
8926
8927   if (GET_CODE (x) == SYMBOL_REF)
8928     {
8929       if (aarch64_cmodel == AARCH64_CMODEL_LARGE)
8930         {
8931           /* This is alright even in PIC code as the constant
8932              pool reference is always PC relative and within
8933              the same translation unit.  */
8934           if (nopcrelative_literal_loads
8935               && CONSTANT_POOL_ADDRESS_P (x))
8936             return SYMBOL_SMALL_ABSOLUTE;
8937           else
8938             return SYMBOL_FORCE_TO_MEM;
8939         }
8940
8941       if (aarch64_tls_symbol_p (x))
8942         return aarch64_classify_tls_symbol (x);
8943
8944       switch (aarch64_cmodel)
8945         {
8946         case AARCH64_CMODEL_TINY:
8947           /* When we retreive symbol + offset address, we have to make sure
8948              the offset does not cause overflow of the final address.  But
8949              we have no way of knowing the address of symbol at compile time
8950              so we can't accurately say if the distance between the PC and
8951              symbol + offset is outside the addressible range of +/-1M in the
8952              TINY code model.  So we rely on images not being greater than
8953              1M and cap the offset at 1M and anything beyond 1M will have to
8954              be loaded using an alternative mechanism.  */
8955           if (SYMBOL_REF_WEAK (x)
8956               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
8957             return SYMBOL_FORCE_TO_MEM;
8958           return SYMBOL_TINY_ABSOLUTE;
8959
8960         case AARCH64_CMODEL_SMALL:
8961           /* Same reasoning as the tiny code model, but the offset cap here is
8962              4G.  */
8963           if (SYMBOL_REF_WEAK (x)
8964               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
8965                             HOST_WIDE_INT_C (4294967264)))
8966             return SYMBOL_FORCE_TO_MEM;
8967           return SYMBOL_SMALL_ABSOLUTE;
8968
8969         case AARCH64_CMODEL_TINY_PIC:
8970           if (!aarch64_symbol_binds_local_p (x))
8971             return SYMBOL_TINY_GOT;
8972           return SYMBOL_TINY_ABSOLUTE;
8973
8974         case AARCH64_CMODEL_SMALL_SPIC:
8975         case AARCH64_CMODEL_SMALL_PIC:
8976           if (!aarch64_symbol_binds_local_p (x))
8977             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
8978                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
8979           return SYMBOL_SMALL_ABSOLUTE;
8980
8981         default:
8982           gcc_unreachable ();
8983         }
8984     }
8985
8986   /* By default push everything into the constant pool.  */
8987   return SYMBOL_FORCE_TO_MEM;
8988 }
8989
8990 bool
8991 aarch64_constant_address_p (rtx x)
8992 {
8993   return (CONSTANT_P (x) && memory_address_p (DImode, x));
8994 }
8995
8996 bool
8997 aarch64_legitimate_pic_operand_p (rtx x)
8998 {
8999   if (GET_CODE (x) == SYMBOL_REF
9000       || (GET_CODE (x) == CONST
9001           && GET_CODE (XEXP (x, 0)) == PLUS
9002           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9003      return false;
9004
9005   return true;
9006 }
9007
9008 /* Return true if X holds either a quarter-precision or
9009      floating-point +0.0 constant.  */
9010 static bool
9011 aarch64_valid_floating_const (machine_mode mode, rtx x)
9012 {
9013   if (!CONST_DOUBLE_P (x))
9014     return false;
9015
9016   if (aarch64_float_const_zero_rtx_p (x))
9017     return true;
9018
9019   /* We only handle moving 0.0 to a TFmode register.  */
9020   if (!(mode == SFmode || mode == DFmode))
9021     return false;
9022
9023   return aarch64_float_const_representable_p (x);
9024 }
9025
9026 static bool
9027 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9028 {
9029   /* Do not allow vector struct mode constants.  We could support
9030      0 and -1 easily, but they need support in aarch64-simd.md.  */
9031   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9032     return false;
9033
9034   /* This could probably go away because
9035      we now decompose CONST_INTs according to expand_mov_immediate.  */
9036   if ((GET_CODE (x) == CONST_VECTOR
9037        && aarch64_simd_valid_immediate (x, mode, false, NULL))
9038       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9039         return !targetm.cannot_force_const_mem (mode, x);
9040
9041   if (GET_CODE (x) == HIGH
9042       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9043     return true;
9044
9045   return aarch64_constant_address_p (x);
9046 }
9047
9048 rtx
9049 aarch64_load_tp (rtx target)
9050 {
9051   if (!target
9052       || GET_MODE (target) != Pmode
9053       || !register_operand (target, Pmode))
9054     target = gen_reg_rtx (Pmode);
9055
9056   /* Can return in any reg.  */
9057   emit_insn (gen_aarch64_load_tp_hard (target));
9058   return target;
9059 }
9060
9061 /* On AAPCS systems, this is the "struct __va_list".  */
9062 static GTY(()) tree va_list_type;
9063
9064 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
9065    Return the type to use as __builtin_va_list.
9066
9067    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
9068
9069    struct __va_list
9070    {
9071      void *__stack;
9072      void *__gr_top;
9073      void *__vr_top;
9074      int   __gr_offs;
9075      int   __vr_offs;
9076    };  */
9077
9078 static tree
9079 aarch64_build_builtin_va_list (void)
9080 {
9081   tree va_list_name;
9082   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9083
9084   /* Create the type.  */
9085   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
9086   /* Give it the required name.  */
9087   va_list_name = build_decl (BUILTINS_LOCATION,
9088                              TYPE_DECL,
9089                              get_identifier ("__va_list"),
9090                              va_list_type);
9091   DECL_ARTIFICIAL (va_list_name) = 1;
9092   TYPE_NAME (va_list_type) = va_list_name;
9093   TYPE_STUB_DECL (va_list_type) = va_list_name;
9094
9095   /* Create the fields.  */
9096   f_stack = build_decl (BUILTINS_LOCATION,
9097                         FIELD_DECL, get_identifier ("__stack"),
9098                         ptr_type_node);
9099   f_grtop = build_decl (BUILTINS_LOCATION,
9100                         FIELD_DECL, get_identifier ("__gr_top"),
9101                         ptr_type_node);
9102   f_vrtop = build_decl (BUILTINS_LOCATION,
9103                         FIELD_DECL, get_identifier ("__vr_top"),
9104                         ptr_type_node);
9105   f_groff = build_decl (BUILTINS_LOCATION,
9106                         FIELD_DECL, get_identifier ("__gr_offs"),
9107                         integer_type_node);
9108   f_vroff = build_decl (BUILTINS_LOCATION,
9109                         FIELD_DECL, get_identifier ("__vr_offs"),
9110                         integer_type_node);
9111
9112   DECL_ARTIFICIAL (f_stack) = 1;
9113   DECL_ARTIFICIAL (f_grtop) = 1;
9114   DECL_ARTIFICIAL (f_vrtop) = 1;
9115   DECL_ARTIFICIAL (f_groff) = 1;
9116   DECL_ARTIFICIAL (f_vroff) = 1;
9117
9118   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
9119   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
9120   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
9121   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
9122   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
9123
9124   TYPE_FIELDS (va_list_type) = f_stack;
9125   DECL_CHAIN (f_stack) = f_grtop;
9126   DECL_CHAIN (f_grtop) = f_vrtop;
9127   DECL_CHAIN (f_vrtop) = f_groff;
9128   DECL_CHAIN (f_groff) = f_vroff;
9129
9130   /* Compute its layout.  */
9131   layout_type (va_list_type);
9132
9133   return va_list_type;
9134 }
9135
9136 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
9137 static void
9138 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
9139 {
9140   const CUMULATIVE_ARGS *cum;
9141   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9142   tree stack, grtop, vrtop, groff, vroff;
9143   tree t;
9144   int gr_save_area_size;
9145   int vr_save_area_size;
9146   int vr_offset;
9147
9148   cum = &crtl->args.info;
9149   gr_save_area_size
9150     = (NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD;
9151   vr_save_area_size
9152     = (NUM_FP_ARG_REGS - cum->aapcs_nvrn) * UNITS_PER_VREG;
9153
9154   if (!TARGET_FLOAT)
9155     {
9156       gcc_assert (cum->aapcs_nvrn == 0);
9157       vr_save_area_size = 0;
9158     }
9159
9160   f_stack = TYPE_FIELDS (va_list_type_node);
9161   f_grtop = DECL_CHAIN (f_stack);
9162   f_vrtop = DECL_CHAIN (f_grtop);
9163   f_groff = DECL_CHAIN (f_vrtop);
9164   f_vroff = DECL_CHAIN (f_groff);
9165
9166   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
9167                   NULL_TREE);
9168   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
9169                   NULL_TREE);
9170   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
9171                   NULL_TREE);
9172   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
9173                   NULL_TREE);
9174   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
9175                   NULL_TREE);
9176
9177   /* Emit code to initialize STACK, which points to the next varargs stack
9178      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
9179      by named arguments.  STACK is 8-byte aligned.  */
9180   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
9181   if (cum->aapcs_stack_size > 0)
9182     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
9183   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
9184   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9185
9186   /* Emit code to initialize GRTOP, the top of the GR save area.
9187      virtual_incoming_args_rtx should have been 16 byte aligned.  */
9188   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
9189   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
9190   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9191
9192   /* Emit code to initialize VRTOP, the top of the VR save area.
9193      This address is gr_save_area_bytes below GRTOP, rounded
9194      down to the next 16-byte boundary.  */
9195   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
9196   vr_offset = ROUND_UP (gr_save_area_size,
9197                         STACK_BOUNDARY / BITS_PER_UNIT);
9198
9199   if (vr_offset)
9200     t = fold_build_pointer_plus_hwi (t, -vr_offset);
9201   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
9202   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9203
9204   /* Emit code to initialize GROFF, the offset from GRTOP of the
9205      next GPR argument.  */
9206   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
9207               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
9208   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9209
9210   /* Likewise emit code to initialize VROFF, the offset from FTOP
9211      of the next VR argument.  */
9212   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
9213               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
9214   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9215 }
9216
9217 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
9218
9219 static tree
9220 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
9221                               gimple_seq *post_p ATTRIBUTE_UNUSED)
9222 {
9223   tree addr;
9224   bool indirect_p;
9225   bool is_ha;           /* is HFA or HVA.  */
9226   bool dw_align;        /* double-word align.  */
9227   machine_mode ag_mode = VOIDmode;
9228   int nregs;
9229   machine_mode mode;
9230
9231   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9232   tree stack, f_top, f_off, off, arg, roundup, on_stack;
9233   HOST_WIDE_INT size, rsize, adjust, align;
9234   tree t, u, cond1, cond2;
9235
9236   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9237   if (indirect_p)
9238     type = build_pointer_type (type);
9239
9240   mode = TYPE_MODE (type);
9241
9242   f_stack = TYPE_FIELDS (va_list_type_node);
9243   f_grtop = DECL_CHAIN (f_stack);
9244   f_vrtop = DECL_CHAIN (f_grtop);
9245   f_groff = DECL_CHAIN (f_vrtop);
9246   f_vroff = DECL_CHAIN (f_groff);
9247
9248   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
9249                   f_stack, NULL_TREE);
9250   size = int_size_in_bytes (type);
9251   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
9252
9253   dw_align = false;
9254   adjust = 0;
9255   if (aarch64_vfp_is_call_or_return_candidate (mode,
9256                                                type,
9257                                                &ag_mode,
9258                                                &nregs,
9259                                                &is_ha))
9260     {
9261       /* TYPE passed in fp/simd registers.  */
9262       if (!TARGET_FLOAT)
9263         aarch64_err_no_fpadvsimd (mode, "varargs");
9264
9265       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
9266                       unshare_expr (valist), f_vrtop, NULL_TREE);
9267       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
9268                       unshare_expr (valist), f_vroff, NULL_TREE);
9269
9270       rsize = nregs * UNITS_PER_VREG;
9271
9272       if (is_ha)
9273         {
9274           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
9275             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
9276         }
9277       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
9278                && size < UNITS_PER_VREG)
9279         {
9280           adjust = UNITS_PER_VREG - size;
9281         }
9282     }
9283   else
9284     {
9285       /* TYPE passed in general registers.  */
9286       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
9287                       unshare_expr (valist), f_grtop, NULL_TREE);
9288       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
9289                       unshare_expr (valist), f_groff, NULL_TREE);
9290       rsize = ROUND_UP (size, UNITS_PER_WORD);
9291       nregs = rsize / UNITS_PER_WORD;
9292
9293       if (align > 8)
9294         dw_align = true;
9295
9296       if (BLOCK_REG_PADDING (mode, type, 1) == downward
9297           && size < UNITS_PER_WORD)
9298         {
9299           adjust = UNITS_PER_WORD  - size;
9300         }
9301     }
9302
9303   /* Get a local temporary for the field value.  */
9304   off = get_initialized_tmp_var (f_off, pre_p, NULL);
9305
9306   /* Emit code to branch if off >= 0.  */
9307   t = build2 (GE_EXPR, boolean_type_node, off,
9308               build_int_cst (TREE_TYPE (off), 0));
9309   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
9310
9311   if (dw_align)
9312     {
9313       /* Emit: offs = (offs + 15) & -16.  */
9314       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9315                   build_int_cst (TREE_TYPE (off), 15));
9316       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
9317                   build_int_cst (TREE_TYPE (off), -16));
9318       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
9319     }
9320   else
9321     roundup = NULL;
9322
9323   /* Update ap.__[g|v]r_offs  */
9324   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9325               build_int_cst (TREE_TYPE (off), rsize));
9326   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
9327
9328   /* String up.  */
9329   if (roundup)
9330     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9331
9332   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
9333   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
9334               build_int_cst (TREE_TYPE (f_off), 0));
9335   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
9336
9337   /* String up: make sure the assignment happens before the use.  */
9338   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
9339   COND_EXPR_ELSE (cond1) = t;
9340
9341   /* Prepare the trees handling the argument that is passed on the stack;
9342      the top level node will store in ON_STACK.  */
9343   arg = get_initialized_tmp_var (stack, pre_p, NULL);
9344   if (align > 8)
9345     {
9346       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
9347       t = fold_convert (intDI_type_node, arg);
9348       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9349                   build_int_cst (TREE_TYPE (t), 15));
9350       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9351                   build_int_cst (TREE_TYPE (t), -16));
9352       t = fold_convert (TREE_TYPE (arg), t);
9353       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
9354     }
9355   else
9356     roundup = NULL;
9357   /* Advance ap.__stack  */
9358   t = fold_convert (intDI_type_node, arg);
9359   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9360               build_int_cst (TREE_TYPE (t), size + 7));
9361   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9362               build_int_cst (TREE_TYPE (t), -8));
9363   t = fold_convert (TREE_TYPE (arg), t);
9364   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
9365   /* String up roundup and advance.  */
9366   if (roundup)
9367     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9368   /* String up with arg */
9369   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
9370   /* Big-endianness related address adjustment.  */
9371   if (BLOCK_REG_PADDING (mode, type, 1) == downward
9372       && size < UNITS_PER_WORD)
9373   {
9374     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
9375                 size_int (UNITS_PER_WORD - size));
9376     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
9377   }
9378
9379   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
9380   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
9381
9382   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
9383   t = off;
9384   if (adjust)
9385     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
9386                 build_int_cst (TREE_TYPE (off), adjust));
9387
9388   t = fold_convert (sizetype, t);
9389   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
9390
9391   if (is_ha)
9392     {
9393       /* type ha; // treat as "struct {ftype field[n];}"
9394          ... [computing offs]
9395          for (i = 0; i <nregs; ++i, offs += 16)
9396            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
9397          return ha;  */
9398       int i;
9399       tree tmp_ha, field_t, field_ptr_t;
9400
9401       /* Declare a local variable.  */
9402       tmp_ha = create_tmp_var_raw (type, "ha");
9403       gimple_add_tmp_var (tmp_ha);
9404
9405       /* Establish the base type.  */
9406       switch (ag_mode)
9407         {
9408         case SFmode:
9409           field_t = float_type_node;
9410           field_ptr_t = float_ptr_type_node;
9411           break;
9412         case DFmode:
9413           field_t = double_type_node;
9414           field_ptr_t = double_ptr_type_node;
9415           break;
9416         case TFmode:
9417           field_t = long_double_type_node;
9418           field_ptr_t = long_double_ptr_type_node;
9419           break;
9420 /* The half precision and quad precision are not fully supported yet.  Enable
9421    the following code after the support is complete.  Need to find the correct
9422    type node for __fp16 *.  */
9423 #if 0
9424         case HFmode:
9425           field_t = float_type_node;
9426           field_ptr_t = float_ptr_type_node;
9427           break;
9428 #endif
9429         case V2SImode:
9430         case V4SImode:
9431             {
9432               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
9433               field_t = build_vector_type_for_mode (innertype, ag_mode);
9434               field_ptr_t = build_pointer_type (field_t);
9435             }
9436           break;
9437         default:
9438           gcc_assert (0);
9439         }
9440
9441       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
9442       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
9443       addr = t;
9444       t = fold_convert (field_ptr_t, addr);
9445       t = build2 (MODIFY_EXPR, field_t,
9446                   build1 (INDIRECT_REF, field_t, tmp_ha),
9447                   build1 (INDIRECT_REF, field_t, t));
9448
9449       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
9450       for (i = 1; i < nregs; ++i)
9451         {
9452           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
9453           u = fold_convert (field_ptr_t, addr);
9454           u = build2 (MODIFY_EXPR, field_t,
9455                       build2 (MEM_REF, field_t, tmp_ha,
9456                               build_int_cst (field_ptr_t,
9457                                              (i *
9458                                               int_size_in_bytes (field_t)))),
9459                       build1 (INDIRECT_REF, field_t, u));
9460           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
9461         }
9462
9463       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
9464       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
9465     }
9466
9467   COND_EXPR_ELSE (cond2) = t;
9468   addr = fold_convert (build_pointer_type (type), cond1);
9469   addr = build_va_arg_indirect_ref (addr);
9470
9471   if (indirect_p)
9472     addr = build_va_arg_indirect_ref (addr);
9473
9474   return addr;
9475 }
9476
9477 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
9478
9479 static void
9480 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9481                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
9482                                 int no_rtl)
9483 {
9484   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9485   CUMULATIVE_ARGS local_cum;
9486   int gr_saved, vr_saved;
9487
9488   /* The caller has advanced CUM up to, but not beyond, the last named
9489      argument.  Advance a local copy of CUM past the last "real" named
9490      argument, to find out how many registers are left over.  */
9491   local_cum = *cum;
9492   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
9493
9494   /* Found out how many registers we need to save.  */
9495   gr_saved = NUM_ARG_REGS - local_cum.aapcs_ncrn;
9496   vr_saved = NUM_FP_ARG_REGS - local_cum.aapcs_nvrn;
9497
9498   if (!TARGET_FLOAT)
9499     {
9500       gcc_assert (local_cum.aapcs_nvrn == 0);
9501       vr_saved = 0;
9502     }
9503
9504   if (!no_rtl)
9505     {
9506       if (gr_saved > 0)
9507         {
9508           rtx ptr, mem;
9509
9510           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
9511           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
9512                                - gr_saved * UNITS_PER_WORD);
9513           mem = gen_frame_mem (BLKmode, ptr);
9514           set_mem_alias_set (mem, get_varargs_alias_set ());
9515
9516           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
9517                                mem, gr_saved);
9518         }
9519       if (vr_saved > 0)
9520         {
9521           /* We can't use move_block_from_reg, because it will use
9522              the wrong mode, storing D regs only.  */
9523           machine_mode mode = TImode;
9524           int off, i;
9525
9526           /* Set OFF to the offset from virtual_incoming_args_rtx of
9527              the first vector register.  The VR save area lies below
9528              the GR one, and is aligned to 16 bytes.  */
9529           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
9530                            STACK_BOUNDARY / BITS_PER_UNIT);
9531           off -= vr_saved * UNITS_PER_VREG;
9532
9533           for (i = local_cum.aapcs_nvrn; i < NUM_FP_ARG_REGS; ++i)
9534             {
9535               rtx ptr, mem;
9536
9537               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
9538               mem = gen_frame_mem (mode, ptr);
9539               set_mem_alias_set (mem, get_varargs_alias_set ());
9540               aarch64_emit_move (mem, gen_rtx_REG (mode, V0_REGNUM + i));
9541               off += UNITS_PER_VREG;
9542             }
9543         }
9544     }
9545
9546   /* We don't save the size into *PRETEND_SIZE because we want to avoid
9547      any complication of having crtl->args.pretend_args_size changed.  */
9548   cfun->machine->frame.saved_varargs_size
9549     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
9550                  STACK_BOUNDARY / BITS_PER_UNIT)
9551        + vr_saved * UNITS_PER_VREG);
9552 }
9553
9554 static void
9555 aarch64_conditional_register_usage (void)
9556 {
9557   int i;
9558   if (!TARGET_FLOAT)
9559     {
9560       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
9561         {
9562           fixed_regs[i] = 1;
9563           call_used_regs[i] = 1;
9564         }
9565     }
9566 }
9567
9568 /* Walk down the type tree of TYPE counting consecutive base elements.
9569    If *MODEP is VOIDmode, then set it to the first valid floating point
9570    type.  If a non-floating point type is found, or if a floating point
9571    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
9572    otherwise return the count in the sub-tree.  */
9573 static int
9574 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
9575 {
9576   machine_mode mode;
9577   HOST_WIDE_INT size;
9578
9579   switch (TREE_CODE (type))
9580     {
9581     case REAL_TYPE:
9582       mode = TYPE_MODE (type);
9583       if (mode != DFmode && mode != SFmode && mode != TFmode)
9584         return -1;
9585
9586       if (*modep == VOIDmode)
9587         *modep = mode;
9588
9589       if (*modep == mode)
9590         return 1;
9591
9592       break;
9593
9594     case COMPLEX_TYPE:
9595       mode = TYPE_MODE (TREE_TYPE (type));
9596       if (mode != DFmode && mode != SFmode && mode != TFmode)
9597         return -1;
9598
9599       if (*modep == VOIDmode)
9600         *modep = mode;
9601
9602       if (*modep == mode)
9603         return 2;
9604
9605       break;
9606
9607     case VECTOR_TYPE:
9608       /* Use V2SImode and V4SImode as representatives of all 64-bit
9609          and 128-bit vector types.  */
9610       size = int_size_in_bytes (type);
9611       switch (size)
9612         {
9613         case 8:
9614           mode = V2SImode;
9615           break;
9616         case 16:
9617           mode = V4SImode;
9618           break;
9619         default:
9620           return -1;
9621         }
9622
9623       if (*modep == VOIDmode)
9624         *modep = mode;
9625
9626       /* Vector modes are considered to be opaque: two vectors are
9627          equivalent for the purposes of being homogeneous aggregates
9628          if they are the same size.  */
9629       if (*modep == mode)
9630         return 1;
9631
9632       break;
9633
9634     case ARRAY_TYPE:
9635       {
9636         int count;
9637         tree index = TYPE_DOMAIN (type);
9638
9639         /* Can't handle incomplete types nor sizes that are not
9640            fixed.  */
9641         if (!COMPLETE_TYPE_P (type)
9642             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
9643           return -1;
9644
9645         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
9646         if (count == -1
9647             || !index
9648             || !TYPE_MAX_VALUE (index)
9649             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
9650             || !TYPE_MIN_VALUE (index)
9651             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
9652             || count < 0)
9653           return -1;
9654
9655         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
9656                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
9657
9658         /* There must be no padding.  */
9659         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
9660           return -1;
9661
9662         return count;
9663       }
9664
9665     case RECORD_TYPE:
9666       {
9667         int count = 0;
9668         int sub_count;
9669         tree field;
9670
9671         /* Can't handle incomplete types nor sizes that are not
9672            fixed.  */
9673         if (!COMPLETE_TYPE_P (type)
9674             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
9675           return -1;
9676
9677         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
9678           {
9679             if (TREE_CODE (field) != FIELD_DECL)
9680               continue;
9681
9682             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
9683             if (sub_count < 0)
9684               return -1;
9685             count += sub_count;
9686           }
9687
9688         /* There must be no padding.  */
9689         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
9690           return -1;
9691
9692         return count;
9693       }
9694
9695     case UNION_TYPE:
9696     case QUAL_UNION_TYPE:
9697       {
9698         /* These aren't very interesting except in a degenerate case.  */
9699         int count = 0;
9700         int sub_count;
9701         tree field;
9702
9703         /* Can't handle incomplete types nor sizes that are not
9704            fixed.  */
9705         if (!COMPLETE_TYPE_P (type)
9706             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
9707           return -1;
9708
9709         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
9710           {
9711             if (TREE_CODE (field) != FIELD_DECL)
9712               continue;
9713
9714             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
9715             if (sub_count < 0)
9716               return -1;
9717             count = count > sub_count ? count : sub_count;
9718           }
9719
9720         /* There must be no padding.  */
9721         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
9722           return -1;
9723
9724         return count;
9725       }
9726
9727     default:
9728       break;
9729     }
9730
9731   return -1;
9732 }
9733
9734 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
9735    type as described in AAPCS64 \S 4.1.2.
9736
9737    See the comment above aarch64_composite_type_p for the notes on MODE.  */
9738
9739 static bool
9740 aarch64_short_vector_p (const_tree type,
9741                         machine_mode mode)
9742 {
9743   HOST_WIDE_INT size = -1;
9744
9745   if (type && TREE_CODE (type) == VECTOR_TYPE)
9746     size = int_size_in_bytes (type);
9747   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
9748             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
9749     size = GET_MODE_SIZE (mode);
9750
9751   return (size == 8 || size == 16);
9752 }
9753
9754 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
9755    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
9756    array types.  The C99 floating-point complex types are also considered
9757    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
9758    types, which are GCC extensions and out of the scope of AAPCS64, are
9759    treated as composite types here as well.
9760
9761    Note that MODE itself is not sufficient in determining whether a type
9762    is such a composite type or not.  This is because
9763    stor-layout.c:compute_record_mode may have already changed the MODE
9764    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
9765    structure with only one field may have its MODE set to the mode of the
9766    field.  Also an integer mode whose size matches the size of the
9767    RECORD_TYPE type may be used to substitute the original mode
9768    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
9769    solely relied on.  */
9770
9771 static bool
9772 aarch64_composite_type_p (const_tree type,
9773                           machine_mode mode)
9774 {
9775   if (aarch64_short_vector_p (type, mode))
9776     return false;
9777
9778   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
9779     return true;
9780
9781   if (mode == BLKmode
9782       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
9783       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
9784     return true;
9785
9786   return false;
9787 }
9788
9789 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
9790    shall be passed or returned in simd/fp register(s) (providing these
9791    parameter passing registers are available).
9792
9793    Upon successful return, *COUNT returns the number of needed registers,
9794    *BASE_MODE returns the mode of the individual register and when IS_HAF
9795    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
9796    floating-point aggregate or a homogeneous short-vector aggregate.  */
9797
9798 static bool
9799 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
9800                                          const_tree type,
9801                                          machine_mode *base_mode,
9802                                          int *count,
9803                                          bool *is_ha)
9804 {
9805   machine_mode new_mode = VOIDmode;
9806   bool composite_p = aarch64_composite_type_p (type, mode);
9807
9808   if (is_ha != NULL) *is_ha = false;
9809
9810   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
9811       || aarch64_short_vector_p (type, mode))
9812     {
9813       *count = 1;
9814       new_mode = mode;
9815     }
9816   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
9817     {
9818       if (is_ha != NULL) *is_ha = true;
9819       *count = 2;
9820       new_mode = GET_MODE_INNER (mode);
9821     }
9822   else if (type && composite_p)
9823     {
9824       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
9825
9826       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
9827         {
9828           if (is_ha != NULL) *is_ha = true;
9829           *count = ag_count;
9830         }
9831       else
9832         return false;
9833     }
9834   else
9835     return false;
9836
9837   *base_mode = new_mode;
9838   return true;
9839 }
9840
9841 /* Implement TARGET_STRUCT_VALUE_RTX.  */
9842
9843 static rtx
9844 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
9845                           int incoming ATTRIBUTE_UNUSED)
9846 {
9847   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
9848 }
9849
9850 /* Implements target hook vector_mode_supported_p.  */
9851 static bool
9852 aarch64_vector_mode_supported_p (machine_mode mode)
9853 {
9854   if (TARGET_SIMD
9855       && (mode == V4SImode  || mode == V8HImode
9856           || mode == V16QImode || mode == V2DImode
9857           || mode == V2SImode  || mode == V4HImode
9858           || mode == V8QImode || mode == V2SFmode
9859           || mode == V4SFmode || mode == V2DFmode
9860           || mode == V4HFmode || mode == V8HFmode
9861           || mode == V1DFmode))
9862     return true;
9863
9864   return false;
9865 }
9866
9867 /* Return appropriate SIMD container
9868    for MODE within a vector of WIDTH bits.  */
9869 static machine_mode
9870 aarch64_simd_container_mode (machine_mode mode, unsigned width)
9871 {
9872   gcc_assert (width == 64 || width == 128);
9873   if (TARGET_SIMD)
9874     {
9875       if (width == 128)
9876         switch (mode)
9877           {
9878           case DFmode:
9879             return V2DFmode;
9880           case SFmode:
9881             return V4SFmode;
9882           case SImode:
9883             return V4SImode;
9884           case HImode:
9885             return V8HImode;
9886           case QImode:
9887             return V16QImode;
9888           case DImode:
9889             return V2DImode;
9890           default:
9891             break;
9892           }
9893       else
9894         switch (mode)
9895           {
9896           case SFmode:
9897             return V2SFmode;
9898           case SImode:
9899             return V2SImode;
9900           case HImode:
9901             return V4HImode;
9902           case QImode:
9903             return V8QImode;
9904           default:
9905             break;
9906           }
9907     }
9908   return word_mode;
9909 }
9910
9911 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
9912 static machine_mode
9913 aarch64_preferred_simd_mode (machine_mode mode)
9914 {
9915   return aarch64_simd_container_mode (mode, 128);
9916 }
9917
9918 /* Return the bitmask of possible vector sizes for the vectorizer
9919    to iterate over.  */
9920 static unsigned int
9921 aarch64_autovectorize_vector_sizes (void)
9922 {
9923   return (16 | 8);
9924 }
9925
9926 /* Implement TARGET_MANGLE_TYPE.  */
9927
9928 static const char *
9929 aarch64_mangle_type (const_tree type)
9930 {
9931   /* The AArch64 ABI documents say that "__va_list" has to be
9932      managled as if it is in the "std" namespace.  */
9933   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
9934     return "St9__va_list";
9935
9936   /* Half-precision float.  */
9937   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
9938     return "Dh";
9939
9940   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
9941      builtin types.  */
9942   if (TYPE_NAME (type) != NULL)
9943     return aarch64_mangle_builtin_type (type);
9944
9945   /* Use the default mangling.  */
9946   return NULL;
9947 }
9948
9949
9950 /* Return true if the rtx_insn contains a MEM RTX somewhere
9951    in it.  */
9952
9953 static bool
9954 has_memory_op (rtx_insn *mem_insn)
9955 {
9956   subrtx_iterator::array_type array;
9957   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
9958     if (MEM_P (*iter))
9959       return true;
9960
9961   return false;
9962 }
9963
9964 /* Find the first rtx_insn before insn that will generate an assembly
9965    instruction.  */
9966
9967 static rtx_insn *
9968 aarch64_prev_real_insn (rtx_insn *insn)
9969 {
9970   if (!insn)
9971     return NULL;
9972
9973   do
9974     {
9975       insn = prev_real_insn (insn);
9976     }
9977   while (insn && recog_memoized (insn) < 0);
9978
9979   return insn;
9980 }
9981
9982 static bool
9983 is_madd_op (enum attr_type t1)
9984 {
9985   unsigned int i;
9986   /* A number of these may be AArch32 only.  */
9987   enum attr_type mlatypes[] = {
9988     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
9989     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
9990     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
9991   };
9992
9993   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
9994     {
9995       if (t1 == mlatypes[i])
9996         return true;
9997     }
9998
9999   return false;
10000 }
10001
10002 /* Check if there is a register dependency between a load and the insn
10003    for which we hold recog_data.  */
10004
10005 static bool
10006 dep_between_memop_and_curr (rtx memop)
10007 {
10008   rtx load_reg;
10009   int opno;
10010
10011   gcc_assert (GET_CODE (memop) == SET);
10012
10013   if (!REG_P (SET_DEST (memop)))
10014     return false;
10015
10016   load_reg = SET_DEST (memop);
10017   for (opno = 1; opno < recog_data.n_operands; opno++)
10018     {
10019       rtx operand = recog_data.operand[opno];
10020       if (REG_P (operand)
10021           && reg_overlap_mentioned_p (load_reg, operand))
10022         return true;
10023
10024     }
10025   return false;
10026 }
10027
10028
10029 /* When working around the Cortex-A53 erratum 835769,
10030    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10031    instruction and has a preceding memory instruction such that a NOP
10032    should be inserted between them.  */
10033
10034 bool
10035 aarch64_madd_needs_nop (rtx_insn* insn)
10036 {
10037   enum attr_type attr_type;
10038   rtx_insn *prev;
10039   rtx body;
10040
10041   if (!TARGET_FIX_ERR_A53_835769)
10042     return false;
10043
10044   if (recog_memoized (insn) < 0)
10045     return false;
10046
10047   attr_type = get_attr_type (insn);
10048   if (!is_madd_op (attr_type))
10049     return false;
10050
10051   prev = aarch64_prev_real_insn (insn);
10052   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
10053      Restore recog state to INSN to avoid state corruption.  */
10054   extract_constrain_insn_cached (insn);
10055
10056   if (!prev || !has_memory_op (prev))
10057     return false;
10058
10059   body = single_set (prev);
10060
10061   /* If the previous insn is a memory op and there is no dependency between
10062      it and the DImode madd, emit a NOP between them.  If body is NULL then we
10063      have a complex memory operation, probably a load/store pair.
10064      Be conservative for now and emit a NOP.  */
10065   if (GET_MODE (recog_data.operand[0]) == DImode
10066       && (!body || !dep_between_memop_and_curr (body)))
10067     return true;
10068
10069   return false;
10070
10071 }
10072
10073
10074 /* Implement FINAL_PRESCAN_INSN.  */
10075
10076 void
10077 aarch64_final_prescan_insn (rtx_insn *insn)
10078 {
10079   if (aarch64_madd_needs_nop (insn))
10080     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
10081 }
10082
10083
10084 /* Return the equivalent letter for size.  */
10085 static char
10086 sizetochar (int size)
10087 {
10088   switch (size)
10089     {
10090     case 64: return 'd';
10091     case 32: return 's';
10092     case 16: return 'h';
10093     case 8 : return 'b';
10094     default: gcc_unreachable ();
10095     }
10096 }
10097
10098 /* Return true iff x is a uniform vector of floating-point
10099    constants, and the constant can be represented in
10100    quarter-precision form.  Note, as aarch64_float_const_representable
10101    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
10102 static bool
10103 aarch64_vect_float_const_representable_p (rtx x)
10104 {
10105   rtx elt;
10106   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10107           && const_vec_duplicate_p (x, &elt)
10108           && aarch64_float_const_representable_p (elt));
10109 }
10110
10111 /* Return true for valid and false for invalid.  */
10112 bool
10113 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
10114                               struct simd_immediate_info *info)
10115 {
10116 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
10117   matches = 1;                                          \
10118   for (i = 0; i < idx; i += (STRIDE))                   \
10119     if (!(TEST))                                        \
10120       matches = 0;                                      \
10121   if (matches)                                          \
10122     {                                                   \
10123       immtype = (CLASS);                                \
10124       elsize = (ELSIZE);                                \
10125       eshift = (SHIFT);                                 \
10126       emvn = (NEG);                                     \
10127       break;                                            \
10128     }
10129
10130   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
10131   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
10132   unsigned char bytes[16];
10133   int immtype = -1, matches;
10134   unsigned int invmask = inverse ? 0xff : 0;
10135   int eshift, emvn;
10136
10137   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10138     {
10139       if (! (aarch64_simd_imm_zero_p (op, mode)
10140              || aarch64_vect_float_const_representable_p (op)))
10141         return false;
10142
10143       if (info)
10144         {
10145           info->value = CONST_VECTOR_ELT (op, 0);
10146           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
10147           info->mvn = false;
10148           info->shift = 0;
10149         }
10150
10151       return true;
10152     }
10153
10154   /* Splat vector constant out into a byte vector.  */
10155   for (i = 0; i < n_elts; i++)
10156     {
10157       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
10158          it must be laid out in the vector register in reverse order.  */
10159       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
10160       unsigned HOST_WIDE_INT elpart;
10161       unsigned int part, parts;
10162
10163       if (CONST_INT_P (el))
10164         {
10165           elpart = INTVAL (el);
10166           parts = 1;
10167         }
10168       else if (GET_CODE (el) == CONST_DOUBLE)
10169         {
10170           elpart = CONST_DOUBLE_LOW (el);
10171           parts = 2;
10172         }
10173       else
10174         gcc_unreachable ();
10175
10176       for (part = 0; part < parts; part++)
10177         {
10178           unsigned int byte;
10179           for (byte = 0; byte < innersize; byte++)
10180             {
10181               bytes[idx++] = (elpart & 0xff) ^ invmask;
10182               elpart >>= BITS_PER_UNIT;
10183             }
10184           if (GET_CODE (el) == CONST_DOUBLE)
10185             elpart = CONST_DOUBLE_HIGH (el);
10186         }
10187     }
10188
10189   /* Sanity check.  */
10190   gcc_assert (idx == GET_MODE_SIZE (mode));
10191
10192   do
10193     {
10194       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
10195              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
10196
10197       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10198              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10199
10200       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
10201              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10202
10203       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
10204              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
10205
10206       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
10207
10208       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
10209
10210       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
10211              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
10212
10213       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10214              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10215
10216       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
10217              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10218
10219       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
10220              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
10221
10222       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
10223
10224       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
10225
10226       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10227              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10228
10229       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10230              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10231
10232       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
10233              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10234
10235       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
10236              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10237
10238       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
10239
10240       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
10241              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
10242     }
10243   while (0);
10244
10245   if (immtype == -1)
10246     return false;
10247
10248   if (info)
10249     {
10250       info->element_width = elsize;
10251       info->mvn = emvn != 0;
10252       info->shift = eshift;
10253
10254       unsigned HOST_WIDE_INT imm = 0;
10255
10256       if (immtype >= 12 && immtype <= 15)
10257         info->msl = true;
10258
10259       /* Un-invert bytes of recognized vector, if necessary.  */
10260       if (invmask != 0)
10261         for (i = 0; i < idx; i++)
10262           bytes[i] ^= invmask;
10263
10264       if (immtype == 17)
10265         {
10266           /* FIXME: Broken on 32-bit H_W_I hosts.  */
10267           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
10268
10269           for (i = 0; i < 8; i++)
10270             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
10271               << (i * BITS_PER_UNIT);
10272
10273
10274           info->value = GEN_INT (imm);
10275         }
10276       else
10277         {
10278           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
10279             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
10280
10281           /* Construct 'abcdefgh' because the assembler cannot handle
10282              generic constants.  */
10283           if (info->mvn)
10284             imm = ~imm;
10285           imm = (imm >> info->shift) & 0xff;
10286           info->value = GEN_INT (imm);
10287         }
10288     }
10289
10290   return true;
10291 #undef CHECK
10292 }
10293
10294 /* Check of immediate shift constants are within range.  */
10295 bool
10296 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
10297 {
10298   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
10299   if (left)
10300     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
10301   else
10302     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
10303 }
10304
10305 /* Return true if X is a uniform vector where all elements
10306    are either the floating-point constant 0.0 or the
10307    integer constant 0.  */
10308 bool
10309 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
10310 {
10311   return x == CONST0_RTX (mode);
10312 }
10313
10314 bool
10315 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
10316 {
10317   HOST_WIDE_INT imm = INTVAL (x);
10318   int i;
10319
10320   for (i = 0; i < 8; i++)
10321     {
10322       unsigned int byte = imm & 0xff;
10323       if (byte != 0xff && byte != 0)
10324        return false;
10325       imm >>= 8;
10326     }
10327
10328   return true;
10329 }
10330
10331 bool
10332 aarch64_mov_operand_p (rtx x, machine_mode mode)
10333 {
10334   if (GET_CODE (x) == HIGH
10335       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10336     return true;
10337
10338   if (CONST_INT_P (x))
10339     return true;
10340
10341   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
10342     return true;
10343
10344   return aarch64_classify_symbolic_expression (x)
10345     == SYMBOL_TINY_ABSOLUTE;
10346 }
10347
10348 /* Return a const_int vector of VAL.  */
10349 rtx
10350 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
10351 {
10352   int nunits = GET_MODE_NUNITS (mode);
10353   rtvec v = rtvec_alloc (nunits);
10354   int i;
10355
10356   for (i=0; i < nunits; i++)
10357     RTVEC_ELT (v, i) = GEN_INT (val);
10358
10359   return gen_rtx_CONST_VECTOR (mode, v);
10360 }
10361
10362 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
10363
10364 bool
10365 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
10366 {
10367   machine_mode vmode;
10368
10369   gcc_assert (!VECTOR_MODE_P (mode));
10370   vmode = aarch64_preferred_simd_mode (mode);
10371   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
10372   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
10373 }
10374
10375 /* Construct and return a PARALLEL RTX vector with elements numbering the
10376    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
10377    the vector - from the perspective of the architecture.  This does not
10378    line up with GCC's perspective on lane numbers, so we end up with
10379    different masks depending on our target endian-ness.  The diagram
10380    below may help.  We must draw the distinction when building masks
10381    which select one half of the vector.  An instruction selecting
10382    architectural low-lanes for a big-endian target, must be described using
10383    a mask selecting GCC high-lanes.
10384
10385                  Big-Endian             Little-Endian
10386
10387 GCC             0   1   2   3           3   2   1   0
10388               | x | x | x | x |       | x | x | x | x |
10389 Architecture    3   2   1   0           3   2   1   0
10390
10391 Low Mask:         { 2, 3 }                { 0, 1 }
10392 High Mask:        { 0, 1 }                { 2, 3 }
10393 */
10394
10395 rtx
10396 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
10397 {
10398   int nunits = GET_MODE_NUNITS (mode);
10399   rtvec v = rtvec_alloc (nunits / 2);
10400   int high_base = nunits / 2;
10401   int low_base = 0;
10402   int base;
10403   rtx t1;
10404   int i;
10405
10406   if (BYTES_BIG_ENDIAN)
10407     base = high ? low_base : high_base;
10408   else
10409     base = high ? high_base : low_base;
10410
10411   for (i = 0; i < nunits / 2; i++)
10412     RTVEC_ELT (v, i) = GEN_INT (base + i);
10413
10414   t1 = gen_rtx_PARALLEL (mode, v);
10415   return t1;
10416 }
10417
10418 /* Check OP for validity as a PARALLEL RTX vector with elements
10419    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
10420    from the perspective of the architecture.  See the diagram above
10421    aarch64_simd_vect_par_cnst_half for more details.  */
10422
10423 bool
10424 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
10425                                        bool high)
10426 {
10427   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
10428   HOST_WIDE_INT count_op = XVECLEN (op, 0);
10429   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
10430   int i = 0;
10431
10432   if (!VECTOR_MODE_P (mode))
10433     return false;
10434
10435   if (count_op != count_ideal)
10436     return false;
10437
10438   for (i = 0; i < count_ideal; i++)
10439     {
10440       rtx elt_op = XVECEXP (op, 0, i);
10441       rtx elt_ideal = XVECEXP (ideal, 0, i);
10442
10443       if (!CONST_INT_P (elt_op)
10444           || INTVAL (elt_ideal) != INTVAL (elt_op))
10445         return false;
10446     }
10447   return true;
10448 }
10449
10450 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
10451    HIGH (exclusive).  */
10452 void
10453 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
10454                           const_tree exp)
10455 {
10456   HOST_WIDE_INT lane;
10457   gcc_assert (CONST_INT_P (operand));
10458   lane = INTVAL (operand);
10459
10460   if (lane < low || lane >= high)
10461   {
10462     if (exp)
10463       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
10464     else
10465       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
10466   }
10467 }
10468
10469 /* Return TRUE if OP is a valid vector addressing mode.  */
10470 bool
10471 aarch64_simd_mem_operand_p (rtx op)
10472 {
10473   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
10474                         || REG_P (XEXP (op, 0)));
10475 }
10476
10477 /* Emit a register copy from operand to operand, taking care not to
10478    early-clobber source registers in the process.
10479
10480    COUNT is the number of components into which the copy needs to be
10481    decomposed.  */
10482 void
10483 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
10484                                 unsigned int count)
10485 {
10486   unsigned int i;
10487   int rdest = REGNO (operands[0]);
10488   int rsrc = REGNO (operands[1]);
10489
10490   if (!reg_overlap_mentioned_p (operands[0], operands[1])
10491       || rdest < rsrc)
10492     for (i = 0; i < count; i++)
10493       emit_move_insn (gen_rtx_REG (mode, rdest + i),
10494                       gen_rtx_REG (mode, rsrc + i));
10495   else
10496     for (i = 0; i < count; i++)
10497       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
10498                       gen_rtx_REG (mode, rsrc + count - i - 1));
10499 }
10500
10501 /* Compute and return the length of aarch64_simd_mov<mode>, where <mode> is
10502    one of VSTRUCT modes: OI, CI or XI.  */
10503 int
10504 aarch64_simd_attr_length_move (rtx_insn *insn)
10505 {
10506   machine_mode mode;
10507
10508   extract_insn_cached (insn);
10509
10510   if (REG_P (recog_data.operand[0]) && REG_P (recog_data.operand[1]))
10511     {
10512       mode = GET_MODE (recog_data.operand[0]);
10513       switch (mode)
10514         {
10515         case OImode:
10516           return 8;
10517         case CImode:
10518           return 12;
10519         case XImode:
10520           return 16;
10521         default:
10522           gcc_unreachable ();
10523         }
10524     }
10525   return 4;
10526 }
10527
10528 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
10529    one of VSTRUCT modes: OI, CI, or XI.  */
10530 int
10531 aarch64_simd_attr_length_rglist (enum machine_mode mode)
10532 {
10533   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
10534 }
10535
10536 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
10537    alignment of a vector to 128 bits.  */
10538 static HOST_WIDE_INT
10539 aarch64_simd_vector_alignment (const_tree type)
10540 {
10541   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
10542   return MIN (align, 128);
10543 }
10544
10545 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
10546 static bool
10547 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
10548 {
10549   if (is_packed)
10550     return false;
10551
10552   /* We guarantee alignment for vectors up to 128-bits.  */
10553   if (tree_int_cst_compare (TYPE_SIZE (type),
10554                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
10555     return false;
10556
10557   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
10558   return true;
10559 }
10560
10561 /* If VALS is a vector constant that can be loaded into a register
10562    using DUP, generate instructions to do so and return an RTX to
10563    assign to the register.  Otherwise return NULL_RTX.  */
10564 static rtx
10565 aarch64_simd_dup_constant (rtx vals)
10566 {
10567   machine_mode mode = GET_MODE (vals);
10568   machine_mode inner_mode = GET_MODE_INNER (mode);
10569   rtx x;
10570
10571   if (!const_vec_duplicate_p (vals, &x))
10572     return NULL_RTX;
10573
10574   /* We can load this constant by using DUP and a constant in a
10575      single ARM register.  This will be cheaper than a vector
10576      load.  */
10577   x = copy_to_mode_reg (inner_mode, x);
10578   return gen_rtx_VEC_DUPLICATE (mode, x);
10579 }
10580
10581
10582 /* Generate code to load VALS, which is a PARALLEL containing only
10583    constants (for vec_init) or CONST_VECTOR, efficiently into a
10584    register.  Returns an RTX to copy into the register, or NULL_RTX
10585    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
10586 static rtx
10587 aarch64_simd_make_constant (rtx vals)
10588 {
10589   machine_mode mode = GET_MODE (vals);
10590   rtx const_dup;
10591   rtx const_vec = NULL_RTX;
10592   int n_elts = GET_MODE_NUNITS (mode);
10593   int n_const = 0;
10594   int i;
10595
10596   if (GET_CODE (vals) == CONST_VECTOR)
10597     const_vec = vals;
10598   else if (GET_CODE (vals) == PARALLEL)
10599     {
10600       /* A CONST_VECTOR must contain only CONST_INTs and
10601          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
10602          Only store valid constants in a CONST_VECTOR.  */
10603       for (i = 0; i < n_elts; ++i)
10604         {
10605           rtx x = XVECEXP (vals, 0, i);
10606           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
10607             n_const++;
10608         }
10609       if (n_const == n_elts)
10610         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
10611     }
10612   else
10613     gcc_unreachable ();
10614
10615   if (const_vec != NULL_RTX
10616       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
10617     /* Load using MOVI/MVNI.  */
10618     return const_vec;
10619   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
10620     /* Loaded using DUP.  */
10621     return const_dup;
10622   else if (const_vec != NULL_RTX)
10623     /* Load from constant pool. We can not take advantage of single-cycle
10624        LD1 because we need a PC-relative addressing mode.  */
10625     return const_vec;
10626   else
10627     /* A PARALLEL containing something not valid inside CONST_VECTOR.
10628        We can not construct an initializer.  */
10629     return NULL_RTX;
10630 }
10631
10632 void
10633 aarch64_expand_vector_init (rtx target, rtx vals)
10634 {
10635   machine_mode mode = GET_MODE (target);
10636   machine_mode inner_mode = GET_MODE_INNER (mode);
10637   int n_elts = GET_MODE_NUNITS (mode);
10638   int n_var = 0;
10639   rtx any_const = NULL_RTX;
10640   bool all_same = true;
10641
10642   for (int i = 0; i < n_elts; ++i)
10643     {
10644       rtx x = XVECEXP (vals, 0, i);
10645       if (!CONST_INT_P (x) && !CONST_DOUBLE_P (x))
10646         ++n_var;
10647       else
10648         any_const = x;
10649
10650       if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
10651         all_same = false;
10652     }
10653
10654   if (n_var == 0)
10655     {
10656       rtx constant = aarch64_simd_make_constant (vals);
10657       if (constant != NULL_RTX)
10658         {
10659           emit_move_insn (target, constant);
10660           return;
10661         }
10662     }
10663
10664   /* Splat a single non-constant element if we can.  */
10665   if (all_same)
10666     {
10667       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, 0));
10668       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
10669       return;
10670     }
10671
10672   /* Half the fields (or less) are non-constant.  Load constant then overwrite
10673      varying fields.  Hope that this is more efficient than using the stack.  */
10674   if (n_var <= n_elts/2)
10675     {
10676       rtx copy = copy_rtx (vals);
10677
10678       /* Load constant part of vector.  We really don't care what goes into the
10679          parts we will overwrite, but we're more likely to be able to load the
10680          constant efficiently if it has fewer, larger, repeating parts
10681          (see aarch64_simd_valid_immediate).  */
10682       for (int i = 0; i < n_elts; i++)
10683         {
10684           rtx x = XVECEXP (vals, 0, i);
10685           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
10686             continue;
10687           rtx subst = any_const;
10688           for (int bit = n_elts / 2; bit > 0; bit /= 2)
10689             {
10690               /* Look in the copied vector, as more elements are const.  */
10691               rtx test = XVECEXP (copy, 0, i ^ bit);
10692               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
10693                 {
10694                   subst = test;
10695                   break;
10696                 }
10697             }
10698           XVECEXP (copy, 0, i) = subst;
10699         }
10700       aarch64_expand_vector_init (target, copy);
10701
10702       /* Insert variables.  */
10703       enum insn_code icode = optab_handler (vec_set_optab, mode);
10704       gcc_assert (icode != CODE_FOR_nothing);
10705
10706       for (int i = 0; i < n_elts; i++)
10707         {
10708           rtx x = XVECEXP (vals, 0, i);
10709           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
10710             continue;
10711           x = copy_to_mode_reg (inner_mode, x);
10712           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
10713         }
10714       return;
10715     }
10716
10717   /* Construct the vector in memory one field at a time
10718      and load the whole vector.  */
10719   rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
10720   for (int i = 0; i < n_elts; i++)
10721     emit_move_insn (adjust_address_nv (mem, inner_mode,
10722                                     i * GET_MODE_SIZE (inner_mode)),
10723                     XVECEXP (vals, 0, i));
10724   emit_move_insn (target, mem);
10725
10726 }
10727
10728 static unsigned HOST_WIDE_INT
10729 aarch64_shift_truncation_mask (machine_mode mode)
10730 {
10731   return
10732     (aarch64_vector_mode_supported_p (mode)
10733      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
10734 }
10735
10736 /* Select a format to encode pointers in exception handling data.  */
10737 int
10738 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
10739 {
10740    int type;
10741    switch (aarch64_cmodel)
10742      {
10743      case AARCH64_CMODEL_TINY:
10744      case AARCH64_CMODEL_TINY_PIC:
10745      case AARCH64_CMODEL_SMALL:
10746      case AARCH64_CMODEL_SMALL_PIC:
10747      case AARCH64_CMODEL_SMALL_SPIC:
10748        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
10749           for everything.  */
10750        type = DW_EH_PE_sdata4;
10751        break;
10752      default:
10753        /* No assumptions here.  8-byte relocs required.  */
10754        type = DW_EH_PE_sdata8;
10755        break;
10756      }
10757    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
10758 }
10759
10760 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
10761    by the function fndecl.  */
10762
10763 void
10764 aarch64_declare_function_name (FILE *stream, const char* name,
10765                                 tree fndecl)
10766 {
10767   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
10768
10769   struct cl_target_option *targ_options;
10770   if (target_parts)
10771     targ_options = TREE_TARGET_OPTION (target_parts);
10772   else
10773     targ_options = TREE_TARGET_OPTION (target_option_current_node);
10774   gcc_assert (targ_options);
10775
10776   const struct processor *this_arch
10777     = aarch64_get_arch (targ_options->x_explicit_arch);
10778
10779   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
10780   std::string extension
10781     = aarch64_get_extension_string_for_isa_flags (isa_flags);
10782   asm_fprintf (asm_out_file, "\t.arch %s%s\n",
10783                this_arch->name, extension.c_str ());
10784
10785   /* Print the cpu name we're tuning for in the comments, might be
10786      useful to readers of the generated asm.  */
10787
10788   const struct processor *this_tune
10789     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
10790
10791   asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
10792                this_tune->name);
10793
10794   /* Don't forget the type directive for ELF.  */
10795   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
10796   ASM_OUTPUT_LABEL (stream, name);
10797 }
10798
10799 /* Emit load exclusive.  */
10800
10801 static void
10802 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
10803                              rtx mem, rtx model_rtx)
10804 {
10805   rtx (*gen) (rtx, rtx, rtx);
10806
10807   switch (mode)
10808     {
10809     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
10810     case HImode: gen = gen_aarch64_load_exclusivehi; break;
10811     case SImode: gen = gen_aarch64_load_exclusivesi; break;
10812     case DImode: gen = gen_aarch64_load_exclusivedi; break;
10813     default:
10814       gcc_unreachable ();
10815     }
10816
10817   emit_insn (gen (rval, mem, model_rtx));
10818 }
10819
10820 /* Emit store exclusive.  */
10821
10822 static void
10823 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
10824                               rtx rval, rtx mem, rtx model_rtx)
10825 {
10826   rtx (*gen) (rtx, rtx, rtx, rtx);
10827
10828   switch (mode)
10829     {
10830     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
10831     case HImode: gen = gen_aarch64_store_exclusivehi; break;
10832     case SImode: gen = gen_aarch64_store_exclusivesi; break;
10833     case DImode: gen = gen_aarch64_store_exclusivedi; break;
10834     default:
10835       gcc_unreachable ();
10836     }
10837
10838   emit_insn (gen (bval, rval, mem, model_rtx));
10839 }
10840
10841 /* Mark the previous jump instruction as unlikely.  */
10842
10843 static void
10844 aarch64_emit_unlikely_jump (rtx insn)
10845 {
10846   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
10847
10848   insn = emit_jump_insn (insn);
10849   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
10850 }
10851
10852 /* Expand a compare and swap pattern.  */
10853
10854 void
10855 aarch64_expand_compare_and_swap (rtx operands[])
10856 {
10857   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
10858   machine_mode mode, cmp_mode;
10859   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
10860   int idx;
10861   gen_cas_fn gen;
10862   const gen_cas_fn split_cas[] =
10863   {
10864     gen_aarch64_compare_and_swapqi,
10865     gen_aarch64_compare_and_swaphi,
10866     gen_aarch64_compare_and_swapsi,
10867     gen_aarch64_compare_and_swapdi
10868   };
10869   const gen_cas_fn atomic_cas[] =
10870   {
10871     gen_aarch64_compare_and_swapqi_lse,
10872     gen_aarch64_compare_and_swaphi_lse,
10873     gen_aarch64_compare_and_swapsi_lse,
10874     gen_aarch64_compare_and_swapdi_lse
10875   };
10876
10877   bval = operands[0];
10878   rval = operands[1];
10879   mem = operands[2];
10880   oldval = operands[3];
10881   newval = operands[4];
10882   is_weak = operands[5];
10883   mod_s = operands[6];
10884   mod_f = operands[7];
10885   mode = GET_MODE (mem);
10886   cmp_mode = mode;
10887
10888   /* Normally the succ memory model must be stronger than fail, but in the
10889      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
10890      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
10891
10892   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
10893       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
10894     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
10895
10896   switch (mode)
10897     {
10898     case QImode:
10899     case HImode:
10900       /* For short modes, we're going to perform the comparison in SImode,
10901          so do the zero-extension now.  */
10902       cmp_mode = SImode;
10903       rval = gen_reg_rtx (SImode);
10904       oldval = convert_modes (SImode, mode, oldval, true);
10905       /* Fall through.  */
10906
10907     case SImode:
10908     case DImode:
10909       /* Force the value into a register if needed.  */
10910       if (!aarch64_plus_operand (oldval, mode))
10911         oldval = force_reg (cmp_mode, oldval);
10912       break;
10913
10914     default:
10915       gcc_unreachable ();
10916     }
10917
10918   switch (mode)
10919     {
10920     case QImode: idx = 0; break;
10921     case HImode: idx = 1; break;
10922     case SImode: idx = 2; break;
10923     case DImode: idx = 3; break;
10924     default:
10925       gcc_unreachable ();
10926     }
10927   if (TARGET_LSE)
10928     gen = atomic_cas[idx];
10929   else
10930     gen = split_cas[idx];
10931
10932   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
10933
10934   if (mode == QImode || mode == HImode)
10935     emit_move_insn (operands[1], gen_lowpart (mode, rval));
10936
10937   x = gen_rtx_REG (CCmode, CC_REGNUM);
10938   x = gen_rtx_EQ (SImode, x, const0_rtx);
10939   emit_insn (gen_rtx_SET (bval, x));
10940 }
10941
10942 /* Test whether the target supports using a atomic load-operate instruction.
10943    CODE is the operation and AFTER is TRUE if the data in memory after the
10944    operation should be returned and FALSE if the data before the operation
10945    should be returned.  Returns FALSE if the operation isn't supported by the
10946    architecture.  */
10947
10948 bool
10949 aarch64_atomic_ldop_supported_p (enum rtx_code code)
10950 {
10951   if (!TARGET_LSE)
10952     return false;
10953
10954   switch (code)
10955     {
10956     case SET:
10957     case AND:
10958     case IOR:
10959     case XOR:
10960     case MINUS:
10961     case PLUS:
10962       return true;
10963     default:
10964       return false;
10965     }
10966 }
10967
10968 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
10969    sequence implementing an atomic operation.  */
10970
10971 static void
10972 aarch64_emit_post_barrier (enum memmodel model)
10973 {
10974   const enum memmodel base_model = memmodel_base (model);
10975
10976   if (is_mm_sync (model)
10977       && (base_model == MEMMODEL_ACQUIRE
10978           || base_model == MEMMODEL_ACQ_REL
10979           || base_model == MEMMODEL_SEQ_CST))
10980     {
10981       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
10982     }
10983 }
10984
10985 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
10986    for the data in memory.  EXPECTED is the value expected to be in memory.
10987    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
10988    is the memory ordering to use.  */
10989
10990 void
10991 aarch64_gen_atomic_cas (rtx rval, rtx mem,
10992                         rtx expected, rtx desired,
10993                         rtx model)
10994 {
10995   rtx (*gen) (rtx, rtx, rtx, rtx);
10996   machine_mode mode;
10997
10998   mode = GET_MODE (mem);
10999
11000   switch (mode)
11001     {
11002     case QImode: gen = gen_aarch64_atomic_casqi; break;
11003     case HImode: gen = gen_aarch64_atomic_cashi; break;
11004     case SImode: gen = gen_aarch64_atomic_cassi; break;
11005     case DImode: gen = gen_aarch64_atomic_casdi; break;
11006     default:
11007       gcc_unreachable ();
11008     }
11009
11010   /* Move the expected value into the CAS destination register.  */
11011   emit_insn (gen_rtx_SET (rval, expected));
11012
11013   /* Emit the CAS.  */
11014   emit_insn (gen (rval, mem, desired, model));
11015
11016   /* Compare the expected value with the value loaded by the CAS, to establish
11017      whether the swap was made.  */
11018   aarch64_gen_compare_reg (EQ, rval, expected);
11019 }
11020
11021 /* Split a compare and swap pattern.  */
11022
11023 void
11024 aarch64_split_compare_and_swap (rtx operands[])
11025 {
11026   rtx rval, mem, oldval, newval, scratch;
11027   machine_mode mode;
11028   bool is_weak;
11029   rtx_code_label *label1, *label2;
11030   rtx x, cond;
11031   enum memmodel model;
11032   rtx model_rtx;
11033
11034   rval = operands[0];
11035   mem = operands[1];
11036   oldval = operands[2];
11037   newval = operands[3];
11038   is_weak = (operands[4] != const0_rtx);
11039   model_rtx = operands[5];
11040   scratch = operands[7];
11041   mode = GET_MODE (mem);
11042   model = memmodel_from_int (INTVAL (model_rtx));
11043
11044   label1 = NULL;
11045   if (!is_weak)
11046     {
11047       label1 = gen_label_rtx ();
11048       emit_label (label1);
11049     }
11050   label2 = gen_label_rtx ();
11051
11052   /* The initial load can be relaxed for a __sync operation since a final
11053      barrier will be emitted to stop code hoisting.  */
11054   if (is_mm_sync (model))
11055     aarch64_emit_load_exclusive (mode, rval, mem,
11056                                  GEN_INT (MEMMODEL_RELAXED));
11057   else
11058     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
11059
11060   cond = aarch64_gen_compare_reg (NE, rval, oldval);
11061   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11062   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11063                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11064   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11065
11066   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
11067
11068   if (!is_weak)
11069     {
11070       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
11071       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11072                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
11073       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11074     }
11075   else
11076     {
11077       cond = gen_rtx_REG (CCmode, CC_REGNUM);
11078       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
11079       emit_insn (gen_rtx_SET (cond, x));
11080     }
11081
11082   emit_label (label2);
11083
11084   /* Emit any final barrier needed for a __sync operation.  */
11085   if (is_mm_sync (model))
11086     aarch64_emit_post_barrier (model);
11087 }
11088
11089 /* Emit a BIC instruction.  */
11090
11091 static void
11092 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
11093 {
11094   rtx shift_rtx = GEN_INT (shift);
11095   rtx (*gen) (rtx, rtx, rtx, rtx);
11096
11097   switch (mode)
11098     {
11099     case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
11100     case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
11101     default:
11102       gcc_unreachable ();
11103     }
11104
11105   emit_insn (gen (dst, s2, shift_rtx, s1));
11106 }
11107
11108 /* Emit an atomic swap.  */
11109
11110 static void
11111 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
11112                           rtx mem, rtx model)
11113 {
11114   rtx (*gen) (rtx, rtx, rtx, rtx);
11115
11116   switch (mode)
11117     {
11118     case QImode: gen = gen_aarch64_atomic_swpqi; break;
11119     case HImode: gen = gen_aarch64_atomic_swphi; break;
11120     case SImode: gen = gen_aarch64_atomic_swpsi; break;
11121     case DImode: gen = gen_aarch64_atomic_swpdi; break;
11122     default:
11123       gcc_unreachable ();
11124     }
11125
11126   emit_insn (gen (dst, mem, value, model));
11127 }
11128
11129 /* Operations supported by aarch64_emit_atomic_load_op.  */
11130
11131 enum aarch64_atomic_load_op_code
11132 {
11133   AARCH64_LDOP_PLUS,    /* A + B  */
11134   AARCH64_LDOP_XOR,     /* A ^ B  */
11135   AARCH64_LDOP_OR,      /* A | B  */
11136   AARCH64_LDOP_BIC      /* A & ~B  */
11137 };
11138
11139 /* Emit an atomic load-operate.  */
11140
11141 static void
11142 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
11143                              machine_mode mode, rtx dst, rtx src,
11144                              rtx mem, rtx model)
11145 {
11146   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
11147   const aarch64_atomic_load_op_fn plus[] =
11148   {
11149     gen_aarch64_atomic_loadaddqi,
11150     gen_aarch64_atomic_loadaddhi,
11151     gen_aarch64_atomic_loadaddsi,
11152     gen_aarch64_atomic_loadadddi
11153   };
11154   const aarch64_atomic_load_op_fn eor[] =
11155   {
11156     gen_aarch64_atomic_loadeorqi,
11157     gen_aarch64_atomic_loadeorhi,
11158     gen_aarch64_atomic_loadeorsi,
11159     gen_aarch64_atomic_loadeordi
11160   };
11161   const aarch64_atomic_load_op_fn ior[] =
11162   {
11163     gen_aarch64_atomic_loadsetqi,
11164     gen_aarch64_atomic_loadsethi,
11165     gen_aarch64_atomic_loadsetsi,
11166     gen_aarch64_atomic_loadsetdi
11167   };
11168   const aarch64_atomic_load_op_fn bic[] =
11169   {
11170     gen_aarch64_atomic_loadclrqi,
11171     gen_aarch64_atomic_loadclrhi,
11172     gen_aarch64_atomic_loadclrsi,
11173     gen_aarch64_atomic_loadclrdi
11174   };
11175   aarch64_atomic_load_op_fn gen;
11176   int idx = 0;
11177
11178   switch (mode)
11179     {
11180     case QImode: idx = 0; break;
11181     case HImode: idx = 1; break;
11182     case SImode: idx = 2; break;
11183     case DImode: idx = 3; break;
11184     default:
11185       gcc_unreachable ();
11186     }
11187
11188   switch (code)
11189     {
11190     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
11191     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
11192     case AARCH64_LDOP_OR: gen = ior[idx]; break;
11193     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
11194     default:
11195       gcc_unreachable ();
11196     }
11197
11198   emit_insn (gen (dst, mem, src, model));
11199 }
11200
11201 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
11202    location to store the data read from memory.  OUT_RESULT is the location to
11203    store the result of the operation.  MEM is the memory location to read and
11204    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
11205    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
11206    be NULL.  */
11207
11208 void
11209 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
11210                          rtx mem, rtx value, rtx model_rtx)
11211 {
11212   machine_mode mode = GET_MODE (mem);
11213   machine_mode wmode = (mode == DImode ? DImode : SImode);
11214   const bool short_mode = (mode < SImode);
11215   aarch64_atomic_load_op_code ldop_code;
11216   rtx src;
11217   rtx x;
11218
11219   if (out_data)
11220     out_data = gen_lowpart (mode, out_data);
11221
11222   if (out_result)
11223     out_result = gen_lowpart (mode, out_result);
11224
11225   /* Make sure the value is in a register, putting it into a destination
11226      register if it needs to be manipulated.  */
11227   if (!register_operand (value, mode)
11228       || code == AND || code == MINUS)
11229     {
11230       src = out_result ? out_result : out_data;
11231       emit_move_insn (src, gen_lowpart (mode, value));
11232     }
11233   else
11234     src = value;
11235   gcc_assert (register_operand (src, mode));
11236
11237   /* Preprocess the data for the operation as necessary.  If the operation is
11238      a SET then emit a swap instruction and finish.  */
11239   switch (code)
11240     {
11241     case SET:
11242       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
11243       return;
11244
11245     case MINUS:
11246       /* Negate the value and treat it as a PLUS.  */
11247       {
11248         rtx neg_src;
11249
11250         /* Resize the value if necessary.  */
11251         if (short_mode)
11252           src = gen_lowpart (wmode, src);
11253
11254         neg_src = gen_rtx_NEG (wmode, src);
11255         emit_insn (gen_rtx_SET (src, neg_src));
11256
11257         if (short_mode)
11258           src = gen_lowpart (mode, src);
11259       }
11260       /* Fall-through.  */
11261     case PLUS:
11262       ldop_code = AARCH64_LDOP_PLUS;
11263       break;
11264
11265     case IOR:
11266       ldop_code = AARCH64_LDOP_OR;
11267       break;
11268
11269     case XOR:
11270       ldop_code = AARCH64_LDOP_XOR;
11271       break;
11272
11273     case AND:
11274       {
11275         rtx not_src;
11276
11277         /* Resize the value if necessary.  */
11278         if (short_mode)
11279           src = gen_lowpart (wmode, src);
11280
11281         not_src = gen_rtx_NOT (wmode, src);
11282         emit_insn (gen_rtx_SET (src, not_src));
11283
11284         if (short_mode)
11285           src = gen_lowpart (mode, src);
11286       }
11287       ldop_code = AARCH64_LDOP_BIC;
11288       break;
11289
11290     default:
11291       /* The operation can't be done with atomic instructions.  */
11292       gcc_unreachable ();
11293     }
11294
11295   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
11296
11297   /* If necessary, calculate the data in memory after the update by redoing the
11298      operation from values in registers.  */
11299   if (!out_result)
11300     return;
11301
11302   if (short_mode)
11303     {
11304       src = gen_lowpart (wmode, src);
11305       out_data = gen_lowpart (wmode, out_data);
11306       out_result = gen_lowpart (wmode, out_result);
11307     }
11308
11309   x = NULL_RTX;
11310
11311   switch (code)
11312     {
11313     case MINUS:
11314     case PLUS:
11315       x = gen_rtx_PLUS (wmode, out_data, src);
11316       break;
11317     case IOR:
11318       x = gen_rtx_IOR (wmode, out_data, src);
11319       break;
11320     case XOR:
11321       x = gen_rtx_XOR (wmode, out_data, src);
11322       break;
11323     case AND:
11324       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
11325       return;
11326     default:
11327       gcc_unreachable ();
11328     }
11329
11330   emit_set_insn (out_result, x);
11331
11332   return;
11333 }
11334
11335 /* Split an atomic operation.  */
11336
11337 void
11338 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
11339                          rtx value, rtx model_rtx, rtx cond)
11340 {
11341   machine_mode mode = GET_MODE (mem);
11342   machine_mode wmode = (mode == DImode ? DImode : SImode);
11343   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
11344   const bool is_sync = is_mm_sync (model);
11345   rtx_code_label *label;
11346   rtx x;
11347
11348   /* Split the atomic operation into a sequence.  */
11349   label = gen_label_rtx ();
11350   emit_label (label);
11351
11352   if (new_out)
11353     new_out = gen_lowpart (wmode, new_out);
11354   if (old_out)
11355     old_out = gen_lowpart (wmode, old_out);
11356   else
11357     old_out = new_out;
11358   value = simplify_gen_subreg (wmode, value, mode, 0);
11359
11360   /* The initial load can be relaxed for a __sync operation since a final
11361      barrier will be emitted to stop code hoisting.  */
11362  if (is_sync)
11363     aarch64_emit_load_exclusive (mode, old_out, mem,
11364                                  GEN_INT (MEMMODEL_RELAXED));
11365   else
11366     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
11367
11368   switch (code)
11369     {
11370     case SET:
11371       new_out = value;
11372       break;
11373
11374     case NOT:
11375       x = gen_rtx_AND (wmode, old_out, value);
11376       emit_insn (gen_rtx_SET (new_out, x));
11377       x = gen_rtx_NOT (wmode, new_out);
11378       emit_insn (gen_rtx_SET (new_out, x));
11379       break;
11380
11381     case MINUS:
11382       if (CONST_INT_P (value))
11383         {
11384           value = GEN_INT (-INTVAL (value));
11385           code = PLUS;
11386         }
11387       /* Fall through.  */
11388
11389     default:
11390       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
11391       emit_insn (gen_rtx_SET (new_out, x));
11392       break;
11393     }
11394
11395   aarch64_emit_store_exclusive (mode, cond, mem,
11396                                 gen_lowpart (mode, new_out), model_rtx);
11397
11398   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11399   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11400                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
11401   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11402
11403   /* Emit any final barrier needed for a __sync operation.  */
11404   if (is_sync)
11405     aarch64_emit_post_barrier (model);
11406 }
11407
11408 static void
11409 aarch64_init_libfuncs (void)
11410 {
11411    /* Half-precision float operations.  The compiler handles all operations
11412      with NULL libfuncs by converting to SFmode.  */
11413
11414   /* Conversions.  */
11415   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
11416   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
11417
11418   /* Arithmetic.  */
11419   set_optab_libfunc (add_optab, HFmode, NULL);
11420   set_optab_libfunc (sdiv_optab, HFmode, NULL);
11421   set_optab_libfunc (smul_optab, HFmode, NULL);
11422   set_optab_libfunc (neg_optab, HFmode, NULL);
11423   set_optab_libfunc (sub_optab, HFmode, NULL);
11424
11425   /* Comparisons.  */
11426   set_optab_libfunc (eq_optab, HFmode, NULL);
11427   set_optab_libfunc (ne_optab, HFmode, NULL);
11428   set_optab_libfunc (lt_optab, HFmode, NULL);
11429   set_optab_libfunc (le_optab, HFmode, NULL);
11430   set_optab_libfunc (ge_optab, HFmode, NULL);
11431   set_optab_libfunc (gt_optab, HFmode, NULL);
11432   set_optab_libfunc (unord_optab, HFmode, NULL);
11433 }
11434
11435 /* Target hook for c_mode_for_suffix.  */
11436 static machine_mode
11437 aarch64_c_mode_for_suffix (char suffix)
11438 {
11439   if (suffix == 'q')
11440     return TFmode;
11441
11442   return VOIDmode;
11443 }
11444
11445 /* We can only represent floating point constants which will fit in
11446    "quarter-precision" values.  These values are characterised by
11447    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
11448    by:
11449
11450    (-1)^s * (n/16) * 2^r
11451
11452    Where:
11453      's' is the sign bit.
11454      'n' is an integer in the range 16 <= n <= 31.
11455      'r' is an integer in the range -3 <= r <= 4.  */
11456
11457 /* Return true iff X can be represented by a quarter-precision
11458    floating point immediate operand X.  Note, we cannot represent 0.0.  */
11459 bool
11460 aarch64_float_const_representable_p (rtx x)
11461 {
11462   /* This represents our current view of how many bits
11463      make up the mantissa.  */
11464   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
11465   int exponent;
11466   unsigned HOST_WIDE_INT mantissa, mask;
11467   REAL_VALUE_TYPE r, m;
11468   bool fail;
11469
11470   if (!CONST_DOUBLE_P (x))
11471     return false;
11472
11473   /* We don't support HFmode constants yet.  */
11474   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
11475     return false;
11476
11477   r = *CONST_DOUBLE_REAL_VALUE (x);
11478
11479   /* We cannot represent infinities, NaNs or +/-zero.  We won't
11480      know if we have +zero until we analyse the mantissa, but we
11481      can reject the other invalid values.  */
11482   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
11483       || REAL_VALUE_MINUS_ZERO (r))
11484     return false;
11485
11486   /* Extract exponent.  */
11487   r = real_value_abs (&r);
11488   exponent = REAL_EXP (&r);
11489
11490   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
11491      highest (sign) bit, with a fixed binary point at bit point_pos.
11492      m1 holds the low part of the mantissa, m2 the high part.
11493      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
11494      bits for the mantissa, this can fail (low bits will be lost).  */
11495   real_ldexp (&m, &r, point_pos - exponent);
11496   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
11497
11498   /* If the low part of the mantissa has bits set we cannot represent
11499      the value.  */
11500   if (w.elt (0) != 0)
11501     return false;
11502   /* We have rejected the lower HOST_WIDE_INT, so update our
11503      understanding of how many bits lie in the mantissa and
11504      look only at the high HOST_WIDE_INT.  */
11505   mantissa = w.elt (1);
11506   point_pos -= HOST_BITS_PER_WIDE_INT;
11507
11508   /* We can only represent values with a mantissa of the form 1.xxxx.  */
11509   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
11510   if ((mantissa & mask) != 0)
11511     return false;
11512
11513   /* Having filtered unrepresentable values, we may now remove all
11514      but the highest 5 bits.  */
11515   mantissa >>= point_pos - 5;
11516
11517   /* We cannot represent the value 0.0, so reject it.  This is handled
11518      elsewhere.  */
11519   if (mantissa == 0)
11520     return false;
11521
11522   /* Then, as bit 4 is always set, we can mask it off, leaving
11523      the mantissa in the range [0, 15].  */
11524   mantissa &= ~(1 << 4);
11525   gcc_assert (mantissa <= 15);
11526
11527   /* GCC internally does not use IEEE754-like encoding (where normalized
11528      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
11529      Our mantissa values are shifted 4 places to the left relative to
11530      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
11531      by 5 places to correct for GCC's representation.  */
11532   exponent = 5 - exponent;
11533
11534   return (exponent >= 0 && exponent <= 7);
11535 }
11536
11537 char*
11538 aarch64_output_simd_mov_immediate (rtx const_vector,
11539                                    machine_mode mode,
11540                                    unsigned width)
11541 {
11542   bool is_valid;
11543   static char templ[40];
11544   const char *mnemonic;
11545   const char *shift_op;
11546   unsigned int lane_count = 0;
11547   char element_char;
11548
11549   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
11550
11551   /* This will return true to show const_vector is legal for use as either
11552      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
11553      also update INFO to show how the immediate should be generated.  */
11554   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
11555   gcc_assert (is_valid);
11556
11557   element_char = sizetochar (info.element_width);
11558   lane_count = width / info.element_width;
11559
11560   mode = GET_MODE_INNER (mode);
11561   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
11562     {
11563       gcc_assert (info.shift == 0 && ! info.mvn);
11564       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
11565          move immediate path.  */
11566       if (aarch64_float_const_zero_rtx_p (info.value))
11567         info.value = GEN_INT (0);
11568       else
11569         {
11570 #define buf_size 20
11571           char float_buf[buf_size] = {'\0'};
11572           real_to_decimal_for_mode (float_buf,
11573                                     CONST_DOUBLE_REAL_VALUE (info.value),
11574                                     buf_size, buf_size, 1, mode);
11575 #undef buf_size
11576
11577           if (lane_count == 1)
11578             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
11579           else
11580             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
11581                       lane_count, element_char, float_buf);
11582           return templ;
11583         }
11584     }
11585
11586   mnemonic = info.mvn ? "mvni" : "movi";
11587   shift_op = info.msl ? "msl" : "lsl";
11588
11589   gcc_assert (CONST_INT_P (info.value));
11590   if (lane_count == 1)
11591     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
11592               mnemonic, UINTVAL (info.value));
11593   else if (info.shift)
11594     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
11595               ", %s %d", mnemonic, lane_count, element_char,
11596               UINTVAL (info.value), shift_op, info.shift);
11597   else
11598     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
11599               mnemonic, lane_count, element_char, UINTVAL (info.value));
11600   return templ;
11601 }
11602
11603 char*
11604 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
11605                                           machine_mode mode)
11606 {
11607   machine_mode vmode;
11608
11609   gcc_assert (!VECTOR_MODE_P (mode));
11610   vmode = aarch64_simd_container_mode (mode, 64);
11611   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
11612   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
11613 }
11614
11615 /* Split operands into moves from op[1] + op[2] into op[0].  */
11616
11617 void
11618 aarch64_split_combinev16qi (rtx operands[3])
11619 {
11620   unsigned int dest = REGNO (operands[0]);
11621   unsigned int src1 = REGNO (operands[1]);
11622   unsigned int src2 = REGNO (operands[2]);
11623   machine_mode halfmode = GET_MODE (operands[1]);
11624   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
11625   rtx destlo, desthi;
11626
11627   gcc_assert (halfmode == V16QImode);
11628
11629   if (src1 == dest && src2 == dest + halfregs)
11630     {
11631       /* No-op move.  Can't split to nothing; emit something.  */
11632       emit_note (NOTE_INSN_DELETED);
11633       return;
11634     }
11635
11636   /* Preserve register attributes for variable tracking.  */
11637   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
11638   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
11639                                GET_MODE_SIZE (halfmode));
11640
11641   /* Special case of reversed high/low parts.  */
11642   if (reg_overlap_mentioned_p (operands[2], destlo)
11643       && reg_overlap_mentioned_p (operands[1], desthi))
11644     {
11645       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
11646       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
11647       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
11648     }
11649   else if (!reg_overlap_mentioned_p (operands[2], destlo))
11650     {
11651       /* Try to avoid unnecessary moves if part of the result
11652          is in the right place already.  */
11653       if (src1 != dest)
11654         emit_move_insn (destlo, operands[1]);
11655       if (src2 != dest + halfregs)
11656         emit_move_insn (desthi, operands[2]);
11657     }
11658   else
11659     {
11660       if (src2 != dest + halfregs)
11661         emit_move_insn (desthi, operands[2]);
11662       if (src1 != dest)
11663         emit_move_insn (destlo, operands[1]);
11664     }
11665 }
11666
11667 /* vec_perm support.  */
11668
11669 #define MAX_VECT_LEN 16
11670
11671 struct expand_vec_perm_d
11672 {
11673   rtx target, op0, op1;
11674   unsigned char perm[MAX_VECT_LEN];
11675   machine_mode vmode;
11676   unsigned char nelt;
11677   bool one_vector_p;
11678   bool testing_p;
11679 };
11680
11681 /* Generate a variable permutation.  */
11682
11683 static void
11684 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
11685 {
11686   machine_mode vmode = GET_MODE (target);
11687   bool one_vector_p = rtx_equal_p (op0, op1);
11688
11689   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
11690   gcc_checking_assert (GET_MODE (op0) == vmode);
11691   gcc_checking_assert (GET_MODE (op1) == vmode);
11692   gcc_checking_assert (GET_MODE (sel) == vmode);
11693   gcc_checking_assert (TARGET_SIMD);
11694
11695   if (one_vector_p)
11696     {
11697       if (vmode == V8QImode)
11698         {
11699           /* Expand the argument to a V16QI mode by duplicating it.  */
11700           rtx pair = gen_reg_rtx (V16QImode);
11701           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
11702           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
11703         }
11704       else
11705         {
11706           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
11707         }
11708     }
11709   else
11710     {
11711       rtx pair;
11712
11713       if (vmode == V8QImode)
11714         {
11715           pair = gen_reg_rtx (V16QImode);
11716           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
11717           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
11718         }
11719       else
11720         {
11721           pair = gen_reg_rtx (OImode);
11722           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
11723           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
11724         }
11725     }
11726 }
11727
11728 void
11729 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
11730 {
11731   machine_mode vmode = GET_MODE (target);
11732   unsigned int nelt = GET_MODE_NUNITS (vmode);
11733   bool one_vector_p = rtx_equal_p (op0, op1);
11734   rtx mask;
11735
11736   /* The TBL instruction does not use a modulo index, so we must take care
11737      of that ourselves.  */
11738   mask = aarch64_simd_gen_const_vector_dup (vmode,
11739       one_vector_p ? nelt - 1 : 2 * nelt - 1);
11740   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
11741
11742   /* For big-endian, we also need to reverse the index within the vector
11743      (but not which vector).  */
11744   if (BYTES_BIG_ENDIAN)
11745     {
11746       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
11747       if (!one_vector_p)
11748         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
11749       sel = expand_simple_binop (vmode, XOR, sel, mask,
11750                                  NULL, 0, OPTAB_LIB_WIDEN);
11751     }
11752   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
11753 }
11754
11755 /* Recognize patterns suitable for the TRN instructions.  */
11756 static bool
11757 aarch64_evpc_trn (struct expand_vec_perm_d *d)
11758 {
11759   unsigned int i, odd, mask, nelt = d->nelt;
11760   rtx out, in0, in1, x;
11761   rtx (*gen) (rtx, rtx, rtx);
11762   machine_mode vmode = d->vmode;
11763
11764   if (GET_MODE_UNIT_SIZE (vmode) > 8)
11765     return false;
11766
11767   /* Note that these are little-endian tests.
11768      We correct for big-endian later.  */
11769   if (d->perm[0] == 0)
11770     odd = 0;
11771   else if (d->perm[0] == 1)
11772     odd = 1;
11773   else
11774     return false;
11775   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
11776
11777   for (i = 0; i < nelt; i += 2)
11778     {
11779       if (d->perm[i] != i + odd)
11780         return false;
11781       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
11782         return false;
11783     }
11784
11785   /* Success!  */
11786   if (d->testing_p)
11787     return true;
11788
11789   in0 = d->op0;
11790   in1 = d->op1;
11791   if (BYTES_BIG_ENDIAN)
11792     {
11793       x = in0, in0 = in1, in1 = x;
11794       odd = !odd;
11795     }
11796   out = d->target;
11797
11798   if (odd)
11799     {
11800       switch (vmode)
11801         {
11802         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
11803         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
11804         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
11805         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
11806         case V4SImode: gen = gen_aarch64_trn2v4si; break;
11807         case V2SImode: gen = gen_aarch64_trn2v2si; break;
11808         case V2DImode: gen = gen_aarch64_trn2v2di; break;
11809         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
11810         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
11811         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
11812         default:
11813           return false;
11814         }
11815     }
11816   else
11817     {
11818       switch (vmode)
11819         {
11820         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
11821         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
11822         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
11823         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
11824         case V4SImode: gen = gen_aarch64_trn1v4si; break;
11825         case V2SImode: gen = gen_aarch64_trn1v2si; break;
11826         case V2DImode: gen = gen_aarch64_trn1v2di; break;
11827         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
11828         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
11829         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
11830         default:
11831           return false;
11832         }
11833     }
11834
11835   emit_insn (gen (out, in0, in1));
11836   return true;
11837 }
11838
11839 /* Recognize patterns suitable for the UZP instructions.  */
11840 static bool
11841 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
11842 {
11843   unsigned int i, odd, mask, nelt = d->nelt;
11844   rtx out, in0, in1, x;
11845   rtx (*gen) (rtx, rtx, rtx);
11846   machine_mode vmode = d->vmode;
11847
11848   if (GET_MODE_UNIT_SIZE (vmode) > 8)
11849     return false;
11850
11851   /* Note that these are little-endian tests.
11852      We correct for big-endian later.  */
11853   if (d->perm[0] == 0)
11854     odd = 0;
11855   else if (d->perm[0] == 1)
11856     odd = 1;
11857   else
11858     return false;
11859   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
11860
11861   for (i = 0; i < nelt; i++)
11862     {
11863       unsigned elt = (i * 2 + odd) & mask;
11864       if (d->perm[i] != elt)
11865         return false;
11866     }
11867
11868   /* Success!  */
11869   if (d->testing_p)
11870     return true;
11871
11872   in0 = d->op0;
11873   in1 = d->op1;
11874   if (BYTES_BIG_ENDIAN)
11875     {
11876       x = in0, in0 = in1, in1 = x;
11877       odd = !odd;
11878     }
11879   out = d->target;
11880
11881   if (odd)
11882     {
11883       switch (vmode)
11884         {
11885         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
11886         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
11887         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
11888         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
11889         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
11890         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
11891         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
11892         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
11893         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
11894         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
11895         default:
11896           return false;
11897         }
11898     }
11899   else
11900     {
11901       switch (vmode)
11902         {
11903         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
11904         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
11905         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
11906         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
11907         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
11908         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
11909         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
11910         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
11911         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
11912         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
11913         default:
11914           return false;
11915         }
11916     }
11917
11918   emit_insn (gen (out, in0, in1));
11919   return true;
11920 }
11921
11922 /* Recognize patterns suitable for the ZIP instructions.  */
11923 static bool
11924 aarch64_evpc_zip (struct expand_vec_perm_d *d)
11925 {
11926   unsigned int i, high, mask, nelt = d->nelt;
11927   rtx out, in0, in1, x;
11928   rtx (*gen) (rtx, rtx, rtx);
11929   machine_mode vmode = d->vmode;
11930
11931   if (GET_MODE_UNIT_SIZE (vmode) > 8)
11932     return false;
11933
11934   /* Note that these are little-endian tests.
11935      We correct for big-endian later.  */
11936   high = nelt / 2;
11937   if (d->perm[0] == high)
11938     /* Do Nothing.  */
11939     ;
11940   else if (d->perm[0] == 0)
11941     high = 0;
11942   else
11943     return false;
11944   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
11945
11946   for (i = 0; i < nelt / 2; i++)
11947     {
11948       unsigned elt = (i + high) & mask;
11949       if (d->perm[i * 2] != elt)
11950         return false;
11951       elt = (elt + nelt) & mask;
11952       if (d->perm[i * 2 + 1] != elt)
11953         return false;
11954     }
11955
11956   /* Success!  */
11957   if (d->testing_p)
11958     return true;
11959
11960   in0 = d->op0;
11961   in1 = d->op1;
11962   if (BYTES_BIG_ENDIAN)
11963     {
11964       x = in0, in0 = in1, in1 = x;
11965       high = !high;
11966     }
11967   out = d->target;
11968
11969   if (high)
11970     {
11971       switch (vmode)
11972         {
11973         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
11974         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
11975         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
11976         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
11977         case V4SImode: gen = gen_aarch64_zip2v4si; break;
11978         case V2SImode: gen = gen_aarch64_zip2v2si; break;
11979         case V2DImode: gen = gen_aarch64_zip2v2di; break;
11980         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
11981         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
11982         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
11983         default:
11984           return false;
11985         }
11986     }
11987   else
11988     {
11989       switch (vmode)
11990         {
11991         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
11992         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
11993         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
11994         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
11995         case V4SImode: gen = gen_aarch64_zip1v4si; break;
11996         case V2SImode: gen = gen_aarch64_zip1v2si; break;
11997         case V2DImode: gen = gen_aarch64_zip1v2di; break;
11998         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
11999         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
12000         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
12001         default:
12002           return false;
12003         }
12004     }
12005
12006   emit_insn (gen (out, in0, in1));
12007   return true;
12008 }
12009
12010 /* Recognize patterns for the EXT insn.  */
12011
12012 static bool
12013 aarch64_evpc_ext (struct expand_vec_perm_d *d)
12014 {
12015   unsigned int i, nelt = d->nelt;
12016   rtx (*gen) (rtx, rtx, rtx, rtx);
12017   rtx offset;
12018
12019   unsigned int location = d->perm[0]; /* Always < nelt.  */
12020
12021   /* Check if the extracted indices are increasing by one.  */
12022   for (i = 1; i < nelt; i++)
12023     {
12024       unsigned int required = location + i;
12025       if (d->one_vector_p)
12026         {
12027           /* We'll pass the same vector in twice, so allow indices to wrap.  */
12028           required &= (nelt - 1);
12029         }
12030       if (d->perm[i] != required)
12031         return false;
12032     }
12033
12034   switch (d->vmode)
12035     {
12036     case V16QImode: gen = gen_aarch64_extv16qi; break;
12037     case V8QImode: gen = gen_aarch64_extv8qi; break;
12038     case V4HImode: gen = gen_aarch64_extv4hi; break;
12039     case V8HImode: gen = gen_aarch64_extv8hi; break;
12040     case V2SImode: gen = gen_aarch64_extv2si; break;
12041     case V4SImode: gen = gen_aarch64_extv4si; break;
12042     case V2SFmode: gen = gen_aarch64_extv2sf; break;
12043     case V4SFmode: gen = gen_aarch64_extv4sf; break;
12044     case V2DImode: gen = gen_aarch64_extv2di; break;
12045     case V2DFmode: gen = gen_aarch64_extv2df; break;
12046     default:
12047       return false;
12048     }
12049
12050   /* Success! */
12051   if (d->testing_p)
12052     return true;
12053
12054   /* The case where (location == 0) is a no-op for both big- and little-endian,
12055      and is removed by the mid-end at optimization levels -O1 and higher.  */
12056
12057   if (BYTES_BIG_ENDIAN && (location != 0))
12058     {
12059       /* After setup, we want the high elements of the first vector (stored
12060          at the LSB end of the register), and the low elements of the second
12061          vector (stored at the MSB end of the register). So swap.  */
12062       std::swap (d->op0, d->op1);
12063       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
12064       location = nelt - location;
12065     }
12066
12067   offset = GEN_INT (location);
12068   emit_insn (gen (d->target, d->op0, d->op1, offset));
12069   return true;
12070 }
12071
12072 /* Recognize patterns for the REV insns.  */
12073
12074 static bool
12075 aarch64_evpc_rev (struct expand_vec_perm_d *d)
12076 {
12077   unsigned int i, j, diff, nelt = d->nelt;
12078   rtx (*gen) (rtx, rtx);
12079
12080   if (!d->one_vector_p)
12081     return false;
12082
12083   diff = d->perm[0];
12084   switch (diff)
12085     {
12086     case 7:
12087       switch (d->vmode)
12088         {
12089         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
12090         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
12091         default:
12092           return false;
12093         }
12094       break;
12095     case 3:
12096       switch (d->vmode)
12097         {
12098         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
12099         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
12100         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
12101         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
12102         default:
12103           return false;
12104         }
12105       break;
12106     case 1:
12107       switch (d->vmode)
12108         {
12109         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
12110         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
12111         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
12112         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
12113         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
12114         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
12115         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
12116         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
12117         default:
12118           return false;
12119         }
12120       break;
12121     default:
12122       return false;
12123     }
12124
12125   for (i = 0; i < nelt ; i += diff + 1)
12126     for (j = 0; j <= diff; j += 1)
12127       {
12128         /* This is guaranteed to be true as the value of diff
12129            is 7, 3, 1 and we should have enough elements in the
12130            queue to generate this.  Getting a vector mask with a
12131            value of diff other than these values implies that
12132            something is wrong by the time we get here.  */
12133         gcc_assert (i + j < nelt);
12134         if (d->perm[i + j] != i + diff - j)
12135           return false;
12136       }
12137
12138   /* Success! */
12139   if (d->testing_p)
12140     return true;
12141
12142   emit_insn (gen (d->target, d->op0));
12143   return true;
12144 }
12145
12146 static bool
12147 aarch64_evpc_dup (struct expand_vec_perm_d *d)
12148 {
12149   rtx (*gen) (rtx, rtx, rtx);
12150   rtx out = d->target;
12151   rtx in0;
12152   machine_mode vmode = d->vmode;
12153   unsigned int i, elt, nelt = d->nelt;
12154   rtx lane;
12155
12156   elt = d->perm[0];
12157   for (i = 1; i < nelt; i++)
12158     {
12159       if (elt != d->perm[i])
12160         return false;
12161     }
12162
12163   /* The generic preparation in aarch64_expand_vec_perm_const_1
12164      swaps the operand order and the permute indices if it finds
12165      d->perm[0] to be in the second operand.  Thus, we can always
12166      use d->op0 and need not do any extra arithmetic to get the
12167      correct lane number.  */
12168   in0 = d->op0;
12169   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
12170
12171   switch (vmode)
12172     {
12173     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
12174     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
12175     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
12176     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
12177     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
12178     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
12179     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
12180     case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
12181     case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
12182     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
12183     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
12184     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
12185     default:
12186       return false;
12187     }
12188
12189   emit_insn (gen (out, in0, lane));
12190   return true;
12191 }
12192
12193 static bool
12194 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
12195 {
12196   rtx rperm[MAX_VECT_LEN], sel;
12197   machine_mode vmode = d->vmode;
12198   unsigned int i, nelt = d->nelt;
12199
12200   if (d->testing_p)
12201     return true;
12202
12203   /* Generic code will try constant permutation twice.  Once with the
12204      original mode and again with the elements lowered to QImode.
12205      So wait and don't do the selector expansion ourselves.  */
12206   if (vmode != V8QImode && vmode != V16QImode)
12207     return false;
12208
12209   for (i = 0; i < nelt; ++i)
12210     {
12211       int nunits = GET_MODE_NUNITS (vmode);
12212
12213       /* If big-endian and two vectors we end up with a weird mixed-endian
12214          mode on NEON.  Reverse the index within each word but not the word
12215          itself.  */
12216       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
12217                                            : d->perm[i]);
12218     }
12219   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
12220   sel = force_reg (vmode, sel);
12221
12222   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
12223   return true;
12224 }
12225
12226 static bool
12227 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
12228 {
12229   /* The pattern matching functions above are written to look for a small
12230      number to begin the sequence (0, 1, N/2).  If we begin with an index
12231      from the second operand, we can swap the operands.  */
12232   if (d->perm[0] >= d->nelt)
12233     {
12234       unsigned i, nelt = d->nelt;
12235
12236       gcc_assert (nelt == (nelt & -nelt));
12237       for (i = 0; i < nelt; ++i)
12238         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
12239
12240       std::swap (d->op0, d->op1);
12241     }
12242
12243   if (TARGET_SIMD)
12244     {
12245       if (aarch64_evpc_rev (d))
12246         return true;
12247       else if (aarch64_evpc_ext (d))
12248         return true;
12249       else if (aarch64_evpc_dup (d))
12250         return true;
12251       else if (aarch64_evpc_zip (d))
12252         return true;
12253       else if (aarch64_evpc_uzp (d))
12254         return true;
12255       else if (aarch64_evpc_trn (d))
12256         return true;
12257       return aarch64_evpc_tbl (d);
12258     }
12259   return false;
12260 }
12261
12262 /* Expand a vec_perm_const pattern.  */
12263
12264 bool
12265 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
12266 {
12267   struct expand_vec_perm_d d;
12268   int i, nelt, which;
12269
12270   d.target = target;
12271   d.op0 = op0;
12272   d.op1 = op1;
12273
12274   d.vmode = GET_MODE (target);
12275   gcc_assert (VECTOR_MODE_P (d.vmode));
12276   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12277   d.testing_p = false;
12278
12279   for (i = which = 0; i < nelt; ++i)
12280     {
12281       rtx e = XVECEXP (sel, 0, i);
12282       int ei = INTVAL (e) & (2 * nelt - 1);
12283       which |= (ei < nelt ? 1 : 2);
12284       d.perm[i] = ei;
12285     }
12286
12287   switch (which)
12288     {
12289     default:
12290       gcc_unreachable ();
12291
12292     case 3:
12293       d.one_vector_p = false;
12294       if (!rtx_equal_p (op0, op1))
12295         break;
12296
12297       /* The elements of PERM do not suggest that only the first operand
12298          is used, but both operands are identical.  Allow easier matching
12299          of the permutation by folding the permutation into the single
12300          input vector.  */
12301       /* Fall Through.  */
12302     case 2:
12303       for (i = 0; i < nelt; ++i)
12304         d.perm[i] &= nelt - 1;
12305       d.op0 = op1;
12306       d.one_vector_p = true;
12307       break;
12308
12309     case 1:
12310       d.op1 = op0;
12311       d.one_vector_p = true;
12312       break;
12313     }
12314
12315   return aarch64_expand_vec_perm_const_1 (&d);
12316 }
12317
12318 static bool
12319 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
12320                                      const unsigned char *sel)
12321 {
12322   struct expand_vec_perm_d d;
12323   unsigned int i, nelt, which;
12324   bool ret;
12325
12326   d.vmode = vmode;
12327   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12328   d.testing_p = true;
12329   memcpy (d.perm, sel, nelt);
12330
12331   /* Calculate whether all elements are in one vector.  */
12332   for (i = which = 0; i < nelt; ++i)
12333     {
12334       unsigned char e = d.perm[i];
12335       gcc_assert (e < 2 * nelt);
12336       which |= (e < nelt ? 1 : 2);
12337     }
12338
12339   /* If all elements are from the second vector, reindex as if from the
12340      first vector.  */
12341   if (which == 2)
12342     for (i = 0; i < nelt; ++i)
12343       d.perm[i] -= nelt;
12344
12345   /* Check whether the mask can be applied to a single vector.  */
12346   d.one_vector_p = (which != 3);
12347
12348   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
12349   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
12350   if (!d.one_vector_p)
12351     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
12352
12353   start_sequence ();
12354   ret = aarch64_expand_vec_perm_const_1 (&d);
12355   end_sequence ();
12356
12357   return ret;
12358 }
12359
12360 rtx
12361 aarch64_reverse_mask (enum machine_mode mode)
12362 {
12363   /* We have to reverse each vector because we dont have
12364      a permuted load that can reverse-load according to ABI rules.  */
12365   rtx mask;
12366   rtvec v = rtvec_alloc (16);
12367   int i, j;
12368   int nunits = GET_MODE_NUNITS (mode);
12369   int usize = GET_MODE_UNIT_SIZE (mode);
12370
12371   gcc_assert (BYTES_BIG_ENDIAN);
12372   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
12373
12374   for (i = 0; i < nunits; i++)
12375     for (j = 0; j < usize; j++)
12376       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
12377   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
12378   return force_reg (V16QImode, mask);
12379 }
12380
12381 /* Implement MODES_TIEABLE_P.  */
12382
12383 bool
12384 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
12385 {
12386   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
12387     return true;
12388
12389   /* We specifically want to allow elements of "structure" modes to
12390      be tieable to the structure.  This more general condition allows
12391      other rarer situations too.  */
12392   if (TARGET_SIMD
12393       && aarch64_vector_mode_p (mode1)
12394       && aarch64_vector_mode_p (mode2))
12395     return true;
12396
12397   return false;
12398 }
12399
12400 /* Return a new RTX holding the result of moving POINTER forward by
12401    AMOUNT bytes.  */
12402
12403 static rtx
12404 aarch64_move_pointer (rtx pointer, int amount)
12405 {
12406   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
12407
12408   return adjust_automodify_address (pointer, GET_MODE (pointer),
12409                                     next, amount);
12410 }
12411
12412 /* Return a new RTX holding the result of moving POINTER forward by the
12413    size of the mode it points to.  */
12414
12415 static rtx
12416 aarch64_progress_pointer (rtx pointer)
12417 {
12418   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
12419
12420   return aarch64_move_pointer (pointer, amount);
12421 }
12422
12423 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
12424    MODE bytes.  */
12425
12426 static void
12427 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
12428                                               machine_mode mode)
12429 {
12430   rtx reg = gen_reg_rtx (mode);
12431
12432   /* "Cast" the pointers to the correct mode.  */
12433   *src = adjust_address (*src, mode, 0);
12434   *dst = adjust_address (*dst, mode, 0);
12435   /* Emit the memcpy.  */
12436   emit_move_insn (reg, *src);
12437   emit_move_insn (*dst, reg);
12438   /* Move the pointers forward.  */
12439   *src = aarch64_progress_pointer (*src);
12440   *dst = aarch64_progress_pointer (*dst);
12441 }
12442
12443 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
12444    we succeed, otherwise return false.  */
12445
12446 bool
12447 aarch64_expand_movmem (rtx *operands)
12448 {
12449   unsigned int n;
12450   rtx dst = operands[0];
12451   rtx src = operands[1];
12452   rtx base;
12453   bool speed_p = !optimize_function_for_size_p (cfun);
12454
12455   /* When optimizing for size, give a better estimate of the length of a
12456      memcpy call, but use the default otherwise.  */
12457   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
12458
12459   /* We can't do anything smart if the amount to copy is not constant.  */
12460   if (!CONST_INT_P (operands[2]))
12461     return false;
12462
12463   n = UINTVAL (operands[2]);
12464
12465   /* Try to keep the number of instructions low.  For cases below 16 bytes we
12466      need to make at most two moves.  For cases above 16 bytes it will be one
12467      move for each 16 byte chunk, then at most two additional moves.  */
12468   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
12469     return false;
12470
12471   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
12472   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
12473
12474   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
12475   src = adjust_automodify_address (src, VOIDmode, base, 0);
12476
12477   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
12478      1-byte chunk.  */
12479   if (n < 4)
12480     {
12481       if (n >= 2)
12482         {
12483           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
12484           n -= 2;
12485         }
12486
12487       if (n == 1)
12488         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
12489
12490       return true;
12491     }
12492
12493   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
12494      4-byte chunk, partially overlapping with the previously copied chunk.  */
12495   if (n < 8)
12496     {
12497       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12498       n -= 4;
12499       if (n > 0)
12500         {
12501           int move = n - 4;
12502
12503           src = aarch64_move_pointer (src, move);
12504           dst = aarch64_move_pointer (dst, move);
12505           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12506         }
12507       return true;
12508     }
12509
12510   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
12511      them, then (if applicable) an 8-byte chunk.  */
12512   while (n >= 8)
12513     {
12514       if (n / 16)
12515         {
12516           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
12517           n -= 16;
12518         }
12519       else
12520         {
12521           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
12522           n -= 8;
12523         }
12524     }
12525
12526   /* Finish the final bytes of the copy.  We can always do this in one
12527      instruction.  We either copy the exact amount we need, or partially
12528      overlap with the previous chunk we copied and copy 8-bytes.  */
12529   if (n == 0)
12530     return true;
12531   else if (n == 1)
12532     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
12533   else if (n == 2)
12534     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
12535   else if (n == 4)
12536     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12537   else
12538     {
12539       if (n == 3)
12540         {
12541           src = aarch64_move_pointer (src, -1);
12542           dst = aarch64_move_pointer (dst, -1);
12543           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
12544         }
12545       else
12546         {
12547           int move = n - 8;
12548
12549           src = aarch64_move_pointer (src, move);
12550           dst = aarch64_move_pointer (dst, move);
12551           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
12552         }
12553     }
12554
12555   return true;
12556 }
12557
12558 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
12559
12560 static unsigned HOST_WIDE_INT
12561 aarch64_asan_shadow_offset (void)
12562 {
12563   return (HOST_WIDE_INT_1 << 36);
12564 }
12565
12566 static bool
12567 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
12568                                         unsigned int align,
12569                                         enum by_pieces_operation op,
12570                                         bool speed_p)
12571 {
12572   /* STORE_BY_PIECES can be used when copying a constant string, but
12573      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
12574      For now we always fail this and let the move_by_pieces code copy
12575      the string from read-only memory.  */
12576   if (op == STORE_BY_PIECES)
12577     return false;
12578
12579   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
12580 }
12581
12582 static enum machine_mode
12583 aarch64_code_to_ccmode (enum rtx_code code)
12584 {
12585   switch (code)
12586     {
12587     case NE:
12588       return CC_DNEmode;
12589
12590     case EQ:
12591       return CC_DEQmode;
12592
12593     case LE:
12594       return CC_DLEmode;
12595
12596     case LT:
12597       return CC_DLTmode;
12598
12599     case GE:
12600       return CC_DGEmode;
12601
12602     case GT:
12603       return CC_DGTmode;
12604
12605     case LEU:
12606       return CC_DLEUmode;
12607
12608     case LTU:
12609       return CC_DLTUmode;
12610
12611     case GEU:
12612       return CC_DGEUmode;
12613
12614     case GTU:
12615       return CC_DGTUmode;
12616
12617     default:
12618       return CCmode;
12619     }
12620 }
12621
12622 static rtx
12623 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
12624                         int code, tree treeop0, tree treeop1)
12625 {
12626   enum machine_mode op_mode, cmp_mode, cc_mode;
12627   rtx op0, op1, cmp, target;
12628   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
12629   enum insn_code icode;
12630   struct expand_operand ops[4];
12631
12632   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) code);
12633   if (cc_mode == CCmode)
12634     return NULL_RTX;
12635
12636   start_sequence ();
12637   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
12638
12639   op_mode = GET_MODE (op0);
12640   if (op_mode == VOIDmode)
12641     op_mode = GET_MODE (op1);
12642
12643   switch (op_mode)
12644     {
12645     case QImode:
12646     case HImode:
12647     case SImode:
12648       cmp_mode = SImode;
12649       icode = CODE_FOR_cmpsi;
12650       break;
12651
12652     case DImode:
12653       cmp_mode = DImode;
12654       icode = CODE_FOR_cmpdi;
12655       break;
12656
12657     default:
12658       end_sequence ();
12659       return NULL_RTX;
12660     }
12661
12662   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
12663   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
12664   if (!op0 || !op1)
12665     {
12666       end_sequence ();
12667       return NULL_RTX;
12668     }
12669   *prep_seq = get_insns ();
12670   end_sequence ();
12671
12672   cmp = gen_rtx_fmt_ee ((enum rtx_code) code, cmp_mode, op0, op1);
12673   target = gen_rtx_REG (CCmode, CC_REGNUM);
12674
12675   create_output_operand (&ops[0], target, CCmode);
12676   create_fixed_operand (&ops[1], cmp);
12677   create_fixed_operand (&ops[2], op0);
12678   create_fixed_operand (&ops[3], op1);
12679
12680   start_sequence ();
12681   if (!maybe_expand_insn (icode, 4, ops))
12682     {
12683       end_sequence ();
12684       return NULL_RTX;
12685     }
12686   *gen_seq = get_insns ();
12687   end_sequence ();
12688
12689   return gen_rtx_REG (cc_mode, CC_REGNUM);
12690 }
12691
12692 static rtx
12693 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
12694                        tree treeop0, tree treeop1, int bit_code)
12695 {
12696   rtx op0, op1, cmp0, cmp1, target;
12697   enum machine_mode op_mode, cmp_mode, cc_mode;
12698   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
12699   enum insn_code icode = CODE_FOR_ccmp_andsi;
12700   struct expand_operand ops[6];
12701
12702   cc_mode = aarch64_code_to_ccmode ((enum rtx_code) cmp_code);
12703   if (cc_mode == CCmode)
12704     return NULL_RTX;
12705
12706   push_to_sequence ((rtx_insn*) *prep_seq);
12707   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
12708
12709   op_mode = GET_MODE (op0);
12710   if (op_mode == VOIDmode)
12711     op_mode = GET_MODE (op1);
12712
12713   switch (op_mode)
12714     {
12715     case QImode:
12716     case HImode:
12717     case SImode:
12718       cmp_mode = SImode;
12719       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_andsi
12720                                                 : CODE_FOR_ccmp_iorsi;
12721       break;
12722
12723     case DImode:
12724       cmp_mode = DImode;
12725       icode = (enum rtx_code) bit_code == AND ? CODE_FOR_ccmp_anddi
12726                                                 : CODE_FOR_ccmp_iordi;
12727       break;
12728
12729     default:
12730       end_sequence ();
12731       return NULL_RTX;
12732     }
12733
12734   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
12735   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
12736   if (!op0 || !op1)
12737     {
12738       end_sequence ();
12739       return NULL_RTX;
12740     }
12741   *prep_seq = get_insns ();
12742   end_sequence ();
12743
12744   target = gen_rtx_REG (cc_mode, CC_REGNUM);
12745   cmp1 = gen_rtx_fmt_ee ((enum rtx_code) cmp_code, cmp_mode, op0, op1);
12746   cmp0 = gen_rtx_fmt_ee (NE, cmp_mode, prev, const0_rtx);
12747
12748   create_fixed_operand (&ops[0], prev);
12749   create_fixed_operand (&ops[1], target);
12750   create_fixed_operand (&ops[2], op0);
12751   create_fixed_operand (&ops[3], op1);
12752   create_fixed_operand (&ops[4], cmp0);
12753   create_fixed_operand (&ops[5], cmp1);
12754
12755   push_to_sequence ((rtx_insn*) *gen_seq);
12756   if (!maybe_expand_insn (icode, 6, ops))
12757     {
12758       end_sequence ();
12759       return NULL_RTX;
12760     }
12761
12762   *gen_seq = get_insns ();
12763   end_sequence ();
12764
12765   return target;
12766 }
12767
12768 #undef TARGET_GEN_CCMP_FIRST
12769 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
12770
12771 #undef TARGET_GEN_CCMP_NEXT
12772 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
12773
12774 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
12775    instruction fusion of some sort.  */
12776
12777 static bool
12778 aarch64_macro_fusion_p (void)
12779 {
12780   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
12781 }
12782
12783
12784 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
12785    should be kept together during scheduling.  */
12786
12787 static bool
12788 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
12789 {
12790   rtx set_dest;
12791   rtx prev_set = single_set (prev);
12792   rtx curr_set = single_set (curr);
12793   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
12794   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
12795
12796   if (!aarch64_macro_fusion_p ())
12797     return false;
12798
12799   if (simple_sets_p
12800       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOV_MOVK))
12801     {
12802       /* We are trying to match:
12803          prev (mov)  == (set (reg r0) (const_int imm16))
12804          curr (movk) == (set (zero_extract (reg r0)
12805                                            (const_int 16)
12806                                            (const_int 16))
12807                              (const_int imm16_1))  */
12808
12809       set_dest = SET_DEST (curr_set);
12810
12811       if (GET_CODE (set_dest) == ZERO_EXTRACT
12812           && CONST_INT_P (SET_SRC (curr_set))
12813           && CONST_INT_P (SET_SRC (prev_set))
12814           && CONST_INT_P (XEXP (set_dest, 2))
12815           && INTVAL (XEXP (set_dest, 2)) == 16
12816           && REG_P (XEXP (set_dest, 0))
12817           && REG_P (SET_DEST (prev_set))
12818           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
12819         {
12820           return true;
12821         }
12822     }
12823
12824   if (simple_sets_p
12825       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_ADD))
12826     {
12827
12828       /*  We're trying to match:
12829           prev (adrp) == (set (reg r1)
12830                               (high (symbol_ref ("SYM"))))
12831           curr (add) == (set (reg r0)
12832                              (lo_sum (reg r1)
12833                                      (symbol_ref ("SYM"))))
12834           Note that r0 need not necessarily be the same as r1, especially
12835           during pre-regalloc scheduling.  */
12836
12837       if (satisfies_constraint_Ush (SET_SRC (prev_set))
12838           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
12839         {
12840           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
12841               && REG_P (XEXP (SET_SRC (curr_set), 0))
12842               && REGNO (XEXP (SET_SRC (curr_set), 0))
12843                  == REGNO (SET_DEST (prev_set))
12844               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
12845                               XEXP (SET_SRC (curr_set), 1)))
12846             return true;
12847         }
12848     }
12849
12850   if (simple_sets_p
12851       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_MOVK_MOVK))
12852     {
12853
12854       /* We're trying to match:
12855          prev (movk) == (set (zero_extract (reg r0)
12856                                            (const_int 16)
12857                                            (const_int 32))
12858                              (const_int imm16_1))
12859          curr (movk) == (set (zero_extract (reg r0)
12860                                            (const_int 16)
12861                                            (const_int 48))
12862                              (const_int imm16_2))  */
12863
12864       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
12865           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
12866           && REG_P (XEXP (SET_DEST (prev_set), 0))
12867           && REG_P (XEXP (SET_DEST (curr_set), 0))
12868           && REGNO (XEXP (SET_DEST (prev_set), 0))
12869              == REGNO (XEXP (SET_DEST (curr_set), 0))
12870           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
12871           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
12872           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
12873           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
12874           && CONST_INT_P (SET_SRC (prev_set))
12875           && CONST_INT_P (SET_SRC (curr_set)))
12876         return true;
12877
12878     }
12879   if (simple_sets_p
12880       && (aarch64_tune_params.fusible_ops & AARCH64_FUSE_ADRP_LDR))
12881     {
12882       /* We're trying to match:
12883           prev (adrp) == (set (reg r0)
12884                               (high (symbol_ref ("SYM"))))
12885           curr (ldr) == (set (reg r1)
12886                              (mem (lo_sum (reg r0)
12887                                              (symbol_ref ("SYM")))))
12888                  or
12889           curr (ldr) == (set (reg r1)
12890                              (zero_extend (mem
12891                                            (lo_sum (reg r0)
12892                                                    (symbol_ref ("SYM"))))))  */
12893       if (satisfies_constraint_Ush (SET_SRC (prev_set))
12894           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
12895         {
12896           rtx curr_src = SET_SRC (curr_set);
12897
12898           if (GET_CODE (curr_src) == ZERO_EXTEND)
12899             curr_src = XEXP (curr_src, 0);
12900
12901           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
12902               && REG_P (XEXP (XEXP (curr_src, 0), 0))
12903               && REGNO (XEXP (XEXP (curr_src, 0), 0))
12904                  == REGNO (SET_DEST (prev_set))
12905               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
12906                               XEXP (SET_SRC (prev_set), 0)))
12907               return true;
12908         }
12909     }
12910
12911   if ((aarch64_tune_params.fusible_ops & AARCH64_FUSE_CMP_BRANCH)
12912       && any_condjump_p (curr))
12913     {
12914       enum attr_type prev_type = get_attr_type (prev);
12915
12916       /* FIXME: this misses some which is considered simple arthematic
12917          instructions for ThunderX.  Simple shifts are missed here.  */
12918       if (prev_type == TYPE_ALUS_SREG
12919           || prev_type == TYPE_ALUS_IMM
12920           || prev_type == TYPE_LOGICS_REG
12921           || prev_type == TYPE_LOGICS_IMM)
12922         return true;
12923     }
12924
12925   return false;
12926 }
12927
12928 /* If MEM is in the form of [base+offset], extract the two parts
12929    of address and set to BASE and OFFSET, otherwise return false
12930    after clearing BASE and OFFSET.  */
12931
12932 bool
12933 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
12934 {
12935   rtx addr;
12936
12937   gcc_assert (MEM_P (mem));
12938
12939   addr = XEXP (mem, 0);
12940
12941   if (REG_P (addr))
12942     {
12943       *base = addr;
12944       *offset = const0_rtx;
12945       return true;
12946     }
12947
12948   if (GET_CODE (addr) == PLUS
12949       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
12950     {
12951       *base = XEXP (addr, 0);
12952       *offset = XEXP (addr, 1);
12953       return true;
12954     }
12955
12956   *base = NULL_RTX;
12957   *offset = NULL_RTX;
12958
12959   return false;
12960 }
12961
12962 /* Types for scheduling fusion.  */
12963 enum sched_fusion_type
12964 {
12965   SCHED_FUSION_NONE = 0,
12966   SCHED_FUSION_LD_SIGN_EXTEND,
12967   SCHED_FUSION_LD_ZERO_EXTEND,
12968   SCHED_FUSION_LD,
12969   SCHED_FUSION_ST,
12970   SCHED_FUSION_NUM
12971 };
12972
12973 /* If INSN is a load or store of address in the form of [base+offset],
12974    extract the two parts and set to BASE and OFFSET.  Return scheduling
12975    fusion type this INSN is.  */
12976
12977 static enum sched_fusion_type
12978 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
12979 {
12980   rtx x, dest, src;
12981   enum sched_fusion_type fusion = SCHED_FUSION_LD;
12982
12983   gcc_assert (INSN_P (insn));
12984   x = PATTERN (insn);
12985   if (GET_CODE (x) != SET)
12986     return SCHED_FUSION_NONE;
12987
12988   src = SET_SRC (x);
12989   dest = SET_DEST (x);
12990
12991   machine_mode dest_mode = GET_MODE (dest);
12992
12993   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
12994     return SCHED_FUSION_NONE;
12995
12996   if (GET_CODE (src) == SIGN_EXTEND)
12997     {
12998       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
12999       src = XEXP (src, 0);
13000       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13001         return SCHED_FUSION_NONE;
13002     }
13003   else if (GET_CODE (src) == ZERO_EXTEND)
13004     {
13005       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
13006       src = XEXP (src, 0);
13007       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13008         return SCHED_FUSION_NONE;
13009     }
13010
13011   if (GET_CODE (src) == MEM && REG_P (dest))
13012     extract_base_offset_in_addr (src, base, offset);
13013   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
13014     {
13015       fusion = SCHED_FUSION_ST;
13016       extract_base_offset_in_addr (dest, base, offset);
13017     }
13018   else
13019     return SCHED_FUSION_NONE;
13020
13021   if (*base == NULL_RTX || *offset == NULL_RTX)
13022     fusion = SCHED_FUSION_NONE;
13023
13024   return fusion;
13025 }
13026
13027 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
13028
13029    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
13030    and PRI are only calculated for these instructions.  For other instruction,
13031    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
13032    type instruction fusion can be added by returning different priorities.
13033
13034    It's important that irrelevant instructions get the largest FUSION_PRI.  */
13035
13036 static void
13037 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
13038                                int *fusion_pri, int *pri)
13039 {
13040   int tmp, off_val;
13041   rtx base, offset;
13042   enum sched_fusion_type fusion;
13043
13044   gcc_assert (INSN_P (insn));
13045
13046   tmp = max_pri - 1;
13047   fusion = fusion_load_store (insn, &base, &offset);
13048   if (fusion == SCHED_FUSION_NONE)
13049     {
13050       *pri = tmp;
13051       *fusion_pri = tmp;
13052       return;
13053     }
13054
13055   /* Set FUSION_PRI according to fusion type and base register.  */
13056   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
13057
13058   /* Calculate PRI.  */
13059   tmp /= 2;
13060
13061   /* INSN with smaller offset goes first.  */
13062   off_val = (int)(INTVAL (offset));
13063   if (off_val >= 0)
13064     tmp -= (off_val & 0xfffff);
13065   else
13066     tmp += ((- off_val) & 0xfffff);
13067
13068   *pri = tmp;
13069   return;
13070 }
13071
13072 /* Given OPERANDS of consecutive load/store, check if we can merge
13073    them into ldp/stp.  LOAD is true if they are load instructions.
13074    MODE is the mode of memory operands.  */
13075
13076 bool
13077 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
13078                                 enum machine_mode mode)
13079 {
13080   HOST_WIDE_INT offval_1, offval_2, msize;
13081   enum reg_class rclass_1, rclass_2;
13082   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
13083
13084   if (load)
13085     {
13086       mem_1 = operands[1];
13087       mem_2 = operands[3];
13088       reg_1 = operands[0];
13089       reg_2 = operands[2];
13090       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
13091       if (REGNO (reg_1) == REGNO (reg_2))
13092         return false;
13093     }
13094   else
13095     {
13096       mem_1 = operands[0];
13097       mem_2 = operands[2];
13098       reg_1 = operands[1];
13099       reg_2 = operands[3];
13100     }
13101
13102   /* The mems cannot be volatile.  */
13103   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
13104     return false;
13105
13106   /* Check if the addresses are in the form of [base+offset].  */
13107   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13108   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13109     return false;
13110   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13111   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13112     return false;
13113
13114   /* Check if the bases are same.  */
13115   if (!rtx_equal_p (base_1, base_2))
13116     return false;
13117
13118   offval_1 = INTVAL (offset_1);
13119   offval_2 = INTVAL (offset_2);
13120   msize = GET_MODE_SIZE (mode);
13121   /* Check if the offsets are consecutive.  */
13122   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
13123     return false;
13124
13125   /* Check if the addresses are clobbered by load.  */
13126   if (load)
13127     {
13128       if (reg_mentioned_p (reg_1, mem_1))
13129         return false;
13130
13131       /* In increasing order, the last load can clobber the address.  */
13132       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
13133       return false;
13134     }
13135
13136   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13137     rclass_1 = FP_REGS;
13138   else
13139     rclass_1 = GENERAL_REGS;
13140
13141   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13142     rclass_2 = FP_REGS;
13143   else
13144     rclass_2 = GENERAL_REGS;
13145
13146   /* Check if the registers are of same class.  */
13147   if (rclass_1 != rclass_2)
13148     return false;
13149
13150   return true;
13151 }
13152
13153 /* Given OPERANDS of consecutive load/store, check if we can merge
13154    them into ldp/stp by adjusting the offset.  LOAD is true if they
13155    are load instructions.  MODE is the mode of memory operands.
13156
13157    Given below consecutive stores:
13158
13159      str  w1, [xb, 0x100]
13160      str  w1, [xb, 0x104]
13161      str  w1, [xb, 0x108]
13162      str  w1, [xb, 0x10c]
13163
13164    Though the offsets are out of the range supported by stp, we can
13165    still pair them after adjusting the offset, like:
13166
13167      add  scratch, xb, 0x100
13168      stp  w1, w1, [scratch]
13169      stp  w1, w1, [scratch, 0x8]
13170
13171    The peephole patterns detecting this opportunity should guarantee
13172    the scratch register is avaliable.  */
13173
13174 bool
13175 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
13176                                        enum machine_mode mode)
13177 {
13178   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
13179   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
13180   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
13181   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
13182
13183   if (load)
13184     {
13185       reg_1 = operands[0];
13186       mem_1 = operands[1];
13187       reg_2 = operands[2];
13188       mem_2 = operands[3];
13189       reg_3 = operands[4];
13190       mem_3 = operands[5];
13191       reg_4 = operands[6];
13192       mem_4 = operands[7];
13193       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
13194                   && REG_P (reg_3) && REG_P (reg_4));
13195       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
13196         return false;
13197     }
13198   else
13199     {
13200       mem_1 = operands[0];
13201       reg_1 = operands[1];
13202       mem_2 = operands[2];
13203       reg_2 = operands[3];
13204       mem_3 = operands[4];
13205       reg_3 = operands[5];
13206       mem_4 = operands[6];
13207       reg_4 = operands[7];
13208     }
13209   /* Skip if memory operand is by itslef valid for ldp/stp.  */
13210   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
13211     return false;
13212
13213   /* The mems cannot be volatile.  */
13214   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
13215       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
13216     return false;
13217
13218   /* Check if the addresses are in the form of [base+offset].  */
13219   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13220   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13221     return false;
13222   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13223   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13224     return false;
13225   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
13226   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
13227     return false;
13228   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
13229   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
13230     return false;
13231
13232   /* Check if the bases are same.  */
13233   if (!rtx_equal_p (base_1, base_2)
13234       || !rtx_equal_p (base_2, base_3)
13235       || !rtx_equal_p (base_3, base_4))
13236     return false;
13237
13238   offval_1 = INTVAL (offset_1);
13239   offval_2 = INTVAL (offset_2);
13240   offval_3 = INTVAL (offset_3);
13241   offval_4 = INTVAL (offset_4);
13242   msize = GET_MODE_SIZE (mode);
13243   /* Check if the offsets are consecutive.  */
13244   if ((offval_1 != (offval_2 + msize)
13245        || offval_1 != (offval_3 + msize * 2)
13246        || offval_1 != (offval_4 + msize * 3))
13247       && (offval_4 != (offval_3 + msize)
13248           || offval_4 != (offval_2 + msize * 2)
13249           || offval_4 != (offval_1 + msize * 3)))
13250     return false;
13251
13252   /* Check if the addresses are clobbered by load.  */
13253   if (load)
13254     {
13255       if (reg_mentioned_p (reg_1, mem_1)
13256           || reg_mentioned_p (reg_2, mem_2)
13257           || reg_mentioned_p (reg_3, mem_3))
13258         return false;
13259
13260       /* In increasing order, the last load can clobber the address.  */
13261       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
13262         return false;
13263     }
13264
13265   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13266     rclass_1 = FP_REGS;
13267   else
13268     rclass_1 = GENERAL_REGS;
13269
13270   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13271     rclass_2 = FP_REGS;
13272   else
13273     rclass_2 = GENERAL_REGS;
13274
13275   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
13276     rclass_3 = FP_REGS;
13277   else
13278     rclass_3 = GENERAL_REGS;
13279
13280   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
13281     rclass_4 = FP_REGS;
13282   else
13283     rclass_4 = GENERAL_REGS;
13284
13285   /* Check if the registers are of same class.  */
13286   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
13287     return false;
13288
13289   return true;
13290 }
13291
13292 /* Given OPERANDS of consecutive load/store, this function pairs them
13293    into ldp/stp after adjusting the offset.  It depends on the fact
13294    that addresses of load/store instructions are in increasing order.
13295    MODE is the mode of memory operands.  CODE is the rtl operator
13296    which should be applied to all memory operands, it's SIGN_EXTEND,
13297    ZERO_EXTEND or UNKNOWN.  */
13298
13299 bool
13300 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
13301                              enum machine_mode mode, RTX_CODE code)
13302 {
13303   rtx base, offset, t1, t2;
13304   rtx mem_1, mem_2, mem_3, mem_4;
13305   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
13306
13307   if (load)
13308     {
13309       mem_1 = operands[1];
13310       mem_2 = operands[3];
13311       mem_3 = operands[5];
13312       mem_4 = operands[7];
13313     }
13314   else
13315     {
13316       mem_1 = operands[0];
13317       mem_2 = operands[2];
13318       mem_3 = operands[4];
13319       mem_4 = operands[6];
13320       gcc_assert (code == UNKNOWN);
13321     }
13322
13323   extract_base_offset_in_addr (mem_1, &base, &offset);
13324   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
13325
13326   /* Adjust offset thus it can fit in ldp/stp instruction.  */
13327   msize = GET_MODE_SIZE (mode);
13328   stp_off_limit = msize * 0x40;
13329   off_val = INTVAL (offset);
13330   abs_off = (off_val < 0) ? -off_val : off_val;
13331   new_off = abs_off % stp_off_limit;
13332   adj_off = abs_off - new_off;
13333
13334   /* Further adjust to make sure all offsets are OK.  */
13335   if ((new_off + msize * 2) >= stp_off_limit)
13336     {
13337       adj_off += stp_off_limit;
13338       new_off -= stp_off_limit;
13339     }
13340
13341   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
13342   if (adj_off >= 0x1000)
13343     return false;
13344
13345   if (off_val < 0)
13346     {
13347       adj_off = -adj_off;
13348       new_off = -new_off;
13349     }
13350
13351   /* Create new memory references.  */
13352   mem_1 = change_address (mem_1, VOIDmode,
13353                           plus_constant (DImode, operands[8], new_off));
13354
13355   /* Check if the adjusted address is OK for ldp/stp.  */
13356   if (!aarch64_mem_pair_operand (mem_1, mode))
13357     return false;
13358
13359   msize = GET_MODE_SIZE (mode);
13360   mem_2 = change_address (mem_2, VOIDmode,
13361                           plus_constant (DImode,
13362                                          operands[8],
13363                                          new_off + msize));
13364   mem_3 = change_address (mem_3, VOIDmode,
13365                           plus_constant (DImode,
13366                                          operands[8],
13367                                          new_off + msize * 2));
13368   mem_4 = change_address (mem_4, VOIDmode,
13369                           plus_constant (DImode,
13370                                          operands[8],
13371                                          new_off + msize * 3));
13372
13373   if (code == ZERO_EXTEND)
13374     {
13375       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
13376       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
13377       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
13378       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
13379     }
13380   else if (code == SIGN_EXTEND)
13381     {
13382       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
13383       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
13384       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
13385       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
13386     }
13387
13388   if (load)
13389     {
13390       operands[1] = mem_1;
13391       operands[3] = mem_2;
13392       operands[5] = mem_3;
13393       operands[7] = mem_4;
13394     }
13395   else
13396     {
13397       operands[0] = mem_1;
13398       operands[2] = mem_2;
13399       operands[4] = mem_3;
13400       operands[6] = mem_4;
13401     }
13402
13403   /* Emit adjusting instruction.  */
13404   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
13405   /* Emit ldp/stp instructions.  */
13406   t1 = gen_rtx_SET (operands[0], operands[1]);
13407   t2 = gen_rtx_SET (operands[2], operands[3]);
13408   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13409   t1 = gen_rtx_SET (operands[4], operands[5]);
13410   t2 = gen_rtx_SET (operands[6], operands[7]);
13411   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13412   return true;
13413 }
13414
13415 /* Return 1 if pseudo register should be created and used to hold
13416    GOT address for PIC code.  */
13417
13418 bool
13419 aarch64_use_pseudo_pic_reg (void)
13420 {
13421   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
13422 }
13423
13424 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
13425
13426 static int
13427 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
13428 {
13429   switch (XINT (x, 1))
13430     {
13431     case UNSPEC_GOTSMALLPIC:
13432     case UNSPEC_GOTSMALLPIC28K:
13433     case UNSPEC_GOTTINYPIC:
13434       return 0;
13435     default:
13436       break;
13437     }
13438
13439   return default_unspec_may_trap_p (x, flags);
13440 }
13441
13442
13443 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
13444    return the log2 of that value.  Otherwise return -1.  */
13445
13446 int
13447 aarch64_fpconst_pow_of_2 (rtx x)
13448 {
13449   const REAL_VALUE_TYPE *r;
13450
13451   if (!CONST_DOUBLE_P (x))
13452     return -1;
13453
13454   r = CONST_DOUBLE_REAL_VALUE (x);
13455
13456   if (REAL_VALUE_NEGATIVE (*r)
13457       || REAL_VALUE_ISNAN (*r)
13458       || REAL_VALUE_ISINF (*r)
13459       || !real_isinteger (r, DFmode))
13460     return -1;
13461
13462   return exact_log2 (real_to_integer (r));
13463 }
13464
13465 /* If X is a vector of equal CONST_DOUBLE values and that value is
13466    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
13467
13468 int
13469 aarch64_vec_fpconst_pow_of_2 (rtx x)
13470 {
13471   if (GET_CODE (x) != CONST_VECTOR)
13472     return -1;
13473
13474   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
13475     return -1;
13476
13477   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
13478   if (firstval <= 0)
13479     return -1;
13480
13481   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
13482     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
13483       return -1;
13484
13485   return firstval;
13486 }
13487
13488 /* Implement TARGET_PROMOTED_TYPE to promote __fp16 to float.  */
13489 static tree
13490 aarch64_promoted_type (const_tree t)
13491 {
13492   if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
13493     return float_type_node;
13494   return NULL_TREE;
13495 }
13496 #undef TARGET_ADDRESS_COST
13497 #define TARGET_ADDRESS_COST aarch64_address_cost
13498
13499 /* This hook will determines whether unnamed bitfields affect the alignment
13500    of the containing structure.  The hook returns true if the structure
13501    should inherit the alignment requirements of an unnamed bitfield's
13502    type.  */
13503 #undef TARGET_ALIGN_ANON_BITFIELD
13504 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
13505
13506 #undef TARGET_ASM_ALIGNED_DI_OP
13507 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
13508
13509 #undef TARGET_ASM_ALIGNED_HI_OP
13510 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
13511
13512 #undef TARGET_ASM_ALIGNED_SI_OP
13513 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
13514
13515 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
13516 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
13517   hook_bool_const_tree_hwi_hwi_const_tree_true
13518
13519 #undef TARGET_ASM_OUTPUT_MI_THUNK
13520 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
13521
13522 #undef TARGET_ASM_SELECT_RTX_SECTION
13523 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
13524
13525 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
13526 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
13527
13528 #undef TARGET_BUILD_BUILTIN_VA_LIST
13529 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
13530
13531 #undef TARGET_CALLEE_COPIES
13532 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
13533
13534 #undef TARGET_CAN_ELIMINATE
13535 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
13536
13537 #undef TARGET_CAN_INLINE_P
13538 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
13539
13540 #undef TARGET_CANNOT_FORCE_CONST_MEM
13541 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
13542
13543 #undef TARGET_CONDITIONAL_REGISTER_USAGE
13544 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
13545
13546 /* Only the least significant bit is used for initialization guard
13547    variables.  */
13548 #undef TARGET_CXX_GUARD_MASK_BIT
13549 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
13550
13551 #undef TARGET_C_MODE_FOR_SUFFIX
13552 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
13553
13554 #ifdef TARGET_BIG_ENDIAN_DEFAULT
13555 #undef  TARGET_DEFAULT_TARGET_FLAGS
13556 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
13557 #endif
13558
13559 #undef TARGET_CLASS_MAX_NREGS
13560 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
13561
13562 #undef TARGET_BUILTIN_DECL
13563 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
13564
13565 #undef TARGET_BUILTIN_RECIPROCAL
13566 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
13567
13568 #undef  TARGET_EXPAND_BUILTIN
13569 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
13570
13571 #undef TARGET_EXPAND_BUILTIN_VA_START
13572 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
13573
13574 #undef TARGET_FOLD_BUILTIN
13575 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
13576
13577 #undef TARGET_FUNCTION_ARG
13578 #define TARGET_FUNCTION_ARG aarch64_function_arg
13579
13580 #undef TARGET_FUNCTION_ARG_ADVANCE
13581 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
13582
13583 #undef TARGET_FUNCTION_ARG_BOUNDARY
13584 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
13585
13586 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
13587 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
13588
13589 #undef TARGET_FUNCTION_VALUE
13590 #define TARGET_FUNCTION_VALUE aarch64_function_value
13591
13592 #undef TARGET_FUNCTION_VALUE_REGNO_P
13593 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
13594
13595 #undef TARGET_FRAME_POINTER_REQUIRED
13596 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
13597
13598 #undef TARGET_GIMPLE_FOLD_BUILTIN
13599 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
13600
13601 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
13602 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
13603
13604 #undef  TARGET_INIT_BUILTINS
13605 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
13606
13607 #undef TARGET_LEGITIMATE_ADDRESS_P
13608 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
13609
13610 #undef TARGET_LEGITIMATE_CONSTANT_P
13611 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
13612
13613 #undef TARGET_LIBGCC_CMP_RETURN_MODE
13614 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
13615
13616 #undef TARGET_LRA_P
13617 #define TARGET_LRA_P hook_bool_void_true
13618
13619 #undef TARGET_MANGLE_TYPE
13620 #define TARGET_MANGLE_TYPE aarch64_mangle_type
13621
13622 #undef TARGET_MEMORY_MOVE_COST
13623 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
13624
13625 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
13626 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
13627
13628 #undef TARGET_MUST_PASS_IN_STACK
13629 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
13630
13631 /* This target hook should return true if accesses to volatile bitfields
13632    should use the narrowest mode possible.  It should return false if these
13633    accesses should use the bitfield container type.  */
13634 #undef TARGET_NARROW_VOLATILE_BITFIELD
13635 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
13636
13637 #undef  TARGET_OPTION_OVERRIDE
13638 #define TARGET_OPTION_OVERRIDE aarch64_override_options
13639
13640 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
13641 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
13642   aarch64_override_options_after_change
13643
13644 #undef TARGET_OPTION_SAVE
13645 #define TARGET_OPTION_SAVE aarch64_option_save
13646
13647 #undef TARGET_OPTION_RESTORE
13648 #define TARGET_OPTION_RESTORE aarch64_option_restore
13649
13650 #undef TARGET_OPTION_PRINT
13651 #define TARGET_OPTION_PRINT aarch64_option_print
13652
13653 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
13654 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
13655
13656 #undef TARGET_SET_CURRENT_FUNCTION
13657 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
13658
13659 #undef TARGET_PASS_BY_REFERENCE
13660 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
13661
13662 #undef TARGET_PREFERRED_RELOAD_CLASS
13663 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
13664
13665 #undef TARGET_SCHED_REASSOCIATION_WIDTH
13666 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
13667
13668 #undef TARGET_PROMOTED_TYPE
13669 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
13670
13671 #undef TARGET_SECONDARY_RELOAD
13672 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
13673
13674 #undef TARGET_SHIFT_TRUNCATION_MASK
13675 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
13676
13677 #undef TARGET_SETUP_INCOMING_VARARGS
13678 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
13679
13680 #undef TARGET_STRUCT_VALUE_RTX
13681 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
13682
13683 #undef TARGET_REGISTER_MOVE_COST
13684 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
13685
13686 #undef TARGET_RETURN_IN_MEMORY
13687 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
13688
13689 #undef TARGET_RETURN_IN_MSB
13690 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
13691
13692 #undef TARGET_RTX_COSTS
13693 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
13694
13695 #undef TARGET_SCHED_ISSUE_RATE
13696 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
13697
13698 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
13699 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
13700   aarch64_sched_first_cycle_multipass_dfa_lookahead
13701
13702 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
13703 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
13704   aarch64_first_cycle_multipass_dfa_lookahead_guard
13705
13706 #undef TARGET_TRAMPOLINE_INIT
13707 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
13708
13709 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
13710 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
13711
13712 #undef TARGET_VECTOR_MODE_SUPPORTED_P
13713 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
13714
13715 #undef TARGET_ARRAY_MODE_SUPPORTED_P
13716 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
13717
13718 #undef TARGET_VECTORIZE_ADD_STMT_COST
13719 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
13720
13721 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
13722 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
13723   aarch64_builtin_vectorization_cost
13724
13725 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
13726 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
13727
13728 #undef TARGET_VECTORIZE_BUILTINS
13729 #define TARGET_VECTORIZE_BUILTINS
13730
13731 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
13732 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
13733   aarch64_builtin_vectorized_function
13734
13735 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
13736 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
13737   aarch64_autovectorize_vector_sizes
13738
13739 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
13740 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
13741   aarch64_atomic_assign_expand_fenv
13742
13743 /* Section anchor support.  */
13744
13745 #undef TARGET_MIN_ANCHOR_OFFSET
13746 #define TARGET_MIN_ANCHOR_OFFSET -256
13747
13748 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
13749    byte offset; we can do much more for larger data types, but have no way
13750    to determine the size of the access.  We assume accesses are aligned.  */
13751 #undef TARGET_MAX_ANCHOR_OFFSET
13752 #define TARGET_MAX_ANCHOR_OFFSET 4095
13753
13754 #undef TARGET_VECTOR_ALIGNMENT
13755 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
13756
13757 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
13758 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
13759   aarch64_simd_vector_alignment_reachable
13760
13761 /* vec_perm support.  */
13762
13763 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
13764 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
13765   aarch64_vectorize_vec_perm_const_ok
13766
13767 #undef TARGET_INIT_LIBFUNCS
13768 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
13769
13770 #undef TARGET_FIXED_CONDITION_CODE_REGS
13771 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
13772
13773 #undef TARGET_FLAGS_REGNUM
13774 #define TARGET_FLAGS_REGNUM CC_REGNUM
13775
13776 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
13777 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
13778
13779 #undef TARGET_ASAN_SHADOW_OFFSET
13780 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
13781
13782 #undef TARGET_LEGITIMIZE_ADDRESS
13783 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
13784
13785 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
13786 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
13787   aarch64_use_by_pieces_infrastructure_p
13788
13789 #undef TARGET_CAN_USE_DOLOOP_P
13790 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
13791
13792 #undef TARGET_SCHED_MACRO_FUSION_P
13793 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
13794
13795 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
13796 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
13797
13798 #undef TARGET_SCHED_FUSION_PRIORITY
13799 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
13800
13801 #undef TARGET_UNSPEC_MAY_TRAP_P
13802 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
13803
13804 #undef TARGET_USE_PSEUDO_PIC_REG
13805 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
13806
13807 #undef TARGET_PRINT_OPERAND
13808 #define TARGET_PRINT_OPERAND aarch64_print_operand
13809
13810 #undef TARGET_PRINT_OPERAND_ADDRESS
13811 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
13812
13813 struct gcc_target targetm = TARGET_INITIALIZER;
13814
13815 #include "gt-aarch64.h"