gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2016 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "optabs.h"
  37 #include "regs.h"
  38 #include "emit-rtl.h"
  39 #include "recog.h"
  40 #include "diagnostic.h"
  41 #include "insn-attr.h"
  42 #include "alias.h"
  43 #include "fold-const.h"
  44 #include "stor-layout.h"
  45 #include "calls.h"
  46 #include "varasm.h"
  47 #include "output.h"
  48 #include "flags.h"
  49 #include "explow.h"
  50 #include "expr.h"
  51 #include "reload.h"
  52 #include "langhooks.h"
  53 #include "opts.h"
  54 #include "params.h"
  55 #include "gimplify.h"
  56 #include "dwarf2.h"
  57 #include "gimple-iterator.h"
  58 #include "tree-vectorizer.h"
  59 #include "aarch64-cost-tables.h"
  60 #include "dumpfile.h"
  61 #include "builtins.h"
  62 #include "rtl-iter.h"
  63 #include "tm-constrs.h"
  64 #include "sched-int.h"
  65 #include "target-globals.h"
  66 #include "common/common-target.h"
  67
  68 /* This file should be included last.  */
  69 #include "target-def.h"
  70
  71 /* Defined for convenience.  */
  72 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  73
  74 /* Classifies an address.
  75
  76    ADDRESS_REG_IMM
  77        A simple base register plus immediate offset.
  78
  79    ADDRESS_REG_WB
  80        A base register indexed by immediate offset with writeback.
  81
  82    ADDRESS_REG_REG
  83        A base register indexed by (optionally scaled) register.
  84
  85    ADDRESS_REG_UXTW
  86        A base register indexed by (optionally scaled) zero-extended register.
  87
  88    ADDRESS_REG_SXTW
  89        A base register indexed by (optionally scaled) sign-extended register.
  90
  91    ADDRESS_LO_SUM
  92        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  93
  94    ADDRESS_SYMBOLIC:
  95        A constant symbolic address, in pc-relative literal pool.  */
  96
  97 enum aarch64_address_type {
  98   ADDRESS_REG_IMM,
  99   ADDRESS_REG_WB,
 100   ADDRESS_REG_REG,
 101   ADDRESS_REG_UXTW,
 102   ADDRESS_REG_SXTW,
 103   ADDRESS_LO_SUM,
 104   ADDRESS_SYMBOLIC
 105 };
 106
 107 struct aarch64_address_info {
 108   enum aarch64_address_type type;
 109   rtx base;
 110   rtx offset;
 111   int shift;
 112   enum aarch64_symbol_type symbol_type;
 113 };
 114
 115 struct simd_immediate_info
 116 {
 117   rtx value;
 118   int shift;
 119   int element_width;
 120   bool mvn;
 121   bool msl;
 122 };
 123
 124 /* The current code model.  */
 125 enum aarch64_code_model aarch64_cmodel;
 126
 127 #ifdef HAVE_AS_TLS
 128 #undef TARGET_HAVE_TLS
 129 #define TARGET_HAVE_TLS 1
 130 #endif
 131
 132 static bool aarch64_composite_type_p (const_tree, machine_mode);
 133 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 134                                                      const_tree,
 135                                                      machine_mode *, int *,
 136                                                      bool *);
 137 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 138 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 139 static void aarch64_override_options_after_change (void);
 140 static bool aarch64_vector_mode_supported_p (machine_mode);
 141 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 142                                                  const unsigned char *sel);
 143 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 144
 145 /* Major revision number of the ARM Architecture implemented by the target.  */
 146 unsigned aarch64_architecture_version;
 147
 148 /* The processor for which instructions should be scheduled.  */
 149 enum aarch64_processor aarch64_tune = cortexa53;
 150
 151 /* Mask to specify which instruction scheduling options should be used.  */
 152 unsigned long aarch64_tune_flags = 0;
 153
 154 /* Global flag for PC relative loads.  */
 155 bool aarch64_pcrelative_literal_loads;
 156
 157 /* Support for command line parsing of boolean flags in the tuning
 158    structures.  */
 159 struct aarch64_flag_desc
 160 {
 161   const char* name;
 162   unsigned int flag;
 163 };
 164
 165 #define AARCH64_FUSION_PAIR(name, internal_name) \
 166   { name, AARCH64_FUSE_##internal_name },
 167 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 168 {
 169   { "none", AARCH64_FUSE_NOTHING },
 170 #include "aarch64-fusion-pairs.def"
 171   { "all", AARCH64_FUSE_ALL },
 172   { NULL, AARCH64_FUSE_NOTHING }
 173 };
 174
 175 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 176   { name, AARCH64_EXTRA_TUNE_##internal_name },
 177 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 178 {
 179   { "none", AARCH64_EXTRA_TUNE_NONE },
 180 #include "aarch64-tuning-flags.def"
 181   { "all", AARCH64_EXTRA_TUNE_ALL },
 182   { NULL, AARCH64_EXTRA_TUNE_NONE }
 183 };
 184
 185 /* Tuning parameters.  */
 186
 187 static const struct cpu_addrcost_table generic_addrcost_table =
 188 {
 189     {
 190       0, /* hi  */
 191       0, /* si  */
 192       0, /* di  */
 193       0, /* ti  */
 194     },
 195   0, /* pre_modify  */
 196   0, /* post_modify  */
 197   0, /* register_offset  */
 198   0, /* register_sextend  */
 199   0, /* register_zextend  */
 200   0 /* imm_offset  */
 201 };
 202
 203 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 204 {
 205     {
 206       1, /* hi  */
 207       0, /* si  */
 208       0, /* di  */
 209       1, /* ti  */
 210     },
 211   0, /* pre_modify  */
 212   0, /* post_modify  */
 213   0, /* register_offset  */
 214   0, /* register_sextend  */
 215   0, /* register_zextend  */
 216   0, /* imm_offset  */
 217 };
 218
 219 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 220 {
 221     {
 222       0, /* hi  */
 223       0, /* si  */
 224       0, /* di  */
 225       2, /* ti  */
 226     },
 227   0, /* pre_modify  */
 228   0, /* post_modify  */
 229   1, /* register_offset  */
 230   1, /* register_sextend  */
 231   2, /* register_zextend  */
 232   0, /* imm_offset  */
 233 };
 234
 235 static const struct cpu_addrcost_table xgene1_addrcost_table =
 236 {
 237     {
 238       1, /* hi  */
 239       0, /* si  */
 240       0, /* di  */
 241       1, /* ti  */
 242     },
 243   1, /* pre_modify  */
 244   0, /* post_modify  */
 245   0, /* register_offset  */
 246   1, /* register_sextend  */
 247   1, /* register_zextend  */
 248   0, /* imm_offset  */
 249 };
 250
 251 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 252 {
 253     {
 254       1, /* hi  */
 255       0, /* si  */
 256       0, /* di  */
 257       1, /* ti  */
 258     },
 259   0, /* pre_modify  */
 260   0, /* post_modify  */
 261   0, /* register_offset  */
 262   0, /* register_sextend  */
 263   0, /* register_zextend  */
 264   0 /* imm_offset  */
 265 };
 266
 267 static const struct cpu_addrcost_table vulcan_addrcost_table =
 268 {
 269     {
 270       0, /* hi  */
 271       0, /* si  */
 272       0, /* di  */
 273       2, /* ti  */
 274     },
 275   0, /* pre_modify  */
 276   0, /* post_modify  */
 277   2, /* register_offset  */
 278   3, /* register_sextend  */
 279   3, /* register_zextend  */
 280   0, /* imm_offset  */
 281 };
 282
 283 static const struct cpu_regmove_cost generic_regmove_cost =
 284 {
 285   1, /* GP2GP  */
 286   /* Avoid the use of slow int<->fp moves for spilling by setting
 287      their cost higher than memmov_cost.  */
 288   5, /* GP2FP  */
 289   5, /* FP2GP  */
 290   2 /* FP2FP  */
 291 };
 292
 293 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 294 {
 295   1, /* GP2GP  */
 296   /* Avoid the use of slow int<->fp moves for spilling by setting
 297      their cost higher than memmov_cost.  */
 298   5, /* GP2FP  */
 299   5, /* FP2GP  */
 300   2 /* FP2FP  */
 301 };
 302
 303 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 304 {
 305   1, /* GP2GP  */
 306   /* Avoid the use of slow int<->fp moves for spilling by setting
 307      their cost higher than memmov_cost.  */
 308   5, /* GP2FP  */
 309   5, /* FP2GP  */
 310   2 /* FP2FP  */
 311 };
 312
 313 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 314 {
 315   1, /* GP2GP  */
 316   /* Avoid the use of slow int<->fp moves for spilling by setting
 317      their cost higher than memmov_cost (actual, 4 and 9).  */
 318   9, /* GP2FP  */
 319   9, /* FP2GP  */
 320   1 /* FP2FP  */
 321 };
 322
 323 static const struct cpu_regmove_cost thunderx_regmove_cost =
 324 {
 325   2, /* GP2GP  */
 326   2, /* GP2FP  */
 327   6, /* FP2GP  */
 328   4 /* FP2FP  */
 329 };
 330
 331 static const struct cpu_regmove_cost xgene1_regmove_cost =
 332 {
 333   1, /* GP2GP  */
 334   /* Avoid the use of slow int<->fp moves for spilling by setting
 335      their cost higher than memmov_cost.  */
 336   8, /* GP2FP  */
 337   8, /* FP2GP  */
 338   2 /* FP2FP  */
 339 };
 340
 341 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 342 {
 343   2, /* GP2GP  */
 344   /* Avoid the use of int<->fp moves for spilling.  */
 345   6, /* GP2FP  */
 346   6, /* FP2GP  */
 347   4 /* FP2FP  */
 348 };
 349
 350 static const struct cpu_regmove_cost vulcan_regmove_cost =
 351 {
 352   1, /* GP2GP  */
 353   /* Avoid the use of int<->fp moves for spilling.  */
 354   8, /* GP2FP  */
 355   8, /* FP2GP  */
 356   4  /* FP2FP  */
 357 };
 358
 359 /* Generic costs for vector insn classes.  */
 360 static const struct cpu_vector_cost generic_vector_cost =
 361 {
 362   1, /* scalar_stmt_cost  */
 363   1, /* scalar_load_cost  */
 364   1, /* scalar_store_cost  */
 365   1, /* vec_stmt_cost  */
 366   2, /* vec_permute_cost  */
 367   1, /* vec_to_scalar_cost  */
 368   1, /* scalar_to_vec_cost  */
 369   1, /* vec_align_load_cost  */
 370   1, /* vec_unalign_load_cost  */
 371   1, /* vec_unalign_store_cost  */
 372   1, /* vec_store_cost  */
 373   3, /* cond_taken_branch_cost  */
 374   1 /* cond_not_taken_branch_cost  */
 375 };
 376
 377 /* ThunderX costs for vector insn classes.  */
 378 static const struct cpu_vector_cost thunderx_vector_cost =
 379 {
 380   1, /* scalar_stmt_cost  */
 381   3, /* scalar_load_cost  */
 382   1, /* scalar_store_cost  */
 383   4, /* vec_stmt_cost  */
 384   4, /* vec_permute_cost  */
 385   2, /* vec_to_scalar_cost  */
 386   2, /* scalar_to_vec_cost  */
 387   3, /* vec_align_load_cost  */
 388   10, /* vec_unalign_load_cost  */
 389   10, /* vec_unalign_store_cost  */
 390   1, /* vec_store_cost  */
 391   3, /* cond_taken_branch_cost  */
 392   3 /* cond_not_taken_branch_cost  */
 393 };
 394
 395 /* Generic costs for vector insn classes.  */
 396 static const struct cpu_vector_cost cortexa57_vector_cost =
 397 {
 398   1, /* scalar_stmt_cost  */
 399   4, /* scalar_load_cost  */
 400   1, /* scalar_store_cost  */
 401   2, /* vec_stmt_cost  */
 402   3, /* vec_permute_cost  */
 403   8, /* vec_to_scalar_cost  */
 404   8, /* scalar_to_vec_cost  */
 405   4, /* vec_align_load_cost  */
 406   4, /* vec_unalign_load_cost  */
 407   1, /* vec_unalign_store_cost  */
 408   1, /* vec_store_cost  */
 409   1, /* cond_taken_branch_cost  */
 410   1 /* cond_not_taken_branch_cost  */
 411 };
 412
 413 static const struct cpu_vector_cost exynosm1_vector_cost =
 414 {
 415   1, /* scalar_stmt_cost  */
 416   5, /* scalar_load_cost  */
 417   1, /* scalar_store_cost  */
 418   3, /* vec_stmt_cost  */
 419   3, /* vec_permute_cost  */
 420   3, /* vec_to_scalar_cost  */
 421   3, /* scalar_to_vec_cost  */
 422   5, /* vec_align_load_cost  */
 423   5, /* vec_unalign_load_cost  */
 424   1, /* vec_unalign_store_cost  */
 425   1, /* vec_store_cost  */
 426   1, /* cond_taken_branch_cost  */
 427   1 /* cond_not_taken_branch_cost  */
 428 };
 429
 430 /* Generic costs for vector insn classes.  */
 431 static const struct cpu_vector_cost xgene1_vector_cost =
 432 {
 433   1, /* scalar_stmt_cost  */
 434   5, /* scalar_load_cost  */
 435   1, /* scalar_store_cost  */
 436   2, /* vec_stmt_cost  */
 437   2, /* vec_permute_cost  */
 438   4, /* vec_to_scalar_cost  */
 439   4, /* scalar_to_vec_cost  */
 440   10, /* vec_align_load_cost  */
 441   10, /* vec_unalign_load_cost  */
 442   2, /* vec_unalign_store_cost  */
 443   2, /* vec_store_cost  */
 444   2, /* cond_taken_branch_cost  */
 445   1 /* cond_not_taken_branch_cost  */
 446 };
 447
 448 /* Costs for vector insn classes for Vulcan.  */
 449 static const struct cpu_vector_cost vulcan_vector_cost =
 450 {
 451   6, /* scalar_stmt_cost  */
 452   4, /* scalar_load_cost  */
 453   1, /* scalar_store_cost  */
 454   6, /* vec_stmt_cost  */
 455   3, /* vec_permute_cost  */
 456   6, /* vec_to_scalar_cost  */
 457   5, /* scalar_to_vec_cost  */
 458   8, /* vec_align_load_cost  */
 459   8, /* vec_unalign_load_cost  */
 460   4, /* vec_unalign_store_cost  */
 461   4, /* vec_store_cost  */
 462   2, /* cond_taken_branch_cost  */
 463   1  /* cond_not_taken_branch_cost  */
 464 };
 465
 466 /* Generic costs for branch instructions.  */
 467 static const struct cpu_branch_cost generic_branch_cost =
 468 {
 469   2,  /* Predictable.  */
 470   2   /* Unpredictable.  */
 471 };
 472
 473 /* Branch costs for Cortex-A57.  */
 474 static const struct cpu_branch_cost cortexa57_branch_cost =
 475 {
 476   1,  /* Predictable.  */
 477   3   /* Unpredictable.  */
 478 };
 479
 480 /* Branch costs for Vulcan.  */
 481 static const struct cpu_branch_cost vulcan_branch_cost =
 482 {
 483   1,  /* Predictable.  */
 484   3   /* Unpredictable.  */
 485 };
 486
 487 /* Generic approximation modes.  */
 488 static const cpu_approx_modes generic_approx_modes =
 489 {
 490   AARCH64_APPROX_NONE,  /* division  */
 491   AARCH64_APPROX_NONE,  /* sqrt  */
 492   AARCH64_APPROX_NONE   /* recip_sqrt  */
 493 };
 494
 495 /* Approximation modes for Exynos M1.  */
 496 static const cpu_approx_modes exynosm1_approx_modes =
 497 {
 498   AARCH64_APPROX_NONE,  /* division  */
 499   AARCH64_APPROX_ALL,   /* sqrt  */
 500   AARCH64_APPROX_ALL    /* recip_sqrt  */
 501 };
 502
 503 /* Approximation modes for X-Gene 1.  */
 504 static const cpu_approx_modes xgene1_approx_modes =
 505 {
 506   AARCH64_APPROX_NONE,  /* division  */
 507   AARCH64_APPROX_NONE,  /* sqrt  */
 508   AARCH64_APPROX_ALL    /* recip_sqrt  */
 509 };
 510
 511 static const struct tune_params generic_tunings =
 512 {
 513   &cortexa57_extra_costs,
 514   &generic_addrcost_table,
 515   &generic_regmove_cost,
 516   &generic_vector_cost,
 517   &generic_branch_cost,
 518   &generic_approx_modes,
 519   4, /* memmov_cost  */
 520   2, /* issue_rate  */
 521   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 522   8,    /* function_align.  */
 523   8,    /* jump_align.  */
 524   4,    /* loop_align.  */
 525   2,    /* int_reassoc_width.  */
 526   4,    /* fp_reassoc_width.  */
 527   1,    /* vec_reassoc_width.  */
 528   2,    /* min_div_recip_mul_sf.  */
 529   2,    /* min_div_recip_mul_df.  */
 530   0,    /* max_case_values.  */
 531   0,    /* cache_line_size.  */
 532   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 533   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 534 };
 535
 536 static const struct tune_params cortexa35_tunings =
 537 {
 538   &cortexa53_extra_costs,
 539   &generic_addrcost_table,
 540   &cortexa53_regmove_cost,
 541   &generic_vector_cost,
 542   &cortexa57_branch_cost,
 543   &generic_approx_modes,
 544   4, /* memmov_cost  */
 545   1, /* issue_rate  */
 546   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 547    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 548   16,   /* function_align.  */
 549   8,    /* jump_align.  */
 550   8,    /* loop_align.  */
 551   2,    /* int_reassoc_width.  */
 552   4,    /* fp_reassoc_width.  */
 553   1,    /* vec_reassoc_width.  */
 554   2,    /* min_div_recip_mul_sf.  */
 555   2,    /* min_div_recip_mul_df.  */
 556   0,    /* max_case_values.  */
 557   0,    /* cache_line_size.  */
 558   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 559   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 560 };
 561
 562 static const struct tune_params cortexa53_tunings =
 563 {
 564   &cortexa53_extra_costs,
 565   &generic_addrcost_table,
 566   &cortexa53_regmove_cost,
 567   &generic_vector_cost,
 568   &cortexa57_branch_cost,
 569   &generic_approx_modes,
 570   4, /* memmov_cost  */
 571   2, /* issue_rate  */
 572   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 573    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 574   16,   /* function_align.  */
 575   8,    /* jump_align.  */
 576   8,    /* loop_align.  */
 577   2,    /* int_reassoc_width.  */
 578   4,    /* fp_reassoc_width.  */
 579   1,    /* vec_reassoc_width.  */
 580   2,    /* min_div_recip_mul_sf.  */
 581   2,    /* min_div_recip_mul_df.  */
 582   0,    /* max_case_values.  */
 583   0,    /* cache_line_size.  */
 584   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 585   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 586 };
 587
 588 static const struct tune_params cortexa57_tunings =
 589 {
 590   &cortexa57_extra_costs,
 591   &cortexa57_addrcost_table,
 592   &cortexa57_regmove_cost,
 593   &cortexa57_vector_cost,
 594   &cortexa57_branch_cost,
 595   &generic_approx_modes,
 596   4, /* memmov_cost  */
 597   3, /* issue_rate  */
 598   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 599    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 600   16,   /* function_align.  */
 601   8,    /* jump_align.  */
 602   8,    /* loop_align.  */
 603   2,    /* int_reassoc_width.  */
 604   4,    /* fp_reassoc_width.  */
 605   1,    /* vec_reassoc_width.  */
 606   2,    /* min_div_recip_mul_sf.  */
 607   2,    /* min_div_recip_mul_df.  */
 608   0,    /* max_case_values.  */
 609   0,    /* cache_line_size.  */
 610   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 611   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)  /* tune_flags.  */
 612 };
 613
 614 static const struct tune_params cortexa72_tunings =
 615 {
 616   &cortexa57_extra_costs,
 617   &cortexa57_addrcost_table,
 618   &cortexa57_regmove_cost,
 619   &cortexa57_vector_cost,
 620   &cortexa57_branch_cost,
 621   &generic_approx_modes,
 622   4, /* memmov_cost  */
 623   3, /* issue_rate  */
 624   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 625    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 626   16,   /* function_align.  */
 627   8,    /* jump_align.  */
 628   8,    /* loop_align.  */
 629   2,    /* int_reassoc_width.  */
 630   4,    /* fp_reassoc_width.  */
 631   1,    /* vec_reassoc_width.  */
 632   2,    /* min_div_recip_mul_sf.  */
 633   2,    /* min_div_recip_mul_df.  */
 634   0,    /* max_case_values.  */
 635   0,    /* cache_line_size.  */
 636   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 637   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 638 };
 639
 640 static const struct tune_params cortexa73_tunings =
 641 {
 642   &cortexa57_extra_costs,
 643   &cortexa57_addrcost_table,
 644   &cortexa57_regmove_cost,
 645   &cortexa57_vector_cost,
 646   &cortexa57_branch_cost,
 647   &generic_approx_modes,
 648   4, /* memmov_cost.  */
 649   2, /* issue_rate.  */
 650   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 651    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 652   16,   /* function_align.  */
 653   8,    /* jump_align.  */
 654   8,    /* loop_align.  */
 655   2,    /* int_reassoc_width.  */
 656   4,    /* fp_reassoc_width.  */
 657   1,    /* vec_reassoc_width.  */
 658   2,    /* min_div_recip_mul_sf.  */
 659   2,    /* min_div_recip_mul_df.  */
 660   0,    /* max_case_values.  */
 661   0,    /* cache_line_size.  */
 662   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 663   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 664 };
 665
 666 static const struct tune_params exynosm1_tunings =
 667 {
 668   &exynosm1_extra_costs,
 669   &exynosm1_addrcost_table,
 670   &exynosm1_regmove_cost,
 671   &exynosm1_vector_cost,
 672   &generic_branch_cost,
 673   &exynosm1_approx_modes,
 674   4,    /* memmov_cost  */
 675   3,    /* issue_rate  */
 676   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 677   4,    /* function_align.  */
 678   4,    /* jump_align.  */
 679   4,    /* loop_align.  */
 680   2,    /* int_reassoc_width.  */
 681   4,    /* fp_reassoc_width.  */
 682   1,    /* vec_reassoc_width.  */
 683   2,    /* min_div_recip_mul_sf.  */
 684   2,    /* min_div_recip_mul_df.  */
 685   48,   /* max_case_values.  */
 686   64,   /* cache_line_size.  */
 687   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 688   (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
 689 };
 690
 691 static const struct tune_params thunderx_tunings =
 692 {
 693   &thunderx_extra_costs,
 694   &generic_addrcost_table,
 695   &thunderx_regmove_cost,
 696   &thunderx_vector_cost,
 697   &generic_branch_cost,
 698   &generic_approx_modes,
 699   6, /* memmov_cost  */
 700   2, /* issue_rate  */
 701   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 702   8,    /* function_align.  */
 703   8,    /* jump_align.  */
 704   8,    /* loop_align.  */
 705   2,    /* int_reassoc_width.  */
 706   4,    /* fp_reassoc_width.  */
 707   1,    /* vec_reassoc_width.  */
 708   2,    /* min_div_recip_mul_sf.  */
 709   2,    /* min_div_recip_mul_df.  */
 710   0,    /* max_case_values.  */
 711   0,    /* cache_line_size.  */
 712   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 713   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)      /* tune_flags.  */
 714 };
 715
 716 static const struct tune_params xgene1_tunings =
 717 {
 718   &xgene1_extra_costs,
 719   &xgene1_addrcost_table,
 720   &xgene1_regmove_cost,
 721   &xgene1_vector_cost,
 722   &generic_branch_cost,
 723   &xgene1_approx_modes,
 724   6, /* memmov_cost  */
 725   4, /* issue_rate  */
 726   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 727   16,   /* function_align.  */
 728   8,    /* jump_align.  */
 729   16,   /* loop_align.  */
 730   2,    /* int_reassoc_width.  */
 731   4,    /* fp_reassoc_width.  */
 732   1,    /* vec_reassoc_width.  */
 733   2,    /* min_div_recip_mul_sf.  */
 734   2,    /* min_div_recip_mul_df.  */
 735   0,    /* max_case_values.  */
 736   0,    /* cache_line_size.  */
 737   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 738   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 739 };
 740
 741 static const struct tune_params qdf24xx_tunings =
 742 {
 743   &qdf24xx_extra_costs,
 744   &qdf24xx_addrcost_table,
 745   &qdf24xx_regmove_cost,
 746   &generic_vector_cost,
 747   &generic_branch_cost,
 748   &generic_approx_modes,
 749   4, /* memmov_cost  */
 750   4, /* issue_rate  */
 751   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 752    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 753   16,   /* function_align.  */
 754   8,    /* jump_align.  */
 755   16,   /* loop_align.  */
 756   2,    /* int_reassoc_width.  */
 757   4,    /* fp_reassoc_width.  */
 758   1,    /* vec_reassoc_width.  */
 759   2,    /* min_div_recip_mul_sf.  */
 760   2,    /* min_div_recip_mul_df.  */
 761   0,    /* max_case_values.  */
 762   64,   /* cache_line_size.  */
 763   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 764   (AARCH64_EXTRA_TUNE_NONE)             /* tune_flags.  */
 765 };
 766
 767 static const struct tune_params vulcan_tunings =
 768 {
 769   &vulcan_extra_costs,
 770   &vulcan_addrcost_table,
 771   &vulcan_regmove_cost,
 772   &vulcan_vector_cost,
 773   &vulcan_branch_cost,
 774   &generic_approx_modes,
 775   4, /* memmov_cost.  */
 776   4, /* issue_rate.  */
 777   AARCH64_FUSE_NOTHING, /* fuseable_ops.  */
 778   16,   /* function_align.  */
 779   8,    /* jump_align.  */
 780   16,   /* loop_align.  */
 781   3,    /* int_reassoc_width.  */
 782   2,    /* fp_reassoc_width.  */
 783   2,    /* vec_reassoc_width.  */
 784   2,    /* min_div_recip_mul_sf.  */
 785   2,    /* min_div_recip_mul_df.  */
 786   0,    /* max_case_values.  */
 787   64,   /* cache_line_size.  */
 788   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 789   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 790 };
 791
 792 /* Support for fine-grained override of the tuning structures.  */
 793 struct aarch64_tuning_override_function
 794 {
 795   const char* name;
 796   void (*parse_override)(const char*, struct tune_params*);
 797 };
 798
 799 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 800 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 801
 802 static const struct aarch64_tuning_override_function
 803 aarch64_tuning_override_functions[] =
 804 {
 805   { "fuse", aarch64_parse_fuse_string },
 806   { "tune", aarch64_parse_tune_string },
 807   { NULL, NULL }
 808 };
 809
 810 /* A processor implementing AArch64.  */
 811 struct processor
 812 {
 813   const char *const name;
 814   enum aarch64_processor ident;
 815   enum aarch64_processor sched_core;
 816   enum aarch64_arch arch;
 817   unsigned architecture_version;
 818   const unsigned long flags;
 819   const struct tune_params *const tune;
 820 };
 821
 822 /* Architectures implementing AArch64.  */
 823 static const struct processor all_architectures[] =
 824 {
 825 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 826   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 827 #include "aarch64-arches.def"
 828   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 829 };
 830
 831 /* Processor cores implementing AArch64.  */
 832 static const struct processor all_cores[] =
 833 {
 834 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
 835   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 836   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 837   FLAGS, &COSTS##_tunings},
 838 #include "aarch64-cores.def"
 839   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 840     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 841   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 842 };
 843
 844
 845 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 846    handling code or by target attributes.  */
 847 static const struct processor *selected_arch;
 848 static const struct processor *selected_cpu;
 849 static const struct processor *selected_tune;
 850
 851 /* The current tuning set.  */
 852 struct tune_params aarch64_tune_params = generic_tunings;
 853
 854 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 855
 856 /* An ISA extension in the co-processor and main instruction set space.  */
 857 struct aarch64_option_extension
 858 {
 859   const char *const name;
 860   const unsigned long flags_on;
 861   const unsigned long flags_off;
 862 };
 863
 864 typedef enum aarch64_cond_code
 865 {
 866   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 867   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 868   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 869 }
 870 aarch64_cc;
 871
 872 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 873
 874 /* The condition codes of the processor, and the inverse function.  */
 875 static const char * const aarch64_condition_codes[] =
 876 {
 877   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 878   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 879 };
 880
 881 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 882 const char *
 883 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 884                         const char * branch_format)
 885 {
 886     rtx_code_label * tmp_label = gen_label_rtx ();
 887     char label_buf[256];
 888     char buffer[128];
 889     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 890                                  CODE_LABEL_NUMBER (tmp_label));
 891     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 892     rtx dest_label = operands[pos_label];
 893     operands[pos_label] = tmp_label;
 894
 895     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 896     output_asm_insn (buffer, operands);
 897
 898     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 899     operands[pos_label] = dest_label;
 900     output_asm_insn (buffer, operands);
 901     return "";
 902 }
 903
 904 void
 905 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 906 {
 907   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 908   if (TARGET_GENERAL_REGS_ONLY)
 909     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 910   else
 911     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 912 }
 913
 914 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 915    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 916    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 917    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 918    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 919    irrespectively of its cost results in bad allocations with many redundant
 920    int<->FP moves which are expensive on various cores.
 921    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 922    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 923    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 924    Otherwise set the allocno class depending on the mode.
 925    The result of this is that it is no longer inefficient to have a higher
 926    memory move cost than the register move cost.
 927 */
 928
 929 static reg_class_t
 930 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 931                                          reg_class_t best_class)
 932 {
 933   enum machine_mode mode;
 934
 935   if (allocno_class != ALL_REGS)
 936     return allocno_class;
 937
 938   if (best_class != ALL_REGS)
 939     return best_class;
 940
 941   mode = PSEUDO_REGNO_MODE (regno);
 942   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
 943 }
 944
 945 static unsigned int
 946 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 947 {
 948   if (GET_MODE_UNIT_SIZE (mode) == 4)
 949     return aarch64_tune_params.min_div_recip_mul_sf;
 950   return aarch64_tune_params.min_div_recip_mul_df;
 951 }
 952
 953 static int
 954 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 955                              enum machine_mode mode)
 956 {
 957   if (VECTOR_MODE_P (mode))
 958     return aarch64_tune_params.vec_reassoc_width;
 959   if (INTEGRAL_MODE_P (mode))
 960     return aarch64_tune_params.int_reassoc_width;
 961   if (FLOAT_MODE_P (mode))
 962     return aarch64_tune_params.fp_reassoc_width;
 963   return 1;
 964 }
 965
 966 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 967 unsigned
 968 aarch64_dbx_register_number (unsigned regno)
 969 {
 970    if (GP_REGNUM_P (regno))
 971      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 972    else if (regno == SP_REGNUM)
 973      return AARCH64_DWARF_SP;
 974    else if (FP_REGNUM_P (regno))
 975      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 976
 977    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 978       equivalent DWARF register.  */
 979    return DWARF_FRAME_REGISTERS;
 980 }
 981
 982 /* Return TRUE if MODE is any of the large INT modes.  */
 983 static bool
 984 aarch64_vect_struct_mode_p (machine_mode mode)
 985 {
 986   return mode == OImode || mode == CImode || mode == XImode;
 987 }
 988
 989 /* Return TRUE if MODE is any of the vector modes.  */
 990 static bool
 991 aarch64_vector_mode_p (machine_mode mode)
 992 {
 993   return aarch64_vector_mode_supported_p (mode)
 994          || aarch64_vect_struct_mode_p (mode);
 995 }
 996
 997 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 998 static bool
 999 aarch64_array_mode_supported_p (machine_mode mode,
1000                                 unsigned HOST_WIDE_INT nelems)
1001 {
1002   if (TARGET_SIMD
1003       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1004           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1005       && (nelems >= 2 && nelems <= 4))
1006     return true;
1007
1008   return false;
1009 }
1010
1011 /* Implement HARD_REGNO_NREGS.  */
1012
1013 int
1014 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1015 {
1016   switch (aarch64_regno_regclass (regno))
1017     {
1018     case FP_REGS:
1019     case FP_LO_REGS:
1020       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1021     default:
1022       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1023     }
1024   gcc_unreachable ();
1025 }
1026
1027 /* Implement HARD_REGNO_MODE_OK.  */
1028
1029 int
1030 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1031 {
1032   if (GET_MODE_CLASS (mode) == MODE_CC)
1033     return regno == CC_REGNUM;
1034
1035   if (regno == SP_REGNUM)
1036     /* The purpose of comparing with ptr_mode is to support the
1037        global register variable associated with the stack pointer
1038        register via the syntax of asm ("wsp") in ILP32.  */
1039     return mode == Pmode || mode == ptr_mode;
1040
1041   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1042     return mode == Pmode;
1043
1044   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1045     return 1;
1046
1047   if (FP_REGNUM_P (regno))
1048     {
1049       if (aarch64_vect_struct_mode_p (mode))
1050         return
1051           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1052       else
1053         return 1;
1054     }
1055
1056   return 0;
1057 }
1058
1059 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1060 machine_mode
1061 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1062                                      machine_mode mode)
1063 {
1064   /* Handle modes that fit within single registers.  */
1065   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1066     {
1067       if (GET_MODE_SIZE (mode) >= 4)
1068         return mode;
1069       else
1070         return SImode;
1071     }
1072   /* Fall back to generic for multi-reg and very large modes.  */
1073   else
1074     return choose_hard_reg_mode (regno, nregs, false);
1075 }
1076
1077 /* Return true if calls to DECL should be treated as
1078    long-calls (ie called via a register).  */
1079 static bool
1080 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1081 {
1082   return false;
1083 }
1084
1085 /* Return true if calls to symbol-ref SYM should be treated as
1086    long-calls (ie called via a register).  */
1087 bool
1088 aarch64_is_long_call_p (rtx sym)
1089 {
1090   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1091 }
1092
1093 /* Return true if calls to symbol-ref SYM should not go through
1094    plt stubs.  */
1095
1096 bool
1097 aarch64_is_noplt_call_p (rtx sym)
1098 {
1099   const_tree decl = SYMBOL_REF_DECL (sym);
1100
1101   if (flag_pic
1102       && decl
1103       && (!flag_plt
1104           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1105       && !targetm.binds_local_p (decl))
1106     return true;
1107
1108   return false;
1109 }
1110
1111 /* Return true if the offsets to a zero/sign-extract operation
1112    represent an expression that matches an extend operation.  The
1113    operands represent the paramters from
1114
1115    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1116 bool
1117 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1118                                 rtx extract_imm)
1119 {
1120   HOST_WIDE_INT mult_val, extract_val;
1121
1122   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1123     return false;
1124
1125   mult_val = INTVAL (mult_imm);
1126   extract_val = INTVAL (extract_imm);
1127
1128   if (extract_val > 8
1129       && extract_val < GET_MODE_BITSIZE (mode)
1130       && exact_log2 (extract_val & ~7) > 0
1131       && (extract_val & 7) <= 4
1132       && mult_val == (1 << (extract_val & 7)))
1133     return true;
1134
1135   return false;
1136 }
1137
1138 /* Emit an insn that's a simple single-set.  Both the operands must be
1139    known to be valid.  */
1140 inline static rtx_insn *
1141 emit_set_insn (rtx x, rtx y)
1142 {
1143   return emit_insn (gen_rtx_SET (x, y));
1144 }
1145
1146 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1147    return the rtx for register 0 in the proper mode.  */
1148 rtx
1149 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1150 {
1151   machine_mode mode = SELECT_CC_MODE (code, x, y);
1152   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1153
1154   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1155   return cc_reg;
1156 }
1157
1158 /* Build the SYMBOL_REF for __tls_get_addr.  */
1159
1160 static GTY(()) rtx tls_get_addr_libfunc;
1161
1162 rtx
1163 aarch64_tls_get_addr (void)
1164 {
1165   if (!tls_get_addr_libfunc)
1166     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1167   return tls_get_addr_libfunc;
1168 }
1169
1170 /* Return the TLS model to use for ADDR.  */
1171
1172 static enum tls_model
1173 tls_symbolic_operand_type (rtx addr)
1174 {
1175   enum tls_model tls_kind = TLS_MODEL_NONE;
1176   rtx sym, addend;
1177
1178   if (GET_CODE (addr) == CONST)
1179     {
1180       split_const (addr, &sym, &addend);
1181       if (GET_CODE (sym) == SYMBOL_REF)
1182         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1183     }
1184   else if (GET_CODE (addr) == SYMBOL_REF)
1185     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1186
1187   return tls_kind;
1188 }
1189
1190 /* We'll allow lo_sum's in addresses in our legitimate addresses
1191    so that combine would take care of combining addresses where
1192    necessary, but for generation purposes, we'll generate the address
1193    as :
1194    RTL                               Absolute
1195    tmp = hi (symbol_ref);            adrp  x1, foo
1196    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1197                                      nop
1198
1199    PIC                               TLS
1200    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1201    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1202                                      bl   __tls_get_addr
1203                                      nop
1204
1205    Load TLS symbol, depending on TLS mechanism and TLS access model.
1206
1207    Global Dynamic - Traditional TLS:
1208    adrp tmp, :tlsgd:imm
1209    add  dest, tmp, #:tlsgd_lo12:imm
1210    bl   __tls_get_addr
1211
1212    Global Dynamic - TLS Descriptors:
1213    adrp dest, :tlsdesc:imm
1214    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1215    add  dest, dest, #:tlsdesc_lo12:imm
1216    blr  tmp
1217    mrs  tp, tpidr_el0
1218    add  dest, dest, tp
1219
1220    Initial Exec:
1221    mrs  tp, tpidr_el0
1222    adrp tmp, :gottprel:imm
1223    ldr  dest, [tmp, #:gottprel_lo12:imm]
1224    add  dest, dest, tp
1225
1226    Local Exec:
1227    mrs  tp, tpidr_el0
1228    add  t0, tp, #:tprel_hi12:imm, lsl #12
1229    add  t0, t0, #:tprel_lo12_nc:imm
1230 */
1231
1232 static void
1233 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1234                                    enum aarch64_symbol_type type)
1235 {
1236   switch (type)
1237     {
1238     case SYMBOL_SMALL_ABSOLUTE:
1239       {
1240         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1241         rtx tmp_reg = dest;
1242         machine_mode mode = GET_MODE (dest);
1243
1244         gcc_assert (mode == Pmode || mode == ptr_mode);
1245
1246         if (can_create_pseudo_p ())
1247           tmp_reg = gen_reg_rtx (mode);
1248
1249         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1250         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1251         return;
1252       }
1253
1254     case SYMBOL_TINY_ABSOLUTE:
1255       emit_insn (gen_rtx_SET (dest, imm));
1256       return;
1257
1258     case SYMBOL_SMALL_GOT_28K:
1259       {
1260         machine_mode mode = GET_MODE (dest);
1261         rtx gp_rtx = pic_offset_table_rtx;
1262         rtx insn;
1263         rtx mem;
1264
1265         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1266            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1267            decide rtx costs, in which case pic_offset_table_rtx is not
1268            initialized.  For that case no need to generate the first adrp
1269            instruction as the final cost for global variable access is
1270            one instruction.  */
1271         if (gp_rtx != NULL)
1272           {
1273             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1274                using the page base as GOT base, the first page may be wasted,
1275                in the worst scenario, there is only 28K space for GOT).
1276
1277                The generate instruction sequence for accessing global variable
1278                is:
1279
1280                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1281
1282                Only one instruction needed. But we must initialize
1283                pic_offset_table_rtx properly.  We generate initialize insn for
1284                every global access, and allow CSE to remove all redundant.
1285
1286                The final instruction sequences will look like the following
1287                for multiply global variables access.
1288
1289                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1290
1291                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1292                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1293                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1294                  ...  */
1295
1296             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1297             crtl->uses_pic_offset_table = 1;
1298             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1299
1300             if (mode != GET_MODE (gp_rtx))
1301               gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
1302           }
1303
1304         if (mode == ptr_mode)
1305           {
1306             if (mode == DImode)
1307               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1308             else
1309               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1310
1311             mem = XVECEXP (SET_SRC (insn), 0, 0);
1312           }
1313         else
1314           {
1315             gcc_assert (mode == Pmode);
1316
1317             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1318             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1319           }
1320
1321         /* The operand is expected to be MEM.  Whenever the related insn
1322            pattern changed, above code which calculate mem should be
1323            updated.  */
1324         gcc_assert (GET_CODE (mem) == MEM);
1325         MEM_READONLY_P (mem) = 1;
1326         MEM_NOTRAP_P (mem) = 1;
1327         emit_insn (insn);
1328         return;
1329       }
1330
1331     case SYMBOL_SMALL_GOT_4G:
1332       {
1333         /* In ILP32, the mode of dest can be either SImode or DImode,
1334            while the got entry is always of SImode size.  The mode of
1335            dest depends on how dest is used: if dest is assigned to a
1336            pointer (e.g. in the memory), it has SImode; it may have
1337            DImode if dest is dereferenced to access the memeory.
1338            This is why we have to handle three different ldr_got_small
1339            patterns here (two patterns for ILP32).  */
1340
1341         rtx insn;
1342         rtx mem;
1343         rtx tmp_reg = dest;
1344         machine_mode mode = GET_MODE (dest);
1345
1346         if (can_create_pseudo_p ())
1347           tmp_reg = gen_reg_rtx (mode);
1348
1349         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1350         if (mode == ptr_mode)
1351           {
1352             if (mode == DImode)
1353               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1354             else
1355               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1356
1357             mem = XVECEXP (SET_SRC (insn), 0, 0);
1358           }
1359         else
1360           {
1361             gcc_assert (mode == Pmode);
1362
1363             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1364             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1365           }
1366
1367         gcc_assert (GET_CODE (mem) == MEM);
1368         MEM_READONLY_P (mem) = 1;
1369         MEM_NOTRAP_P (mem) = 1;
1370         emit_insn (insn);
1371         return;
1372       }
1373
1374     case SYMBOL_SMALL_TLSGD:
1375       {
1376         rtx_insn *insns;
1377         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
1378
1379         start_sequence ();
1380         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
1381         insns = get_insns ();
1382         end_sequence ();
1383
1384         RTL_CONST_CALL_P (insns) = 1;
1385         emit_libcall_block (insns, dest, result, imm);
1386         return;
1387       }
1388
1389     case SYMBOL_SMALL_TLSDESC:
1390       {
1391         machine_mode mode = GET_MODE (dest);
1392         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1393         rtx tp;
1394
1395         gcc_assert (mode == Pmode || mode == ptr_mode);
1396
1397         /* In ILP32, the got entry is always of SImode size.  Unlike
1398            small GOT, the dest is fixed at reg 0.  */
1399         if (TARGET_ILP32)
1400           emit_insn (gen_tlsdesc_small_si (imm));
1401         else
1402           emit_insn (gen_tlsdesc_small_di (imm));
1403         tp = aarch64_load_tp (NULL);
1404
1405         if (mode != Pmode)
1406           tp = gen_lowpart (mode, tp);
1407
1408         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1409         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1410         return;
1411       }
1412
1413     case SYMBOL_SMALL_TLSIE:
1414       {
1415         /* In ILP32, the mode of dest can be either SImode or DImode,
1416            while the got entry is always of SImode size.  The mode of
1417            dest depends on how dest is used: if dest is assigned to a
1418            pointer (e.g. in the memory), it has SImode; it may have
1419            DImode if dest is dereferenced to access the memeory.
1420            This is why we have to handle three different tlsie_small
1421            patterns here (two patterns for ILP32).  */
1422         machine_mode mode = GET_MODE (dest);
1423         rtx tmp_reg = gen_reg_rtx (mode);
1424         rtx tp = aarch64_load_tp (NULL);
1425
1426         if (mode == ptr_mode)
1427           {
1428             if (mode == DImode)
1429               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1430             else
1431               {
1432                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1433                 tp = gen_lowpart (mode, tp);
1434               }
1435           }
1436         else
1437           {
1438             gcc_assert (mode == Pmode);
1439             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1440           }
1441
1442         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1443         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1444         return;
1445       }
1446
1447     case SYMBOL_TLSLE12:
1448     case SYMBOL_TLSLE24:
1449     case SYMBOL_TLSLE32:
1450     case SYMBOL_TLSLE48:
1451       {
1452         machine_mode mode = GET_MODE (dest);
1453         rtx tp = aarch64_load_tp (NULL);
1454
1455         if (mode != Pmode)
1456           tp = gen_lowpart (mode, tp);
1457
1458         switch (type)
1459           {
1460           case SYMBOL_TLSLE12:
1461             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1462                         (dest, tp, imm));
1463             break;
1464           case SYMBOL_TLSLE24:
1465             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1466                         (dest, tp, imm));
1467           break;
1468           case SYMBOL_TLSLE32:
1469             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1470                         (dest, imm));
1471             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1472                         (dest, dest, tp));
1473           break;
1474           case SYMBOL_TLSLE48:
1475             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1476                         (dest, imm));
1477             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1478                         (dest, dest, tp));
1479             break;
1480           default:
1481             gcc_unreachable ();
1482           }
1483
1484         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1485         return;
1486       }
1487
1488     case SYMBOL_TINY_GOT:
1489       emit_insn (gen_ldr_got_tiny (dest, imm));
1490       return;
1491
1492     case SYMBOL_TINY_TLSIE:
1493       {
1494         machine_mode mode = GET_MODE (dest);
1495         rtx tp = aarch64_load_tp (NULL);
1496
1497         if (mode == ptr_mode)
1498           {
1499             if (mode == DImode)
1500               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1501             else
1502               {
1503                 tp = gen_lowpart (mode, tp);
1504                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1505               }
1506           }
1507         else
1508           {
1509             gcc_assert (mode == Pmode);
1510             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1511           }
1512
1513         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1514         return;
1515       }
1516
1517     default:
1518       gcc_unreachable ();
1519     }
1520 }
1521
1522 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1523    handle all moves if !can_create_pseudo_p ().  The distinction is
1524    important because, unlike emit_move_insn, the move expanders know
1525    how to force Pmode objects into the constant pool even when the
1526    constant pool address is not itself legitimate.  */
1527 static rtx
1528 aarch64_emit_move (rtx dest, rtx src)
1529 {
1530   return (can_create_pseudo_p ()
1531           ? emit_move_insn (dest, src)
1532           : emit_move_insn_1 (dest, src));
1533 }
1534
1535 /* Split a 128-bit move operation into two 64-bit move operations,
1536    taking care to handle partial overlap of register to register
1537    copies.  Special cases are needed when moving between GP regs and
1538    FP regs.  SRC can be a register, constant or memory; DST a register
1539    or memory.  If either operand is memory it must not have any side
1540    effects.  */
1541 void
1542 aarch64_split_128bit_move (rtx dst, rtx src)
1543 {
1544   rtx dst_lo, dst_hi;
1545   rtx src_lo, src_hi;
1546
1547   machine_mode mode = GET_MODE (dst);
1548
1549   gcc_assert (mode == TImode || mode == TFmode);
1550   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1551   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1552
1553   if (REG_P (dst) && REG_P (src))
1554     {
1555       int src_regno = REGNO (src);
1556       int dst_regno = REGNO (dst);
1557
1558       /* Handle FP <-> GP regs.  */
1559       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1560         {
1561           src_lo = gen_lowpart (word_mode, src);
1562           src_hi = gen_highpart (word_mode, src);
1563
1564           if (mode == TImode)
1565             {
1566               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1567               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1568             }
1569           else
1570             {
1571               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1572               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1573             }
1574           return;
1575         }
1576       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1577         {
1578           dst_lo = gen_lowpart (word_mode, dst);
1579           dst_hi = gen_highpart (word_mode, dst);
1580
1581           if (mode == TImode)
1582             {
1583               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1584               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1585             }
1586           else
1587             {
1588               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1589               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1590             }
1591           return;
1592         }
1593     }
1594
1595   dst_lo = gen_lowpart (word_mode, dst);
1596   dst_hi = gen_highpart (word_mode, dst);
1597   src_lo = gen_lowpart (word_mode, src);
1598   src_hi = gen_highpart_mode (word_mode, mode, src);
1599
1600   /* At most one pairing may overlap.  */
1601   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1602     {
1603       aarch64_emit_move (dst_hi, src_hi);
1604       aarch64_emit_move (dst_lo, src_lo);
1605     }
1606   else
1607     {
1608       aarch64_emit_move (dst_lo, src_lo);
1609       aarch64_emit_move (dst_hi, src_hi);
1610     }
1611 }
1612
1613 bool
1614 aarch64_split_128bit_move_p (rtx dst, rtx src)
1615 {
1616   return (! REG_P (src)
1617           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1618 }
1619
1620 /* Split a complex SIMD combine.  */
1621
1622 void
1623 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1624 {
1625   machine_mode src_mode = GET_MODE (src1);
1626   machine_mode dst_mode = GET_MODE (dst);
1627
1628   gcc_assert (VECTOR_MODE_P (dst_mode));
1629
1630   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1631     {
1632       rtx (*gen) (rtx, rtx, rtx);
1633
1634       switch (src_mode)
1635         {
1636         case V8QImode:
1637           gen = gen_aarch64_simd_combinev8qi;
1638           break;
1639         case V4HImode:
1640           gen = gen_aarch64_simd_combinev4hi;
1641           break;
1642         case V2SImode:
1643           gen = gen_aarch64_simd_combinev2si;
1644           break;
1645         case V4HFmode:
1646           gen = gen_aarch64_simd_combinev4hf;
1647           break;
1648         case V2SFmode:
1649           gen = gen_aarch64_simd_combinev2sf;
1650           break;
1651         case DImode:
1652           gen = gen_aarch64_simd_combinedi;
1653           break;
1654         case DFmode:
1655           gen = gen_aarch64_simd_combinedf;
1656           break;
1657         default:
1658           gcc_unreachable ();
1659         }
1660
1661       emit_insn (gen (dst, src1, src2));
1662       return;
1663     }
1664 }
1665
1666 /* Split a complex SIMD move.  */
1667
1668 void
1669 aarch64_split_simd_move (rtx dst, rtx src)
1670 {
1671   machine_mode src_mode = GET_MODE (src);
1672   machine_mode dst_mode = GET_MODE (dst);
1673
1674   gcc_assert (VECTOR_MODE_P (dst_mode));
1675
1676   if (REG_P (dst) && REG_P (src))
1677     {
1678       rtx (*gen) (rtx, rtx);
1679
1680       gcc_assert (VECTOR_MODE_P (src_mode));
1681
1682       switch (src_mode)
1683         {
1684         case V16QImode:
1685           gen = gen_aarch64_split_simd_movv16qi;
1686           break;
1687         case V8HImode:
1688           gen = gen_aarch64_split_simd_movv8hi;
1689           break;
1690         case V4SImode:
1691           gen = gen_aarch64_split_simd_movv4si;
1692           break;
1693         case V2DImode:
1694           gen = gen_aarch64_split_simd_movv2di;
1695           break;
1696         case V8HFmode:
1697           gen = gen_aarch64_split_simd_movv8hf;
1698           break;
1699         case V4SFmode:
1700           gen = gen_aarch64_split_simd_movv4sf;
1701           break;
1702         case V2DFmode:
1703           gen = gen_aarch64_split_simd_movv2df;
1704           break;
1705         default:
1706           gcc_unreachable ();
1707         }
1708
1709       emit_insn (gen (dst, src));
1710       return;
1711     }
1712 }
1713
1714 bool
1715 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1716                               machine_mode ymode, rtx y)
1717 {
1718   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1719   gcc_assert (r != NULL);
1720   return rtx_equal_p (x, r);
1721 }
1722
1723
1724 static rtx
1725 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1726 {
1727   if (can_create_pseudo_p ())
1728     return force_reg (mode, value);
1729   else
1730     {
1731       x = aarch64_emit_move (x, value);
1732       return x;
1733     }
1734 }
1735
1736
1737 static rtx
1738 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1739 {
1740   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1741     {
1742       rtx high;
1743       /* Load the full offset into a register.  This
1744          might be improvable in the future.  */
1745       high = GEN_INT (offset);
1746       offset = 0;
1747       high = aarch64_force_temporary (mode, temp, high);
1748       reg = aarch64_force_temporary (mode, temp,
1749                                      gen_rtx_PLUS (mode, high, reg));
1750     }
1751   return plus_constant (mode, reg, offset);
1752 }
1753
1754 static int
1755 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1756                                 machine_mode mode)
1757 {
1758   int i;
1759   unsigned HOST_WIDE_INT val, val2, mask;
1760   int one_match, zero_match;
1761   int num_insns;
1762
1763   val = INTVAL (imm);
1764
1765   if (aarch64_move_imm (val, mode))
1766     {
1767       if (generate)
1768         emit_insn (gen_rtx_SET (dest, imm));
1769       return 1;
1770     }
1771
1772   if ((val >> 32) == 0 || mode == SImode)
1773     {
1774       if (generate)
1775         {
1776           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1777           if (mode == SImode)
1778             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1779                                        GEN_INT ((val >> 16) & 0xffff)));
1780           else
1781             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1782                                        GEN_INT ((val >> 16) & 0xffff)));
1783         }
1784       return 2;
1785     }
1786
1787   /* Remaining cases are all for DImode.  */
1788
1789   mask = 0xffff;
1790   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1791     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1792   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1793     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1794
1795   if (zero_match != 2 && one_match != 2)
1796     {
1797       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1798          For a 64-bit bitmask try whether changing 16 bits to all ones or
1799          zeroes creates a valid bitmask.  To check any repeated bitmask,
1800          try using 16 bits from the other 32-bit half of val.  */
1801
1802       for (i = 0; i < 64; i += 16, mask <<= 16)
1803         {
1804           val2 = val & ~mask;
1805           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1806             break;
1807           val2 = val | mask;
1808           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1809             break;
1810           val2 = val2 & ~mask;
1811           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1812           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1813             break;
1814         }
1815       if (i != 64)
1816         {
1817           if (generate)
1818             {
1819               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1820               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1821                                          GEN_INT ((val >> i) & 0xffff)));
1822             }
1823           return 2;
1824         }
1825     }
1826
1827   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1828      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1829      otherwise skip zero bits.  */
1830
1831   num_insns = 1;
1832   mask = 0xffff;
1833   val2 = one_match > zero_match ? ~val : val;
1834   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1835
1836   if (generate)
1837     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1838                                            ? (val | ~(mask << i))
1839                                            : (val & (mask << i)))));
1840   for (i += 16; i < 64; i += 16)
1841     {
1842       if ((val2 & (mask << i)) == 0)
1843         continue;
1844       if (generate)
1845         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1846                                    GEN_INT ((val >> i) & 0xffff)));
1847       num_insns ++;
1848     }
1849
1850   return num_insns;
1851 }
1852
1853
1854 void
1855 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1856 {
1857   machine_mode mode = GET_MODE (dest);
1858
1859   gcc_assert (mode == SImode || mode == DImode);
1860
1861   /* Check on what type of symbol it is.  */
1862   if (GET_CODE (imm) == SYMBOL_REF
1863       || GET_CODE (imm) == LABEL_REF
1864       || GET_CODE (imm) == CONST)
1865     {
1866       rtx mem, base, offset;
1867       enum aarch64_symbol_type sty;
1868
1869       /* If we have (const (plus symbol offset)), separate out the offset
1870          before we start classifying the symbol.  */
1871       split_const (imm, &base, &offset);
1872
1873       sty = aarch64_classify_symbol (base, offset);
1874       switch (sty)
1875         {
1876         case SYMBOL_FORCE_TO_MEM:
1877           if (offset != const0_rtx
1878               && targetm.cannot_force_const_mem (mode, imm))
1879             {
1880               gcc_assert (can_create_pseudo_p ());
1881               base = aarch64_force_temporary (mode, dest, base);
1882               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1883               aarch64_emit_move (dest, base);
1884               return;
1885             }
1886
1887           mem = force_const_mem (ptr_mode, imm);
1888           gcc_assert (mem);
1889
1890           /* If we aren't generating PC relative literals, then
1891              we need to expand the literal pool access carefully.
1892              This is something that needs to be done in a number
1893              of places, so could well live as a separate function.  */
1894           if (!aarch64_pcrelative_literal_loads)
1895             {
1896               gcc_assert (can_create_pseudo_p ());
1897               base = gen_reg_rtx (ptr_mode);
1898               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1899               mem = gen_rtx_MEM (ptr_mode, base);
1900             }
1901
1902           if (mode != ptr_mode)
1903             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1904
1905           emit_insn (gen_rtx_SET (dest, mem));
1906
1907           return;
1908
1909         case SYMBOL_SMALL_TLSGD:
1910         case SYMBOL_SMALL_TLSDESC:
1911         case SYMBOL_SMALL_TLSIE:
1912         case SYMBOL_SMALL_GOT_28K:
1913         case SYMBOL_SMALL_GOT_4G:
1914         case SYMBOL_TINY_GOT:
1915         case SYMBOL_TINY_TLSIE:
1916           if (offset != const0_rtx)
1917             {
1918               gcc_assert(can_create_pseudo_p ());
1919               base = aarch64_force_temporary (mode, dest, base);
1920               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1921               aarch64_emit_move (dest, base);
1922               return;
1923             }
1924           /* FALLTHRU */
1925
1926         case SYMBOL_SMALL_ABSOLUTE:
1927         case SYMBOL_TINY_ABSOLUTE:
1928         case SYMBOL_TLSLE12:
1929         case SYMBOL_TLSLE24:
1930         case SYMBOL_TLSLE32:
1931         case SYMBOL_TLSLE48:
1932           aarch64_load_symref_appropriately (dest, imm, sty);
1933           return;
1934
1935         default:
1936           gcc_unreachable ();
1937         }
1938     }
1939
1940   if (!CONST_INT_P (imm))
1941     {
1942       if (GET_CODE (imm) == HIGH)
1943         emit_insn (gen_rtx_SET (dest, imm));
1944       else
1945         {
1946           rtx mem = force_const_mem (mode, imm);
1947           gcc_assert (mem);
1948           emit_insn (gen_rtx_SET (dest, mem));
1949         }
1950
1951       return;
1952     }
1953
1954   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1955 }
1956
1957 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
1958    temporary value if necessary.  FRAME_RELATED_P should be true if
1959    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
1960    to the generated instructions.  If SCRATCHREG is known to hold
1961    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
1962    immediate again.
1963
1964    Since this function may be used to adjust the stack pointer, we must
1965    ensure that it cannot cause transient stack deallocation (for example
1966    by first incrementing SP and then decrementing when adjusting by a
1967    large immediate).  */
1968
1969 static void
1970 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
1971                                HOST_WIDE_INT delta, bool frame_related_p,
1972                                bool emit_move_imm)
1973 {
1974   HOST_WIDE_INT mdelta = abs_hwi (delta);
1975   rtx this_rtx = gen_rtx_REG (mode, regnum);
1976   rtx_insn *insn;
1977
1978   if (!mdelta)
1979     return;
1980
1981   /* Single instruction adjustment.  */
1982   if (aarch64_uimm12_shift (mdelta))
1983     {
1984       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
1985       RTX_FRAME_RELATED_P (insn) = frame_related_p;
1986       return;
1987     }
1988
1989   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
1990      Only do this if mdelta is not a 16-bit move as adjusting using a move
1991      is better.  */
1992   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
1993     {
1994       HOST_WIDE_INT low_off = mdelta & 0xfff;
1995
1996       low_off = delta < 0 ? -low_off : low_off;
1997       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
1998       RTX_FRAME_RELATED_P (insn) = frame_related_p;
1999       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2000       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2001       return;
2002     }
2003
2004   /* Emit a move immediate if required and an addition/subtraction.  */
2005   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2006   if (emit_move_imm)
2007     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2008   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2009                               : gen_add2_insn (this_rtx, scratch_rtx));
2010   if (frame_related_p)
2011     {
2012       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2013       rtx adj = plus_constant (mode, this_rtx, delta);
2014       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2015     }
2016 }
2017
2018 static inline void
2019 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2020                       HOST_WIDE_INT delta)
2021 {
2022   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2023 }
2024
2025 static inline void
2026 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2027 {
2028   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2029                                  true, emit_move_imm);
2030 }
2031
2032 static inline void
2033 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2034 {
2035   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2036                                  frame_related_p, true);
2037 }
2038
2039 static bool
2040 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2041                                  tree exp ATTRIBUTE_UNUSED)
2042 {
2043   /* Currently, always true.  */
2044   return true;
2045 }
2046
2047 /* Implement TARGET_PASS_BY_REFERENCE.  */
2048
2049 static bool
2050 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2051                            machine_mode mode,
2052                            const_tree type,
2053                            bool named ATTRIBUTE_UNUSED)
2054 {
2055   HOST_WIDE_INT size;
2056   machine_mode dummymode;
2057   int nregs;
2058
2059   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2060   size = (mode == BLKmode && type)
2061     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2062
2063   /* Aggregates are passed by reference based on their size.  */
2064   if (type && AGGREGATE_TYPE_P (type))
2065     {
2066       size = int_size_in_bytes (type);
2067     }
2068
2069   /* Variable sized arguments are always returned by reference.  */
2070   if (size < 0)
2071     return true;
2072
2073   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2074   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2075                                                &dummymode, &nregs,
2076                                                NULL))
2077     return false;
2078
2079   /* Arguments which are variable sized or larger than 2 registers are
2080      passed by reference unless they are a homogenous floating point
2081      aggregate.  */
2082   return size > 2 * UNITS_PER_WORD;
2083 }
2084
2085 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2086 static bool
2087 aarch64_return_in_msb (const_tree valtype)
2088 {
2089   machine_mode dummy_mode;
2090   int dummy_int;
2091
2092   /* Never happens in little-endian mode.  */
2093   if (!BYTES_BIG_ENDIAN)
2094     return false;
2095
2096   /* Only composite types smaller than or equal to 16 bytes can
2097      be potentially returned in registers.  */
2098   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2099       || int_size_in_bytes (valtype) <= 0
2100       || int_size_in_bytes (valtype) > 16)
2101     return false;
2102
2103   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2104      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2105      is always passed/returned in the least significant bits of fp/simd
2106      register(s).  */
2107   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2108                                                &dummy_mode, &dummy_int, NULL))
2109     return false;
2110
2111   return true;
2112 }
2113
2114 /* Implement TARGET_FUNCTION_VALUE.
2115    Define how to find the value returned by a function.  */
2116
2117 static rtx
2118 aarch64_function_value (const_tree type, const_tree func,
2119                         bool outgoing ATTRIBUTE_UNUSED)
2120 {
2121   machine_mode mode;
2122   int unsignedp;
2123   int count;
2124   machine_mode ag_mode;
2125
2126   mode = TYPE_MODE (type);
2127   if (INTEGRAL_TYPE_P (type))
2128     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2129
2130   if (aarch64_return_in_msb (type))
2131     {
2132       HOST_WIDE_INT size = int_size_in_bytes (type);
2133
2134       if (size % UNITS_PER_WORD != 0)
2135         {
2136           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2137           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2138         }
2139     }
2140
2141   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2142                                                &ag_mode, &count, NULL))
2143     {
2144       if (!aarch64_composite_type_p (type, mode))
2145         {
2146           gcc_assert (count == 1 && mode == ag_mode);
2147           return gen_rtx_REG (mode, V0_REGNUM);
2148         }
2149       else
2150         {
2151           int i;
2152           rtx par;
2153
2154           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2155           for (i = 0; i < count; i++)
2156             {
2157               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2158               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2159                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2160               XVECEXP (par, 0, i) = tmp;
2161             }
2162           return par;
2163         }
2164     }
2165   else
2166     return gen_rtx_REG (mode, R0_REGNUM);
2167 }
2168
2169 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2170    Return true if REGNO is the number of a hard register in which the values
2171    of called function may come back.  */
2172
2173 static bool
2174 aarch64_function_value_regno_p (const unsigned int regno)
2175 {
2176   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2177      of 16-byte return values are: 128-bit integers and 16-byte small
2178      structures (excluding homogeneous floating-point aggregates).  */
2179   if (regno == R0_REGNUM || regno == R1_REGNUM)
2180     return true;
2181
2182   /* Up to four fp/simd registers can return a function value, e.g. a
2183      homogeneous floating-point aggregate having four members.  */
2184   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2185     return TARGET_FLOAT;
2186
2187   return false;
2188 }
2189
2190 /* Implement TARGET_RETURN_IN_MEMORY.
2191
2192    If the type T of the result of a function is such that
2193      void func (T arg)
2194    would require that arg be passed as a value in a register (or set of
2195    registers) according to the parameter passing rules, then the result
2196    is returned in the same registers as would be used for such an
2197    argument.  */
2198
2199 static bool
2200 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2201 {
2202   HOST_WIDE_INT size;
2203   machine_mode ag_mode;
2204   int count;
2205
2206   if (!AGGREGATE_TYPE_P (type)
2207       && TREE_CODE (type) != COMPLEX_TYPE
2208       && TREE_CODE (type) != VECTOR_TYPE)
2209     /* Simple scalar types always returned in registers.  */
2210     return false;
2211
2212   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2213                                                type,
2214                                                &ag_mode,
2215                                                &count,
2216                                                NULL))
2217     return false;
2218
2219   /* Types larger than 2 registers returned in memory.  */
2220   size = int_size_in_bytes (type);
2221   return (size < 0 || size > 2 * UNITS_PER_WORD);
2222 }
2223
2224 static bool
2225 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2226                                const_tree type, int *nregs)
2227 {
2228   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2229   return aarch64_vfp_is_call_or_return_candidate (mode,
2230                                                   type,
2231                                                   &pcum->aapcs_vfp_rmode,
2232                                                   nregs,
2233                                                   NULL);
2234 }
2235
2236 /* Given MODE and TYPE of a function argument, return the alignment in
2237    bits.  The idea is to suppress any stronger alignment requested by
2238    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2239    This is a helper function for local use only.  */
2240
2241 static unsigned int
2242 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2243 {
2244   if (!type)
2245     return GET_MODE_ALIGNMENT (mode);
2246   if (integer_zerop (TYPE_SIZE (type)))
2247     return 0;
2248
2249   gcc_assert (TYPE_MODE (type) == mode);
2250
2251   if (!AGGREGATE_TYPE_P (type))
2252     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2253
2254   if (TREE_CODE (type) == ARRAY_TYPE)
2255     return TYPE_ALIGN (TREE_TYPE (type));
2256
2257   unsigned int alignment = 0;
2258
2259   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2260     alignment = std::max (alignment, DECL_ALIGN (field));
2261
2262   return alignment;
2263 }
2264
2265 /* Layout a function argument according to the AAPCS64 rules.  The rule
2266    numbers refer to the rule numbers in the AAPCS64.  */
2267
2268 static void
2269 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2270                     const_tree type,
2271                     bool named ATTRIBUTE_UNUSED)
2272 {
2273   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2274   int ncrn, nvrn, nregs;
2275   bool allocate_ncrn, allocate_nvrn;
2276   HOST_WIDE_INT size;
2277
2278   /* We need to do this once per argument.  */
2279   if (pcum->aapcs_arg_processed)
2280     return;
2281
2282   pcum->aapcs_arg_processed = true;
2283
2284   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2285   size
2286     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2287                 UNITS_PER_WORD);
2288
2289   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2290   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2291                                                  mode,
2292                                                  type,
2293                                                  &nregs);
2294
2295   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2296      The following code thus handles passing by SIMD/FP registers first.  */
2297
2298   nvrn = pcum->aapcs_nvrn;
2299
2300   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2301      and homogenous short-vector aggregates (HVA).  */
2302   if (allocate_nvrn)
2303     {
2304       if (!TARGET_FLOAT)
2305         aarch64_err_no_fpadvsimd (mode, "argument");
2306
2307       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2308         {
2309           pcum->aapcs_nextnvrn = nvrn + nregs;
2310           if (!aarch64_composite_type_p (type, mode))
2311             {
2312               gcc_assert (nregs == 1);
2313               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2314             }
2315           else
2316             {
2317               rtx par;
2318               int i;
2319               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2320               for (i = 0; i < nregs; i++)
2321                 {
2322                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2323                                          V0_REGNUM + nvrn + i);
2324                   tmp = gen_rtx_EXPR_LIST
2325                     (VOIDmode, tmp,
2326                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2327                   XVECEXP (par, 0, i) = tmp;
2328                 }
2329               pcum->aapcs_reg = par;
2330             }
2331           return;
2332         }
2333       else
2334         {
2335           /* C.3 NSRN is set to 8.  */
2336           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2337           goto on_stack;
2338         }
2339     }
2340
2341   ncrn = pcum->aapcs_ncrn;
2342   nregs = size / UNITS_PER_WORD;
2343
2344   /* C6 - C9.  though the sign and zero extension semantics are
2345      handled elsewhere.  This is the case where the argument fits
2346      entirely general registers.  */
2347   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2348     {
2349       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2350
2351       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2352
2353       /* C.8 if the argument has an alignment of 16 then the NGRN is
2354          rounded up to the next even number.  */
2355       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
2356         {
2357           ++ncrn;
2358           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2359         }
2360       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2361          A reg is still generated for it, but the caller should be smart
2362          enough not to use it.  */
2363       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2364         {
2365           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2366         }
2367       else
2368         {
2369           rtx par;
2370           int i;
2371
2372           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2373           for (i = 0; i < nregs; i++)
2374             {
2375               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2376               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2377                                        GEN_INT (i * UNITS_PER_WORD));
2378               XVECEXP (par, 0, i) = tmp;
2379             }
2380           pcum->aapcs_reg = par;
2381         }
2382
2383       pcum->aapcs_nextncrn = ncrn + nregs;
2384       return;
2385     }
2386
2387   /* C.11  */
2388   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2389
2390   /* The argument is passed on stack; record the needed number of words for
2391      this argument and align the total size if necessary.  */
2392 on_stack:
2393   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2394   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2395     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2396                                        16 / UNITS_PER_WORD);
2397   return;
2398 }
2399
2400 /* Implement TARGET_FUNCTION_ARG.  */
2401
2402 static rtx
2403 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2404                       const_tree type, bool named)
2405 {
2406   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2407   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2408
2409   if (mode == VOIDmode)
2410     return NULL_RTX;
2411
2412   aarch64_layout_arg (pcum_v, mode, type, named);
2413   return pcum->aapcs_reg;
2414 }
2415
2416 void
2417 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2418                            const_tree fntype ATTRIBUTE_UNUSED,
2419                            rtx libname ATTRIBUTE_UNUSED,
2420                            const_tree fndecl ATTRIBUTE_UNUSED,
2421                            unsigned n_named ATTRIBUTE_UNUSED)
2422 {
2423   pcum->aapcs_ncrn = 0;
2424   pcum->aapcs_nvrn = 0;
2425   pcum->aapcs_nextncrn = 0;
2426   pcum->aapcs_nextnvrn = 0;
2427   pcum->pcs_variant = ARM_PCS_AAPCS64;
2428   pcum->aapcs_reg = NULL_RTX;
2429   pcum->aapcs_arg_processed = false;
2430   pcum->aapcs_stack_words = 0;
2431   pcum->aapcs_stack_size = 0;
2432
2433   if (!TARGET_FLOAT
2434       && fndecl && TREE_PUBLIC (fndecl)
2435       && fntype && fntype != error_mark_node)
2436     {
2437       const_tree type = TREE_TYPE (fntype);
2438       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2439       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2440       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2441                                                    &mode, &nregs, NULL))
2442         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2443     }
2444   return;
2445 }
2446
2447 static void
2448 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2449                               machine_mode mode,
2450                               const_tree type,
2451                               bool named)
2452 {
2453   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2454   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2455     {
2456       aarch64_layout_arg (pcum_v, mode, type, named);
2457       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2458                   != (pcum->aapcs_stack_words != 0));
2459       pcum->aapcs_arg_processed = false;
2460       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2461       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2462       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2463       pcum->aapcs_stack_words = 0;
2464       pcum->aapcs_reg = NULL_RTX;
2465     }
2466 }
2467
2468 bool
2469 aarch64_function_arg_regno_p (unsigned regno)
2470 {
2471   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2472           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2473 }
2474
2475 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2476    PARM_BOUNDARY bits of alignment, but will be given anything up
2477    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2478    that both before and after the layout of each argument, the Next
2479    Stacked Argument Address (NSAA) will have a minimum alignment of
2480    8 bytes.  */
2481
2482 static unsigned int
2483 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2484 {
2485   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2486
2487   if (alignment < PARM_BOUNDARY)
2488     alignment = PARM_BOUNDARY;
2489   if (alignment > STACK_BOUNDARY)
2490     alignment = STACK_BOUNDARY;
2491   return alignment;
2492 }
2493
2494 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2495
2496    Return true if an argument passed on the stack should be padded upwards,
2497    i.e. if the least-significant byte of the stack slot has useful data.
2498
2499    Small aggregate types are placed in the lowest memory address.
2500
2501    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2502
2503 bool
2504 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2505 {
2506   /* On little-endian targets, the least significant byte of every stack
2507      argument is passed at the lowest byte address of the stack slot.  */
2508   if (!BYTES_BIG_ENDIAN)
2509     return true;
2510
2511   /* Otherwise, integral, floating-point and pointer types are padded downward:
2512      the least significant byte of a stack argument is passed at the highest
2513      byte address of the stack slot.  */
2514   if (type
2515       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2516          || POINTER_TYPE_P (type))
2517       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2518     return false;
2519
2520   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2521   return true;
2522 }
2523
2524 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2525
2526    It specifies padding for the last (may also be the only)
2527    element of a block move between registers and memory.  If
2528    assuming the block is in the memory, padding upward means that
2529    the last element is padded after its highest significant byte,
2530    while in downward padding, the last element is padded at the
2531    its least significant byte side.
2532
2533    Small aggregates and small complex types are always padded
2534    upwards.
2535
2536    We don't need to worry about homogeneous floating-point or
2537    short-vector aggregates; their move is not affected by the
2538    padding direction determined here.  Regardless of endianness,
2539    each element of such an aggregate is put in the least
2540    significant bits of a fp/simd register.
2541
2542    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2543    register has useful data, and return the opposite if the most
2544    significant byte does.  */
2545
2546 bool
2547 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2548                      bool first ATTRIBUTE_UNUSED)
2549 {
2550
2551   /* Small composite types are always padded upward.  */
2552   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2553     {
2554       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2555                             : GET_MODE_SIZE (mode));
2556       if (size < 2 * UNITS_PER_WORD)
2557         return true;
2558     }
2559
2560   /* Otherwise, use the default padding.  */
2561   return !BYTES_BIG_ENDIAN;
2562 }
2563
2564 static machine_mode
2565 aarch64_libgcc_cmp_return_mode (void)
2566 {
2567   return SImode;
2568 }
2569
2570 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2571
2572 /* We use the 12-bit shifted immediate arithmetic instructions so values
2573    must be multiple of (1 << 12), i.e. 4096.  */
2574 #define ARITH_FACTOR 4096
2575
2576 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2577 #error Cannot use simple address calculation for stack probing
2578 #endif
2579
2580 /* The pair of scratch registers used for stack probing.  */
2581 #define PROBE_STACK_FIRST_REG  9
2582 #define PROBE_STACK_SECOND_REG 10
2583
2584 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2585    inclusive.  These are offsets from the current stack pointer.  */
2586
2587 static void
2588 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2589 {
2590   rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2591
2592   /* See the same assertion on PROBE_INTERVAL above.  */
2593   gcc_assert ((first % ARITH_FACTOR) == 0);
2594
2595   /* See if we have a constant small number of probes to generate.  If so,
2596      that's the easy case.  */
2597   if (size <= PROBE_INTERVAL)
2598     {
2599       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2600
2601       emit_set_insn (reg1,
2602                      plus_constant (ptr_mode,
2603                                     stack_pointer_rtx, -(first + base)));
2604       emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2605     }
2606
2607   /* The run-time loop is made up of 8 insns in the generic case while the
2608      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2609   else if (size <= 4 * PROBE_INTERVAL)
2610     {
2611       HOST_WIDE_INT i, rem;
2612
2613       emit_set_insn (reg1,
2614                      plus_constant (ptr_mode,
2615                                     stack_pointer_rtx,
2616                                     -(first + PROBE_INTERVAL)));
2617       emit_stack_probe (reg1);
2618
2619       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2620          it exceeds SIZE.  If only two probes are needed, this will not
2621          generate any code.  Then probe at FIRST + SIZE.  */
2622       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2623         {
2624           emit_set_insn (reg1,
2625                          plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2626           emit_stack_probe (reg1);
2627         }
2628
2629       rem = size - (i - PROBE_INTERVAL);
2630       if (rem > 256)
2631         {
2632           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2633
2634           emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2635           emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2636         }
2637       else
2638         emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2639     }
2640
2641   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2642      extra careful with variables wrapping around because we might be at
2643      the very top (or the very bottom) of the address space and we have
2644      to be able to handle this case properly; in particular, we use an
2645      equality test for the loop condition.  */
2646   else
2647     {
2648       rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2649
2650       /* Step 1: round SIZE to the previous multiple of the interval.  */
2651
2652       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2653
2654
2655       /* Step 2: compute initial and final value of the loop counter.  */
2656
2657       /* TEST_ADDR = SP + FIRST.  */
2658       emit_set_insn (reg1,
2659                      plus_constant (ptr_mode, stack_pointer_rtx, -first));
2660
2661       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2662       emit_set_insn (reg2,
2663                      plus_constant (ptr_mode, stack_pointer_rtx,
2664                                     -(first + rounded_size)));
2665
2666
2667       /* Step 3: the loop
2668
2669          do
2670            {
2671              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2672              probe at TEST_ADDR
2673            }
2674          while (TEST_ADDR != LAST_ADDR)
2675
2676          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2677          until it is equal to ROUNDED_SIZE.  */
2678
2679       if (ptr_mode == DImode)
2680         emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2681       else
2682         emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2683
2684
2685       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2686          that SIZE is equal to ROUNDED_SIZE.  */
2687
2688       if (size != rounded_size)
2689         {
2690           HOST_WIDE_INT rem = size - rounded_size;
2691
2692           if (rem > 256)
2693             {
2694               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2695
2696               emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2697               emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2698             }
2699           else
2700             emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2701         }
2702     }
2703
2704   /* Make sure nothing is scheduled before we are done.  */
2705   emit_insn (gen_blockage ());
2706 }
2707
2708 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2709    absolute addresses.  */
2710
2711 const char *
2712 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2713 {
2714   static int labelno = 0;
2715   char loop_lab[32];
2716   rtx xops[2];
2717
2718   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2719
2720   /* Loop.  */
2721   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2722
2723   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2724   xops[0] = reg1;
2725   xops[1] = GEN_INT (PROBE_INTERVAL);
2726   output_asm_insn ("sub\t%0, %0, %1", xops);
2727
2728   /* Probe at TEST_ADDR.  */
2729   output_asm_insn ("str\txzr, [%0]", xops);
2730
2731   /* Test if TEST_ADDR == LAST_ADDR.  */
2732   xops[1] = reg2;
2733   output_asm_insn ("cmp\t%0, %1", xops);
2734
2735   /* Branch.  */
2736   fputs ("\tb.ne\t", asm_out_file);
2737   assemble_name_raw (asm_out_file, loop_lab);
2738   fputc ('\n', asm_out_file);
2739
2740   return "";
2741 }
2742
2743 static bool
2744 aarch64_frame_pointer_required (void)
2745 {
2746   /* In aarch64_override_options_after_change
2747      flag_omit_leaf_frame_pointer turns off the frame pointer by
2748      default.  Turn it back on now if we've not got a leaf
2749      function.  */
2750   if (flag_omit_leaf_frame_pointer
2751       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2752     return true;
2753
2754   return false;
2755 }
2756
2757 /* Mark the registers that need to be saved by the callee and calculate
2758    the size of the callee-saved registers area and frame record (both FP
2759    and LR may be omitted).  */
2760 static void
2761 aarch64_layout_frame (void)
2762 {
2763   HOST_WIDE_INT offset = 0;
2764   int regno, last_fp_reg = INVALID_REGNUM;
2765
2766   if (reload_completed && cfun->machine->frame.laid_out)
2767     return;
2768
2769 #define SLOT_NOT_REQUIRED (-2)
2770 #define SLOT_REQUIRED     (-1)
2771
2772   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2773   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2774
2775   /* First mark all the registers that really need to be saved...  */
2776   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2777     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2778
2779   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2780     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2781
2782   /* ... that includes the eh data registers (if needed)...  */
2783   if (crtl->calls_eh_return)
2784     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2785       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2786         = SLOT_REQUIRED;
2787
2788   /* ... and any callee saved register that dataflow says is live.  */
2789   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2790     if (df_regs_ever_live_p (regno)
2791         && (regno == R30_REGNUM
2792             || !call_used_regs[regno]))
2793       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2794
2795   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2796     if (df_regs_ever_live_p (regno)
2797         && !call_used_regs[regno])
2798       {
2799         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2800         last_fp_reg = regno;
2801       }
2802
2803   if (frame_pointer_needed)
2804     {
2805       /* FP and LR are placed in the linkage record.  */
2806       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2807       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2808       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2809       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2810       offset += 2 * UNITS_PER_WORD;
2811     }
2812
2813   /* Now assign stack slots for them.  */
2814   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2815     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2816       {
2817         cfun->machine->frame.reg_offset[regno] = offset;
2818         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2819           cfun->machine->frame.wb_candidate1 = regno;
2820         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2821           cfun->machine->frame.wb_candidate2 = regno;
2822         offset += UNITS_PER_WORD;
2823       }
2824
2825   HOST_WIDE_INT max_int_offset = offset;
2826   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2827   bool has_align_gap = offset != max_int_offset;
2828
2829   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2830     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2831       {
2832         /* If there is an alignment gap between integer and fp callee-saves,
2833            allocate the last fp register to it if possible.  */
2834         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2835           {
2836             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2837             break;
2838           }
2839
2840         cfun->machine->frame.reg_offset[regno] = offset;
2841         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2842           cfun->machine->frame.wb_candidate1 = regno;
2843         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2844                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2845           cfun->machine->frame.wb_candidate2 = regno;
2846         offset += UNITS_PER_WORD;
2847       }
2848
2849   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2850
2851   cfun->machine->frame.saved_regs_size = offset;
2852
2853   HOST_WIDE_INT varargs_and_saved_regs_size
2854     = offset + cfun->machine->frame.saved_varargs_size;
2855
2856   cfun->machine->frame.hard_fp_offset
2857     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2858                 STACK_BOUNDARY / BITS_PER_UNIT);
2859
2860   cfun->machine->frame.frame_size
2861     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2862                 + crtl->outgoing_args_size,
2863                 STACK_BOUNDARY / BITS_PER_UNIT);
2864
2865   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2866
2867   cfun->machine->frame.initial_adjust = 0;
2868   cfun->machine->frame.final_adjust = 0;
2869   cfun->machine->frame.callee_adjust = 0;
2870   cfun->machine->frame.callee_offset = 0;
2871
2872   HOST_WIDE_INT max_push_offset = 0;
2873   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2874     max_push_offset = 512;
2875   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2876     max_push_offset = 256;
2877
2878   if (cfun->machine->frame.frame_size < max_push_offset
2879       && crtl->outgoing_args_size == 0)
2880     {
2881       /* Simple, small frame with no outgoing arguments:
2882          stp reg1, reg2, [sp, -frame_size]!
2883          stp reg3, reg4, [sp, 16]  */
2884       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2885     }
2886   else if ((crtl->outgoing_args_size
2887             + cfun->machine->frame.saved_regs_size < 512)
2888            && !(cfun->calls_alloca
2889                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2890     {
2891       /* Frame with small outgoing arguments:
2892          sub sp, sp, frame_size
2893          stp reg1, reg2, [sp, outgoing_args_size]
2894          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
2895       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2896       cfun->machine->frame.callee_offset
2897         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2898     }
2899   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
2900     {
2901       /* Frame with large outgoing arguments but a small local area:
2902          stp reg1, reg2, [sp, -hard_fp_offset]!
2903          stp reg3, reg4, [sp, 16]
2904          sub sp, sp, outgoing_args_size  */
2905       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
2906       cfun->machine->frame.final_adjust
2907         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2908     }
2909   else if (!frame_pointer_needed
2910            && varargs_and_saved_regs_size < max_push_offset)
2911     {
2912       /* Frame with large local area and outgoing arguments (this pushes the
2913          callee-saves first, followed by the locals and outgoing area):
2914          stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2915          stp reg3, reg4, [sp, 16]
2916          sub sp, sp, frame_size - varargs_and_saved_regs_size  */
2917       cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
2918       cfun->machine->frame.final_adjust
2919         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2920       cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
2921       cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
2922     }
2923   else
2924     {
2925       /* Frame with large local area and outgoing arguments using frame pointer:
2926          sub sp, sp, hard_fp_offset
2927          stp x29, x30, [sp, 0]
2928          add x29, sp, 0
2929          stp reg3, reg4, [sp, 16]
2930          sub sp, sp, outgoing_args_size  */
2931       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
2932       cfun->machine->frame.final_adjust
2933         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
2934     }
2935
2936   cfun->machine->frame.laid_out = true;
2937 }
2938
2939 /* Return true if the register REGNO is saved on entry to
2940    the current function.  */
2941
2942 static bool
2943 aarch64_register_saved_on_entry (int regno)
2944 {
2945   return cfun->machine->frame.reg_offset[regno] >= 0;
2946 }
2947
2948 /* Return the next register up from REGNO up to LIMIT for the callee
2949    to save.  */
2950
2951 static unsigned
2952 aarch64_next_callee_save (unsigned regno, unsigned limit)
2953 {
2954   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2955     regno ++;
2956   return regno;
2957 }
2958
2959 /* Push the register number REGNO of mode MODE to the stack with write-back
2960    adjusting the stack by ADJUSTMENT.  */
2961
2962 static void
2963 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2964                            HOST_WIDE_INT adjustment)
2965  {
2966   rtx base_rtx = stack_pointer_rtx;
2967   rtx insn, reg, mem;
2968
2969   reg = gen_rtx_REG (mode, regno);
2970   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2971                             plus_constant (Pmode, base_rtx, -adjustment));
2972   mem = gen_rtx_MEM (mode, mem);
2973
2974   insn = emit_move_insn (mem, reg);
2975   RTX_FRAME_RELATED_P (insn) = 1;
2976 }
2977
2978 /* Generate and return an instruction to store the pair of registers
2979    REG and REG2 of mode MODE to location BASE with write-back adjusting
2980    the stack location BASE by ADJUSTMENT.  */
2981
2982 static rtx
2983 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2984                           HOST_WIDE_INT adjustment)
2985 {
2986   switch (mode)
2987     {
2988     case DImode:
2989       return gen_storewb_pairdi_di (base, base, reg, reg2,
2990                                     GEN_INT (-adjustment),
2991                                     GEN_INT (UNITS_PER_WORD - adjustment));
2992     case DFmode:
2993       return gen_storewb_pairdf_di (base, base, reg, reg2,
2994                                     GEN_INT (-adjustment),
2995                                     GEN_INT (UNITS_PER_WORD - adjustment));
2996     default:
2997       gcc_unreachable ();
2998     }
2999 }
3000
3001 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3002    stack pointer by ADJUSTMENT.  */
3003
3004 static void
3005 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3006 {
3007   rtx_insn *insn;
3008   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3009
3010   if (regno2 == INVALID_REGNUM)
3011     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3012
3013   rtx reg1 = gen_rtx_REG (mode, regno1);
3014   rtx reg2 = gen_rtx_REG (mode, regno2);
3015
3016   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3017                                               reg2, adjustment));
3018   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3019   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3020   RTX_FRAME_RELATED_P (insn) = 1;
3021 }
3022
3023 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3024    adjusting it by ADJUSTMENT afterwards.  */
3025
3026 static rtx
3027 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3028                          HOST_WIDE_INT adjustment)
3029 {
3030   switch (mode)
3031     {
3032     case DImode:
3033       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3034                                    GEN_INT (UNITS_PER_WORD));
3035     case DFmode:
3036       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3037                                    GEN_INT (UNITS_PER_WORD));
3038     default:
3039       gcc_unreachable ();
3040     }
3041 }
3042
3043 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3044    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3045    into CFI_OPS.  */
3046
3047 static void
3048 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3049                   rtx *cfi_ops)
3050 {
3051   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3052   rtx reg1 = gen_rtx_REG (mode, regno1);
3053
3054   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3055
3056   if (regno2 == INVALID_REGNUM)
3057     {
3058       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3059       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3060       emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3061     }
3062   else
3063     {
3064       rtx reg2 = gen_rtx_REG (mode, regno2);
3065       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3066       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3067                                           reg2, adjustment));
3068     }
3069 }
3070
3071 /* Generate and return a store pair instruction of mode MODE to store
3072    register REG1 to MEM1 and register REG2 to MEM2.  */
3073
3074 static rtx
3075 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3076                         rtx reg2)
3077 {
3078   switch (mode)
3079     {
3080     case DImode:
3081       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3082
3083     case DFmode:
3084       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3085
3086     default:
3087       gcc_unreachable ();
3088     }
3089 }
3090
3091 /* Generate and regurn a load pair isntruction of mode MODE to load register
3092    REG1 from MEM1 and register REG2 from MEM2.  */
3093
3094 static rtx
3095 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3096                        rtx mem2)
3097 {
3098   switch (mode)
3099     {
3100     case DImode:
3101       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3102
3103     case DFmode:
3104       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3105
3106     default:
3107       gcc_unreachable ();
3108     }
3109 }
3110
3111 /* Emit code to save the callee-saved registers from register number START
3112    to LIMIT to the stack at the location starting at offset START_OFFSET,
3113    skipping any write-back candidates if SKIP_WB is true.  */
3114
3115 static void
3116 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3117                            unsigned start, unsigned limit, bool skip_wb)
3118 {
3119   rtx_insn *insn;
3120   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3121                                                  ? gen_frame_mem : gen_rtx_MEM);
3122   unsigned regno;
3123   unsigned regno2;
3124
3125   for (regno = aarch64_next_callee_save (start, limit);
3126        regno <= limit;
3127        regno = aarch64_next_callee_save (regno + 1, limit))
3128     {
3129       rtx reg, mem;
3130       HOST_WIDE_INT offset;
3131
3132       if (skip_wb
3133           && (regno == cfun->machine->frame.wb_candidate1
3134               || regno == cfun->machine->frame.wb_candidate2))
3135         continue;
3136
3137       if (cfun->machine->reg_is_wrapped_separately[regno])
3138        continue;
3139
3140       reg = gen_rtx_REG (mode, regno);
3141       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3142       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3143                                               offset));
3144
3145       regno2 = aarch64_next_callee_save (regno + 1, limit);
3146
3147       if (regno2 <= limit
3148           && !cfun->machine->reg_is_wrapped_separately[regno2]
3149           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3150               == cfun->machine->frame.reg_offset[regno2]))
3151
3152         {
3153           rtx reg2 = gen_rtx_REG (mode, regno2);
3154           rtx mem2;
3155
3156           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3157           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3158                                                    offset));
3159           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3160                                                     reg2));
3161
3162           /* The first part of a frame-related parallel insn is
3163              always assumed to be relevant to the frame
3164              calculations; subsequent parts, are only
3165              frame-related if explicitly marked.  */
3166           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3167           regno = regno2;
3168         }
3169       else
3170         insn = emit_move_insn (mem, reg);
3171
3172       RTX_FRAME_RELATED_P (insn) = 1;
3173     }
3174 }
3175
3176 /* Emit code to restore the callee registers of mode MODE from register
3177    number START up to and including LIMIT.  Restore from the stack offset
3178    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3179    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3180
3181 static void
3182 aarch64_restore_callee_saves (machine_mode mode,
3183                               HOST_WIDE_INT start_offset, unsigned start,
3184                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3185 {
3186   rtx base_rtx = stack_pointer_rtx;
3187   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3188                                                  ? gen_frame_mem : gen_rtx_MEM);
3189   unsigned regno;
3190   unsigned regno2;
3191   HOST_WIDE_INT offset;
3192
3193   for (regno = aarch64_next_callee_save (start, limit);
3194        regno <= limit;
3195        regno = aarch64_next_callee_save (regno + 1, limit))
3196     {
3197       if (cfun->machine->reg_is_wrapped_separately[regno])
3198        continue;
3199
3200       rtx reg, mem;
3201
3202       if (skip_wb
3203           && (regno == cfun->machine->frame.wb_candidate1
3204               || regno == cfun->machine->frame.wb_candidate2))
3205         continue;
3206
3207       reg = gen_rtx_REG (mode, regno);
3208       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3209       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3210
3211       regno2 = aarch64_next_callee_save (regno + 1, limit);
3212
3213       if (regno2 <= limit
3214           && !cfun->machine->reg_is_wrapped_separately[regno2]
3215           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3216               == cfun->machine->frame.reg_offset[regno2]))
3217         {
3218           rtx reg2 = gen_rtx_REG (mode, regno2);
3219           rtx mem2;
3220
3221           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3222           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3223           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3224
3225           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3226           regno = regno2;
3227         }
3228       else
3229         emit_move_insn (reg, mem);
3230       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3231     }
3232 }
3233
3234 static inline bool
3235 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3236                                HOST_WIDE_INT offset)
3237 {
3238   return offset >= -256 && offset < 256;
3239 }
3240
3241 static inline bool
3242 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3243 {
3244   return (offset >= 0
3245           && offset < 4096 * GET_MODE_SIZE (mode)
3246           && offset % GET_MODE_SIZE (mode) == 0);
3247 }
3248
3249 bool
3250 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3251 {
3252   return (offset >= -64 * GET_MODE_SIZE (mode)
3253           && offset < 64 * GET_MODE_SIZE (mode)
3254           && offset % GET_MODE_SIZE (mode) == 0);
3255 }
3256
3257 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
3258
3259 static sbitmap
3260 aarch64_get_separate_components (void)
3261 {
3262   aarch64_layout_frame ();
3263
3264   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3265   bitmap_clear (components);
3266
3267   /* The registers we need saved to the frame.  */
3268   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3269     if (aarch64_register_saved_on_entry (regno))
3270       {
3271         HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3272         if (!frame_pointer_needed)
3273           offset += cfun->machine->frame.frame_size
3274                     - cfun->machine->frame.hard_fp_offset;
3275         /* Check that we can access the stack slot of the register with one
3276            direct load with no adjustments needed.  */
3277         if (offset_12bit_unsigned_scaled_p (DImode, offset))
3278           bitmap_set_bit (components, regno);
3279       }
3280
3281   /* Don't mess with the hard frame pointer.  */
3282   if (frame_pointer_needed)
3283     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3284
3285   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3286   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3287   /* If aarch64_layout_frame has chosen registers to store/restore with
3288      writeback don't interfere with them to avoid having to output explicit
3289      stack adjustment instructions.  */
3290   if (reg2 != INVALID_REGNUM)
3291     bitmap_clear_bit (components, reg2);
3292   if (reg1 != INVALID_REGNUM)
3293     bitmap_clear_bit (components, reg1);
3294
3295   bitmap_clear_bit (components, LR_REGNUM);
3296   bitmap_clear_bit (components, SP_REGNUM);
3297
3298   return components;
3299 }
3300
3301 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
3302
3303 static sbitmap
3304 aarch64_components_for_bb (basic_block bb)
3305 {
3306   bitmap in = DF_LIVE_IN (bb);
3307   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3308   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3309
3310   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3311   bitmap_clear (components);
3312
3313   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
3314   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3315     if ((!call_used_regs[regno])
3316        && (bitmap_bit_p (in, regno)
3317            || bitmap_bit_p (gen, regno)
3318            || bitmap_bit_p (kill, regno)))
3319           bitmap_set_bit (components, regno);
3320
3321   return components;
3322 }
3323
3324 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3325    Nothing to do for aarch64.  */
3326
3327 static void
3328 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3329 {
3330 }
3331
3332 /* Return the next set bit in BMP from START onwards.  Return the total number
3333    of bits in BMP if no set bit is found at or after START.  */
3334
3335 static unsigned int
3336 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3337 {
3338   unsigned int nbits = SBITMAP_SIZE (bmp);
3339   if (start == nbits)
3340     return start;
3341
3342   gcc_assert (start < nbits);
3343   for (unsigned int i = start; i < nbits; i++)
3344     if (bitmap_bit_p (bmp, i))
3345       return i;
3346
3347   return nbits;
3348 }
3349
3350 /* Do the work for aarch64_emit_prologue_components and
3351    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
3352    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3353    for these components or the epilogue sequence.  That is, it determines
3354    whether we should emit stores or loads and what kind of CFA notes to attach
3355    to the insns.  Otherwise the logic for the two sequences is very
3356    similar.  */
3357
3358 static void
3359 aarch64_process_components (sbitmap components, bool prologue_p)
3360 {
3361   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3362                              ? HARD_FRAME_POINTER_REGNUM
3363                              : STACK_POINTER_REGNUM);
3364
3365   unsigned last_regno = SBITMAP_SIZE (components);
3366   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3367   rtx_insn *insn = NULL;
3368
3369   while (regno != last_regno)
3370     {
3371       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3372          so DFmode for the vector registers is enough.  */
3373       machine_mode mode = GP_REGNUM_P (regno) ? DImode : DFmode;
3374       rtx reg = gen_rtx_REG (mode, regno);
3375       HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3376       if (!frame_pointer_needed)
3377         offset += cfun->machine->frame.frame_size
3378                   - cfun->machine->frame.hard_fp_offset;
3379       rtx addr = plus_constant (Pmode, ptr_reg, offset);
3380       rtx mem = gen_frame_mem (mode, addr);
3381
3382       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3383       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3384       /* No more registers to handle after REGNO.
3385          Emit a single save/restore and exit.  */
3386       if (regno2 == last_regno)
3387         {
3388           insn = emit_insn (set);
3389           RTX_FRAME_RELATED_P (insn) = 1;
3390           if (prologue_p)
3391             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3392           else
3393             add_reg_note (insn, REG_CFA_RESTORE, reg);
3394           break;
3395         }
3396
3397       HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3398       /* The next register is not of the same class or its offset is not
3399          mergeable with the current one into a pair.  */
3400       if (!satisfies_constraint_Ump (mem)
3401           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3402           || (offset2 - cfun->machine->frame.reg_offset[regno])
3403                 != GET_MODE_SIZE (mode))
3404         {
3405           insn = emit_insn (set);
3406           RTX_FRAME_RELATED_P (insn) = 1;
3407           if (prologue_p)
3408             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3409           else
3410             add_reg_note (insn, REG_CFA_RESTORE, reg);
3411
3412           regno = regno2;
3413           continue;
3414         }
3415
3416       /* REGNO2 can be saved/restored in a pair with REGNO.  */
3417       rtx reg2 = gen_rtx_REG (mode, regno2);
3418       if (!frame_pointer_needed)
3419         offset2 += cfun->machine->frame.frame_size
3420                   - cfun->machine->frame.hard_fp_offset;
3421       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3422       rtx mem2 = gen_frame_mem (mode, addr2);
3423       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3424                              : gen_rtx_SET (reg2, mem2);
3425
3426       if (prologue_p)
3427         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3428       else
3429         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3430
3431       RTX_FRAME_RELATED_P (insn) = 1;
3432       if (prologue_p)
3433         {
3434           add_reg_note (insn, REG_CFA_OFFSET, set);
3435           add_reg_note (insn, REG_CFA_OFFSET, set2);
3436         }
3437       else
3438         {
3439           add_reg_note (insn, REG_CFA_RESTORE, reg);
3440           add_reg_note (insn, REG_CFA_RESTORE, reg2);
3441         }
3442
3443       regno = aarch64_get_next_set_bit (components, regno2 + 1);
3444     }
3445 }
3446
3447 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
3448
3449 static void
3450 aarch64_emit_prologue_components (sbitmap components)
3451 {
3452   aarch64_process_components (components, true);
3453 }
3454
3455 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
3456
3457 static void
3458 aarch64_emit_epilogue_components (sbitmap components)
3459 {
3460   aarch64_process_components (components, false);
3461 }
3462
3463 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
3464
3465 static void
3466 aarch64_set_handled_components (sbitmap components)
3467 {
3468   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3469     if (bitmap_bit_p (components, regno))
3470       cfun->machine->reg_is_wrapped_separately[regno] = true;
3471 }
3472
3473 /* AArch64 stack frames generated by this compiler look like:
3474
3475         +-------------------------------+
3476         |                               |
3477         |  incoming stack arguments     |
3478         |                               |
3479         +-------------------------------+
3480         |                               | <-- incoming stack pointer (aligned)
3481         |  callee-allocated save area   |
3482         |  for register varargs         |
3483         |                               |
3484         +-------------------------------+
3485         |  local variables              | <-- frame_pointer_rtx
3486         |                               |
3487         +-------------------------------+
3488         |  padding0                     | \
3489         +-------------------------------+  |
3490         |  callee-saved registers       |  | frame.saved_regs_size
3491         +-------------------------------+  |
3492         |  LR'                          |  |
3493         +-------------------------------+  |
3494         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3495         +-------------------------------+
3496         |  dynamic allocation           |
3497         +-------------------------------+
3498         |  padding                      |
3499         +-------------------------------+
3500         |  outgoing stack arguments     | <-- arg_pointer
3501         |                               |
3502         +-------------------------------+
3503         |                               | <-- stack_pointer_rtx (aligned)
3504
3505    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3506    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3507    unchanged.  */
3508
3509 /* Generate the prologue instructions for entry into a function.
3510    Establish the stack frame by decreasing the stack pointer with a
3511    properly calculated size and, if necessary, create a frame record
3512    filled with the values of LR and previous frame pointer.  The
3513    current FP is also set up if it is in use.  */
3514
3515 void
3516 aarch64_expand_prologue (void)
3517 {
3518   aarch64_layout_frame ();
3519
3520   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3521   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3522   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3523   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3524   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3525   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3526   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3527   rtx_insn *insn;
3528
3529   if (flag_stack_usage_info)
3530     current_function_static_stack_size = frame_size;
3531
3532   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3533     {
3534       if (crtl->is_leaf && !cfun->calls_alloca)
3535         {
3536           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3537             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3538                                             frame_size - STACK_CHECK_PROTECT);
3539         }
3540       else if (frame_size > 0)
3541         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3542     }
3543
3544   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3545
3546   if (callee_adjust != 0)
3547     aarch64_push_regs (reg1, reg2, callee_adjust);
3548
3549   if (frame_pointer_needed)
3550     {
3551       if (callee_adjust == 0)
3552         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3553                                    R30_REGNUM, false);
3554       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3555                                        stack_pointer_rtx,
3556                                        GEN_INT (callee_offset)));
3557       RTX_FRAME_RELATED_P (insn) = 1;
3558       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3559     }
3560
3561   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3562                              callee_adjust != 0 || frame_pointer_needed);
3563   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3564                              callee_adjust != 0 || frame_pointer_needed);
3565   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3566 }
3567
3568 /* Return TRUE if we can use a simple_return insn.
3569
3570    This function checks whether the callee saved stack is empty, which
3571    means no restore actions are need. The pro_and_epilogue will use
3572    this to check whether shrink-wrapping opt is feasible.  */
3573
3574 bool
3575 aarch64_use_return_insn_p (void)
3576 {
3577   if (!reload_completed)
3578     return false;
3579
3580   if (crtl->profile)
3581     return false;
3582
3583   aarch64_layout_frame ();
3584
3585   return cfun->machine->frame.frame_size == 0;
3586 }
3587
3588 /* Generate the epilogue instructions for returning from a function.
3589    This is almost exactly the reverse of the prolog sequence, except
3590    that we need to insert barriers to avoid scheduling loads that read
3591    from a deallocated stack, and we optimize the unwind records by
3592    emitting them all together if possible.  */
3593 void
3594 aarch64_expand_epilogue (bool for_sibcall)
3595 {
3596   aarch64_layout_frame ();
3597
3598   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3599   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3600   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3601   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3602   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3603   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3604   rtx cfi_ops = NULL;
3605   rtx_insn *insn;
3606
3607   /* We need to add memory barrier to prevent read from deallocated stack.  */
3608   bool need_barrier_p = (get_frame_size ()
3609                          + cfun->machine->frame.saved_varargs_size) != 0;
3610
3611   /* Emit a barrier to prevent loads from a deallocated stack.  */
3612   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca)
3613     {
3614       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3615       need_barrier_p = false;
3616     }
3617
3618   /* Restore the stack pointer from the frame pointer if it may not
3619      be the same as the stack pointer.  */
3620   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3621     {
3622       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3623                                        hard_frame_pointer_rtx,
3624                                        GEN_INT (-callee_offset)));
3625       /* If writeback is used when restoring callee-saves, the CFA
3626          is restored on the instruction doing the writeback.  */
3627       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3628     }
3629   else
3630     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3631
3632   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3633                                 callee_adjust != 0, &cfi_ops);
3634   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3635                                 callee_adjust != 0, &cfi_ops);
3636
3637   if (need_barrier_p)
3638     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3639
3640   if (callee_adjust != 0)
3641     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3642
3643   if (callee_adjust != 0 || initial_adjust > 65536)
3644     {
3645       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3646       insn = get_last_insn ();
3647       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3648       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3649       RTX_FRAME_RELATED_P (insn) = 1;
3650       cfi_ops = NULL;
3651     }
3652
3653   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3654
3655   if (cfi_ops)
3656     {
3657       /* Emit delayed restores and reset the CFA to be SP.  */
3658       insn = get_last_insn ();
3659       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3660       REG_NOTES (insn) = cfi_ops;
3661       RTX_FRAME_RELATED_P (insn) = 1;
3662     }
3663
3664   /* Stack adjustment for exception handler.  */
3665   if (crtl->calls_eh_return)
3666     {
3667       /* We need to unwind the stack by the offset computed by
3668          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3669          to be SP; letting the CFA move during this adjustment
3670          is just as correct as retaining the CFA from the body
3671          of the function.  Therefore, do nothing special.  */
3672       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3673     }
3674
3675   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3676   if (!for_sibcall)
3677     emit_jump_insn (ret_rtx);
3678 }
3679
3680 /* Return the place to copy the exception unwinding return address to.
3681    This will probably be a stack slot, but could (in theory be the
3682    return register).  */
3683 rtx
3684 aarch64_final_eh_return_addr (void)
3685 {
3686   HOST_WIDE_INT fp_offset;
3687
3688   aarch64_layout_frame ();
3689
3690   fp_offset = cfun->machine->frame.frame_size
3691               - cfun->machine->frame.hard_fp_offset;
3692
3693   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
3694     return gen_rtx_REG (DImode, LR_REGNUM);
3695
3696   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
3697      result in a store to save LR introduced by builtin_eh_return () being
3698      incorrectly deleted because the alias is not detected.
3699      So in the calculation of the address to copy the exception unwinding
3700      return address to, we note 2 cases.
3701      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
3702      we return a SP-relative location since all the addresses are SP-relative
3703      in this case.  This prevents the store from being optimized away.
3704      If the fp_offset is not 0, then the addresses will be FP-relative and
3705      therefore we return a FP-relative location.  */
3706
3707   if (frame_pointer_needed)
3708     {
3709       if (fp_offset)
3710         return gen_frame_mem (DImode,
3711                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3712       else
3713         return gen_frame_mem (DImode,
3714                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
3715     }
3716
3717   /* If FP is not needed, we calculate the location of LR, which would be
3718      at the top of the saved registers block.  */
3719
3720   return gen_frame_mem (DImode,
3721                         plus_constant (Pmode,
3722                                        stack_pointer_rtx,
3723                                        fp_offset
3724                                        + cfun->machine->frame.saved_regs_size
3725                                        - 2 * UNITS_PER_WORD));
3726 }
3727
3728 /* Output code to add DELTA to the first argument, and then jump
3729    to FUNCTION.  Used for C++ multiple inheritance.  */
3730 static void
3731 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3732                          HOST_WIDE_INT delta,
3733                          HOST_WIDE_INT vcall_offset,
3734                          tree function)
3735 {
3736   /* The this pointer is always in x0.  Note that this differs from
3737      Arm where the this pointer maybe bumped to r1 if r0 is required
3738      to return a pointer to an aggregate.  On AArch64 a result value
3739      pointer will be in x8.  */
3740   int this_regno = R0_REGNUM;
3741   rtx this_rtx, temp0, temp1, addr, funexp;
3742   rtx_insn *insn;
3743
3744   reload_completed = 1;
3745   emit_note (NOTE_INSN_PROLOGUE_END);
3746
3747   if (vcall_offset == 0)
3748     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3749   else
3750     {
3751       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3752
3753       this_rtx = gen_rtx_REG (Pmode, this_regno);
3754       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3755       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3756
3757       addr = this_rtx;
3758       if (delta != 0)
3759         {
3760           if (delta >= -256 && delta < 256)
3761             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3762                                        plus_constant (Pmode, this_rtx, delta));
3763           else
3764             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3765         }
3766
3767       if (Pmode == ptr_mode)
3768         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3769       else
3770         aarch64_emit_move (temp0,
3771                            gen_rtx_ZERO_EXTEND (Pmode,
3772                                                 gen_rtx_MEM (ptr_mode, addr)));
3773
3774       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3775           addr = plus_constant (Pmode, temp0, vcall_offset);
3776       else
3777         {
3778           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3779                                           Pmode);
3780           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3781         }
3782
3783       if (Pmode == ptr_mode)
3784         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3785       else
3786         aarch64_emit_move (temp1,
3787                            gen_rtx_SIGN_EXTEND (Pmode,
3788                                                 gen_rtx_MEM (ptr_mode, addr)));
3789
3790       emit_insn (gen_add2_insn (this_rtx, temp1));
3791     }
3792
3793   /* Generate a tail call to the target function.  */
3794   if (!TREE_USED (function))
3795     {
3796       assemble_external (function);
3797       TREE_USED (function) = 1;
3798     }
3799   funexp = XEXP (DECL_RTL (function), 0);
3800   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3801   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3802   SIBLING_CALL_P (insn) = 1;
3803
3804   insn = get_insns ();
3805   shorten_branches (insn);
3806   final_start_function (insn, file, 1);
3807   final (insn, file, 1);
3808   final_end_function ();
3809
3810   /* Stop pretending to be a post-reload pass.  */
3811   reload_completed = 0;
3812 }
3813
3814 static bool
3815 aarch64_tls_referenced_p (rtx x)
3816 {
3817   if (!TARGET_HAVE_TLS)
3818     return false;
3819   subrtx_iterator::array_type array;
3820   FOR_EACH_SUBRTX (iter, array, x, ALL)
3821     {
3822       const_rtx x = *iter;
3823       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3824         return true;
3825       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3826          TLS offsets, not real symbol references.  */
3827       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3828         iter.skip_subrtxes ();
3829     }
3830   return false;
3831 }
3832
3833
3834 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3835    a left shift of 0 or 12 bits.  */
3836 bool
3837 aarch64_uimm12_shift (HOST_WIDE_INT val)
3838 {
3839   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3840           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3841           );
3842 }
3843
3844
3845 /* Return true if val is an immediate that can be loaded into a
3846    register by a MOVZ instruction.  */
3847 static bool
3848 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3849 {
3850   if (GET_MODE_SIZE (mode) > 4)
3851     {
3852       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3853           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3854         return 1;
3855     }
3856   else
3857     {
3858       /* Ignore sign extension.  */
3859       val &= (HOST_WIDE_INT) 0xffffffff;
3860     }
3861   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3862           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3863 }
3864
3865 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
3866
3867 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3868   {
3869     0x0000000100000001ull,
3870     0x0001000100010001ull,
3871     0x0101010101010101ull,
3872     0x1111111111111111ull,
3873     0x5555555555555555ull,
3874   };
3875
3876
3877 /* Return true if val is a valid bitmask immediate.  */
3878
3879 bool
3880 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3881 {
3882   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3883   int bits;
3884
3885   /* Check for a single sequence of one bits and return quickly if so.
3886      The special cases of all ones and all zeroes returns false.  */
3887   val = (unsigned HOST_WIDE_INT) val_in;
3888   tmp = val + (val & -val);
3889
3890   if (tmp == (tmp & -tmp))
3891     return (val + 1) > 1;
3892
3893   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
3894   if (mode == SImode)
3895     val = (val << 32) | (val & 0xffffffff);
3896
3897   /* Invert if the immediate doesn't start with a zero bit - this means we
3898      only need to search for sequences of one bits.  */
3899   if (val & 1)
3900     val = ~val;
3901
3902   /* Find the first set bit and set tmp to val with the first sequence of one
3903      bits removed.  Return success if there is a single sequence of ones.  */
3904   first_one = val & -val;
3905   tmp = val & (val + first_one);
3906
3907   if (tmp == 0)
3908     return true;
3909
3910   /* Find the next set bit and compute the difference in bit position.  */
3911   next_one = tmp & -tmp;
3912   bits = clz_hwi (first_one) - clz_hwi (next_one);
3913   mask = val ^ tmp;
3914
3915   /* Check the bit position difference is a power of 2, and that the first
3916      sequence of one bits fits within 'bits' bits.  */
3917   if ((mask >> bits) != 0 || bits != (bits & -bits))
3918     return false;
3919
3920   /* Check the sequence of one bits is repeated 64/bits times.  */
3921   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3922 }
3923
3924 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
3925    Assumed precondition: VAL_IN Is not zero.  */
3926
3927 unsigned HOST_WIDE_INT
3928 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
3929 {
3930   int lowest_bit_set = ctz_hwi (val_in);
3931   int highest_bit_set = floor_log2 (val_in);
3932   gcc_assert (val_in != 0);
3933
3934   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
3935           (HOST_WIDE_INT_1U << lowest_bit_set));
3936 }
3937
3938 /* Create constant where bits outside of lowest bit set to highest bit set
3939    are set to 1.  */
3940
3941 unsigned HOST_WIDE_INT
3942 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
3943 {
3944   return val_in | ~aarch64_and_split_imm1 (val_in);
3945 }
3946
3947 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
3948
3949 bool
3950 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
3951 {
3952   if (aarch64_bitmask_imm (val_in, mode))
3953     return false;
3954
3955   if (aarch64_move_imm (val_in, mode))
3956     return false;
3957
3958   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
3959
3960   return aarch64_bitmask_imm (imm2, mode);
3961 }
3962
3963 /* Return true if val is an immediate that can be loaded into a
3964    register in a single instruction.  */
3965 bool
3966 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3967 {
3968   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3969     return 1;
3970   return aarch64_bitmask_imm (val, mode);
3971 }
3972
3973 static bool
3974 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3975 {
3976   rtx base, offset;
3977
3978   if (GET_CODE (x) == HIGH)
3979     return true;
3980
3981   split_const (x, &base, &offset);
3982   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3983     {
3984       if (aarch64_classify_symbol (base, offset)
3985           != SYMBOL_FORCE_TO_MEM)
3986         return true;
3987       else
3988         /* Avoid generating a 64-bit relocation in ILP32; leave
3989            to aarch64_expand_mov_immediate to handle it properly.  */
3990         return mode != ptr_mode;
3991     }
3992
3993   return aarch64_tls_referenced_p (x);
3994 }
3995
3996 /* Implement TARGET_CASE_VALUES_THRESHOLD.
3997    The expansion for a table switch is quite expensive due to the number
3998    of instructions, the table lookup and hard to predict indirect jump.
3999    When optimizing for speed, and -O3 enabled, use the per-core tuning if
4000    set, otherwise use tables for > 16 cases as a tradeoff between size and
4001    performance.  When optimizing for size, use the default setting.  */
4002
4003 static unsigned int
4004 aarch64_case_values_threshold (void)
4005 {
4006   /* Use the specified limit for the number of cases before using jump
4007      tables at higher optimization levels.  */
4008   if (optimize > 2
4009       && selected_cpu->tune->max_case_values != 0)
4010     return selected_cpu->tune->max_case_values;
4011   else
4012     return optimize_size ? default_case_values_threshold () : 17;
4013 }
4014
4015 /* Return true if register REGNO is a valid index register.
4016    STRICT_P is true if REG_OK_STRICT is in effect.  */
4017
4018 bool
4019 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4020 {
4021   if (!HARD_REGISTER_NUM_P (regno))
4022     {
4023       if (!strict_p)
4024         return true;
4025
4026       if (!reg_renumber)
4027         return false;
4028
4029       regno = reg_renumber[regno];
4030     }
4031   return GP_REGNUM_P (regno);
4032 }
4033
4034 /* Return true if register REGNO is a valid base register for mode MODE.
4035    STRICT_P is true if REG_OK_STRICT is in effect.  */
4036
4037 bool
4038 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4039 {
4040   if (!HARD_REGISTER_NUM_P (regno))
4041     {
4042       if (!strict_p)
4043         return true;
4044
4045       if (!reg_renumber)
4046         return false;
4047
4048       regno = reg_renumber[regno];
4049     }
4050
4051   /* The fake registers will be eliminated to either the stack or
4052      hard frame pointer, both of which are usually valid base registers.
4053      Reload deals with the cases where the eliminated form isn't valid.  */
4054   return (GP_REGNUM_P (regno)
4055           || regno == SP_REGNUM
4056           || regno == FRAME_POINTER_REGNUM
4057           || regno == ARG_POINTER_REGNUM);
4058 }
4059
4060 /* Return true if X is a valid base register for mode MODE.
4061    STRICT_P is true if REG_OK_STRICT is in effect.  */
4062
4063 static bool
4064 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4065 {
4066   if (!strict_p && GET_CODE (x) == SUBREG)
4067     x = SUBREG_REG (x);
4068
4069   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4070 }
4071
4072 /* Return true if address offset is a valid index.  If it is, fill in INFO
4073    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
4074
4075 static bool
4076 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4077                         machine_mode mode, bool strict_p)
4078 {
4079   enum aarch64_address_type type;
4080   rtx index;
4081   int shift;
4082
4083   /* (reg:P) */
4084   if ((REG_P (x) || GET_CODE (x) == SUBREG)
4085       && GET_MODE (x) == Pmode)
4086     {
4087       type = ADDRESS_REG_REG;
4088       index = x;
4089       shift = 0;
4090     }
4091   /* (sign_extend:DI (reg:SI)) */
4092   else if ((GET_CODE (x) == SIGN_EXTEND
4093             || GET_CODE (x) == ZERO_EXTEND)
4094            && GET_MODE (x) == DImode
4095            && GET_MODE (XEXP (x, 0)) == SImode)
4096     {
4097       type = (GET_CODE (x) == SIGN_EXTEND)
4098         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4099       index = XEXP (x, 0);
4100       shift = 0;
4101     }
4102   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4103   else if (GET_CODE (x) == MULT
4104            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4105                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4106            && GET_MODE (XEXP (x, 0)) == DImode
4107            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4108            && CONST_INT_P (XEXP (x, 1)))
4109     {
4110       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4111         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4112       index = XEXP (XEXP (x, 0), 0);
4113       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4114     }
4115   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4116   else if (GET_CODE (x) == ASHIFT
4117            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4118                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4119            && GET_MODE (XEXP (x, 0)) == DImode
4120            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4121            && CONST_INT_P (XEXP (x, 1)))
4122     {
4123       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4124         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4125       index = XEXP (XEXP (x, 0), 0);
4126       shift = INTVAL (XEXP (x, 1));
4127     }
4128   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4129   else if ((GET_CODE (x) == SIGN_EXTRACT
4130             || GET_CODE (x) == ZERO_EXTRACT)
4131            && GET_MODE (x) == DImode
4132            && GET_CODE (XEXP (x, 0)) == MULT
4133            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4134            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4135     {
4136       type = (GET_CODE (x) == SIGN_EXTRACT)
4137         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4138       index = XEXP (XEXP (x, 0), 0);
4139       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4140       if (INTVAL (XEXP (x, 1)) != 32 + shift
4141           || INTVAL (XEXP (x, 2)) != 0)
4142         shift = -1;
4143     }
4144   /* (and:DI (mult:DI (reg:DI) (const_int scale))
4145      (const_int 0xffffffff<<shift)) */
4146   else if (GET_CODE (x) == AND
4147            && GET_MODE (x) == DImode
4148            && GET_CODE (XEXP (x, 0)) == MULT
4149            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4150            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4151            && CONST_INT_P (XEXP (x, 1)))
4152     {
4153       type = ADDRESS_REG_UXTW;
4154       index = XEXP (XEXP (x, 0), 0);
4155       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4156       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4157         shift = -1;
4158     }
4159   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4160   else if ((GET_CODE (x) == SIGN_EXTRACT
4161             || GET_CODE (x) == ZERO_EXTRACT)
4162            && GET_MODE (x) == DImode
4163            && GET_CODE (XEXP (x, 0)) == ASHIFT
4164            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4165            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4166     {
4167       type = (GET_CODE (x) == SIGN_EXTRACT)
4168         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4169       index = XEXP (XEXP (x, 0), 0);
4170       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4171       if (INTVAL (XEXP (x, 1)) != 32 + shift
4172           || INTVAL (XEXP (x, 2)) != 0)
4173         shift = -1;
4174     }
4175   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4176      (const_int 0xffffffff<<shift)) */
4177   else if (GET_CODE (x) == AND
4178            && GET_MODE (x) == DImode
4179            && GET_CODE (XEXP (x, 0)) == ASHIFT
4180            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4181            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4182            && CONST_INT_P (XEXP (x, 1)))
4183     {
4184       type = ADDRESS_REG_UXTW;
4185       index = XEXP (XEXP (x, 0), 0);
4186       shift = INTVAL (XEXP (XEXP (x, 0), 1));
4187       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4188         shift = -1;
4189     }
4190   /* (mult:P (reg:P) (const_int scale)) */
4191   else if (GET_CODE (x) == MULT
4192            && GET_MODE (x) == Pmode
4193            && GET_MODE (XEXP (x, 0)) == Pmode
4194            && CONST_INT_P (XEXP (x, 1)))
4195     {
4196       type = ADDRESS_REG_REG;
4197       index = XEXP (x, 0);
4198       shift = exact_log2 (INTVAL (XEXP (x, 1)));
4199     }
4200   /* (ashift:P (reg:P) (const_int shift)) */
4201   else if (GET_CODE (x) == ASHIFT
4202            && GET_MODE (x) == Pmode
4203            && GET_MODE (XEXP (x, 0)) == Pmode
4204            && CONST_INT_P (XEXP (x, 1)))
4205     {
4206       type = ADDRESS_REG_REG;
4207       index = XEXP (x, 0);
4208       shift = INTVAL (XEXP (x, 1));
4209     }
4210   else
4211     return false;
4212
4213   if (GET_CODE (index) == SUBREG)
4214     index = SUBREG_REG (index);
4215
4216   if ((shift == 0 ||
4217        (shift > 0 && shift <= 3
4218         && (1 << shift) == GET_MODE_SIZE (mode)))
4219       && REG_P (index)
4220       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4221     {
4222       info->type = type;
4223       info->offset = index;
4224       info->shift = shift;
4225       return true;
4226     }
4227
4228   return false;
4229 }
4230
4231 /* Return true if MODE is one of the modes for which we
4232    support LDP/STP operations.  */
4233
4234 static bool
4235 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4236 {
4237   return mode == SImode || mode == DImode
4238          || mode == SFmode || mode == DFmode
4239          || (aarch64_vector_mode_supported_p (mode)
4240              && GET_MODE_SIZE (mode) == 8);
4241 }
4242
4243 /* Return true if REGNO is a virtual pointer register, or an eliminable
4244    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
4245    include stack_pointer or hard_frame_pointer.  */
4246 static bool
4247 virt_or_elim_regno_p (unsigned regno)
4248 {
4249   return ((regno >= FIRST_VIRTUAL_REGISTER
4250            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4251           || regno == FRAME_POINTER_REGNUM
4252           || regno == ARG_POINTER_REGNUM);
4253 }
4254
4255 /* Return true if X is a valid address for machine mode MODE.  If it is,
4256    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
4257    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
4258
4259 static bool
4260 aarch64_classify_address (struct aarch64_address_info *info,
4261                           rtx x, machine_mode mode,
4262                           RTX_CODE outer_code, bool strict_p)
4263 {
4264   enum rtx_code code = GET_CODE (x);
4265   rtx op0, op1;
4266
4267   /* On BE, we use load/store pair for all large int mode load/stores.  */
4268   bool load_store_pair_p = (outer_code == PARALLEL
4269                             || (BYTES_BIG_ENDIAN
4270                                 && aarch64_vect_struct_mode_p (mode)));
4271
4272   bool allow_reg_index_p =
4273     !load_store_pair_p
4274     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4275     && !aarch64_vect_struct_mode_p (mode);
4276
4277   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4278      REG addressing.  */
4279   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4280       && (code != POST_INC && code != REG))
4281     return false;
4282
4283   switch (code)
4284     {
4285     case REG:
4286     case SUBREG:
4287       info->type = ADDRESS_REG_IMM;
4288       info->base = x;
4289       info->offset = const0_rtx;
4290       return aarch64_base_register_rtx_p (x, strict_p);
4291
4292     case PLUS:
4293       op0 = XEXP (x, 0);
4294       op1 = XEXP (x, 1);
4295
4296       if (! strict_p
4297           && REG_P (op0)
4298           && virt_or_elim_regno_p (REGNO (op0))
4299           && CONST_INT_P (op1))
4300         {
4301           info->type = ADDRESS_REG_IMM;
4302           info->base = op0;
4303           info->offset = op1;
4304
4305           return true;
4306         }
4307
4308       if (GET_MODE_SIZE (mode) != 0
4309           && CONST_INT_P (op1)
4310           && aarch64_base_register_rtx_p (op0, strict_p))
4311         {
4312           HOST_WIDE_INT offset = INTVAL (op1);
4313
4314           info->type = ADDRESS_REG_IMM;
4315           info->base = op0;
4316           info->offset = op1;
4317
4318           /* TImode and TFmode values are allowed in both pairs of X
4319              registers and individual Q registers.  The available
4320              address modes are:
4321              X,X: 7-bit signed scaled offset
4322              Q:   9-bit signed offset
4323              We conservatively require an offset representable in either mode.
4324              When performing the check for pairs of X registers i.e.  LDP/STP
4325              pass down DImode since that is the natural size of the LDP/STP
4326              instruction memory accesses.  */
4327           if (mode == TImode || mode == TFmode)
4328             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4329                     && offset_9bit_signed_unscaled_p (mode, offset));
4330
4331           /* A 7bit offset check because OImode will emit a ldp/stp
4332              instruction (only big endian will get here).
4333              For ldp/stp instructions, the offset is scaled for the size of a
4334              single element of the pair.  */
4335           if (mode == OImode)
4336             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4337
4338           /* Three 9/12 bit offsets checks because CImode will emit three
4339              ldr/str instructions (only big endian will get here).  */
4340           if (mode == CImode)
4341             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4342                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4343                         || offset_12bit_unsigned_scaled_p (V16QImode,
4344                                                            offset + 32)));
4345
4346           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4347              instructions (only big endian will get here).  */
4348           if (mode == XImode)
4349             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4350                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4351                                                             offset + 32));
4352
4353           if (load_store_pair_p)
4354             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4355                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4356           else
4357             return (offset_9bit_signed_unscaled_p (mode, offset)
4358                     || offset_12bit_unsigned_scaled_p (mode, offset));
4359         }
4360
4361       if (allow_reg_index_p)
4362         {
4363           /* Look for base + (scaled/extended) index register.  */
4364           if (aarch64_base_register_rtx_p (op0, strict_p)
4365               && aarch64_classify_index (info, op1, mode, strict_p))
4366             {
4367               info->base = op0;
4368               return true;
4369             }
4370           if (aarch64_base_register_rtx_p (op1, strict_p)
4371               && aarch64_classify_index (info, op0, mode, strict_p))
4372             {
4373               info->base = op1;
4374               return true;
4375             }
4376         }
4377
4378       return false;
4379
4380     case POST_INC:
4381     case POST_DEC:
4382     case PRE_INC:
4383     case PRE_DEC:
4384       info->type = ADDRESS_REG_WB;
4385       info->base = XEXP (x, 0);
4386       info->offset = NULL_RTX;
4387       return aarch64_base_register_rtx_p (info->base, strict_p);
4388
4389     case POST_MODIFY:
4390     case PRE_MODIFY:
4391       info->type = ADDRESS_REG_WB;
4392       info->base = XEXP (x, 0);
4393       if (GET_CODE (XEXP (x, 1)) == PLUS
4394           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4395           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4396           && aarch64_base_register_rtx_p (info->base, strict_p))
4397         {
4398           HOST_WIDE_INT offset;
4399           info->offset = XEXP (XEXP (x, 1), 1);
4400           offset = INTVAL (info->offset);
4401
4402           /* TImode and TFmode values are allowed in both pairs of X
4403              registers and individual Q registers.  The available
4404              address modes are:
4405              X,X: 7-bit signed scaled offset
4406              Q:   9-bit signed offset
4407              We conservatively require an offset representable in either mode.
4408            */
4409           if (mode == TImode || mode == TFmode)
4410             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4411                     && offset_9bit_signed_unscaled_p (mode, offset));
4412
4413           if (load_store_pair_p)
4414             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4415                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4416           else
4417             return offset_9bit_signed_unscaled_p (mode, offset);
4418         }
4419       return false;
4420
4421     case CONST:
4422     case SYMBOL_REF:
4423     case LABEL_REF:
4424       /* load literal: pc-relative constant pool entry.  Only supported
4425          for SI mode or larger.  */
4426       info->type = ADDRESS_SYMBOLIC;
4427
4428       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4429         {
4430           rtx sym, addend;
4431
4432           split_const (x, &sym, &addend);
4433           return ((GET_CODE (sym) == LABEL_REF
4434                    || (GET_CODE (sym) == SYMBOL_REF
4435                        && CONSTANT_POOL_ADDRESS_P (sym)
4436                        && aarch64_pcrelative_literal_loads)));
4437         }
4438       return false;
4439
4440     case LO_SUM:
4441       info->type = ADDRESS_LO_SUM;
4442       info->base = XEXP (x, 0);
4443       info->offset = XEXP (x, 1);
4444       if (allow_reg_index_p
4445           && aarch64_base_register_rtx_p (info->base, strict_p))
4446         {
4447           rtx sym, offs;
4448           split_const (info->offset, &sym, &offs);
4449           if (GET_CODE (sym) == SYMBOL_REF
4450               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4451             {
4452               /* The symbol and offset must be aligned to the access size.  */
4453               unsigned int align;
4454               unsigned int ref_size;
4455
4456               if (CONSTANT_POOL_ADDRESS_P (sym))
4457                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4458               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4459                 {
4460                   tree exp = SYMBOL_REF_DECL (sym);
4461                   align = TYPE_ALIGN (TREE_TYPE (exp));
4462                   align = CONSTANT_ALIGNMENT (exp, align);
4463                 }
4464               else if (SYMBOL_REF_DECL (sym))
4465                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4466               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4467                        && SYMBOL_REF_BLOCK (sym) != NULL)
4468                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4469               else
4470                 align = BITS_PER_UNIT;
4471
4472               ref_size = GET_MODE_SIZE (mode);
4473               if (ref_size == 0)
4474                 ref_size = GET_MODE_SIZE (DImode);
4475
4476               return ((INTVAL (offs) & (ref_size - 1)) == 0
4477                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4478             }
4479         }
4480       return false;
4481
4482     default:
4483       return false;
4484     }
4485 }
4486
4487 bool
4488 aarch64_symbolic_address_p (rtx x)
4489 {
4490   rtx offset;
4491
4492   split_const (x, &x, &offset);
4493   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4494 }
4495
4496 /* Classify the base of symbolic expression X.  */
4497
4498 enum aarch64_symbol_type
4499 aarch64_classify_symbolic_expression (rtx x)
4500 {
4501   rtx offset;
4502
4503   split_const (x, &x, &offset);
4504   return aarch64_classify_symbol (x, offset);
4505 }
4506
4507
4508 /* Return TRUE if X is a legitimate address for accessing memory in
4509    mode MODE.  */
4510 static bool
4511 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4512 {
4513   struct aarch64_address_info addr;
4514
4515   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4516 }
4517
4518 /* Return TRUE if X is a legitimate address for accessing memory in
4519    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4520    pair operation.  */
4521 bool
4522 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4523                               RTX_CODE outer_code, bool strict_p)
4524 {
4525   struct aarch64_address_info addr;
4526
4527   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4528 }
4529
4530 /* Split an out-of-range address displacement into a base and offset.
4531    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4532    to increase opportunities for sharing the base address of different sizes.
4533    For TI/TFmode and unaligned accesses use a 256-byte range.  */
4534 static bool
4535 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4536 {
4537   HOST_WIDE_INT mask = GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3fff;
4538
4539   if (mode == TImode || mode == TFmode ||
4540       (INTVAL (*disp) & (GET_MODE_SIZE (mode) - 1)) != 0)
4541     mask = 0xff;
4542
4543   *off = GEN_INT (INTVAL (*disp) & ~mask);
4544   *disp = GEN_INT (INTVAL (*disp) & mask);
4545   return true;
4546 }
4547
4548 /* Return TRUE if rtx X is immediate constant 0.0 */
4549 bool
4550 aarch64_float_const_zero_rtx_p (rtx x)
4551 {
4552   if (GET_MODE (x) == VOIDmode)
4553     return false;
4554
4555   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4556     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4557   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4558 }
4559
4560 /* Return the fixed registers used for condition codes.  */
4561
4562 static bool
4563 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4564 {
4565   *p1 = CC_REGNUM;
4566   *p2 = INVALID_REGNUM;
4567   return true;
4568 }
4569
4570 /* Emit call insn with PAT and do aarch64-specific handling.  */
4571
4572 void
4573 aarch64_emit_call_insn (rtx pat)
4574 {
4575   rtx insn = emit_call_insn (pat);
4576
4577   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4578   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4579   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4580 }
4581
4582 machine_mode
4583 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4584 {
4585   /* All floating point compares return CCFP if it is an equality
4586      comparison, and CCFPE otherwise.  */
4587   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4588     {
4589       switch (code)
4590         {
4591         case EQ:
4592         case NE:
4593         case UNORDERED:
4594         case ORDERED:
4595         case UNLT:
4596         case UNLE:
4597         case UNGT:
4598         case UNGE:
4599         case UNEQ:
4600         case LTGT:
4601           return CCFPmode;
4602
4603         case LT:
4604         case LE:
4605         case GT:
4606         case GE:
4607           return CCFPEmode;
4608
4609         default:
4610           gcc_unreachable ();
4611         }
4612     }
4613
4614   /* Equality comparisons of short modes against zero can be performed
4615      using the TST instruction with the appropriate bitmask.  */
4616   if (y == const0_rtx && REG_P (x)
4617       && (code == EQ || code == NE)
4618       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4619     return CC_NZmode;
4620
4621   /* Similarly, comparisons of zero_extends from shorter modes can
4622      be performed using an ANDS with an immediate mask.  */
4623   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4624       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4625       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4626       && (code == EQ || code == NE))
4627     return CC_NZmode;
4628
4629   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4630       && y == const0_rtx
4631       && (code == EQ || code == NE || code == LT || code == GE)
4632       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4633           || GET_CODE (x) == NEG
4634           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4635               && CONST_INT_P (XEXP (x, 2)))))
4636     return CC_NZmode;
4637
4638   /* A compare with a shifted operand.  Because of canonicalization,
4639      the comparison will have to be swapped when we emit the assembly
4640      code.  */
4641   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4642       && (REG_P (y) || GET_CODE (y) == SUBREG)
4643       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4644           || GET_CODE (x) == LSHIFTRT
4645           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4646     return CC_SWPmode;
4647
4648   /* Similarly for a negated operand, but we can only do this for
4649      equalities.  */
4650   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4651       && (REG_P (y) || GET_CODE (y) == SUBREG)
4652       && (code == EQ || code == NE)
4653       && GET_CODE (x) == NEG)
4654     return CC_Zmode;
4655
4656   /* A test for unsigned overflow.  */
4657   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4658       && code == NE
4659       && GET_CODE (x) == PLUS
4660       && GET_CODE (y) == ZERO_EXTEND)
4661     return CC_Cmode;
4662
4663   /* For everything else, return CCmode.  */
4664   return CCmode;
4665 }
4666
4667 static int
4668 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4669
4670 int
4671 aarch64_get_condition_code (rtx x)
4672 {
4673   machine_mode mode = GET_MODE (XEXP (x, 0));
4674   enum rtx_code comp_code = GET_CODE (x);
4675
4676   if (GET_MODE_CLASS (mode) != MODE_CC)
4677     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4678   return aarch64_get_condition_code_1 (mode, comp_code);
4679 }
4680
4681 static int
4682 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4683 {
4684   switch (mode)
4685     {
4686     case CCFPmode:
4687     case CCFPEmode:
4688       switch (comp_code)
4689         {
4690         case GE: return AARCH64_GE;
4691         case GT: return AARCH64_GT;
4692         case LE: return AARCH64_LS;
4693         case LT: return AARCH64_MI;
4694         case NE: return AARCH64_NE;
4695         case EQ: return AARCH64_EQ;
4696         case ORDERED: return AARCH64_VC;
4697         case UNORDERED: return AARCH64_VS;
4698         case UNLT: return AARCH64_LT;
4699         case UNLE: return AARCH64_LE;
4700         case UNGT: return AARCH64_HI;
4701         case UNGE: return AARCH64_PL;
4702         default: return -1;
4703         }
4704       break;
4705
4706     case CCmode:
4707       switch (comp_code)
4708         {
4709         case NE: return AARCH64_NE;
4710         case EQ: return AARCH64_EQ;
4711         case GE: return AARCH64_GE;
4712         case GT: return AARCH64_GT;
4713         case LE: return AARCH64_LE;
4714         case LT: return AARCH64_LT;
4715         case GEU: return AARCH64_CS;
4716         case GTU: return AARCH64_HI;
4717         case LEU: return AARCH64_LS;
4718         case LTU: return AARCH64_CC;
4719         default: return -1;
4720         }
4721       break;
4722
4723     case CC_SWPmode:
4724       switch (comp_code)
4725         {
4726         case NE: return AARCH64_NE;
4727         case EQ: return AARCH64_EQ;
4728         case GE: return AARCH64_LE;
4729         case GT: return AARCH64_LT;
4730         case LE: return AARCH64_GE;
4731         case LT: return AARCH64_GT;
4732         case GEU: return AARCH64_LS;
4733         case GTU: return AARCH64_CC;
4734         case LEU: return AARCH64_CS;
4735         case LTU: return AARCH64_HI;
4736         default: return -1;
4737         }
4738       break;
4739
4740     case CC_NZmode:
4741       switch (comp_code)
4742         {
4743         case NE: return AARCH64_NE;
4744         case EQ: return AARCH64_EQ;
4745         case GE: return AARCH64_PL;
4746         case LT: return AARCH64_MI;
4747         default: return -1;
4748         }
4749       break;
4750
4751     case CC_Zmode:
4752       switch (comp_code)
4753         {
4754         case NE: return AARCH64_NE;
4755         case EQ: return AARCH64_EQ;
4756         default: return -1;
4757         }
4758       break;
4759
4760     case CC_Cmode:
4761       switch (comp_code)
4762         {
4763         case NE: return AARCH64_CS;
4764         case EQ: return AARCH64_CC;
4765         default: return -1;
4766         }
4767       break;
4768
4769     default:
4770       return -1;
4771     }
4772
4773   return -1;
4774 }
4775
4776 bool
4777 aarch64_const_vec_all_same_in_range_p (rtx x,
4778                                   HOST_WIDE_INT minval,
4779                                   HOST_WIDE_INT maxval)
4780 {
4781   HOST_WIDE_INT firstval;
4782   int count, i;
4783
4784   if (GET_CODE (x) != CONST_VECTOR
4785       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4786     return false;
4787
4788   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4789   if (firstval < minval || firstval > maxval)
4790     return false;
4791
4792   count = CONST_VECTOR_NUNITS (x);
4793   for (i = 1; i < count; i++)
4794     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4795       return false;
4796
4797   return true;
4798 }
4799
4800 bool
4801 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4802 {
4803   return aarch64_const_vec_all_same_in_range_p (x, val, val);
4804 }
4805
4806
4807 /* N Z C V.  */
4808 #define AARCH64_CC_V 1
4809 #define AARCH64_CC_C (1 << 1)
4810 #define AARCH64_CC_Z (1 << 2)
4811 #define AARCH64_CC_N (1 << 3)
4812
4813 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
4814 static const int aarch64_nzcv_codes[] =
4815 {
4816   0,            /* EQ, Z == 1.  */
4817   AARCH64_CC_Z, /* NE, Z == 0.  */
4818   0,            /* CS, C == 1.  */
4819   AARCH64_CC_C, /* CC, C == 0.  */
4820   0,            /* MI, N == 1.  */
4821   AARCH64_CC_N, /* PL, N == 0.  */
4822   0,            /* VS, V == 1.  */
4823   AARCH64_CC_V, /* VC, V == 0.  */
4824   0,            /* HI, C ==1 && Z == 0.  */
4825   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
4826   AARCH64_CC_V, /* GE, N == V.  */
4827   0,            /* LT, N != V.  */
4828   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
4829   0,            /* LE, !(Z == 0 && N == V).  */
4830   0,            /* AL, Any.  */
4831   0             /* NV, Any.  */
4832 };
4833
4834 static void
4835 aarch64_print_operand (FILE *f, rtx x, int code)
4836 {
4837   switch (code)
4838     {
4839     /* An integer or symbol address without a preceding # sign.  */
4840     case 'c':
4841       switch (GET_CODE (x))
4842         {
4843         case CONST_INT:
4844           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4845           break;
4846
4847         case SYMBOL_REF:
4848           output_addr_const (f, x);
4849           break;
4850
4851         case CONST:
4852           if (GET_CODE (XEXP (x, 0)) == PLUS
4853               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4854             {
4855               output_addr_const (f, x);
4856               break;
4857             }
4858           /* Fall through.  */
4859
4860         default:
4861           output_operand_lossage ("Unsupported operand for code '%c'", code);
4862         }
4863       break;
4864
4865     case 'e':
4866       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4867       {
4868         int n;
4869
4870         if (!CONST_INT_P (x)
4871             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4872           {
4873             output_operand_lossage ("invalid operand for '%%%c'", code);
4874             return;
4875           }
4876
4877         switch (n)
4878           {
4879           case 3:
4880             fputc ('b', f);
4881             break;
4882           case 4:
4883             fputc ('h', f);
4884             break;
4885           case 5:
4886             fputc ('w', f);
4887             break;
4888           default:
4889             output_operand_lossage ("invalid operand for '%%%c'", code);
4890             return;
4891           }
4892       }
4893       break;
4894
4895     case 'p':
4896       {
4897         int n;
4898
4899         /* Print N such that 2^N == X.  */
4900         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4901           {
4902             output_operand_lossage ("invalid operand for '%%%c'", code);
4903             return;
4904           }
4905
4906         asm_fprintf (f, "%d", n);
4907       }
4908       break;
4909
4910     case 'P':
4911       /* Print the number of non-zero bits in X (a const_int).  */
4912       if (!CONST_INT_P (x))
4913         {
4914           output_operand_lossage ("invalid operand for '%%%c'", code);
4915           return;
4916         }
4917
4918       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4919       break;
4920
4921     case 'H':
4922       /* Print the higher numbered register of a pair (TImode) of regs.  */
4923       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4924         {
4925           output_operand_lossage ("invalid operand for '%%%c'", code);
4926           return;
4927         }
4928
4929       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4930       break;
4931
4932     case 'M':
4933     case 'm':
4934       {
4935         int cond_code;
4936         /* Print a condition (eq, ne, etc) or its inverse.  */
4937
4938         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
4939         if (x == const_true_rtx)
4940           {
4941             if (code == 'M')
4942               fputs ("nv", f);
4943             return;
4944           }
4945
4946         if (!COMPARISON_P (x))
4947           {
4948             output_operand_lossage ("invalid operand for '%%%c'", code);
4949             return;
4950           }
4951
4952         cond_code = aarch64_get_condition_code (x);
4953         gcc_assert (cond_code >= 0);
4954         if (code == 'M')
4955           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
4956         fputs (aarch64_condition_codes[cond_code], f);
4957       }
4958       break;
4959
4960     case 'b':
4961     case 'h':
4962     case 's':
4963     case 'd':
4964     case 'q':
4965       /* Print a scalar FP/SIMD register name.  */
4966       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4967         {
4968           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4969           return;
4970         }
4971       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4972       break;
4973
4974     case 'S':
4975     case 'T':
4976     case 'U':
4977     case 'V':
4978       /* Print the first FP/SIMD register name in a list.  */
4979       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4980         {
4981           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4982           return;
4983         }
4984       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4985       break;
4986
4987     case 'R':
4988       /* Print a scalar FP/SIMD register name + 1.  */
4989       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4990         {
4991           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4992           return;
4993         }
4994       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4995       break;
4996
4997     case 'X':
4998       /* Print bottom 16 bits of integer constant in hex.  */
4999       if (!CONST_INT_P (x))
5000         {
5001           output_operand_lossage ("invalid operand for '%%%c'", code);
5002           return;
5003         }
5004       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5005       break;
5006
5007     case 'w':
5008     case 'x':
5009       /* Print a general register name or the zero register (32-bit or
5010          64-bit).  */
5011       if (x == const0_rtx
5012           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5013         {
5014           asm_fprintf (f, "%czr", code);
5015           break;
5016         }
5017
5018       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5019         {
5020           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5021           break;
5022         }
5023
5024       if (REG_P (x) && REGNO (x) == SP_REGNUM)
5025         {
5026           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5027           break;
5028         }
5029
5030       /* Fall through */
5031
5032     case 0:
5033       /* Print a normal operand, if it's a general register, then we
5034          assume DImode.  */
5035       if (x == NULL)
5036         {
5037           output_operand_lossage ("missing operand");
5038           return;
5039         }
5040
5041       switch (GET_CODE (x))
5042         {
5043         case REG:
5044           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5045           break;
5046
5047         case MEM:
5048           output_address (GET_MODE (x), XEXP (x, 0));
5049           break;
5050
5051         case CONST:
5052         case LABEL_REF:
5053         case SYMBOL_REF:
5054           output_addr_const (asm_out_file, x);
5055           break;
5056
5057         case CONST_INT:
5058           asm_fprintf (f, "%wd", INTVAL (x));
5059           break;
5060
5061         case CONST_VECTOR:
5062           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5063             {
5064               gcc_assert (
5065                   aarch64_const_vec_all_same_in_range_p (x,
5066                                                          HOST_WIDE_INT_MIN,
5067                                                          HOST_WIDE_INT_MAX));
5068               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5069             }
5070           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5071             {
5072               fputc ('0', f);
5073             }
5074           else
5075             gcc_unreachable ();
5076           break;
5077
5078         case CONST_DOUBLE:
5079           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5080              be getting CONST_DOUBLEs holding integers.  */
5081           gcc_assert (GET_MODE (x) != VOIDmode);
5082           if (aarch64_float_const_zero_rtx_p (x))
5083             {
5084               fputc ('0', f);
5085               break;
5086             }
5087           else if (aarch64_float_const_representable_p (x))
5088             {
5089 #define buf_size 20
5090               char float_buf[buf_size] = {'\0'};
5091               real_to_decimal_for_mode (float_buf,
5092                                         CONST_DOUBLE_REAL_VALUE (x),
5093                                         buf_size, buf_size,
5094                                         1, GET_MODE (x));
5095               asm_fprintf (asm_out_file, "%s", float_buf);
5096               break;
5097 #undef buf_size
5098             }
5099           output_operand_lossage ("invalid constant");
5100           return;
5101         default:
5102           output_operand_lossage ("invalid operand");
5103           return;
5104         }
5105       break;
5106
5107     case 'A':
5108       if (GET_CODE (x) == HIGH)
5109         x = XEXP (x, 0);
5110
5111       switch (aarch64_classify_symbolic_expression (x))
5112         {
5113         case SYMBOL_SMALL_GOT_4G:
5114           asm_fprintf (asm_out_file, ":got:");
5115           break;
5116
5117         case SYMBOL_SMALL_TLSGD:
5118           asm_fprintf (asm_out_file, ":tlsgd:");
5119           break;
5120
5121         case SYMBOL_SMALL_TLSDESC:
5122           asm_fprintf (asm_out_file, ":tlsdesc:");
5123           break;
5124
5125         case SYMBOL_SMALL_TLSIE:
5126           asm_fprintf (asm_out_file, ":gottprel:");
5127           break;
5128
5129         case SYMBOL_TLSLE24:
5130           asm_fprintf (asm_out_file, ":tprel:");
5131           break;
5132
5133         case SYMBOL_TINY_GOT:
5134           gcc_unreachable ();
5135           break;
5136
5137         default:
5138           break;
5139         }
5140       output_addr_const (asm_out_file, x);
5141       break;
5142
5143     case 'L':
5144       switch (aarch64_classify_symbolic_expression (x))
5145         {
5146         case SYMBOL_SMALL_GOT_4G:
5147           asm_fprintf (asm_out_file, ":lo12:");
5148           break;
5149
5150         case SYMBOL_SMALL_TLSGD:
5151           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5152           break;
5153
5154         case SYMBOL_SMALL_TLSDESC:
5155           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5156           break;
5157
5158         case SYMBOL_SMALL_TLSIE:
5159           asm_fprintf (asm_out_file, ":gottprel_lo12:");
5160           break;
5161
5162         case SYMBOL_TLSLE12:
5163           asm_fprintf (asm_out_file, ":tprel_lo12:");
5164           break;
5165
5166         case SYMBOL_TLSLE24:
5167           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5168           break;
5169
5170         case SYMBOL_TINY_GOT:
5171           asm_fprintf (asm_out_file, ":got:");
5172           break;
5173
5174         case SYMBOL_TINY_TLSIE:
5175           asm_fprintf (asm_out_file, ":gottprel:");
5176           break;
5177
5178         default:
5179           break;
5180         }
5181       output_addr_const (asm_out_file, x);
5182       break;
5183
5184     case 'G':
5185
5186       switch (aarch64_classify_symbolic_expression (x))
5187         {
5188         case SYMBOL_TLSLE24:
5189           asm_fprintf (asm_out_file, ":tprel_hi12:");
5190           break;
5191         default:
5192           break;
5193         }
5194       output_addr_const (asm_out_file, x);
5195       break;
5196
5197     case 'k':
5198       {
5199         HOST_WIDE_INT cond_code;
5200         /* Print nzcv.  */
5201
5202         if (!CONST_INT_P (x))
5203           {
5204             output_operand_lossage ("invalid operand for '%%%c'", code);
5205             return;
5206           }
5207
5208         cond_code = INTVAL (x);
5209         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5210         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5211       }
5212       break;
5213
5214     default:
5215       output_operand_lossage ("invalid operand prefix '%%%c'", code);
5216       return;
5217     }
5218 }
5219
5220 static void
5221 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5222 {
5223   struct aarch64_address_info addr;
5224
5225   if (aarch64_classify_address (&addr, x, mode, MEM, true))
5226     switch (addr.type)
5227       {
5228       case ADDRESS_REG_IMM:
5229         if (addr.offset == const0_rtx)
5230           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5231         else
5232           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5233                        INTVAL (addr.offset));
5234         return;
5235
5236       case ADDRESS_REG_REG:
5237         if (addr.shift == 0)
5238           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5239                        reg_names [REGNO (addr.offset)]);
5240         else
5241           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5242                        reg_names [REGNO (addr.offset)], addr.shift);
5243         return;
5244
5245       case ADDRESS_REG_UXTW:
5246         if (addr.shift == 0)
5247           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5248                        REGNO (addr.offset) - R0_REGNUM);
5249         else
5250           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5251                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5252         return;
5253
5254       case ADDRESS_REG_SXTW:
5255         if (addr.shift == 0)
5256           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5257                        REGNO (addr.offset) - R0_REGNUM);
5258         else
5259           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5260                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
5261         return;
5262
5263       case ADDRESS_REG_WB:
5264         switch (GET_CODE (x))
5265           {
5266           case PRE_INC:
5267             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5268                          GET_MODE_SIZE (mode));
5269             return;
5270           case POST_INC:
5271             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5272                          GET_MODE_SIZE (mode));
5273             return;
5274           case PRE_DEC:
5275             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5276                          GET_MODE_SIZE (mode));
5277             return;
5278           case POST_DEC:
5279             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5280                          GET_MODE_SIZE (mode));
5281             return;
5282           case PRE_MODIFY:
5283             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5284                          INTVAL (addr.offset));
5285             return;
5286           case POST_MODIFY:
5287             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5288                          INTVAL (addr.offset));
5289             return;
5290           default:
5291             break;
5292           }
5293         break;
5294
5295       case ADDRESS_LO_SUM:
5296         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5297         output_addr_const (f, addr.offset);
5298         asm_fprintf (f, "]");
5299         return;
5300
5301       case ADDRESS_SYMBOLIC:
5302         break;
5303       }
5304
5305   output_addr_const (f, x);
5306 }
5307
5308 bool
5309 aarch64_label_mentioned_p (rtx x)
5310 {
5311   const char *fmt;
5312   int i;
5313
5314   if (GET_CODE (x) == LABEL_REF)
5315     return true;
5316
5317   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5318      referencing instruction, but they are constant offsets, not
5319      symbols.  */
5320   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5321     return false;
5322
5323   fmt = GET_RTX_FORMAT (GET_CODE (x));
5324   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5325     {
5326       if (fmt[i] == 'E')
5327         {
5328           int j;
5329
5330           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5331             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5332               return 1;
5333         }
5334       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5335         return 1;
5336     }
5337
5338   return 0;
5339 }
5340
5341 /* Implement REGNO_REG_CLASS.  */
5342
5343 enum reg_class
5344 aarch64_regno_regclass (unsigned regno)
5345 {
5346   if (GP_REGNUM_P (regno))
5347     return GENERAL_REGS;
5348
5349   if (regno == SP_REGNUM)
5350     return STACK_REG;
5351
5352   if (regno == FRAME_POINTER_REGNUM
5353       || regno == ARG_POINTER_REGNUM)
5354     return POINTER_REGS;
5355
5356   if (FP_REGNUM_P (regno))
5357     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5358
5359   return NO_REGS;
5360 }
5361
5362 static rtx
5363 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5364 {
5365   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5366      where mask is selected by alignment and size of the offset.
5367      We try to pick as large a range for the offset as possible to
5368      maximize the chance of a CSE.  However, for aligned addresses
5369      we limit the range to 4k so that structures with different sized
5370      elements are likely to use the same base.  We need to be careful
5371      not to split a CONST for some forms of address expression, otherwise
5372      it will generate sub-optimal code.  */
5373
5374   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5375     {
5376       rtx base = XEXP (x, 0);
5377       rtx offset_rtx = XEXP (x, 1);
5378       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5379
5380       if (GET_CODE (base) == PLUS)
5381         {
5382           rtx op0 = XEXP (base, 0);
5383           rtx op1 = XEXP (base, 1);
5384
5385           /* Force any scaling into a temp for CSE.  */
5386           op0 = force_reg (Pmode, op0);
5387           op1 = force_reg (Pmode, op1);
5388
5389           /* Let the pointer register be in op0.  */
5390           if (REG_POINTER (op1))
5391             std::swap (op0, op1);
5392
5393           /* If the pointer is virtual or frame related, then we know that
5394              virtual register instantiation or register elimination is going
5395              to apply a second constant.  We want the two constants folded
5396              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5397           if (virt_or_elim_regno_p (REGNO (op0)))
5398             {
5399               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5400                                    NULL_RTX, true, OPTAB_DIRECT);
5401               return gen_rtx_PLUS (Pmode, base, op1);
5402             }
5403
5404           /* Otherwise, in order to encourage CSE (and thence loop strength
5405              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5406           base = expand_binop (Pmode, add_optab, op0, op1,
5407                                NULL_RTX, true, OPTAB_DIRECT);
5408           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5409         }
5410
5411       /* Does it look like we'll need a load/store-pair operation?  */
5412       HOST_WIDE_INT base_offset;
5413       if (GET_MODE_SIZE (mode) > 16
5414           || mode == TImode)
5415         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
5416                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
5417       /* For offsets aren't a multiple of the access size, the limit is
5418          -256...255.  */
5419       else if (offset & (GET_MODE_SIZE (mode) - 1))
5420         {
5421           base_offset = (offset + 0x100) & ~0x1ff;
5422
5423           /* BLKmode typically uses LDP of X-registers.  */
5424           if (mode == BLKmode)
5425             base_offset = (offset + 512) & ~0x3ff;
5426         }
5427       /* Small negative offsets are supported.  */
5428       else if (IN_RANGE (offset, -256, 0))
5429         base_offset = 0;
5430       /* Use 12-bit offset by access size.  */
5431       else
5432         base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5433
5434       if (base_offset != 0)
5435         {
5436           base = plus_constant (Pmode, base, base_offset);
5437           base = force_operand (base, NULL_RTX);
5438           return plus_constant (Pmode, base, offset - base_offset);
5439         }
5440     }
5441
5442   return x;
5443 }
5444
5445 /* Return the reload icode required for a constant pool in mode.  */
5446 static enum insn_code
5447 aarch64_constant_pool_reload_icode (machine_mode mode)
5448 {
5449   switch (mode)
5450     {
5451     case SFmode:
5452       return CODE_FOR_aarch64_reload_movcpsfdi;
5453
5454     case DFmode:
5455       return CODE_FOR_aarch64_reload_movcpdfdi;
5456
5457     case TFmode:
5458       return CODE_FOR_aarch64_reload_movcptfdi;
5459
5460     case V8QImode:
5461       return CODE_FOR_aarch64_reload_movcpv8qidi;
5462
5463     case V16QImode:
5464       return CODE_FOR_aarch64_reload_movcpv16qidi;
5465
5466     case V4HImode:
5467       return CODE_FOR_aarch64_reload_movcpv4hidi;
5468
5469     case V8HImode:
5470       return CODE_FOR_aarch64_reload_movcpv8hidi;
5471
5472     case V2SImode:
5473       return CODE_FOR_aarch64_reload_movcpv2sidi;
5474
5475     case V4SImode:
5476       return CODE_FOR_aarch64_reload_movcpv4sidi;
5477
5478     case V2DImode:
5479       return CODE_FOR_aarch64_reload_movcpv2didi;
5480
5481     case V2DFmode:
5482       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5483
5484     default:
5485       gcc_unreachable ();
5486     }
5487
5488   gcc_unreachable ();
5489 }
5490 static reg_class_t
5491 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5492                           reg_class_t rclass,
5493                           machine_mode mode,
5494                           secondary_reload_info *sri)
5495 {
5496
5497   /* If we have to disable direct literal pool loads and stores because the
5498      function is too big, then we need a scratch register.  */
5499   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5500       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5501           || targetm.vector_mode_supported_p (GET_MODE (x)))
5502       && !aarch64_pcrelative_literal_loads)
5503     {
5504       sri->icode = aarch64_constant_pool_reload_icode (mode);
5505       return NO_REGS;
5506     }
5507
5508   /* Without the TARGET_SIMD instructions we cannot move a Q register
5509      to a Q register directly.  We need a scratch.  */
5510   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5511       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5512       && reg_class_subset_p (rclass, FP_REGS))
5513     {
5514       if (mode == TFmode)
5515         sri->icode = CODE_FOR_aarch64_reload_movtf;
5516       else if (mode == TImode)
5517         sri->icode = CODE_FOR_aarch64_reload_movti;
5518       return NO_REGS;
5519     }
5520
5521   /* A TFmode or TImode memory access should be handled via an FP_REGS
5522      because AArch64 has richer addressing modes for LDR/STR instructions
5523      than LDP/STP instructions.  */
5524   if (TARGET_FLOAT && rclass == GENERAL_REGS
5525       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5526     return FP_REGS;
5527
5528   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5529       return GENERAL_REGS;
5530
5531   return NO_REGS;
5532 }
5533
5534 static bool
5535 aarch64_can_eliminate (const int from, const int to)
5536 {
5537   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5538      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5539
5540   if (frame_pointer_needed)
5541     {
5542       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5543         return true;
5544       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5545         return false;
5546       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5547           && !cfun->calls_alloca)
5548         return true;
5549       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5550         return true;
5551
5552       return false;
5553     }
5554   else
5555     {
5556       /* If we decided that we didn't need a leaf frame pointer but then used
5557          LR in the function, then we'll want a frame pointer after all, so
5558          prevent this elimination to ensure a frame pointer is used.  */
5559       if (to == STACK_POINTER_REGNUM
5560           && flag_omit_leaf_frame_pointer
5561           && df_regs_ever_live_p (LR_REGNUM))
5562         return false;
5563     }
5564
5565   return true;
5566 }
5567
5568 HOST_WIDE_INT
5569 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5570 {
5571   aarch64_layout_frame ();
5572
5573   if (to == HARD_FRAME_POINTER_REGNUM)
5574     {
5575       if (from == ARG_POINTER_REGNUM)
5576         return cfun->machine->frame.hard_fp_offset;
5577
5578       if (from == FRAME_POINTER_REGNUM)
5579         return cfun->machine->frame.hard_fp_offset
5580                - cfun->machine->frame.locals_offset;
5581     }
5582
5583   if (to == STACK_POINTER_REGNUM)
5584     {
5585       if (from == FRAME_POINTER_REGNUM)
5586           return cfun->machine->frame.frame_size
5587                  - cfun->machine->frame.locals_offset;
5588     }
5589
5590   return cfun->machine->frame.frame_size;
5591 }
5592
5593 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5594    previous frame.  */
5595
5596 rtx
5597 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5598 {
5599   if (count != 0)
5600     return const0_rtx;
5601   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5602 }
5603
5604
5605 static void
5606 aarch64_asm_trampoline_template (FILE *f)
5607 {
5608   if (TARGET_ILP32)
5609     {
5610       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5611       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5612     }
5613   else
5614     {
5615       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5616       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5617     }
5618   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5619   assemble_aligned_integer (4, const0_rtx);
5620   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5621   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5622 }
5623
5624 static void
5625 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5626 {
5627   rtx fnaddr, mem, a_tramp;
5628   const int tramp_code_sz = 16;
5629
5630   /* Don't need to copy the trailing D-words, we fill those in below.  */
5631   emit_block_move (m_tramp, assemble_trampoline_template (),
5632                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5633   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5634   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5635   if (GET_MODE (fnaddr) != ptr_mode)
5636     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5637   emit_move_insn (mem, fnaddr);
5638
5639   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5640   emit_move_insn (mem, chain_value);
5641
5642   /* XXX We should really define a "clear_cache" pattern and use
5643      gen_clear_cache().  */
5644   a_tramp = XEXP (m_tramp, 0);
5645   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5646                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5647                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5648                      ptr_mode);
5649 }
5650
5651 static unsigned char
5652 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5653 {
5654   switch (regclass)
5655     {
5656     case CALLER_SAVE_REGS:
5657     case POINTER_REGS:
5658     case GENERAL_REGS:
5659     case ALL_REGS:
5660     case FP_REGS:
5661     case FP_LO_REGS:
5662       return
5663         aarch64_vector_mode_p (mode)
5664           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5665           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5666     case STACK_REG:
5667       return 1;
5668
5669     case NO_REGS:
5670       return 0;
5671
5672     default:
5673       break;
5674     }
5675   gcc_unreachable ();
5676 }
5677
5678 static reg_class_t
5679 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5680 {
5681   if (regclass == POINTER_REGS)
5682     return GENERAL_REGS;
5683
5684   if (regclass == STACK_REG)
5685     {
5686       if (REG_P(x)
5687           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5688           return regclass;
5689
5690       return NO_REGS;
5691     }
5692
5693   /* If it's an integer immediate that MOVI can't handle, then
5694      FP_REGS is not an option, so we return NO_REGS instead.  */
5695   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5696       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5697     return NO_REGS;
5698
5699   /* Register eliminiation can result in a request for
5700      SP+constant->FP_REGS.  We cannot support such operations which
5701      use SP as source and an FP_REG as destination, so reject out
5702      right now.  */
5703   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5704     {
5705       rtx lhs = XEXP (x, 0);
5706
5707       /* Look through a possible SUBREG introduced by ILP32.  */
5708       if (GET_CODE (lhs) == SUBREG)
5709         lhs = SUBREG_REG (lhs);
5710
5711       gcc_assert (REG_P (lhs));
5712       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5713                                       POINTER_REGS));
5714       return NO_REGS;
5715     }
5716
5717   return regclass;
5718 }
5719
5720 void
5721 aarch64_asm_output_labelref (FILE* f, const char *name)
5722 {
5723   asm_fprintf (f, "%U%s", name);
5724 }
5725
5726 static void
5727 aarch64_elf_asm_constructor (rtx symbol, int priority)
5728 {
5729   if (priority == DEFAULT_INIT_PRIORITY)
5730     default_ctor_section_asm_out_constructor (symbol, priority);
5731   else
5732     {
5733       section *s;
5734       char buf[18];
5735       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5736       s = get_section (buf, SECTION_WRITE, NULL);
5737       switch_to_section (s);
5738       assemble_align (POINTER_SIZE);
5739       assemble_aligned_integer (POINTER_BYTES, symbol);
5740     }
5741 }
5742
5743 static void
5744 aarch64_elf_asm_destructor (rtx symbol, int priority)
5745 {
5746   if (priority == DEFAULT_INIT_PRIORITY)
5747     default_dtor_section_asm_out_destructor (symbol, priority);
5748   else
5749     {
5750       section *s;
5751       char buf[18];
5752       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5753       s = get_section (buf, SECTION_WRITE, NULL);
5754       switch_to_section (s);
5755       assemble_align (POINTER_SIZE);
5756       assemble_aligned_integer (POINTER_BYTES, symbol);
5757     }
5758 }
5759
5760 const char*
5761 aarch64_output_casesi (rtx *operands)
5762 {
5763   char buf[100];
5764   char label[100];
5765   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5766   int index;
5767   static const char *const patterns[4][2] =
5768   {
5769     {
5770       "ldrb\t%w3, [%0,%w1,uxtw]",
5771       "add\t%3, %4, %w3, sxtb #2"
5772     },
5773     {
5774       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5775       "add\t%3, %4, %w3, sxth #2"
5776     },
5777     {
5778       "ldr\t%w3, [%0,%w1,uxtw #2]",
5779       "add\t%3, %4, %w3, sxtw #2"
5780     },
5781     /* We assume that DImode is only generated when not optimizing and
5782        that we don't really need 64-bit address offsets.  That would
5783        imply an object file with 8GB of code in a single function!  */
5784     {
5785       "ldr\t%w3, [%0,%w1,uxtw #2]",
5786       "add\t%3, %4, %w3, sxtw #2"
5787     }
5788   };
5789
5790   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5791
5792   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5793
5794   gcc_assert (index >= 0 && index <= 3);
5795
5796   /* Need to implement table size reduction, by chaning the code below.  */
5797   output_asm_insn (patterns[index][0], operands);
5798   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5799   snprintf (buf, sizeof (buf),
5800             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5801   output_asm_insn (buf, operands);
5802   output_asm_insn (patterns[index][1], operands);
5803   output_asm_insn ("br\t%3", operands);
5804   assemble_label (asm_out_file, label);
5805   return "";
5806 }
5807
5808
5809 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5810    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5811    operator.  */
5812
5813 int
5814 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5815 {
5816   if (shift >= 0 && shift <= 3)
5817     {
5818       int size;
5819       for (size = 8; size <= 32; size *= 2)
5820         {
5821           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5822           if (mask == bits << shift)
5823             return size;
5824         }
5825     }
5826   return 0;
5827 }
5828
5829 /* Constant pools are per function only when PC relative
5830    literal loads are true or we are in the large memory
5831    model.  */
5832
5833 static inline bool
5834 aarch64_can_use_per_function_literal_pools_p (void)
5835 {
5836   return (aarch64_pcrelative_literal_loads
5837           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5838 }
5839
5840 static bool
5841 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5842 {
5843   /* Fixme:: In an ideal world this would work similar
5844      to the logic in aarch64_select_rtx_section but this
5845      breaks bootstrap in gcc go.  For now we workaround
5846      this by returning false here.  */
5847   return false;
5848 }
5849
5850 /* Select appropriate section for constants depending
5851    on where we place literal pools.  */
5852
5853 static section *
5854 aarch64_select_rtx_section (machine_mode mode,
5855                             rtx x,
5856                             unsigned HOST_WIDE_INT align)
5857 {
5858   if (aarch64_can_use_per_function_literal_pools_p ())
5859     return function_section (current_function_decl);
5860
5861   return default_elf_select_rtx_section (mode, x, align);
5862 }
5863
5864 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
5865 void
5866 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
5867                                   HOST_WIDE_INT offset)
5868 {
5869   /* When using per-function literal pools, we must ensure that any code
5870      section is aligned to the minimal instruction length, lest we get
5871      errors from the assembler re "unaligned instructions".  */
5872   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
5873     ASM_OUTPUT_ALIGN (f, 2);
5874 }
5875
5876 /* Costs.  */
5877
5878 /* Helper function for rtx cost calculation.  Strip a shift expression
5879    from X.  Returns the inner operand if successful, or the original
5880    expression on failure.  */
5881 static rtx
5882 aarch64_strip_shift (rtx x)
5883 {
5884   rtx op = x;
5885
5886   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5887      we can convert both to ROR during final output.  */
5888   if ((GET_CODE (op) == ASHIFT
5889        || GET_CODE (op) == ASHIFTRT
5890        || GET_CODE (op) == LSHIFTRT
5891        || GET_CODE (op) == ROTATERT
5892        || GET_CODE (op) == ROTATE)
5893       && CONST_INT_P (XEXP (op, 1)))
5894     return XEXP (op, 0);
5895
5896   if (GET_CODE (op) == MULT
5897       && CONST_INT_P (XEXP (op, 1))
5898       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5899     return XEXP (op, 0);
5900
5901   return x;
5902 }
5903
5904 /* Helper function for rtx cost calculation.  Strip an extend
5905    expression from X.  Returns the inner operand if successful, or the
5906    original expression on failure.  We deal with a number of possible
5907    canonicalization variations here.  */
5908 static rtx
5909 aarch64_strip_extend (rtx x)
5910 {
5911   rtx op = x;
5912
5913   /* Zero and sign extraction of a widened value.  */
5914   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5915       && XEXP (op, 2) == const0_rtx
5916       && GET_CODE (XEXP (op, 0)) == MULT
5917       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5918                                          XEXP (op, 1)))
5919     return XEXP (XEXP (op, 0), 0);
5920
5921   /* It can also be represented (for zero-extend) as an AND with an
5922      immediate.  */
5923   if (GET_CODE (op) == AND
5924       && GET_CODE (XEXP (op, 0)) == MULT
5925       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5926       && CONST_INT_P (XEXP (op, 1))
5927       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5928                            INTVAL (XEXP (op, 1))) != 0)
5929     return XEXP (XEXP (op, 0), 0);
5930
5931   /* Now handle extended register, as this may also have an optional
5932      left shift by 1..4.  */
5933   if (GET_CODE (op) == ASHIFT
5934       && CONST_INT_P (XEXP (op, 1))
5935       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5936     op = XEXP (op, 0);
5937
5938   if (GET_CODE (op) == ZERO_EXTEND
5939       || GET_CODE (op) == SIGN_EXTEND)
5940     op = XEXP (op, 0);
5941
5942   if (op != x)
5943     return op;
5944
5945   return x;
5946 }
5947
5948 /* Return true iff CODE is a shift supported in combination
5949    with arithmetic instructions.  */
5950
5951 static bool
5952 aarch64_shift_p (enum rtx_code code)
5953 {
5954   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5955 }
5956
5957 /* Helper function for rtx cost calculation.  Calculate the cost of
5958    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5959    Return the calculated cost of the expression, recursing manually in to
5960    operands where needed.  */
5961
5962 static int
5963 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
5964 {
5965   rtx op0, op1;
5966   const struct cpu_cost_table *extra_cost
5967     = aarch64_tune_params.insn_extra_cost;
5968   int cost = 0;
5969   bool compound_p = (outer == PLUS || outer == MINUS);
5970   machine_mode mode = GET_MODE (x);
5971
5972   gcc_checking_assert (code == MULT);
5973
5974   op0 = XEXP (x, 0);
5975   op1 = XEXP (x, 1);
5976
5977   if (VECTOR_MODE_P (mode))
5978     mode = GET_MODE_INNER (mode);
5979
5980   /* Integer multiply/fma.  */
5981   if (GET_MODE_CLASS (mode) == MODE_INT)
5982     {
5983       /* The multiply will be canonicalized as a shift, cost it as such.  */
5984       if (aarch64_shift_p (GET_CODE (x))
5985           || (CONST_INT_P (op1)
5986               && exact_log2 (INTVAL (op1)) > 0))
5987         {
5988           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5989                            || GET_CODE (op0) == SIGN_EXTEND;
5990           if (speed)
5991             {
5992               if (compound_p)
5993                 {
5994                   if (REG_P (op1))
5995                     /* ARITH + shift-by-register.  */
5996                     cost += extra_cost->alu.arith_shift_reg;
5997                   else if (is_extend)
5998                     /* ARITH + extended register.  We don't have a cost field
5999                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
6000                     cost += extra_cost->alu.extend_arith;
6001                   else
6002                     /* ARITH + shift-by-immediate.  */
6003                     cost += extra_cost->alu.arith_shift;
6004                 }
6005               else
6006                 /* LSL (immediate).  */
6007                 cost += extra_cost->alu.shift;
6008
6009             }
6010           /* Strip extends as we will have costed them in the case above.  */
6011           if (is_extend)
6012             op0 = aarch64_strip_extend (op0);
6013
6014           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6015
6016           return cost;
6017         }
6018
6019       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
6020          compound and let the below cases handle it.  After all, MNEG is a
6021          special-case alias of MSUB.  */
6022       if (GET_CODE (op0) == NEG)
6023         {
6024           op0 = XEXP (op0, 0);
6025           compound_p = true;
6026         }
6027
6028       /* Integer multiplies or FMAs have zero/sign extending variants.  */
6029       if ((GET_CODE (op0) == ZERO_EXTEND
6030            && GET_CODE (op1) == ZERO_EXTEND)
6031           || (GET_CODE (op0) == SIGN_EXTEND
6032               && GET_CODE (op1) == SIGN_EXTEND))
6033         {
6034           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6035           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6036
6037           if (speed)
6038             {
6039               if (compound_p)
6040                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
6041                 cost += extra_cost->mult[0].extend_add;
6042               else
6043                 /* MUL/SMULL/UMULL.  */
6044                 cost += extra_cost->mult[0].extend;
6045             }
6046
6047           return cost;
6048         }
6049
6050       /* This is either an integer multiply or a MADD.  In both cases
6051          we want to recurse and cost the operands.  */
6052       cost += rtx_cost (op0, mode, MULT, 0, speed);
6053       cost += rtx_cost (op1, mode, MULT, 1, speed);
6054
6055       if (speed)
6056         {
6057           if (compound_p)
6058             /* MADD/MSUB.  */
6059             cost += extra_cost->mult[mode == DImode].add;
6060           else
6061             /* MUL.  */
6062             cost += extra_cost->mult[mode == DImode].simple;
6063         }
6064
6065       return cost;
6066     }
6067   else
6068     {
6069       if (speed)
6070         {
6071           /* Floating-point FMA/FMUL can also support negations of the
6072              operands, unless the rounding mode is upward or downward in
6073              which case FNMUL is different than FMUL with operand negation.  */
6074           bool neg0 = GET_CODE (op0) == NEG;
6075           bool neg1 = GET_CODE (op1) == NEG;
6076           if (compound_p || !flag_rounding_math || (neg0 && neg1))
6077             {
6078               if (neg0)
6079                 op0 = XEXP (op0, 0);
6080               if (neg1)
6081                 op1 = XEXP (op1, 0);
6082             }
6083
6084           if (compound_p)
6085             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
6086             cost += extra_cost->fp[mode == DFmode].fma;
6087           else
6088             /* FMUL/FNMUL.  */
6089             cost += extra_cost->fp[mode == DFmode].mult;
6090         }
6091
6092       cost += rtx_cost (op0, mode, MULT, 0, speed);
6093       cost += rtx_cost (op1, mode, MULT, 1, speed);
6094       return cost;
6095     }
6096 }
6097
6098 static int
6099 aarch64_address_cost (rtx x,
6100                       machine_mode mode,
6101                       addr_space_t as ATTRIBUTE_UNUSED,
6102                       bool speed)
6103 {
6104   enum rtx_code c = GET_CODE (x);
6105   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6106   struct aarch64_address_info info;
6107   int cost = 0;
6108   info.shift = 0;
6109
6110   if (!aarch64_classify_address (&info, x, mode, c, false))
6111     {
6112       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6113         {
6114           /* This is a CONST or SYMBOL ref which will be split
6115              in a different way depending on the code model in use.
6116              Cost it through the generic infrastructure.  */
6117           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6118           /* Divide through by the cost of one instruction to
6119              bring it to the same units as the address costs.  */
6120           cost_symbol_ref /= COSTS_N_INSNS (1);
6121           /* The cost is then the cost of preparing the address,
6122              followed by an immediate (possibly 0) offset.  */
6123           return cost_symbol_ref + addr_cost->imm_offset;
6124         }
6125       else
6126         {
6127           /* This is most likely a jump table from a case
6128              statement.  */
6129           return addr_cost->register_offset;
6130         }
6131     }
6132
6133   switch (info.type)
6134     {
6135       case ADDRESS_LO_SUM:
6136       case ADDRESS_SYMBOLIC:
6137       case ADDRESS_REG_IMM:
6138         cost += addr_cost->imm_offset;
6139         break;
6140
6141       case ADDRESS_REG_WB:
6142         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6143           cost += addr_cost->pre_modify;
6144         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6145           cost += addr_cost->post_modify;
6146         else
6147           gcc_unreachable ();
6148
6149         break;
6150
6151       case ADDRESS_REG_REG:
6152         cost += addr_cost->register_offset;
6153         break;
6154
6155       case ADDRESS_REG_SXTW:
6156         cost += addr_cost->register_sextend;
6157         break;
6158
6159       case ADDRESS_REG_UXTW:
6160         cost += addr_cost->register_zextend;
6161         break;
6162
6163       default:
6164         gcc_unreachable ();
6165     }
6166
6167
6168   if (info.shift > 0)
6169     {
6170       /* For the sake of calculating the cost of the shifted register
6171          component, we can treat same sized modes in the same way.  */
6172       switch (GET_MODE_BITSIZE (mode))
6173         {
6174           case 16:
6175             cost += addr_cost->addr_scale_costs.hi;
6176             break;
6177
6178           case 32:
6179             cost += addr_cost->addr_scale_costs.si;
6180             break;
6181
6182           case 64:
6183             cost += addr_cost->addr_scale_costs.di;
6184             break;
6185
6186           /* We can't tell, or this is a 128-bit vector.  */
6187           default:
6188             cost += addr_cost->addr_scale_costs.ti;
6189             break;
6190         }
6191     }
6192
6193   return cost;
6194 }
6195
6196 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
6197    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
6198    to be taken.  */
6199
6200 int
6201 aarch64_branch_cost (bool speed_p, bool predictable_p)
6202 {
6203   /* When optimizing for speed, use the cost of unpredictable branches.  */
6204   const struct cpu_branch_cost *branch_costs =
6205     aarch64_tune_params.branch_costs;
6206
6207   if (!speed_p || predictable_p)
6208     return branch_costs->predictable;
6209   else
6210     return branch_costs->unpredictable;
6211 }
6212
6213 /* Return true if the RTX X in mode MODE is a zero or sign extract
6214    usable in an ADD or SUB (extended register) instruction.  */
6215 static bool
6216 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6217 {
6218   /* Catch add with a sign extract.
6219      This is add_<optab><mode>_multp2.  */
6220   if (GET_CODE (x) == SIGN_EXTRACT
6221       || GET_CODE (x) == ZERO_EXTRACT)
6222     {
6223       rtx op0 = XEXP (x, 0);
6224       rtx op1 = XEXP (x, 1);
6225       rtx op2 = XEXP (x, 2);
6226
6227       if (GET_CODE (op0) == MULT
6228           && CONST_INT_P (op1)
6229           && op2 == const0_rtx
6230           && CONST_INT_P (XEXP (op0, 1))
6231           && aarch64_is_extend_from_extract (mode,
6232                                              XEXP (op0, 1),
6233                                              op1))
6234         {
6235           return true;
6236         }
6237     }
6238   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6239      No shift.  */
6240   else if (GET_CODE (x) == SIGN_EXTEND
6241            || GET_CODE (x) == ZERO_EXTEND)
6242     return REG_P (XEXP (x, 0));
6243
6244   return false;
6245 }
6246
6247 static bool
6248 aarch64_frint_unspec_p (unsigned int u)
6249 {
6250   switch (u)
6251     {
6252       case UNSPEC_FRINTZ:
6253       case UNSPEC_FRINTP:
6254       case UNSPEC_FRINTM:
6255       case UNSPEC_FRINTA:
6256       case UNSPEC_FRINTN:
6257       case UNSPEC_FRINTX:
6258       case UNSPEC_FRINTI:
6259         return true;
6260
6261       default:
6262         return false;
6263     }
6264 }
6265
6266 /* Return true iff X is an rtx that will match an extr instruction
6267    i.e. as described in the *extr<mode>5_insn family of patterns.
6268    OP0 and OP1 will be set to the operands of the shifts involved
6269    on success and will be NULL_RTX otherwise.  */
6270
6271 static bool
6272 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6273 {
6274   rtx op0, op1;
6275   machine_mode mode = GET_MODE (x);
6276
6277   *res_op0 = NULL_RTX;
6278   *res_op1 = NULL_RTX;
6279
6280   if (GET_CODE (x) != IOR)
6281     return false;
6282
6283   op0 = XEXP (x, 0);
6284   op1 = XEXP (x, 1);
6285
6286   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6287       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6288     {
6289      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6290       if (GET_CODE (op1) == ASHIFT)
6291         std::swap (op0, op1);
6292
6293       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6294         return false;
6295
6296       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6297       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6298
6299       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6300           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6301         {
6302           *res_op0 = XEXP (op0, 0);
6303           *res_op1 = XEXP (op1, 0);
6304           return true;
6305         }
6306     }
6307
6308   return false;
6309 }
6310
6311 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6312    storing it in *COST.  Result is true if the total cost of the operation
6313    has now been calculated.  */
6314 static bool
6315 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6316 {
6317   rtx inner;
6318   rtx comparator;
6319   enum rtx_code cmpcode;
6320
6321   if (COMPARISON_P (op0))
6322     {
6323       inner = XEXP (op0, 0);
6324       comparator = XEXP (op0, 1);
6325       cmpcode = GET_CODE (op0);
6326     }
6327   else
6328     {
6329       inner = op0;
6330       comparator = const0_rtx;
6331       cmpcode = NE;
6332     }
6333
6334   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6335     {
6336       /* Conditional branch.  */
6337       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6338         return true;
6339       else
6340         {
6341           if (cmpcode == NE || cmpcode == EQ)
6342             {
6343               if (comparator == const0_rtx)
6344                 {
6345                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6346                   if (GET_CODE (inner) == ZERO_EXTRACT)
6347                     /* TBZ/TBNZ.  */
6348                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6349                                        ZERO_EXTRACT, 0, speed);
6350                   else
6351                     /* CBZ/CBNZ.  */
6352                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6353
6354                 return true;
6355               }
6356             }
6357           else if (cmpcode == LT || cmpcode == GE)
6358             {
6359               /* TBZ/TBNZ.  */
6360               if (comparator == const0_rtx)
6361                 return true;
6362             }
6363         }
6364     }
6365   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6366     {
6367       /* CCMP.  */
6368       if (GET_CODE (op1) == COMPARE)
6369         {
6370           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6371           if (XEXP (op1, 1) == const0_rtx)
6372             *cost += 1;
6373           if (speed)
6374             {
6375               machine_mode mode = GET_MODE (XEXP (op1, 0));
6376               const struct cpu_cost_table *extra_cost
6377                 = aarch64_tune_params.insn_extra_cost;
6378
6379               if (GET_MODE_CLASS (mode) == MODE_INT)
6380                 *cost += extra_cost->alu.arith;
6381               else
6382                 *cost += extra_cost->fp[mode == DFmode].compare;
6383             }
6384           return true;
6385         }
6386
6387       /* It's a conditional operation based on the status flags,
6388          so it must be some flavor of CSEL.  */
6389
6390       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6391       if (GET_CODE (op1) == NEG
6392           || GET_CODE (op1) == NOT
6393           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6394         op1 = XEXP (op1, 0);
6395       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6396         {
6397           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6398           op1 = XEXP (op1, 0);
6399           op2 = XEXP (op2, 0);
6400         }
6401
6402       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6403       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6404       return true;
6405     }
6406
6407   /* We don't know what this is, cost all operands.  */
6408   return false;
6409 }
6410
6411 /* Check whether X is a bitfield operation of the form shift + extend that
6412    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6413    operand to which the bitfield operation is applied.  Otherwise return
6414    NULL_RTX.  */
6415
6416 static rtx
6417 aarch64_extend_bitfield_pattern_p (rtx x)
6418 {
6419   rtx_code outer_code = GET_CODE (x);
6420   machine_mode outer_mode = GET_MODE (x);
6421
6422   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6423       && outer_mode != SImode && outer_mode != DImode)
6424     return NULL_RTX;
6425
6426   rtx inner = XEXP (x, 0);
6427   rtx_code inner_code = GET_CODE (inner);
6428   machine_mode inner_mode = GET_MODE (inner);
6429   rtx op = NULL_RTX;
6430
6431   switch (inner_code)
6432     {
6433       case ASHIFT:
6434         if (CONST_INT_P (XEXP (inner, 1))
6435             && (inner_mode == QImode || inner_mode == HImode))
6436           op = XEXP (inner, 0);
6437         break;
6438       case LSHIFTRT:
6439         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6440             && (inner_mode == QImode || inner_mode == HImode))
6441           op = XEXP (inner, 0);
6442         break;
6443       case ASHIFTRT:
6444         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6445             && (inner_mode == QImode || inner_mode == HImode))
6446           op = XEXP (inner, 0);
6447         break;
6448       default:
6449         break;
6450     }
6451
6452   return op;
6453 }
6454
6455 /* Return true if the mask and a shift amount from an RTX of the form
6456    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6457    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6458
6459 bool
6460 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6461 {
6462   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6463          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6464          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6465          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6466 }
6467
6468 /* Calculate the cost of calculating X, storing it in *COST.  Result
6469    is true if the total cost of the operation has now been calculated.  */
6470 static bool
6471 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6472                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6473 {
6474   rtx op0, op1, op2;
6475   const struct cpu_cost_table *extra_cost
6476     = aarch64_tune_params.insn_extra_cost;
6477   int code = GET_CODE (x);
6478
6479   /* By default, assume that everything has equivalent cost to the
6480      cheapest instruction.  Any additional costs are applied as a delta
6481      above this default.  */
6482   *cost = COSTS_N_INSNS (1);
6483
6484   switch (code)
6485     {
6486     case SET:
6487       /* The cost depends entirely on the operands to SET.  */
6488       *cost = 0;
6489       op0 = SET_DEST (x);
6490       op1 = SET_SRC (x);
6491
6492       switch (GET_CODE (op0))
6493         {
6494         case MEM:
6495           if (speed)
6496             {
6497               rtx address = XEXP (op0, 0);
6498               if (VECTOR_MODE_P (mode))
6499                 *cost += extra_cost->ldst.storev;
6500               else if (GET_MODE_CLASS (mode) == MODE_INT)
6501                 *cost += extra_cost->ldst.store;
6502               else if (mode == SFmode)
6503                 *cost += extra_cost->ldst.storef;
6504               else if (mode == DFmode)
6505                 *cost += extra_cost->ldst.stored;
6506
6507               *cost +=
6508                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6509                                                      0, speed));
6510             }
6511
6512           *cost += rtx_cost (op1, mode, SET, 1, speed);
6513           return true;
6514
6515         case SUBREG:
6516           if (! REG_P (SUBREG_REG (op0)))
6517             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6518
6519           /* Fall through.  */
6520         case REG:
6521           /* The cost is one per vector-register copied.  */
6522           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6523             {
6524               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6525                               / GET_MODE_SIZE (V4SImode);
6526               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6527             }
6528           /* const0_rtx is in general free, but we will use an
6529              instruction to set a register to 0.  */
6530           else if (REG_P (op1) || op1 == const0_rtx)
6531             {
6532               /* The cost is 1 per register copied.  */
6533               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6534                               / UNITS_PER_WORD;
6535               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6536             }
6537           else
6538             /* Cost is just the cost of the RHS of the set.  */
6539             *cost += rtx_cost (op1, mode, SET, 1, speed);
6540           return true;
6541
6542         case ZERO_EXTRACT:
6543         case SIGN_EXTRACT:
6544           /* Bit-field insertion.  Strip any redundant widening of
6545              the RHS to meet the width of the target.  */
6546           if (GET_CODE (op1) == SUBREG)
6547             op1 = SUBREG_REG (op1);
6548           if ((GET_CODE (op1) == ZERO_EXTEND
6549                || GET_CODE (op1) == SIGN_EXTEND)
6550               && CONST_INT_P (XEXP (op0, 1))
6551               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6552                   >= INTVAL (XEXP (op0, 1))))
6553             op1 = XEXP (op1, 0);
6554
6555           if (CONST_INT_P (op1))
6556             {
6557               /* MOV immediate is assumed to always be cheap.  */
6558               *cost = COSTS_N_INSNS (1);
6559             }
6560           else
6561             {
6562               /* BFM.  */
6563               if (speed)
6564                 *cost += extra_cost->alu.bfi;
6565               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6566             }
6567
6568           return true;
6569
6570         default:
6571           /* We can't make sense of this, assume default cost.  */
6572           *cost = COSTS_N_INSNS (1);
6573           return false;
6574         }
6575       return false;
6576
6577     case CONST_INT:
6578       /* If an instruction can incorporate a constant within the
6579          instruction, the instruction's expression avoids calling
6580          rtx_cost() on the constant.  If rtx_cost() is called on a
6581          constant, then it is usually because the constant must be
6582          moved into a register by one or more instructions.
6583
6584          The exception is constant 0, which can be expressed
6585          as XZR/WZR and is therefore free.  The exception to this is
6586          if we have (set (reg) (const0_rtx)) in which case we must cost
6587          the move.  However, we can catch that when we cost the SET, so
6588          we don't need to consider that here.  */
6589       if (x == const0_rtx)
6590         *cost = 0;
6591       else
6592         {
6593           /* To an approximation, building any other constant is
6594              proportionally expensive to the number of instructions
6595              required to build that constant.  This is true whether we
6596              are compiling for SPEED or otherwise.  */
6597           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6598                                  (NULL_RTX, x, false, mode));
6599         }
6600       return true;
6601
6602     case CONST_DOUBLE:
6603       if (speed)
6604         {
6605           /* mov[df,sf]_aarch64.  */
6606           if (aarch64_float_const_representable_p (x))
6607             /* FMOV (scalar immediate).  */
6608             *cost += extra_cost->fp[mode == DFmode].fpconst;
6609           else if (!aarch64_float_const_zero_rtx_p (x))
6610             {
6611               /* This will be a load from memory.  */
6612               if (mode == DFmode)
6613                 *cost += extra_cost->ldst.loadd;
6614               else
6615                 *cost += extra_cost->ldst.loadf;
6616             }
6617           else
6618             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
6619                or MOV v0.s[0], wzr - neither of which are modeled by the
6620                cost tables.  Just use the default cost.  */
6621             {
6622             }
6623         }
6624
6625       return true;
6626
6627     case MEM:
6628       if (speed)
6629         {
6630           /* For loads we want the base cost of a load, plus an
6631              approximation for the additional cost of the addressing
6632              mode.  */
6633           rtx address = XEXP (x, 0);
6634           if (VECTOR_MODE_P (mode))
6635             *cost += extra_cost->ldst.loadv;
6636           else if (GET_MODE_CLASS (mode) == MODE_INT)
6637             *cost += extra_cost->ldst.load;
6638           else if (mode == SFmode)
6639             *cost += extra_cost->ldst.loadf;
6640           else if (mode == DFmode)
6641             *cost += extra_cost->ldst.loadd;
6642
6643           *cost +=
6644                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6645                                                      0, speed));
6646         }
6647
6648       return true;
6649
6650     case NEG:
6651       op0 = XEXP (x, 0);
6652
6653       if (VECTOR_MODE_P (mode))
6654         {
6655           if (speed)
6656             {
6657               /* FNEG.  */
6658               *cost += extra_cost->vect.alu;
6659             }
6660           return false;
6661         }
6662
6663       if (GET_MODE_CLASS (mode) == MODE_INT)
6664         {
6665           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6666               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6667             {
6668               /* CSETM.  */
6669               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6670               return true;
6671             }
6672
6673           /* Cost this as SUB wzr, X.  */
6674           op0 = CONST0_RTX (mode);
6675           op1 = XEXP (x, 0);
6676           goto cost_minus;
6677         }
6678
6679       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6680         {
6681           /* Support (neg(fma...)) as a single instruction only if
6682              sign of zeros is unimportant.  This matches the decision
6683              making in aarch64.md.  */
6684           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6685             {
6686               /* FNMADD.  */
6687               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6688               return true;
6689             }
6690           if (GET_CODE (op0) == MULT)
6691             {
6692               /* FNMUL.  */
6693               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6694               return true;
6695             }
6696           if (speed)
6697             /* FNEG.  */
6698             *cost += extra_cost->fp[mode == DFmode].neg;
6699           return false;
6700         }
6701
6702       return false;
6703
6704     case CLRSB:
6705     case CLZ:
6706       if (speed)
6707         {
6708           if (VECTOR_MODE_P (mode))
6709             *cost += extra_cost->vect.alu;
6710           else
6711             *cost += extra_cost->alu.clz;
6712         }
6713
6714       return false;
6715
6716     case COMPARE:
6717       op0 = XEXP (x, 0);
6718       op1 = XEXP (x, 1);
6719
6720       if (op1 == const0_rtx
6721           && GET_CODE (op0) == AND)
6722         {
6723           x = op0;
6724           mode = GET_MODE (op0);
6725           goto cost_logic;
6726         }
6727
6728       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6729         {
6730           /* TODO: A write to the CC flags possibly costs extra, this
6731              needs encoding in the cost tables.  */
6732
6733           mode = GET_MODE (op0);
6734           /* ANDS.  */
6735           if (GET_CODE (op0) == AND)
6736             {
6737               x = op0;
6738               goto cost_logic;
6739             }
6740
6741           if (GET_CODE (op0) == PLUS)
6742             {
6743               /* ADDS (and CMN alias).  */
6744               x = op0;
6745               goto cost_plus;
6746             }
6747
6748           if (GET_CODE (op0) == MINUS)
6749             {
6750               /* SUBS.  */
6751               x = op0;
6752               goto cost_minus;
6753             }
6754
6755           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6756               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6757               && CONST_INT_P (XEXP (op0, 2)))
6758             {
6759               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6760                  Handle it here directly rather than going to cost_logic
6761                  since we know the immediate generated for the TST is valid
6762                  so we can avoid creating an intermediate rtx for it only
6763                  for costing purposes.  */
6764               if (speed)
6765                 *cost += extra_cost->alu.logical;
6766
6767               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6768                                  ZERO_EXTRACT, 0, speed);
6769               return true;
6770             }
6771
6772           if (GET_CODE (op1) == NEG)
6773             {
6774               /* CMN.  */
6775               if (speed)
6776                 *cost += extra_cost->alu.arith;
6777
6778               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6779               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6780               return true;
6781             }
6782
6783           /* CMP.
6784
6785              Compare can freely swap the order of operands, and
6786              canonicalization puts the more complex operation first.
6787              But the integer MINUS logic expects the shift/extend
6788              operation in op1.  */
6789           if (! (REG_P (op0)
6790                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6791           {
6792             op0 = XEXP (x, 1);
6793             op1 = XEXP (x, 0);
6794           }
6795           goto cost_minus;
6796         }
6797
6798       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6799         {
6800           /* FCMP.  */
6801           if (speed)
6802             *cost += extra_cost->fp[mode == DFmode].compare;
6803
6804           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6805             {
6806               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6807               /* FCMP supports constant 0.0 for no extra cost. */
6808               return true;
6809             }
6810           return false;
6811         }
6812
6813       if (VECTOR_MODE_P (mode))
6814         {
6815           /* Vector compare.  */
6816           if (speed)
6817             *cost += extra_cost->vect.alu;
6818
6819           if (aarch64_float_const_zero_rtx_p (op1))
6820             {
6821               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6822                  cost.  */
6823               return true;
6824             }
6825           return false;
6826         }
6827       return false;
6828
6829     case MINUS:
6830       {
6831         op0 = XEXP (x, 0);
6832         op1 = XEXP (x, 1);
6833
6834 cost_minus:
6835         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6836
6837         /* Detect valid immediates.  */
6838         if ((GET_MODE_CLASS (mode) == MODE_INT
6839              || (GET_MODE_CLASS (mode) == MODE_CC
6840                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6841             && CONST_INT_P (op1)
6842             && aarch64_uimm12_shift (INTVAL (op1)))
6843           {
6844             if (speed)
6845               /* SUB(S) (immediate).  */
6846               *cost += extra_cost->alu.arith;
6847             return true;
6848           }
6849
6850         /* Look for SUB (extended register).  */
6851         if (aarch64_rtx_arith_op_extract_p (op1, mode))
6852           {
6853             if (speed)
6854               *cost += extra_cost->alu.extend_arith;
6855
6856             op1 = aarch64_strip_extend (op1);
6857             *cost += rtx_cost (op1, VOIDmode,
6858                                (enum rtx_code) GET_CODE (op1), 0, speed);
6859             return true;
6860           }
6861
6862         rtx new_op1 = aarch64_strip_extend (op1);
6863
6864         /* Cost this as an FMA-alike operation.  */
6865         if ((GET_CODE (new_op1) == MULT
6866              || aarch64_shift_p (GET_CODE (new_op1)))
6867             && code != COMPARE)
6868           {
6869             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6870                                             (enum rtx_code) code,
6871                                             speed);
6872             return true;
6873           }
6874
6875         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6876
6877         if (speed)
6878           {
6879             if (VECTOR_MODE_P (mode))
6880               {
6881                 /* Vector SUB.  */
6882                 *cost += extra_cost->vect.alu;
6883               }
6884             else if (GET_MODE_CLASS (mode) == MODE_INT)
6885               {
6886                 /* SUB(S).  */
6887                 *cost += extra_cost->alu.arith;
6888               }
6889             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6890               {
6891                 /* FSUB.  */
6892                 *cost += extra_cost->fp[mode == DFmode].addsub;
6893               }
6894           }
6895         return true;
6896       }
6897
6898     case PLUS:
6899       {
6900         rtx new_op0;
6901
6902         op0 = XEXP (x, 0);
6903         op1 = XEXP (x, 1);
6904
6905 cost_plus:
6906         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6907             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6908           {
6909             /* CSINC.  */
6910             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6911             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6912             return true;
6913           }
6914
6915         if (GET_MODE_CLASS (mode) == MODE_INT
6916             && CONST_INT_P (op1)
6917             && aarch64_uimm12_shift (INTVAL (op1)))
6918           {
6919             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6920
6921             if (speed)
6922               /* ADD (immediate).  */
6923               *cost += extra_cost->alu.arith;
6924             return true;
6925           }
6926
6927         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6928
6929         /* Look for ADD (extended register).  */
6930         if (aarch64_rtx_arith_op_extract_p (op0, mode))
6931           {
6932             if (speed)
6933               *cost += extra_cost->alu.extend_arith;
6934
6935             op0 = aarch64_strip_extend (op0);
6936             *cost += rtx_cost (op0, VOIDmode,
6937                                (enum rtx_code) GET_CODE (op0), 0, speed);
6938             return true;
6939           }
6940
6941         /* Strip any extend, leave shifts behind as we will
6942            cost them through mult_cost.  */
6943         new_op0 = aarch64_strip_extend (op0);
6944
6945         if (GET_CODE (new_op0) == MULT
6946             || aarch64_shift_p (GET_CODE (new_op0)))
6947           {
6948             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6949                                             speed);
6950             return true;
6951           }
6952
6953         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
6954
6955         if (speed)
6956           {
6957             if (VECTOR_MODE_P (mode))
6958               {
6959                 /* Vector ADD.  */
6960                 *cost += extra_cost->vect.alu;
6961               }
6962             else if (GET_MODE_CLASS (mode) == MODE_INT)
6963               {
6964                 /* ADD.  */
6965                 *cost += extra_cost->alu.arith;
6966               }
6967             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6968               {
6969                 /* FADD.  */
6970                 *cost += extra_cost->fp[mode == DFmode].addsub;
6971               }
6972           }
6973         return true;
6974       }
6975
6976     case BSWAP:
6977       *cost = COSTS_N_INSNS (1);
6978
6979       if (speed)
6980         {
6981           if (VECTOR_MODE_P (mode))
6982             *cost += extra_cost->vect.alu;
6983           else
6984             *cost += extra_cost->alu.rev;
6985         }
6986       return false;
6987
6988     case IOR:
6989       if (aarch_rev16_p (x))
6990         {
6991           *cost = COSTS_N_INSNS (1);
6992
6993           if (speed)
6994             {
6995               if (VECTOR_MODE_P (mode))
6996                 *cost += extra_cost->vect.alu;
6997               else
6998                 *cost += extra_cost->alu.rev;
6999             }
7000           return true;
7001         }
7002
7003       if (aarch64_extr_rtx_p (x, &op0, &op1))
7004         {
7005           *cost += rtx_cost (op0, mode, IOR, 0, speed);
7006           *cost += rtx_cost (op1, mode, IOR, 1, speed);
7007           if (speed)
7008             *cost += extra_cost->alu.shift;
7009
7010           return true;
7011         }
7012     /* Fall through.  */
7013     case XOR:
7014     case AND:
7015     cost_logic:
7016       op0 = XEXP (x, 0);
7017       op1 = XEXP (x, 1);
7018
7019       if (VECTOR_MODE_P (mode))
7020         {
7021           if (speed)
7022             *cost += extra_cost->vect.alu;
7023           return true;
7024         }
7025
7026       if (code == AND
7027           && GET_CODE (op0) == MULT
7028           && CONST_INT_P (XEXP (op0, 1))
7029           && CONST_INT_P (op1)
7030           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7031                                INTVAL (op1)) != 0)
7032         {
7033           /* This is a UBFM/SBFM.  */
7034           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7035           if (speed)
7036             *cost += extra_cost->alu.bfx;
7037           return true;
7038         }
7039
7040       if (GET_MODE_CLASS (mode) == MODE_INT)
7041         {
7042           if (CONST_INT_P (op1))
7043             {
7044               /* We have a mask + shift version of a UBFIZ
7045                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
7046               if (GET_CODE (op0) == ASHIFT
7047                   && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
7048                                                           XEXP (op0, 1)))
7049                 {
7050                   *cost += rtx_cost (XEXP (op0, 0), mode,
7051                                      (enum rtx_code) code, 0, speed);
7052                   if (speed)
7053                     *cost += extra_cost->alu.bfx;
7054
7055                   return true;
7056                 }
7057               else if (aarch64_bitmask_imm (INTVAL (op1), mode))
7058                 {
7059                 /* We possibly get the immediate for free, this is not
7060                    modelled.  */
7061                   *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7062                   if (speed)
7063                     *cost += extra_cost->alu.logical;
7064
7065                   return true;
7066                 }
7067             }
7068           else
7069             {
7070               rtx new_op0 = op0;
7071
7072               /* Handle ORN, EON, or BIC.  */
7073               if (GET_CODE (op0) == NOT)
7074                 op0 = XEXP (op0, 0);
7075
7076               new_op0 = aarch64_strip_shift (op0);
7077
7078               /* If we had a shift on op0 then this is a logical-shift-
7079                  by-register/immediate operation.  Otherwise, this is just
7080                  a logical operation.  */
7081               if (speed)
7082                 {
7083                   if (new_op0 != op0)
7084                     {
7085                       /* Shift by immediate.  */
7086                       if (CONST_INT_P (XEXP (op0, 1)))
7087                         *cost += extra_cost->alu.log_shift;
7088                       else
7089                         *cost += extra_cost->alu.log_shift_reg;
7090                     }
7091                   else
7092                     *cost += extra_cost->alu.logical;
7093                 }
7094
7095               /* In both cases we want to cost both operands.  */
7096               *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
7097               *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
7098
7099               return true;
7100             }
7101         }
7102       return false;
7103
7104     case NOT:
7105       x = XEXP (x, 0);
7106       op0 = aarch64_strip_shift (x);
7107
7108       if (VECTOR_MODE_P (mode))
7109         {
7110           /* Vector NOT.  */
7111           *cost += extra_cost->vect.alu;
7112           return false;
7113         }
7114
7115       /* MVN-shifted-reg.  */
7116       if (op0 != x)
7117         {
7118           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7119
7120           if (speed)
7121             *cost += extra_cost->alu.log_shift;
7122
7123           return true;
7124         }
7125       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7126          Handle the second form here taking care that 'a' in the above can
7127          be a shift.  */
7128       else if (GET_CODE (op0) == XOR)
7129         {
7130           rtx newop0 = XEXP (op0, 0);
7131           rtx newop1 = XEXP (op0, 1);
7132           rtx op0_stripped = aarch64_strip_shift (newop0);
7133
7134           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7135           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7136
7137           if (speed)
7138             {
7139               if (op0_stripped != newop0)
7140                 *cost += extra_cost->alu.log_shift;
7141               else
7142                 *cost += extra_cost->alu.logical;
7143             }
7144
7145           return true;
7146         }
7147       /* MVN.  */
7148       if (speed)
7149         *cost += extra_cost->alu.logical;
7150
7151       return false;
7152
7153     case ZERO_EXTEND:
7154
7155       op0 = XEXP (x, 0);
7156       /* If a value is written in SI mode, then zero extended to DI
7157          mode, the operation will in general be free as a write to
7158          a 'w' register implicitly zeroes the upper bits of an 'x'
7159          register.  However, if this is
7160
7161            (set (reg) (zero_extend (reg)))
7162
7163          we must cost the explicit register move.  */
7164       if (mode == DImode
7165           && GET_MODE (op0) == SImode
7166           && outer == SET)
7167         {
7168           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7169
7170         /* If OP_COST is non-zero, then the cost of the zero extend
7171            is effectively the cost of the inner operation.  Otherwise
7172            we have a MOV instruction and we take the cost from the MOV
7173            itself.  This is true independently of whether we are
7174            optimizing for space or time.  */
7175           if (op_cost)
7176             *cost = op_cost;
7177
7178           return true;
7179         }
7180       else if (MEM_P (op0))
7181         {
7182           /* All loads can zero extend to any size for free.  */
7183           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7184           return true;
7185         }
7186
7187       op0 = aarch64_extend_bitfield_pattern_p (x);
7188       if (op0)
7189         {
7190           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7191           if (speed)
7192             *cost += extra_cost->alu.bfx;
7193           return true;
7194         }
7195
7196       if (speed)
7197         {
7198           if (VECTOR_MODE_P (mode))
7199             {
7200               /* UMOV.  */
7201               *cost += extra_cost->vect.alu;
7202             }
7203           else
7204             {
7205               /* We generate an AND instead of UXTB/UXTH.  */
7206               *cost += extra_cost->alu.logical;
7207             }
7208         }
7209       return false;
7210
7211     case SIGN_EXTEND:
7212       if (MEM_P (XEXP (x, 0)))
7213         {
7214           /* LDRSH.  */
7215           if (speed)
7216             {
7217               rtx address = XEXP (XEXP (x, 0), 0);
7218               *cost += extra_cost->ldst.load_sign_extend;
7219
7220               *cost +=
7221                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7222                                                      0, speed));
7223             }
7224           return true;
7225         }
7226
7227       op0 = aarch64_extend_bitfield_pattern_p (x);
7228       if (op0)
7229         {
7230           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7231           if (speed)
7232             *cost += extra_cost->alu.bfx;
7233           return true;
7234         }
7235
7236       if (speed)
7237         {
7238           if (VECTOR_MODE_P (mode))
7239             *cost += extra_cost->vect.alu;
7240           else
7241             *cost += extra_cost->alu.extend;
7242         }
7243       return false;
7244
7245     case ASHIFT:
7246       op0 = XEXP (x, 0);
7247       op1 = XEXP (x, 1);
7248
7249       if (CONST_INT_P (op1))
7250         {
7251           if (speed)
7252             {
7253               if (VECTOR_MODE_P (mode))
7254                 {
7255                   /* Vector shift (immediate).  */
7256                   *cost += extra_cost->vect.alu;
7257                 }
7258               else
7259                 {
7260                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
7261                      aliases.  */
7262                   *cost += extra_cost->alu.shift;
7263                 }
7264             }
7265
7266           /* We can incorporate zero/sign extend for free.  */
7267           if (GET_CODE (op0) == ZERO_EXTEND
7268               || GET_CODE (op0) == SIGN_EXTEND)
7269             op0 = XEXP (op0, 0);
7270
7271           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7272           return true;
7273         }
7274       else
7275         {
7276           if (speed)
7277             {
7278               if (VECTOR_MODE_P (mode))
7279                 {
7280                   /* Vector shift (register).  */
7281                   *cost += extra_cost->vect.alu;
7282                 }
7283               else
7284                 {
7285                   /* LSLV.  */
7286                   *cost += extra_cost->alu.shift_reg;
7287                 }
7288             }
7289           return false;  /* All arguments need to be in registers.  */
7290         }
7291
7292     case ROTATE:
7293     case ROTATERT:
7294     case LSHIFTRT:
7295     case ASHIFTRT:
7296       op0 = XEXP (x, 0);
7297       op1 = XEXP (x, 1);
7298
7299       if (CONST_INT_P (op1))
7300         {
7301           /* ASR (immediate) and friends.  */
7302           if (speed)
7303             {
7304               if (VECTOR_MODE_P (mode))
7305                 *cost += extra_cost->vect.alu;
7306               else
7307                 *cost += extra_cost->alu.shift;
7308             }
7309
7310           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7311           return true;
7312         }
7313       else
7314         {
7315
7316           /* ASR (register) and friends.  */
7317           if (speed)
7318             {
7319               if (VECTOR_MODE_P (mode))
7320                 *cost += extra_cost->vect.alu;
7321               else
7322                 *cost += extra_cost->alu.shift_reg;
7323             }
7324           return false;  /* All arguments need to be in registers.  */
7325         }
7326
7327     case SYMBOL_REF:
7328
7329       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7330           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7331         {
7332           /* LDR.  */
7333           if (speed)
7334             *cost += extra_cost->ldst.load;
7335         }
7336       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7337                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7338         {
7339           /* ADRP, followed by ADD.  */
7340           *cost += COSTS_N_INSNS (1);
7341           if (speed)
7342             *cost += 2 * extra_cost->alu.arith;
7343         }
7344       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7345                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7346         {
7347           /* ADR.  */
7348           if (speed)
7349             *cost += extra_cost->alu.arith;
7350         }
7351
7352       if (flag_pic)
7353         {
7354           /* One extra load instruction, after accessing the GOT.  */
7355           *cost += COSTS_N_INSNS (1);
7356           if (speed)
7357             *cost += extra_cost->ldst.load;
7358         }
7359       return true;
7360
7361     case HIGH:
7362     case LO_SUM:
7363       /* ADRP/ADD (immediate).  */
7364       if (speed)
7365         *cost += extra_cost->alu.arith;
7366       return true;
7367
7368     case ZERO_EXTRACT:
7369     case SIGN_EXTRACT:
7370       /* UBFX/SBFX.  */
7371       if (speed)
7372         {
7373           if (VECTOR_MODE_P (mode))
7374             *cost += extra_cost->vect.alu;
7375           else
7376             *cost += extra_cost->alu.bfx;
7377         }
7378
7379       /* We can trust that the immediates used will be correct (there
7380          are no by-register forms), so we need only cost op0.  */
7381       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7382       return true;
7383
7384     case MULT:
7385       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7386       /* aarch64_rtx_mult_cost always handles recursion to its
7387          operands.  */
7388       return true;
7389
7390     case MOD:
7391     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7392        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7393        an unconditional negate.  This case should only ever be reached through
7394        the set_smod_pow2_cheap check in expmed.c.  */
7395       if (CONST_INT_P (XEXP (x, 1))
7396           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7397           && (mode == SImode || mode == DImode))
7398         {
7399           /* We expand to 4 instructions.  Reset the baseline.  */
7400           *cost = COSTS_N_INSNS (4);
7401
7402           if (speed)
7403             *cost += 2 * extra_cost->alu.logical
7404                      + 2 * extra_cost->alu.arith;
7405
7406           return true;
7407         }
7408
7409     /* Fall-through.  */
7410     case UMOD:
7411       if (speed)
7412         {
7413           if (VECTOR_MODE_P (mode))
7414             *cost += extra_cost->vect.alu;
7415           else if (GET_MODE_CLASS (mode) == MODE_INT)
7416             *cost += (extra_cost->mult[mode == DImode].add
7417                       + extra_cost->mult[mode == DImode].idiv);
7418           else if (mode == DFmode)
7419             *cost += (extra_cost->fp[1].mult
7420                       + extra_cost->fp[1].div);
7421           else if (mode == SFmode)
7422             *cost += (extra_cost->fp[0].mult
7423                       + extra_cost->fp[0].div);
7424         }
7425       return false;  /* All arguments need to be in registers.  */
7426
7427     case DIV:
7428     case UDIV:
7429     case SQRT:
7430       if (speed)
7431         {
7432           if (VECTOR_MODE_P (mode))
7433             *cost += extra_cost->vect.alu;
7434           else if (GET_MODE_CLASS (mode) == MODE_INT)
7435             /* There is no integer SQRT, so only DIV and UDIV can get
7436                here.  */
7437             *cost += extra_cost->mult[mode == DImode].idiv;
7438           else
7439             *cost += extra_cost->fp[mode == DFmode].div;
7440         }
7441       return false;  /* All arguments need to be in registers.  */
7442
7443     case IF_THEN_ELSE:
7444       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7445                                          XEXP (x, 2), cost, speed);
7446
7447     case EQ:
7448     case NE:
7449     case GT:
7450     case GTU:
7451     case LT:
7452     case LTU:
7453     case GE:
7454     case GEU:
7455     case LE:
7456     case LEU:
7457
7458       return false; /* All arguments must be in registers.  */
7459
7460     case FMA:
7461       op0 = XEXP (x, 0);
7462       op1 = XEXP (x, 1);
7463       op2 = XEXP (x, 2);
7464
7465       if (speed)
7466         {
7467           if (VECTOR_MODE_P (mode))
7468             *cost += extra_cost->vect.alu;
7469           else
7470             *cost += extra_cost->fp[mode == DFmode].fma;
7471         }
7472
7473       /* FMSUB, FNMADD, and FNMSUB are free.  */
7474       if (GET_CODE (op0) == NEG)
7475         op0 = XEXP (op0, 0);
7476
7477       if (GET_CODE (op2) == NEG)
7478         op2 = XEXP (op2, 0);
7479
7480       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7481          and the by-element operand as operand 0.  */
7482       if (GET_CODE (op1) == NEG)
7483         op1 = XEXP (op1, 0);
7484
7485       /* Catch vector-by-element operations.  The by-element operand can
7486          either be (vec_duplicate (vec_select (x))) or just
7487          (vec_select (x)), depending on whether we are multiplying by
7488          a vector or a scalar.
7489
7490          Canonicalization is not very good in these cases, FMA4 will put the
7491          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7492       if (GET_CODE (op0) == VEC_DUPLICATE)
7493         op0 = XEXP (op0, 0);
7494       else if (GET_CODE (op1) == VEC_DUPLICATE)
7495         op1 = XEXP (op1, 0);
7496
7497       if (GET_CODE (op0) == VEC_SELECT)
7498         op0 = XEXP (op0, 0);
7499       else if (GET_CODE (op1) == VEC_SELECT)
7500         op1 = XEXP (op1, 0);
7501
7502       /* If the remaining parameters are not registers,
7503          get the cost to put them into registers.  */
7504       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7505       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7506       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7507       return true;
7508
7509     case FLOAT:
7510     case UNSIGNED_FLOAT:
7511       if (speed)
7512         *cost += extra_cost->fp[mode == DFmode].fromint;
7513       return false;
7514
7515     case FLOAT_EXTEND:
7516       if (speed)
7517         {
7518           if (VECTOR_MODE_P (mode))
7519             {
7520               /*Vector truncate.  */
7521               *cost += extra_cost->vect.alu;
7522             }
7523           else
7524             *cost += extra_cost->fp[mode == DFmode].widen;
7525         }
7526       return false;
7527
7528     case FLOAT_TRUNCATE:
7529       if (speed)
7530         {
7531           if (VECTOR_MODE_P (mode))
7532             {
7533               /*Vector conversion.  */
7534               *cost += extra_cost->vect.alu;
7535             }
7536           else
7537             *cost += extra_cost->fp[mode == DFmode].narrow;
7538         }
7539       return false;
7540
7541     case FIX:
7542     case UNSIGNED_FIX:
7543       x = XEXP (x, 0);
7544       /* Strip the rounding part.  They will all be implemented
7545          by the fcvt* family of instructions anyway.  */
7546       if (GET_CODE (x) == UNSPEC)
7547         {
7548           unsigned int uns_code = XINT (x, 1);
7549
7550           if (uns_code == UNSPEC_FRINTA
7551               || uns_code == UNSPEC_FRINTM
7552               || uns_code == UNSPEC_FRINTN
7553               || uns_code == UNSPEC_FRINTP
7554               || uns_code == UNSPEC_FRINTZ)
7555             x = XVECEXP (x, 0, 0);
7556         }
7557
7558       if (speed)
7559         {
7560           if (VECTOR_MODE_P (mode))
7561             *cost += extra_cost->vect.alu;
7562           else
7563             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7564         }
7565
7566       /* We can combine fmul by a power of 2 followed by a fcvt into a single
7567          fixed-point fcvt.  */
7568       if (GET_CODE (x) == MULT
7569           && ((VECTOR_MODE_P (mode)
7570                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7571               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7572         {
7573           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7574                              0, speed);
7575           return true;
7576         }
7577
7578       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7579       return true;
7580
7581     case ABS:
7582       if (VECTOR_MODE_P (mode))
7583         {
7584           /* ABS (vector).  */
7585           if (speed)
7586             *cost += extra_cost->vect.alu;
7587         }
7588       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7589         {
7590           op0 = XEXP (x, 0);
7591
7592           /* FABD, which is analogous to FADD.  */
7593           if (GET_CODE (op0) == MINUS)
7594             {
7595               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7596               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7597               if (speed)
7598                 *cost += extra_cost->fp[mode == DFmode].addsub;
7599
7600               return true;
7601             }
7602           /* Simple FABS is analogous to FNEG.  */
7603           if (speed)
7604             *cost += extra_cost->fp[mode == DFmode].neg;
7605         }
7606       else
7607         {
7608           /* Integer ABS will either be split to
7609              two arithmetic instructions, or will be an ABS
7610              (scalar), which we don't model.  */
7611           *cost = COSTS_N_INSNS (2);
7612           if (speed)
7613             *cost += 2 * extra_cost->alu.arith;
7614         }
7615       return false;
7616
7617     case SMAX:
7618     case SMIN:
7619       if (speed)
7620         {
7621           if (VECTOR_MODE_P (mode))
7622             *cost += extra_cost->vect.alu;
7623           else
7624             {
7625               /* FMAXNM/FMINNM/FMAX/FMIN.
7626                  TODO: This may not be accurate for all implementations, but
7627                  we do not model this in the cost tables.  */
7628               *cost += extra_cost->fp[mode == DFmode].addsub;
7629             }
7630         }
7631       return false;
7632
7633     case UNSPEC:
7634       /* The floating point round to integer frint* instructions.  */
7635       if (aarch64_frint_unspec_p (XINT (x, 1)))
7636         {
7637           if (speed)
7638             *cost += extra_cost->fp[mode == DFmode].roundint;
7639
7640           return false;
7641         }
7642
7643       if (XINT (x, 1) == UNSPEC_RBIT)
7644         {
7645           if (speed)
7646             *cost += extra_cost->alu.rev;
7647
7648           return false;
7649         }
7650       break;
7651
7652     case TRUNCATE:
7653
7654       /* Decompose <su>muldi3_highpart.  */
7655       if (/* (truncate:DI  */
7656           mode == DImode
7657           /*   (lshiftrt:TI  */
7658           && GET_MODE (XEXP (x, 0)) == TImode
7659           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7660           /*      (mult:TI  */
7661           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7662           /*        (ANY_EXTEND:TI (reg:DI))
7663                     (ANY_EXTEND:TI (reg:DI)))  */
7664           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7665                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7666               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7667                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7668           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7669           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7670           /*     (const_int 64)  */
7671           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7672           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7673         {
7674           /* UMULH/SMULH.  */
7675           if (speed)
7676             *cost += extra_cost->mult[mode == DImode].extend;
7677           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7678                              mode, MULT, 0, speed);
7679           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7680                              mode, MULT, 1, speed);
7681           return true;
7682         }
7683
7684       /* Fall through.  */
7685     default:
7686       break;
7687     }
7688
7689   if (dump_file
7690       && flag_aarch64_verbose_cost)
7691     fprintf (dump_file,
7692       "\nFailed to cost RTX.  Assuming default cost.\n");
7693
7694   return true;
7695 }
7696
7697 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7698    calculated for X.  This cost is stored in *COST.  Returns true
7699    if the total cost of X was calculated.  */
7700 static bool
7701 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7702                    int param, int *cost, bool speed)
7703 {
7704   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7705
7706   if (dump_file
7707       && flag_aarch64_verbose_cost)
7708     {
7709       print_rtl_single (dump_file, x);
7710       fprintf (dump_file, "\n%s cost: %d (%s)\n",
7711                speed ? "Hot" : "Cold",
7712                *cost, result ? "final" : "partial");
7713     }
7714
7715   return result;
7716 }
7717
7718 static int
7719 aarch64_register_move_cost (machine_mode mode,
7720                             reg_class_t from_i, reg_class_t to_i)
7721 {
7722   enum reg_class from = (enum reg_class) from_i;
7723   enum reg_class to = (enum reg_class) to_i;
7724   const struct cpu_regmove_cost *regmove_cost
7725     = aarch64_tune_params.regmove_cost;
7726
7727   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
7728   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7729     to = GENERAL_REGS;
7730
7731   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7732     from = GENERAL_REGS;
7733
7734   /* Moving between GPR and stack cost is the same as GP2GP.  */
7735   if ((from == GENERAL_REGS && to == STACK_REG)
7736       || (to == GENERAL_REGS && from == STACK_REG))
7737     return regmove_cost->GP2GP;
7738
7739   /* To/From the stack register, we move via the gprs.  */
7740   if (to == STACK_REG || from == STACK_REG)
7741     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7742             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7743
7744   if (GET_MODE_SIZE (mode) == 16)
7745     {
7746       /* 128-bit operations on general registers require 2 instructions.  */
7747       if (from == GENERAL_REGS && to == GENERAL_REGS)
7748         return regmove_cost->GP2GP * 2;
7749       else if (from == GENERAL_REGS)
7750         return regmove_cost->GP2FP * 2;
7751       else if (to == GENERAL_REGS)
7752         return regmove_cost->FP2GP * 2;
7753
7754       /* When AdvSIMD instructions are disabled it is not possible to move
7755          a 128-bit value directly between Q registers.  This is handled in
7756          secondary reload.  A general register is used as a scratch to move
7757          the upper DI value and the lower DI value is moved directly,
7758          hence the cost is the sum of three moves. */
7759       if (! TARGET_SIMD)
7760         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7761
7762       return regmove_cost->FP2FP;
7763     }
7764
7765   if (from == GENERAL_REGS && to == GENERAL_REGS)
7766     return regmove_cost->GP2GP;
7767   else if (from == GENERAL_REGS)
7768     return regmove_cost->GP2FP;
7769   else if (to == GENERAL_REGS)
7770     return regmove_cost->FP2GP;
7771
7772   return regmove_cost->FP2FP;
7773 }
7774
7775 static int
7776 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7777                           reg_class_t rclass ATTRIBUTE_UNUSED,
7778                           bool in ATTRIBUTE_UNUSED)
7779 {
7780   return aarch64_tune_params.memmov_cost;
7781 }
7782
7783 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7784    to optimize 1.0/sqrt.  */
7785
7786 static bool
7787 use_rsqrt_p (machine_mode mode)
7788 {
7789   return (!flag_trapping_math
7790           && flag_unsafe_math_optimizations
7791           && ((aarch64_tune_params.approx_modes->recip_sqrt
7792                & AARCH64_APPROX_MODE (mode))
7793               || flag_mrecip_low_precision_sqrt));
7794 }
7795
7796 /* Function to decide when to use the approximate reciprocal square root
7797    builtin.  */
7798
7799 static tree
7800 aarch64_builtin_reciprocal (tree fndecl)
7801 {
7802   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
7803
7804   if (!use_rsqrt_p (mode))
7805     return NULL_TREE;
7806   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7807 }
7808
7809 typedef rtx (*rsqrte_type) (rtx, rtx);
7810
7811 /* Select reciprocal square root initial estimate insn depending on machine
7812    mode.  */
7813
7814 static rsqrte_type
7815 get_rsqrte_type (machine_mode mode)
7816 {
7817   switch (mode)
7818   {
7819     case DFmode:   return gen_aarch64_rsqrtedf;
7820     case SFmode:   return gen_aarch64_rsqrtesf;
7821     case V2DFmode: return gen_aarch64_rsqrtev2df;
7822     case V2SFmode: return gen_aarch64_rsqrtev2sf;
7823     case V4SFmode: return gen_aarch64_rsqrtev4sf;
7824     default: gcc_unreachable ();
7825   }
7826 }
7827
7828 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7829
7830 /* Select reciprocal square root series step insn depending on machine mode.  */
7831
7832 static rsqrts_type
7833 get_rsqrts_type (machine_mode mode)
7834 {
7835   switch (mode)
7836   {
7837     case DFmode:   return gen_aarch64_rsqrtsdf;
7838     case SFmode:   return gen_aarch64_rsqrtssf;
7839     case V2DFmode: return gen_aarch64_rsqrtsv2df;
7840     case V2SFmode: return gen_aarch64_rsqrtsv2sf;
7841     case V4SFmode: return gen_aarch64_rsqrtsv4sf;
7842     default: gcc_unreachable ();
7843   }
7844 }
7845
7846 /* Emit instruction sequence to compute either the approximate square root
7847    or its approximate reciprocal, depending on the flag RECP, and return
7848    whether the sequence was emitted or not.  */
7849
7850 bool
7851 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
7852 {
7853   machine_mode mode = GET_MODE (dst);
7854
7855   if (GET_MODE_INNER (mode) == HFmode)
7856     return false;
7857
7858   machine_mode mmsk = mode_for_vector
7859                         (int_mode_for_mode (GET_MODE_INNER (mode)),
7860                          GET_MODE_NUNITS (mode));
7861   bool use_approx_sqrt_p = (!recp
7862                             && (flag_mlow_precision_sqrt
7863                                 || (aarch64_tune_params.approx_modes->sqrt
7864                                     & AARCH64_APPROX_MODE (mode))));
7865   bool use_approx_rsqrt_p = (recp
7866                              && (flag_mrecip_low_precision_sqrt
7867                                  || (aarch64_tune_params.approx_modes->recip_sqrt
7868                                      & AARCH64_APPROX_MODE (mode))));
7869
7870   if (!flag_finite_math_only
7871       || flag_trapping_math
7872       || !flag_unsafe_math_optimizations
7873       || !(use_approx_sqrt_p || use_approx_rsqrt_p)
7874       || optimize_function_for_size_p (cfun))
7875     return false;
7876
7877   rtx xmsk = gen_reg_rtx (mmsk);
7878   if (!recp)
7879     /* When calculating the approximate square root, compare the argument with
7880        0.0 and create a mask.  */
7881     emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
7882                                                           CONST0_RTX (mode)))));
7883
7884   /* Estimate the approximate reciprocal square root.  */
7885   rtx xdst = gen_reg_rtx (mode);
7886   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
7887
7888   /* Iterate over the series twice for SF and thrice for DF.  */
7889   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7890
7891   /* Optionally iterate over the series once less for faster performance
7892      while sacrificing the accuracy.  */
7893   if ((recp && flag_mrecip_low_precision_sqrt)
7894       || (!recp && flag_mlow_precision_sqrt))
7895     iterations--;
7896
7897   /* Iterate over the series to calculate the approximate reciprocal square
7898      root.  */
7899   rtx x1 = gen_reg_rtx (mode);
7900   while (iterations--)
7901     {
7902       rtx x2 = gen_reg_rtx (mode);
7903       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
7904
7905       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
7906
7907       if (iterations > 0)
7908         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
7909     }
7910
7911   if (!recp)
7912     {
7913       /* Qualify the approximate reciprocal square root when the argument is
7914          0.0 by squashing the intermediary result to 0.0.  */
7915       rtx xtmp = gen_reg_rtx (mmsk);
7916       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
7917                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
7918       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
7919
7920       /* Calculate the approximate square root.  */
7921       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
7922     }
7923
7924   /* Finalize the approximation.  */
7925   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
7926
7927   return true;
7928 }
7929
7930 typedef rtx (*recpe_type) (rtx, rtx);
7931
7932 /* Select reciprocal initial estimate insn depending on machine mode.  */
7933
7934 static recpe_type
7935 get_recpe_type (machine_mode mode)
7936 {
7937   switch (mode)
7938   {
7939     case SFmode:   return (gen_aarch64_frecpesf);
7940     case V2SFmode: return (gen_aarch64_frecpev2sf);
7941     case V4SFmode: return (gen_aarch64_frecpev4sf);
7942     case DFmode:   return (gen_aarch64_frecpedf);
7943     case V2DFmode: return (gen_aarch64_frecpev2df);
7944     default:       gcc_unreachable ();
7945   }
7946 }
7947
7948 typedef rtx (*recps_type) (rtx, rtx, rtx);
7949
7950 /* Select reciprocal series step insn depending on machine mode.  */
7951
7952 static recps_type
7953 get_recps_type (machine_mode mode)
7954 {
7955   switch (mode)
7956   {
7957     case SFmode:   return (gen_aarch64_frecpssf);
7958     case V2SFmode: return (gen_aarch64_frecpsv2sf);
7959     case V4SFmode: return (gen_aarch64_frecpsv4sf);
7960     case DFmode:   return (gen_aarch64_frecpsdf);
7961     case V2DFmode: return (gen_aarch64_frecpsv2df);
7962     default:       gcc_unreachable ();
7963   }
7964 }
7965
7966 /* Emit the instruction sequence to compute the approximation for the division
7967    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
7968
7969 bool
7970 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
7971 {
7972   machine_mode mode = GET_MODE (quo);
7973
7974   if (GET_MODE_INNER (mode) == HFmode)
7975     return false;
7976
7977   bool use_approx_division_p = (flag_mlow_precision_div
7978                                 || (aarch64_tune_params.approx_modes->division
7979                                     & AARCH64_APPROX_MODE (mode)));
7980
7981   if (!flag_finite_math_only
7982       || flag_trapping_math
7983       || !flag_unsafe_math_optimizations
7984       || optimize_function_for_size_p (cfun)
7985       || !use_approx_division_p)
7986     return false;
7987
7988   /* Estimate the approximate reciprocal.  */
7989   rtx xrcp = gen_reg_rtx (mode);
7990   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
7991
7992   /* Iterate over the series twice for SF and thrice for DF.  */
7993   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7994
7995   /* Optionally iterate over the series once less for faster performance,
7996      while sacrificing the accuracy.  */
7997   if (flag_mlow_precision_div)
7998     iterations--;
7999
8000   /* Iterate over the series to calculate the approximate reciprocal.  */
8001   rtx xtmp = gen_reg_rtx (mode);
8002   while (iterations--)
8003     {
8004       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8005
8006       if (iterations > 0)
8007         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8008     }
8009
8010   if (num != CONST1_RTX (mode))
8011     {
8012       /* As the approximate reciprocal of DEN is already calculated, only
8013          calculate the approximate division when NUM is not 1.0.  */
8014       rtx xnum = force_reg (mode, num);
8015       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8016     }
8017
8018   /* Finalize the approximation.  */
8019   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8020   return true;
8021 }
8022
8023 /* Return the number of instructions that can be issued per cycle.  */
8024 static int
8025 aarch64_sched_issue_rate (void)
8026 {
8027   return aarch64_tune_params.issue_rate;
8028 }
8029
8030 static int
8031 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8032 {
8033   int issue_rate = aarch64_sched_issue_rate ();
8034
8035   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8036 }
8037
8038
8039 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8040    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
8041    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
8042
8043 static int
8044 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8045                                                     int ready_index)
8046 {
8047   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8048 }
8049
8050
8051 /* Vectorizer cost model target hooks.  */
8052
8053 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
8054 static int
8055 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8056                                     tree vectype,
8057                                     int misalign ATTRIBUTE_UNUSED)
8058 {
8059   unsigned elements;
8060
8061   switch (type_of_cost)
8062     {
8063       case scalar_stmt:
8064         return aarch64_tune_params.vec_costs->scalar_stmt_cost;
8065
8066       case scalar_load:
8067         return aarch64_tune_params.vec_costs->scalar_load_cost;
8068
8069       case scalar_store:
8070         return aarch64_tune_params.vec_costs->scalar_store_cost;
8071
8072       case vector_stmt:
8073         return aarch64_tune_params.vec_costs->vec_stmt_cost;
8074
8075       case vector_load:
8076         return aarch64_tune_params.vec_costs->vec_align_load_cost;
8077
8078       case vector_store:
8079         return aarch64_tune_params.vec_costs->vec_store_cost;
8080
8081       case vec_to_scalar:
8082         return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
8083
8084       case scalar_to_vec:
8085         return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
8086
8087       case unaligned_load:
8088         return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
8089
8090       case unaligned_store:
8091         return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
8092
8093       case cond_branch_taken:
8094         return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
8095
8096       case cond_branch_not_taken:
8097         return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
8098
8099       case vec_perm:
8100         return aarch64_tune_params.vec_costs->vec_permute_cost;
8101
8102       case vec_promote_demote:
8103         return aarch64_tune_params.vec_costs->vec_stmt_cost;
8104
8105       case vec_construct:
8106         elements = TYPE_VECTOR_SUBPARTS (vectype);
8107         return elements / 2 + 1;
8108
8109       default:
8110         gcc_unreachable ();
8111     }
8112 }
8113
8114 /* Implement targetm.vectorize.add_stmt_cost.  */
8115 static unsigned
8116 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8117                        struct _stmt_vec_info *stmt_info, int misalign,
8118                        enum vect_cost_model_location where)
8119 {
8120   unsigned *cost = (unsigned *) data;
8121   unsigned retval = 0;
8122
8123   if (flag_vect_cost_model)
8124     {
8125       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8126       int stmt_cost =
8127             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8128
8129       /* Statements in an inner loop relative to the loop being
8130          vectorized are weighted more heavily.  The value here is
8131          arbitrary and could potentially be improved with analysis.  */
8132       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8133         count *= 50; /*  FIXME  */
8134
8135       retval = (unsigned) (count * stmt_cost);
8136       cost[where] += retval;
8137     }
8138
8139   return retval;
8140 }
8141
8142 static void initialize_aarch64_code_model (struct gcc_options *);
8143
8144 /* Parse the TO_PARSE string and put the architecture struct that it
8145    selects into RES and the architectural features into ISA_FLAGS.
8146    Return an aarch64_parse_opt_result describing the parse result.
8147    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
8148
8149 static enum aarch64_parse_opt_result
8150 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8151                     unsigned long *isa_flags)
8152 {
8153   char *ext;
8154   const struct processor *arch;
8155   char *str = (char *) alloca (strlen (to_parse) + 1);
8156   size_t len;
8157
8158   strcpy (str, to_parse);
8159
8160   ext = strchr (str, '+');
8161
8162   if (ext != NULL)
8163     len = ext - str;
8164   else
8165     len = strlen (str);
8166
8167   if (len == 0)
8168     return AARCH64_PARSE_MISSING_ARG;
8169
8170
8171   /* Loop through the list of supported ARCHes to find a match.  */
8172   for (arch = all_architectures; arch->name != NULL; arch++)
8173     {
8174       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8175         {
8176           unsigned long isa_temp = arch->flags;
8177
8178           if (ext != NULL)
8179             {
8180               /* TO_PARSE string contains at least one extension.  */
8181               enum aarch64_parse_opt_result ext_res
8182                 = aarch64_parse_extension (ext, &isa_temp);
8183
8184               if (ext_res != AARCH64_PARSE_OK)
8185                 return ext_res;
8186             }
8187           /* Extension parsing was successful.  Confirm the result
8188              arch and ISA flags.  */
8189           *res = arch;
8190           *isa_flags = isa_temp;
8191           return AARCH64_PARSE_OK;
8192         }
8193     }
8194
8195   /* ARCH name not found in list.  */
8196   return AARCH64_PARSE_INVALID_ARG;
8197 }
8198
8199 /* Parse the TO_PARSE string and put the result tuning in RES and the
8200    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
8201    describing the parse result.  If there is an error parsing, RES and
8202    ISA_FLAGS are left unchanged.  */
8203
8204 static enum aarch64_parse_opt_result
8205 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8206                    unsigned long *isa_flags)
8207 {
8208   char *ext;
8209   const struct processor *cpu;
8210   char *str = (char *) alloca (strlen (to_parse) + 1);
8211   size_t len;
8212
8213   strcpy (str, to_parse);
8214
8215   ext = strchr (str, '+');
8216
8217   if (ext != NULL)
8218     len = ext - str;
8219   else
8220     len = strlen (str);
8221
8222   if (len == 0)
8223     return AARCH64_PARSE_MISSING_ARG;
8224
8225
8226   /* Loop through the list of supported CPUs to find a match.  */
8227   for (cpu = all_cores; cpu->name != NULL; cpu++)
8228     {
8229       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8230         {
8231           unsigned long isa_temp = cpu->flags;
8232
8233
8234           if (ext != NULL)
8235             {
8236               /* TO_PARSE string contains at least one extension.  */
8237               enum aarch64_parse_opt_result ext_res
8238                 = aarch64_parse_extension (ext, &isa_temp);
8239
8240               if (ext_res != AARCH64_PARSE_OK)
8241                 return ext_res;
8242             }
8243           /* Extension parsing was successfull.  Confirm the result
8244              cpu and ISA flags.  */
8245           *res = cpu;
8246           *isa_flags = isa_temp;
8247           return AARCH64_PARSE_OK;
8248         }
8249     }
8250
8251   /* CPU name not found in list.  */
8252   return AARCH64_PARSE_INVALID_ARG;
8253 }
8254
8255 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8256    Return an aarch64_parse_opt_result describing the parse result.
8257    If the parsing fails the RES does not change.  */
8258
8259 static enum aarch64_parse_opt_result
8260 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8261 {
8262   const struct processor *cpu;
8263   char *str = (char *) alloca (strlen (to_parse) + 1);
8264
8265   strcpy (str, to_parse);
8266
8267   /* Loop through the list of supported CPUs to find a match.  */
8268   for (cpu = all_cores; cpu->name != NULL; cpu++)
8269     {
8270       if (strcmp (cpu->name, str) == 0)
8271         {
8272           *res = cpu;
8273           return AARCH64_PARSE_OK;
8274         }
8275     }
8276
8277   /* CPU name not found in list.  */
8278   return AARCH64_PARSE_INVALID_ARG;
8279 }
8280
8281 /* Parse TOKEN, which has length LENGTH to see if it is an option
8282    described in FLAG.  If it is, return the index bit for that fusion type.
8283    If not, error (printing OPTION_NAME) and return zero.  */
8284
8285 static unsigned int
8286 aarch64_parse_one_option_token (const char *token,
8287                                 size_t length,
8288                                 const struct aarch64_flag_desc *flag,
8289                                 const char *option_name)
8290 {
8291   for (; flag->name != NULL; flag++)
8292     {
8293       if (length == strlen (flag->name)
8294           && !strncmp (flag->name, token, length))
8295         return flag->flag;
8296     }
8297
8298   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8299   return 0;
8300 }
8301
8302 /* Parse OPTION which is a comma-separated list of flags to enable.
8303    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8304    default state we inherit from the CPU tuning structures.  OPTION_NAME
8305    gives the top-level option we are parsing in the -moverride string,
8306    for use in error messages.  */
8307
8308 static unsigned int
8309 aarch64_parse_boolean_options (const char *option,
8310                                const struct aarch64_flag_desc *flags,
8311                                unsigned int initial_state,
8312                                const char *option_name)
8313 {
8314   const char separator = '.';
8315   const char* specs = option;
8316   const char* ntoken = option;
8317   unsigned int found_flags = initial_state;
8318
8319   while ((ntoken = strchr (specs, separator)))
8320     {
8321       size_t token_length = ntoken - specs;
8322       unsigned token_ops = aarch64_parse_one_option_token (specs,
8323                                                            token_length,
8324                                                            flags,
8325                                                            option_name);
8326       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8327          in the token stream, reset the supported operations.  So:
8328
8329            adrp+add.cmp+branch.none.adrp+add
8330
8331            would have the result of turning on only adrp+add fusion.  */
8332       if (!token_ops)
8333         found_flags = 0;
8334
8335       found_flags |= token_ops;
8336       specs = ++ntoken;
8337     }
8338
8339   /* We ended with a comma, print something.  */
8340   if (!(*specs))
8341     {
8342       error ("%s string ill-formed\n", option_name);
8343       return 0;
8344     }
8345
8346   /* We still have one more token to parse.  */
8347   size_t token_length = strlen (specs);
8348   unsigned token_ops = aarch64_parse_one_option_token (specs,
8349                                                        token_length,
8350                                                        flags,
8351                                                        option_name);
8352    if (!token_ops)
8353      found_flags = 0;
8354
8355   found_flags |= token_ops;
8356   return found_flags;
8357 }
8358
8359 /* Support for overriding instruction fusion.  */
8360
8361 static void
8362 aarch64_parse_fuse_string (const char *fuse_string,
8363                             struct tune_params *tune)
8364 {
8365   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8366                                                      aarch64_fusible_pairs,
8367                                                      tune->fusible_ops,
8368                                                      "fuse=");
8369 }
8370
8371 /* Support for overriding other tuning flags.  */
8372
8373 static void
8374 aarch64_parse_tune_string (const char *tune_string,
8375                             struct tune_params *tune)
8376 {
8377   tune->extra_tuning_flags
8378     = aarch64_parse_boolean_options (tune_string,
8379                                      aarch64_tuning_flags,
8380                                      tune->extra_tuning_flags,
8381                                      "tune=");
8382 }
8383
8384 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8385    we understand.  If it is, extract the option string and handoff to
8386    the appropriate function.  */
8387
8388 void
8389 aarch64_parse_one_override_token (const char* token,
8390                                   size_t length,
8391                                   struct tune_params *tune)
8392 {
8393   const struct aarch64_tuning_override_function *fn
8394     = aarch64_tuning_override_functions;
8395
8396   const char *option_part = strchr (token, '=');
8397   if (!option_part)
8398     {
8399       error ("tuning string missing in option (%s)", token);
8400       return;
8401     }
8402
8403   /* Get the length of the option name.  */
8404   length = option_part - token;
8405   /* Skip the '=' to get to the option string.  */
8406   option_part++;
8407
8408   for (; fn->name != NULL; fn++)
8409     {
8410       if (!strncmp (fn->name, token, length))
8411         {
8412           fn->parse_override (option_part, tune);
8413           return;
8414         }
8415     }
8416
8417   error ("unknown tuning option (%s)",token);
8418   return;
8419 }
8420
8421 /* A checking mechanism for the implementation of the tls size.  */
8422
8423 static void
8424 initialize_aarch64_tls_size (struct gcc_options *opts)
8425 {
8426   if (aarch64_tls_size == 0)
8427     aarch64_tls_size = 24;
8428
8429   switch (opts->x_aarch64_cmodel_var)
8430     {
8431     case AARCH64_CMODEL_TINY:
8432       /* Both the default and maximum TLS size allowed under tiny is 1M which
8433          needs two instructions to address, so we clamp the size to 24.  */
8434       if (aarch64_tls_size > 24)
8435         aarch64_tls_size = 24;
8436       break;
8437     case AARCH64_CMODEL_SMALL:
8438       /* The maximum TLS size allowed under small is 4G.  */
8439       if (aarch64_tls_size > 32)
8440         aarch64_tls_size = 32;
8441       break;
8442     case AARCH64_CMODEL_LARGE:
8443       /* The maximum TLS size allowed under large is 16E.
8444          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8445       if (aarch64_tls_size > 48)
8446         aarch64_tls_size = 48;
8447       break;
8448     default:
8449       gcc_unreachable ();
8450     }
8451
8452   return;
8453 }
8454
8455 /* Parse STRING looking for options in the format:
8456      string     :: option:string
8457      option     :: name=substring
8458      name       :: {a-z}
8459      substring  :: defined by option.  */
8460
8461 static void
8462 aarch64_parse_override_string (const char* input_string,
8463                                struct tune_params* tune)
8464 {
8465   const char separator = ':';
8466   size_t string_length = strlen (input_string) + 1;
8467   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8468   char *string = string_root;
8469   strncpy (string, input_string, string_length);
8470   string[string_length - 1] = '\0';
8471
8472   char* ntoken = string;
8473
8474   while ((ntoken = strchr (string, separator)))
8475     {
8476       size_t token_length = ntoken - string;
8477       /* Make this substring look like a string.  */
8478       *ntoken = '\0';
8479       aarch64_parse_one_override_token (string, token_length, tune);
8480       string = ++ntoken;
8481     }
8482
8483   /* One last option to parse.  */
8484   aarch64_parse_one_override_token (string, strlen (string), tune);
8485   free (string_root);
8486 }
8487
8488
8489 static void
8490 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8491 {
8492   /* The logic here is that if we are disabling all frame pointer generation
8493      then we do not need to disable leaf frame pointer generation as a
8494      separate operation.  But if we are *only* disabling leaf frame pointer
8495      generation then we set flag_omit_frame_pointer to true, but in
8496      aarch64_frame_pointer_required we return false only for leaf functions.
8497
8498      PR 70044: We have to be careful about being called multiple times for the
8499      same function.  Once we have decided to set flag_omit_frame_pointer just
8500      so that we can omit leaf frame pointers, we must then not interpret a
8501      second call as meaning that all frame pointer generation should be
8502      omitted.  We do this by setting flag_omit_frame_pointer to a special,
8503      non-zero value.  */
8504   if (opts->x_flag_omit_frame_pointer == 2)
8505     opts->x_flag_omit_frame_pointer = 0;
8506
8507   if (opts->x_flag_omit_frame_pointer)
8508     opts->x_flag_omit_leaf_frame_pointer = false;
8509   else if (opts->x_flag_omit_leaf_frame_pointer)
8510     opts->x_flag_omit_frame_pointer = 2;
8511
8512   /* If not optimizing for size, set the default
8513      alignment to what the target wants.  */
8514   if (!opts->x_optimize_size)
8515     {
8516       if (opts->x_align_loops <= 0)
8517         opts->x_align_loops = aarch64_tune_params.loop_align;
8518       if (opts->x_align_jumps <= 0)
8519         opts->x_align_jumps = aarch64_tune_params.jump_align;
8520       if (opts->x_align_functions <= 0)
8521         opts->x_align_functions = aarch64_tune_params.function_align;
8522     }
8523
8524   /* We default to no pc-relative literal loads.  */
8525
8526   aarch64_pcrelative_literal_loads = false;
8527
8528   /* If -mpc-relative-literal-loads is set on the command line, this
8529      implies that the user asked for PC relative literal loads.  */
8530   if (opts->x_pcrelative_literal_loads == 1)
8531     aarch64_pcrelative_literal_loads = true;
8532
8533   /* This is PR70113. When building the Linux kernel with
8534      CONFIG_ARM64_ERRATUM_843419, support for relocations
8535      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8536      removed from the kernel to avoid loading objects with possibly
8537      offending sequences.  Without -mpc-relative-literal-loads we would
8538      generate such relocations, preventing the kernel build from
8539      succeeding.  */
8540   if (opts->x_pcrelative_literal_loads == 2
8541       && TARGET_FIX_ERR_A53_843419)
8542     aarch64_pcrelative_literal_loads = true;
8543
8544   /* In the tiny memory model it makes no sense to disallow PC relative
8545      literal pool loads.  */
8546   if (aarch64_cmodel == AARCH64_CMODEL_TINY
8547       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8548     aarch64_pcrelative_literal_loads = true;
8549
8550   /* When enabling the lower precision Newton series for the square root, also
8551      enable it for the reciprocal square root, since the latter is an
8552      intermediary step for the former.  */
8553   if (flag_mlow_precision_sqrt)
8554     flag_mrecip_low_precision_sqrt = true;
8555 }
8556
8557 /* 'Unpack' up the internal tuning structs and update the options
8558     in OPTS.  The caller must have set up selected_tune and selected_arch
8559     as all the other target-specific codegen decisions are
8560     derived from them.  */
8561
8562 void
8563 aarch64_override_options_internal (struct gcc_options *opts)
8564 {
8565   aarch64_tune_flags = selected_tune->flags;
8566   aarch64_tune = selected_tune->sched_core;
8567   /* Make a copy of the tuning parameters attached to the core, which
8568      we may later overwrite.  */
8569   aarch64_tune_params = *(selected_tune->tune);
8570   aarch64_architecture_version = selected_arch->architecture_version;
8571
8572   if (opts->x_aarch64_override_tune_string)
8573     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8574                                   &aarch64_tune_params);
8575
8576   /* This target defaults to strict volatile bitfields.  */
8577   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8578     opts->x_flag_strict_volatile_bitfields = 1;
8579
8580   initialize_aarch64_code_model (opts);
8581   initialize_aarch64_tls_size (opts);
8582
8583   int queue_depth = 0;
8584   switch (aarch64_tune_params.autoprefetcher_model)
8585     {
8586       case tune_params::AUTOPREFETCHER_OFF:
8587         queue_depth = -1;
8588         break;
8589       case tune_params::AUTOPREFETCHER_WEAK:
8590         queue_depth = 0;
8591         break;
8592       case tune_params::AUTOPREFETCHER_STRONG:
8593         queue_depth = max_insn_queue_index + 1;
8594         break;
8595       default:
8596         gcc_unreachable ();
8597     }
8598
8599   /* We don't mind passing in global_options_set here as we don't use
8600      the *options_set structs anyway.  */
8601   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8602                          queue_depth,
8603                          opts->x_param_values,
8604                          global_options_set.x_param_values);
8605
8606   /* Set the L1 cache line size.  */
8607   if (selected_cpu->tune->cache_line_size != 0)
8608     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8609                            selected_cpu->tune->cache_line_size,
8610                            opts->x_param_values,
8611                            global_options_set.x_param_values);
8612
8613   aarch64_override_options_after_change_1 (opts);
8614 }
8615
8616 /* Print a hint with a suggestion for a core or architecture name that
8617    most closely resembles what the user passed in STR.  ARCH is true if
8618    the user is asking for an architecture name.  ARCH is false if the user
8619    is asking for a core name.  */
8620
8621 static void
8622 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
8623 {
8624   auto_vec<const char *> candidates;
8625   const struct processor *entry = arch ? all_architectures : all_cores;
8626   for (; entry->name != NULL; entry++)
8627     candidates.safe_push (entry->name);
8628   char *s;
8629   const char *hint = candidates_list_and_hint (str, s, candidates);
8630   if (hint)
8631     inform (input_location, "valid arguments are: %s;"
8632                              " did you mean %qs?", s, hint);
8633   XDELETEVEC (s);
8634 }
8635
8636 /* Print a hint with a suggestion for a core name that most closely resembles
8637    what the user passed in STR.  */
8638
8639 inline static void
8640 aarch64_print_hint_for_core (const char *str)
8641 {
8642   aarch64_print_hint_for_core_or_arch (str, false);
8643 }
8644
8645 /* Print a hint with a suggestion for an architecture name that most closely
8646    resembles what the user passed in STR.  */
8647
8648 inline static void
8649 aarch64_print_hint_for_arch (const char *str)
8650 {
8651   aarch64_print_hint_for_core_or_arch (str, true);
8652 }
8653
8654 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
8655    specified in STR and throw errors if appropriate.  Put the results if
8656    they are valid in RES and ISA_FLAGS.  Return whether the option is
8657    valid.  */
8658
8659 static bool
8660 aarch64_validate_mcpu (const char *str, const struct processor **res,
8661                        unsigned long *isa_flags)
8662 {
8663   enum aarch64_parse_opt_result parse_res
8664     = aarch64_parse_cpu (str, res, isa_flags);
8665
8666   if (parse_res == AARCH64_PARSE_OK)
8667     return true;
8668
8669   switch (parse_res)
8670     {
8671       case AARCH64_PARSE_MISSING_ARG:
8672         error ("missing cpu name in -mcpu=%qs", str);
8673         break;
8674       case AARCH64_PARSE_INVALID_ARG:
8675         error ("unknown value %qs for -mcpu", str);
8676         aarch64_print_hint_for_core (str);
8677         break;
8678       case AARCH64_PARSE_INVALID_FEATURE:
8679         error ("invalid feature modifier in -mcpu=%qs", str);
8680         break;
8681       default:
8682         gcc_unreachable ();
8683     }
8684
8685   return false;
8686 }
8687
8688 /* Validate a command-line -march option.  Parse the arch and extensions
8689    (if any) specified in STR and throw errors if appropriate.  Put the
8690    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
8691    option is valid.  */
8692
8693 static bool
8694 aarch64_validate_march (const char *str, const struct processor **res,
8695                          unsigned long *isa_flags)
8696 {
8697   enum aarch64_parse_opt_result parse_res
8698     = aarch64_parse_arch (str, res, isa_flags);
8699
8700   if (parse_res == AARCH64_PARSE_OK)
8701     return true;
8702
8703   switch (parse_res)
8704     {
8705       case AARCH64_PARSE_MISSING_ARG:
8706         error ("missing arch name in -march=%qs", str);
8707         break;
8708       case AARCH64_PARSE_INVALID_ARG:
8709         error ("unknown value %qs for -march", str);
8710         aarch64_print_hint_for_arch (str);
8711         break;
8712       case AARCH64_PARSE_INVALID_FEATURE:
8713         error ("invalid feature modifier in -march=%qs", str);
8714         break;
8715       default:
8716         gcc_unreachable ();
8717     }
8718
8719   return false;
8720 }
8721
8722 /* Validate a command-line -mtune option.  Parse the cpu
8723    specified in STR and throw errors if appropriate.  Put the
8724    result, if it is valid, in RES.  Return whether the option is
8725    valid.  */
8726
8727 static bool
8728 aarch64_validate_mtune (const char *str, const struct processor **res)
8729 {
8730   enum aarch64_parse_opt_result parse_res
8731     = aarch64_parse_tune (str, res);
8732
8733   if (parse_res == AARCH64_PARSE_OK)
8734     return true;
8735
8736   switch (parse_res)
8737     {
8738       case AARCH64_PARSE_MISSING_ARG:
8739         error ("missing cpu name in -mtune=%qs", str);
8740         break;
8741       case AARCH64_PARSE_INVALID_ARG:
8742         error ("unknown value %qs for -mtune", str);
8743         aarch64_print_hint_for_core (str);
8744         break;
8745       default:
8746         gcc_unreachable ();
8747     }
8748   return false;
8749 }
8750
8751 /* Return the CPU corresponding to the enum CPU.
8752    If it doesn't specify a cpu, return the default.  */
8753
8754 static const struct processor *
8755 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8756 {
8757   if (cpu != aarch64_none)
8758     return &all_cores[cpu];
8759
8760   /* The & 0x3f is to extract the bottom 6 bits that encode the
8761      default cpu as selected by the --with-cpu GCC configure option
8762      in config.gcc.
8763      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8764      flags mechanism should be reworked to make it more sane.  */
8765   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8766 }
8767
8768 /* Return the architecture corresponding to the enum ARCH.
8769    If it doesn't specify a valid architecture, return the default.  */
8770
8771 static const struct processor *
8772 aarch64_get_arch (enum aarch64_arch arch)
8773 {
8774   if (arch != aarch64_no_arch)
8775     return &all_architectures[arch];
8776
8777   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8778
8779   return &all_architectures[cpu->arch];
8780 }
8781
8782 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
8783    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8784    tuning structs.  In particular it must set selected_tune and
8785    aarch64_isa_flags that define the available ISA features and tuning
8786    decisions.  It must also set selected_arch as this will be used to
8787    output the .arch asm tags for each function.  */
8788
8789 static void
8790 aarch64_override_options (void)
8791 {
8792   unsigned long cpu_isa = 0;
8793   unsigned long arch_isa = 0;
8794   aarch64_isa_flags = 0;
8795
8796   bool valid_cpu = true;
8797   bool valid_tune = true;
8798   bool valid_arch = true;
8799
8800   selected_cpu = NULL;
8801   selected_arch = NULL;
8802   selected_tune = NULL;
8803
8804   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8805      If either of -march or -mtune is given, they override their
8806      respective component of -mcpu.  */
8807   if (aarch64_cpu_string)
8808     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8809                                         &cpu_isa);
8810
8811   if (aarch64_arch_string)
8812     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8813                                           &arch_isa);
8814
8815   if (aarch64_tune_string)
8816     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8817
8818   /* If the user did not specify a processor, choose the default
8819      one for them.  This will be the CPU set during configuration using
8820      --with-cpu, otherwise it is "generic".  */
8821   if (!selected_cpu)
8822     {
8823       if (selected_arch)
8824         {
8825           selected_cpu = &all_cores[selected_arch->ident];
8826           aarch64_isa_flags = arch_isa;
8827           explicit_arch = selected_arch->arch;
8828         }
8829       else
8830         {
8831           /* Get default configure-time CPU.  */
8832           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8833           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8834         }
8835
8836       if (selected_tune)
8837         explicit_tune_core = selected_tune->ident;
8838     }
8839   /* If both -mcpu and -march are specified check that they are architecturally
8840      compatible, warn if they're not and prefer the -march ISA flags.  */
8841   else if (selected_arch)
8842     {
8843       if (selected_arch->arch != selected_cpu->arch)
8844         {
8845           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8846                        all_architectures[selected_cpu->arch].name,
8847                        selected_arch->name);
8848         }
8849       aarch64_isa_flags = arch_isa;
8850       explicit_arch = selected_arch->arch;
8851       explicit_tune_core = selected_tune ? selected_tune->ident
8852                                           : selected_cpu->ident;
8853     }
8854   else
8855     {
8856       /* -mcpu but no -march.  */
8857       aarch64_isa_flags = cpu_isa;
8858       explicit_tune_core = selected_tune ? selected_tune->ident
8859                                           : selected_cpu->ident;
8860       gcc_assert (selected_cpu);
8861       selected_arch = &all_architectures[selected_cpu->arch];
8862       explicit_arch = selected_arch->arch;
8863     }
8864
8865   /* Set the arch as well as we will need it when outputing
8866      the .arch directive in assembly.  */
8867   if (!selected_arch)
8868     {
8869       gcc_assert (selected_cpu);
8870       selected_arch = &all_architectures[selected_cpu->arch];
8871     }
8872
8873   if (!selected_tune)
8874     selected_tune = selected_cpu;
8875
8876 #ifndef HAVE_AS_MABI_OPTION
8877   /* The compiler may have been configured with 2.23.* binutils, which does
8878      not have support for ILP32.  */
8879   if (TARGET_ILP32)
8880     error ("Assembler does not support -mabi=ilp32");
8881 #endif
8882
8883   /* Make sure we properly set up the explicit options.  */
8884   if ((aarch64_cpu_string && valid_cpu)
8885        || (aarch64_tune_string && valid_tune))
8886     gcc_assert (explicit_tune_core != aarch64_none);
8887
8888   if ((aarch64_cpu_string && valid_cpu)
8889        || (aarch64_arch_string && valid_arch))
8890     gcc_assert (explicit_arch != aarch64_no_arch);
8891
8892   aarch64_override_options_internal (&global_options);
8893
8894   /* Save these options as the default ones in case we push and pop them later
8895      while processing functions with potential target attributes.  */
8896   target_option_default_node = target_option_current_node
8897       = build_target_option_node (&global_options);
8898 }
8899
8900 /* Implement targetm.override_options_after_change.  */
8901
8902 static void
8903 aarch64_override_options_after_change (void)
8904 {
8905   aarch64_override_options_after_change_1 (&global_options);
8906 }
8907
8908 static struct machine_function *
8909 aarch64_init_machine_status (void)
8910 {
8911   struct machine_function *machine;
8912   machine = ggc_cleared_alloc<machine_function> ();
8913   return machine;
8914 }
8915
8916 void
8917 aarch64_init_expanders (void)
8918 {
8919   init_machine_status = aarch64_init_machine_status;
8920 }
8921
8922 /* A checking mechanism for the implementation of the various code models.  */
8923 static void
8924 initialize_aarch64_code_model (struct gcc_options *opts)
8925 {
8926    if (opts->x_flag_pic)
8927      {
8928        switch (opts->x_aarch64_cmodel_var)
8929          {
8930          case AARCH64_CMODEL_TINY:
8931            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8932            break;
8933          case AARCH64_CMODEL_SMALL:
8934 #ifdef HAVE_AS_SMALL_PIC_RELOCS
8935            aarch64_cmodel = (flag_pic == 2
8936                              ? AARCH64_CMODEL_SMALL_PIC
8937                              : AARCH64_CMODEL_SMALL_SPIC);
8938 #else
8939            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
8940 #endif
8941            break;
8942          case AARCH64_CMODEL_LARGE:
8943            sorry ("code model %qs with -f%s", "large",
8944                   opts->x_flag_pic > 1 ? "PIC" : "pic");
8945            break;
8946          default:
8947            gcc_unreachable ();
8948          }
8949      }
8950    else
8951      aarch64_cmodel = opts->x_aarch64_cmodel_var;
8952 }
8953
8954 /* Implement TARGET_OPTION_SAVE.  */
8955
8956 static void
8957 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
8958 {
8959   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
8960 }
8961
8962 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
8963    using the information saved in PTR.  */
8964
8965 static void
8966 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
8967 {
8968   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
8969   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8970   opts->x_explicit_arch = ptr->x_explicit_arch;
8971   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
8972   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
8973
8974   aarch64_override_options_internal (opts);
8975 }
8976
8977 /* Implement TARGET_OPTION_PRINT.  */
8978
8979 static void
8980 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
8981 {
8982   const struct processor *cpu
8983     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8984   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
8985   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
8986   std::string extension
8987     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
8988
8989   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
8990   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
8991            arch->name, extension.c_str ());
8992 }
8993
8994 static GTY(()) tree aarch64_previous_fndecl;
8995
8996 void
8997 aarch64_reset_previous_fndecl (void)
8998 {
8999   aarch64_previous_fndecl = NULL;
9000 }
9001
9002 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9003    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9004    make sure optab availability predicates are recomputed when necessary.  */
9005
9006 void
9007 aarch64_save_restore_target_globals (tree new_tree)
9008 {
9009   if (TREE_TARGET_GLOBALS (new_tree))
9010     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9011   else if (new_tree == target_option_default_node)
9012     restore_target_globals (&default_target_globals);
9013   else
9014     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9015 }
9016
9017 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
9018    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9019    of the function, if such exists.  This function may be called multiple
9020    times on a single function so use aarch64_previous_fndecl to avoid
9021    setting up identical state.  */
9022
9023 static void
9024 aarch64_set_current_function (tree fndecl)
9025 {
9026   if (!fndecl || fndecl == aarch64_previous_fndecl)
9027     return;
9028
9029   tree old_tree = (aarch64_previous_fndecl
9030                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9031                    : NULL_TREE);
9032
9033   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9034
9035   /* If current function has no attributes but the previous one did,
9036      use the default node.  */
9037   if (!new_tree && old_tree)
9038     new_tree = target_option_default_node;
9039
9040   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
9041      the default have been handled by aarch64_save_restore_target_globals from
9042      aarch64_pragma_target_parse.  */
9043   if (old_tree == new_tree)
9044     return;
9045
9046   aarch64_previous_fndecl = fndecl;
9047
9048   /* First set the target options.  */
9049   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9050
9051   aarch64_save_restore_target_globals (new_tree);
9052 }
9053
9054 /* Enum describing the various ways we can handle attributes.
9055    In many cases we can reuse the generic option handling machinery.  */
9056
9057 enum aarch64_attr_opt_type
9058 {
9059   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
9060   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
9061   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
9062   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
9063 };
9064
9065 /* All the information needed to handle a target attribute.
9066    NAME is the name of the attribute.
9067    ATTR_TYPE specifies the type of behavior of the attribute as described
9068    in the definition of enum aarch64_attr_opt_type.
9069    ALLOW_NEG is true if the attribute supports a "no-" form.
9070    HANDLER is the function that takes the attribute string and whether
9071    it is a pragma or attribute and handles the option.  It is needed only
9072    when the ATTR_TYPE is aarch64_attr_custom.
9073    OPT_NUM is the enum specifying the option that the attribute modifies.
9074    This is needed for attributes that mirror the behavior of a command-line
9075    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9076    aarch64_attr_enum.  */
9077
9078 struct aarch64_attribute_info
9079 {
9080   const char *name;
9081   enum aarch64_attr_opt_type attr_type;
9082   bool allow_neg;
9083   bool (*handler) (const char *, const char *);
9084   enum opt_code opt_num;
9085 };
9086
9087 /* Handle the ARCH_STR argument to the arch= target attribute.
9088    PRAGMA_OR_ATTR is used in potential error messages.  */
9089
9090 static bool
9091 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9092 {
9093   const struct processor *tmp_arch = NULL;
9094   enum aarch64_parse_opt_result parse_res
9095     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9096
9097   if (parse_res == AARCH64_PARSE_OK)
9098     {
9099       gcc_assert (tmp_arch);
9100       selected_arch = tmp_arch;
9101       explicit_arch = selected_arch->arch;
9102       return true;
9103     }
9104
9105   switch (parse_res)
9106     {
9107       case AARCH64_PARSE_MISSING_ARG:
9108         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9109         break;
9110       case AARCH64_PARSE_INVALID_ARG:
9111         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9112         aarch64_print_hint_for_arch (str);
9113         break;
9114       case AARCH64_PARSE_INVALID_FEATURE:
9115         error ("invalid feature modifier %qs for 'arch' target %s",
9116                str, pragma_or_attr);
9117         break;
9118       default:
9119         gcc_unreachable ();
9120     }
9121
9122   return false;
9123 }
9124
9125 /* Handle the argument CPU_STR to the cpu= target attribute.
9126    PRAGMA_OR_ATTR is used in potential error messages.  */
9127
9128 static bool
9129 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9130 {
9131   const struct processor *tmp_cpu = NULL;
9132   enum aarch64_parse_opt_result parse_res
9133     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9134
9135   if (parse_res == AARCH64_PARSE_OK)
9136     {
9137       gcc_assert (tmp_cpu);
9138       selected_tune = tmp_cpu;
9139       explicit_tune_core = selected_tune->ident;
9140
9141       selected_arch = &all_architectures[tmp_cpu->arch];
9142       explicit_arch = selected_arch->arch;
9143       return true;
9144     }
9145
9146   switch (parse_res)
9147     {
9148       case AARCH64_PARSE_MISSING_ARG:
9149         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9150         break;
9151       case AARCH64_PARSE_INVALID_ARG:
9152         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9153         aarch64_print_hint_for_core (str);
9154         break;
9155       case AARCH64_PARSE_INVALID_FEATURE:
9156         error ("invalid feature modifier %qs for 'cpu' target %s",
9157                str, pragma_or_attr);
9158         break;
9159       default:
9160         gcc_unreachable ();
9161     }
9162
9163   return false;
9164 }
9165
9166 /* Handle the argument STR to the tune= target attribute.
9167    PRAGMA_OR_ATTR is used in potential error messages.  */
9168
9169 static bool
9170 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9171 {
9172   const struct processor *tmp_tune = NULL;
9173   enum aarch64_parse_opt_result parse_res
9174     = aarch64_parse_tune (str, &tmp_tune);
9175
9176   if (parse_res == AARCH64_PARSE_OK)
9177     {
9178       gcc_assert (tmp_tune);
9179       selected_tune = tmp_tune;
9180       explicit_tune_core = selected_tune->ident;
9181       return true;
9182     }
9183
9184   switch (parse_res)
9185     {
9186       case AARCH64_PARSE_INVALID_ARG:
9187         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9188         aarch64_print_hint_for_core (str);
9189         break;
9190       default:
9191         gcc_unreachable ();
9192     }
9193
9194   return false;
9195 }
9196
9197 /* Parse an architecture extensions target attribute string specified in STR.
9198    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
9199    if successful.  Update aarch64_isa_flags to reflect the ISA features
9200    modified.
9201    PRAGMA_OR_ATTR is used in potential error messages.  */
9202
9203 static bool
9204 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9205 {
9206   enum aarch64_parse_opt_result parse_res;
9207   unsigned long isa_flags = aarch64_isa_flags;
9208
9209   /* We allow "+nothing" in the beginning to clear out all architectural
9210      features if the user wants to handpick specific features.  */
9211   if (strncmp ("+nothing", str, 8) == 0)
9212     {
9213       isa_flags = 0;
9214       str += 8;
9215     }
9216
9217   parse_res = aarch64_parse_extension (str, &isa_flags);
9218
9219   if (parse_res == AARCH64_PARSE_OK)
9220     {
9221       aarch64_isa_flags = isa_flags;
9222       return true;
9223     }
9224
9225   switch (parse_res)
9226     {
9227       case AARCH64_PARSE_MISSING_ARG:
9228         error ("missing feature modifier in target %s %qs",
9229                pragma_or_attr, str);
9230         break;
9231
9232       case AARCH64_PARSE_INVALID_FEATURE:
9233         error ("invalid feature modifier in target %s %qs",
9234                pragma_or_attr, str);
9235         break;
9236
9237       default:
9238         gcc_unreachable ();
9239     }
9240
9241  return false;
9242 }
9243
9244 /* The target attributes that we support.  On top of these we also support just
9245    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
9246    handled explicitly in aarch64_process_one_target_attr.  */
9247
9248 static const struct aarch64_attribute_info aarch64_attributes[] =
9249 {
9250   { "general-regs-only", aarch64_attr_mask, false, NULL,
9251      OPT_mgeneral_regs_only },
9252   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9253      OPT_mfix_cortex_a53_835769 },
9254   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9255      OPT_mfix_cortex_a53_843419 },
9256   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9257   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9258   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9259      OPT_momit_leaf_frame_pointer },
9260   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9261   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9262      OPT_march_ },
9263   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9264   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9265      OPT_mtune_ },
9266   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9267 };
9268
9269 /* Parse ARG_STR which contains the definition of one target attribute.
9270    Show appropriate errors if any or return true if the attribute is valid.
9271    PRAGMA_OR_ATTR holds the string to use in error messages about whether
9272    we're processing a target attribute or pragma.  */
9273
9274 static bool
9275 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9276 {
9277   bool invert = false;
9278
9279   size_t len = strlen (arg_str);
9280
9281   if (len == 0)
9282     {
9283       error ("malformed target %s", pragma_or_attr);
9284       return false;
9285     }
9286
9287   char *str_to_check = (char *) alloca (len + 1);
9288   strcpy (str_to_check, arg_str);
9289
9290   /* Skip leading whitespace.  */
9291   while (*str_to_check == ' ' || *str_to_check == '\t')
9292     str_to_check++;
9293
9294   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9295      It is easier to detect and handle it explicitly here rather than going
9296      through the machinery for the rest of the target attributes in this
9297      function.  */
9298   if (*str_to_check == '+')
9299     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9300
9301   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9302     {
9303       invert = true;
9304       str_to_check += 3;
9305     }
9306   char *arg = strchr (str_to_check, '=');
9307
9308   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9309      and point ARG to "foo".  */
9310   if (arg)
9311     {
9312       *arg = '\0';
9313       arg++;
9314     }
9315   const struct aarch64_attribute_info *p_attr;
9316   bool found = false;
9317   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9318     {
9319       /* If the names don't match up, or the user has given an argument
9320          to an attribute that doesn't accept one, or didn't give an argument
9321          to an attribute that expects one, fail to match.  */
9322       if (strcmp (str_to_check, p_attr->name) != 0)
9323         continue;
9324
9325       found = true;
9326       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9327                               || p_attr->attr_type == aarch64_attr_enum;
9328
9329       if (attr_need_arg_p ^ (arg != NULL))
9330         {
9331           error ("target %s %qs does not accept an argument",
9332                   pragma_or_attr, str_to_check);
9333           return false;
9334         }
9335
9336       /* If the name matches but the attribute does not allow "no-" versions
9337          then we can't match.  */
9338       if (invert && !p_attr->allow_neg)
9339         {
9340           error ("target %s %qs does not allow a negated form",
9341                   pragma_or_attr, str_to_check);
9342           return false;
9343         }
9344
9345       switch (p_attr->attr_type)
9346         {
9347         /* Has a custom handler registered.
9348            For example, cpu=, arch=, tune=.  */
9349           case aarch64_attr_custom:
9350             gcc_assert (p_attr->handler);
9351             if (!p_attr->handler (arg, pragma_or_attr))
9352               return false;
9353             break;
9354
9355           /* Either set or unset a boolean option.  */
9356           case aarch64_attr_bool:
9357             {
9358               struct cl_decoded_option decoded;
9359
9360               generate_option (p_attr->opt_num, NULL, !invert,
9361                                CL_TARGET, &decoded);
9362               aarch64_handle_option (&global_options, &global_options_set,
9363                                       &decoded, input_location);
9364               break;
9365             }
9366           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9367              should know what mask to apply given the option number.  */
9368           case aarch64_attr_mask:
9369             {
9370               struct cl_decoded_option decoded;
9371               /* We only need to specify the option number.
9372                  aarch64_handle_option will know which mask to apply.  */
9373               decoded.opt_index = p_attr->opt_num;
9374               decoded.value = !invert;
9375               aarch64_handle_option (&global_options, &global_options_set,
9376                                       &decoded, input_location);
9377               break;
9378             }
9379           /* Use the option setting machinery to set an option to an enum.  */
9380           case aarch64_attr_enum:
9381             {
9382               gcc_assert (arg);
9383               bool valid;
9384               int value;
9385               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9386                                               &value, CL_TARGET);
9387               if (valid)
9388                 {
9389                   set_option (&global_options, NULL, p_attr->opt_num, value,
9390                               NULL, DK_UNSPECIFIED, input_location,
9391                               global_dc);
9392                 }
9393               else
9394                 {
9395                   error ("target %s %s=%s is not valid",
9396                          pragma_or_attr, str_to_check, arg);
9397                 }
9398               break;
9399             }
9400           default:
9401             gcc_unreachable ();
9402         }
9403     }
9404
9405   /* If we reached here we either have found an attribute and validated
9406      it or didn't match any.  If we matched an attribute but its arguments
9407      were malformed we will have returned false already.  */
9408   return found;
9409 }
9410
9411 /* Count how many times the character C appears in
9412    NULL-terminated string STR.  */
9413
9414 static unsigned int
9415 num_occurences_in_str (char c, char *str)
9416 {
9417   unsigned int res = 0;
9418   while (*str != '\0')
9419     {
9420       if (*str == c)
9421         res++;
9422
9423       str++;
9424     }
9425
9426   return res;
9427 }
9428
9429 /* Parse the tree in ARGS that contains the target attribute information
9430    and update the global target options space.  PRAGMA_OR_ATTR is a string
9431    to be used in error messages, specifying whether this is processing
9432    a target attribute or a target pragma.  */
9433
9434 bool
9435 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9436 {
9437   if (TREE_CODE (args) == TREE_LIST)
9438     {
9439       do
9440         {
9441           tree head = TREE_VALUE (args);
9442           if (head)
9443             {
9444               if (!aarch64_process_target_attr (head, pragma_or_attr))
9445                 return false;
9446             }
9447           args = TREE_CHAIN (args);
9448         } while (args);
9449
9450       return true;
9451     }
9452   /* We expect to find a string to parse.  */
9453   gcc_assert (TREE_CODE (args) == STRING_CST);
9454
9455   size_t len = strlen (TREE_STRING_POINTER (args));
9456   char *str_to_check = (char *) alloca (len + 1);
9457   strcpy (str_to_check, TREE_STRING_POINTER (args));
9458
9459   if (len == 0)
9460     {
9461       error ("malformed target %s value", pragma_or_attr);
9462       return false;
9463     }
9464
9465   /* Used to catch empty spaces between commas i.e.
9466      attribute ((target ("attr1,,attr2"))).  */
9467   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9468
9469   /* Handle multiple target attributes separated by ','.  */
9470   char *token = strtok (str_to_check, ",");
9471
9472   unsigned int num_attrs = 0;
9473   while (token)
9474     {
9475       num_attrs++;
9476       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9477         {
9478           error ("target %s %qs is invalid", pragma_or_attr, token);
9479           return false;
9480         }
9481
9482       token = strtok (NULL, ",");
9483     }
9484
9485   if (num_attrs != num_commas + 1)
9486     {
9487       error ("malformed target %s list %qs",
9488               pragma_or_attr, TREE_STRING_POINTER (args));
9489       return false;
9490     }
9491
9492   return true;
9493 }
9494
9495 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9496    process attribute ((target ("..."))).  */
9497
9498 static bool
9499 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9500 {
9501   struct cl_target_option cur_target;
9502   bool ret;
9503   tree old_optimize;
9504   tree new_target, new_optimize;
9505   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9506
9507   /* If what we're processing is the current pragma string then the
9508      target option node is already stored in target_option_current_node
9509      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
9510      having to re-parse the string.  This is especially useful to keep
9511      arm_neon.h compile times down since that header contains a lot
9512      of intrinsics enclosed in pragmas.  */
9513   if (!existing_target && args == current_target_pragma)
9514     {
9515       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9516       return true;
9517     }
9518   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9519
9520   old_optimize = build_optimization_node (&global_options);
9521   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9522
9523   /* If the function changed the optimization levels as well as setting
9524      target options, start with the optimizations specified.  */
9525   if (func_optimize && func_optimize != old_optimize)
9526     cl_optimization_restore (&global_options,
9527                              TREE_OPTIMIZATION (func_optimize));
9528
9529   /* Save the current target options to restore at the end.  */
9530   cl_target_option_save (&cur_target, &global_options);
9531
9532   /* If fndecl already has some target attributes applied to it, unpack
9533      them so that we add this attribute on top of them, rather than
9534      overwriting them.  */
9535   if (existing_target)
9536     {
9537       struct cl_target_option *existing_options
9538         = TREE_TARGET_OPTION (existing_target);
9539
9540       if (existing_options)
9541         cl_target_option_restore (&global_options, existing_options);
9542     }
9543   else
9544     cl_target_option_restore (&global_options,
9545                         TREE_TARGET_OPTION (target_option_current_node));
9546
9547
9548   ret = aarch64_process_target_attr (args, "attribute");
9549
9550   /* Set up any additional state.  */
9551   if (ret)
9552     {
9553       aarch64_override_options_internal (&global_options);
9554       /* Initialize SIMD builtins if we haven't already.
9555          Set current_target_pragma to NULL for the duration so that
9556          the builtin initialization code doesn't try to tag the functions
9557          being built with the attributes specified by any current pragma, thus
9558          going into an infinite recursion.  */
9559       if (TARGET_SIMD)
9560         {
9561           tree saved_current_target_pragma = current_target_pragma;
9562           current_target_pragma = NULL;
9563           aarch64_init_simd_builtins ();
9564           current_target_pragma = saved_current_target_pragma;
9565         }
9566       new_target = build_target_option_node (&global_options);
9567     }
9568   else
9569     new_target = NULL;
9570
9571   new_optimize = build_optimization_node (&global_options);
9572
9573   if (fndecl && ret)
9574     {
9575       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9576
9577       if (old_optimize != new_optimize)
9578         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9579     }
9580
9581   cl_target_option_restore (&global_options, &cur_target);
9582
9583   if (old_optimize != new_optimize)
9584     cl_optimization_restore (&global_options,
9585                              TREE_OPTIMIZATION (old_optimize));
9586   return ret;
9587 }
9588
9589 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
9590    tri-bool options (yes, no, don't care) and the default value is
9591    DEF, determine whether to reject inlining.  */
9592
9593 static bool
9594 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9595                                      int dont_care, int def)
9596 {
9597   /* If the callee doesn't care, always allow inlining.  */
9598   if (callee == dont_care)
9599     return true;
9600
9601   /* If the caller doesn't care, always allow inlining.  */
9602   if (caller == dont_care)
9603     return true;
9604
9605   /* Otherwise, allow inlining if either the callee and caller values
9606      agree, or if the callee is using the default value.  */
9607   return (callee == caller || callee == def);
9608 }
9609
9610 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
9611    to inline CALLEE into CALLER based on target-specific info.
9612    Make sure that the caller and callee have compatible architectural
9613    features.  Then go through the other possible target attributes
9614    and see if they can block inlining.  Try not to reject always_inline
9615    callees unless they are incompatible architecturally.  */
9616
9617 static bool
9618 aarch64_can_inline_p (tree caller, tree callee)
9619 {
9620   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9621   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9622
9623   /* If callee has no option attributes, then it is ok to inline.  */
9624   if (!callee_tree)
9625     return true;
9626
9627   struct cl_target_option *caller_opts
9628         = TREE_TARGET_OPTION (caller_tree ? caller_tree
9629                                            : target_option_default_node);
9630
9631   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9632
9633
9634   /* Callee's ISA flags should be a subset of the caller's.  */
9635   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9636        != callee_opts->x_aarch64_isa_flags)
9637     return false;
9638
9639   /* Allow non-strict aligned functions inlining into strict
9640      aligned ones.  */
9641   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9642        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9643       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9644            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9645     return false;
9646
9647   bool always_inline = lookup_attribute ("always_inline",
9648                                           DECL_ATTRIBUTES (callee));
9649
9650   /* If the architectural features match up and the callee is always_inline
9651      then the other attributes don't matter.  */
9652   if (always_inline)
9653     return true;
9654
9655   if (caller_opts->x_aarch64_cmodel_var
9656       != callee_opts->x_aarch64_cmodel_var)
9657     return false;
9658
9659   if (caller_opts->x_aarch64_tls_dialect
9660       != callee_opts->x_aarch64_tls_dialect)
9661     return false;
9662
9663   /* Honour explicit requests to workaround errata.  */
9664   if (!aarch64_tribools_ok_for_inlining_p (
9665           caller_opts->x_aarch64_fix_a53_err835769,
9666           callee_opts->x_aarch64_fix_a53_err835769,
9667           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9668     return false;
9669
9670   if (!aarch64_tribools_ok_for_inlining_p (
9671           caller_opts->x_aarch64_fix_a53_err843419,
9672           callee_opts->x_aarch64_fix_a53_err843419,
9673           2, TARGET_FIX_ERR_A53_843419))
9674     return false;
9675
9676   /* If the user explicitly specified -momit-leaf-frame-pointer for the
9677      caller and calle and they don't match up, reject inlining.  */
9678   if (!aarch64_tribools_ok_for_inlining_p (
9679           caller_opts->x_flag_omit_leaf_frame_pointer,
9680           callee_opts->x_flag_omit_leaf_frame_pointer,
9681           2, 1))
9682     return false;
9683
9684   /* If the callee has specific tuning overrides, respect them.  */
9685   if (callee_opts->x_aarch64_override_tune_string != NULL
9686       && caller_opts->x_aarch64_override_tune_string == NULL)
9687     return false;
9688
9689   /* If the user specified tuning override strings for the
9690      caller and callee and they don't match up, reject inlining.
9691      We just do a string compare here, we don't analyze the meaning
9692      of the string, as it would be too costly for little gain.  */
9693   if (callee_opts->x_aarch64_override_tune_string
9694       && caller_opts->x_aarch64_override_tune_string
9695       && (strcmp (callee_opts->x_aarch64_override_tune_string,
9696                   caller_opts->x_aarch64_override_tune_string) != 0))
9697     return false;
9698
9699   return true;
9700 }
9701
9702 /* Return true if SYMBOL_REF X binds locally.  */
9703
9704 static bool
9705 aarch64_symbol_binds_local_p (const_rtx x)
9706 {
9707   return (SYMBOL_REF_DECL (x)
9708           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9709           : SYMBOL_REF_LOCAL_P (x));
9710 }
9711
9712 /* Return true if SYMBOL_REF X is thread local */
9713 static bool
9714 aarch64_tls_symbol_p (rtx x)
9715 {
9716   if (! TARGET_HAVE_TLS)
9717     return false;
9718
9719   if (GET_CODE (x) != SYMBOL_REF)
9720     return false;
9721
9722   return SYMBOL_REF_TLS_MODEL (x) != 0;
9723 }
9724
9725 /* Classify a TLS symbol into one of the TLS kinds.  */
9726 enum aarch64_symbol_type
9727 aarch64_classify_tls_symbol (rtx x)
9728 {
9729   enum tls_model tls_kind = tls_symbolic_operand_type (x);
9730
9731   switch (tls_kind)
9732     {
9733     case TLS_MODEL_GLOBAL_DYNAMIC:
9734     case TLS_MODEL_LOCAL_DYNAMIC:
9735       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9736
9737     case TLS_MODEL_INITIAL_EXEC:
9738       switch (aarch64_cmodel)
9739         {
9740         case AARCH64_CMODEL_TINY:
9741         case AARCH64_CMODEL_TINY_PIC:
9742           return SYMBOL_TINY_TLSIE;
9743         default:
9744           return SYMBOL_SMALL_TLSIE;
9745         }
9746
9747     case TLS_MODEL_LOCAL_EXEC:
9748       if (aarch64_tls_size == 12)
9749         return SYMBOL_TLSLE12;
9750       else if (aarch64_tls_size == 24)
9751         return SYMBOL_TLSLE24;
9752       else if (aarch64_tls_size == 32)
9753         return SYMBOL_TLSLE32;
9754       else if (aarch64_tls_size == 48)
9755         return SYMBOL_TLSLE48;
9756       else
9757         gcc_unreachable ();
9758
9759     case TLS_MODEL_EMULATED:
9760     case TLS_MODEL_NONE:
9761       return SYMBOL_FORCE_TO_MEM;
9762
9763     default:
9764       gcc_unreachable ();
9765     }
9766 }
9767
9768 /* Return the method that should be used to access SYMBOL_REF or
9769    LABEL_REF X.  */
9770
9771 enum aarch64_symbol_type
9772 aarch64_classify_symbol (rtx x, rtx offset)
9773 {
9774   if (GET_CODE (x) == LABEL_REF)
9775     {
9776       switch (aarch64_cmodel)
9777         {
9778         case AARCH64_CMODEL_LARGE:
9779           return SYMBOL_FORCE_TO_MEM;
9780
9781         case AARCH64_CMODEL_TINY_PIC:
9782         case AARCH64_CMODEL_TINY:
9783           return SYMBOL_TINY_ABSOLUTE;
9784
9785         case AARCH64_CMODEL_SMALL_SPIC:
9786         case AARCH64_CMODEL_SMALL_PIC:
9787         case AARCH64_CMODEL_SMALL:
9788           return SYMBOL_SMALL_ABSOLUTE;
9789
9790         default:
9791           gcc_unreachable ();
9792         }
9793     }
9794
9795   if (GET_CODE (x) == SYMBOL_REF)
9796     {
9797       if (aarch64_tls_symbol_p (x))
9798         return aarch64_classify_tls_symbol (x);
9799
9800       switch (aarch64_cmodel)
9801         {
9802         case AARCH64_CMODEL_TINY:
9803           /* When we retrieve symbol + offset address, we have to make sure
9804              the offset does not cause overflow of the final address.  But
9805              we have no way of knowing the address of symbol at compile time
9806              so we can't accurately say if the distance between the PC and
9807              symbol + offset is outside the addressible range of +/-1M in the
9808              TINY code model.  So we rely on images not being greater than
9809              1M and cap the offset at 1M and anything beyond 1M will have to
9810              be loaded using an alternative mechanism.  Furthermore if the
9811              symbol is a weak reference to something that isn't known to
9812              resolve to a symbol in this module, then force to memory.  */
9813           if ((SYMBOL_REF_WEAK (x)
9814                && !aarch64_symbol_binds_local_p (x))
9815               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9816             return SYMBOL_FORCE_TO_MEM;
9817           return SYMBOL_TINY_ABSOLUTE;
9818
9819         case AARCH64_CMODEL_SMALL:
9820           /* Same reasoning as the tiny code model, but the offset cap here is
9821              4G.  */
9822           if ((SYMBOL_REF_WEAK (x)
9823                && !aarch64_symbol_binds_local_p (x))
9824               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9825                             HOST_WIDE_INT_C (4294967264)))
9826             return SYMBOL_FORCE_TO_MEM;
9827           return SYMBOL_SMALL_ABSOLUTE;
9828
9829         case AARCH64_CMODEL_TINY_PIC:
9830           if (!aarch64_symbol_binds_local_p (x))
9831             return SYMBOL_TINY_GOT;
9832           return SYMBOL_TINY_ABSOLUTE;
9833
9834         case AARCH64_CMODEL_SMALL_SPIC:
9835         case AARCH64_CMODEL_SMALL_PIC:
9836           if (!aarch64_symbol_binds_local_p (x))
9837             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9838                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9839           return SYMBOL_SMALL_ABSOLUTE;
9840
9841         case AARCH64_CMODEL_LARGE:
9842           /* This is alright even in PIC code as the constant
9843              pool reference is always PC relative and within
9844              the same translation unit.  */
9845           if (CONSTANT_POOL_ADDRESS_P (x))
9846             return SYMBOL_SMALL_ABSOLUTE;
9847           else
9848             return SYMBOL_FORCE_TO_MEM;
9849
9850         default:
9851           gcc_unreachable ();
9852         }
9853     }
9854
9855   /* By default push everything into the constant pool.  */
9856   return SYMBOL_FORCE_TO_MEM;
9857 }
9858
9859 bool
9860 aarch64_constant_address_p (rtx x)
9861 {
9862   return (CONSTANT_P (x) && memory_address_p (DImode, x));
9863 }
9864
9865 bool
9866 aarch64_legitimate_pic_operand_p (rtx x)
9867 {
9868   if (GET_CODE (x) == SYMBOL_REF
9869       || (GET_CODE (x) == CONST
9870           && GET_CODE (XEXP (x, 0)) == PLUS
9871           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9872      return false;
9873
9874   return true;
9875 }
9876
9877 /* Return true if X holds either a quarter-precision or
9878      floating-point +0.0 constant.  */
9879 static bool
9880 aarch64_valid_floating_const (machine_mode mode, rtx x)
9881 {
9882   if (!CONST_DOUBLE_P (x))
9883     return false;
9884
9885   if (aarch64_float_const_zero_rtx_p (x))
9886     return true;
9887
9888   /* We only handle moving 0.0 to a TFmode register.  */
9889   if (!(mode == SFmode || mode == DFmode))
9890     return false;
9891
9892   return aarch64_float_const_representable_p (x);
9893 }
9894
9895 static bool
9896 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9897 {
9898   /* Do not allow vector struct mode constants.  We could support
9899      0 and -1 easily, but they need support in aarch64-simd.md.  */
9900   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9901     return false;
9902
9903   /* This could probably go away because
9904      we now decompose CONST_INTs according to expand_mov_immediate.  */
9905   if ((GET_CODE (x) == CONST_VECTOR
9906        && aarch64_simd_valid_immediate (x, mode, false, NULL))
9907       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9908         return !targetm.cannot_force_const_mem (mode, x);
9909
9910   if (GET_CODE (x) == HIGH
9911       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9912     return true;
9913
9914   return aarch64_constant_address_p (x);
9915 }
9916
9917 rtx
9918 aarch64_load_tp (rtx target)
9919 {
9920   if (!target
9921       || GET_MODE (target) != Pmode
9922       || !register_operand (target, Pmode))
9923     target = gen_reg_rtx (Pmode);
9924
9925   /* Can return in any reg.  */
9926   emit_insn (gen_aarch64_load_tp_hard (target));
9927   return target;
9928 }
9929
9930 /* On AAPCS systems, this is the "struct __va_list".  */
9931 static GTY(()) tree va_list_type;
9932
9933 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
9934    Return the type to use as __builtin_va_list.
9935
9936    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
9937
9938    struct __va_list
9939    {
9940      void *__stack;
9941      void *__gr_top;
9942      void *__vr_top;
9943      int   __gr_offs;
9944      int   __vr_offs;
9945    };  */
9946
9947 static tree
9948 aarch64_build_builtin_va_list (void)
9949 {
9950   tree va_list_name;
9951   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9952
9953   /* Create the type.  */
9954   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
9955   /* Give it the required name.  */
9956   va_list_name = build_decl (BUILTINS_LOCATION,
9957                              TYPE_DECL,
9958                              get_identifier ("__va_list"),
9959                              va_list_type);
9960   DECL_ARTIFICIAL (va_list_name) = 1;
9961   TYPE_NAME (va_list_type) = va_list_name;
9962   TYPE_STUB_DECL (va_list_type) = va_list_name;
9963
9964   /* Create the fields.  */
9965   f_stack = build_decl (BUILTINS_LOCATION,
9966                         FIELD_DECL, get_identifier ("__stack"),
9967                         ptr_type_node);
9968   f_grtop = build_decl (BUILTINS_LOCATION,
9969                         FIELD_DECL, get_identifier ("__gr_top"),
9970                         ptr_type_node);
9971   f_vrtop = build_decl (BUILTINS_LOCATION,
9972                         FIELD_DECL, get_identifier ("__vr_top"),
9973                         ptr_type_node);
9974   f_groff = build_decl (BUILTINS_LOCATION,
9975                         FIELD_DECL, get_identifier ("__gr_offs"),
9976                         integer_type_node);
9977   f_vroff = build_decl (BUILTINS_LOCATION,
9978                         FIELD_DECL, get_identifier ("__vr_offs"),
9979                         integer_type_node);
9980
9981   /* Tell tree-stdarg pass about our internal offset fields.
9982      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
9983      purpose to identify whether the code is updating va_list internal
9984      offset fields through irregular way.  */
9985   va_list_gpr_counter_field = f_groff;
9986   va_list_fpr_counter_field = f_vroff;
9987
9988   DECL_ARTIFICIAL (f_stack) = 1;
9989   DECL_ARTIFICIAL (f_grtop) = 1;
9990   DECL_ARTIFICIAL (f_vrtop) = 1;
9991   DECL_ARTIFICIAL (f_groff) = 1;
9992   DECL_ARTIFICIAL (f_vroff) = 1;
9993
9994   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
9995   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
9996   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
9997   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
9998   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
9999
10000   TYPE_FIELDS (va_list_type) = f_stack;
10001   DECL_CHAIN (f_stack) = f_grtop;
10002   DECL_CHAIN (f_grtop) = f_vrtop;
10003   DECL_CHAIN (f_vrtop) = f_groff;
10004   DECL_CHAIN (f_groff) = f_vroff;
10005
10006   /* Compute its layout.  */
10007   layout_type (va_list_type);
10008
10009   return va_list_type;
10010 }
10011
10012 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
10013 static void
10014 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10015 {
10016   const CUMULATIVE_ARGS *cum;
10017   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10018   tree stack, grtop, vrtop, groff, vroff;
10019   tree t;
10020   int gr_save_area_size = cfun->va_list_gpr_size;
10021   int vr_save_area_size = cfun->va_list_fpr_size;
10022   int vr_offset;
10023
10024   cum = &crtl->args.info;
10025   if (cfun->va_list_gpr_size)
10026     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10027                              cfun->va_list_gpr_size);
10028   if (cfun->va_list_fpr_size)
10029     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10030                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
10031
10032   if (!TARGET_FLOAT)
10033     {
10034       gcc_assert (cum->aapcs_nvrn == 0);
10035       vr_save_area_size = 0;
10036     }
10037
10038   f_stack = TYPE_FIELDS (va_list_type_node);
10039   f_grtop = DECL_CHAIN (f_stack);
10040   f_vrtop = DECL_CHAIN (f_grtop);
10041   f_groff = DECL_CHAIN (f_vrtop);
10042   f_vroff = DECL_CHAIN (f_groff);
10043
10044   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10045                   NULL_TREE);
10046   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10047                   NULL_TREE);
10048   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10049                   NULL_TREE);
10050   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10051                   NULL_TREE);
10052   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10053                   NULL_TREE);
10054
10055   /* Emit code to initialize STACK, which points to the next varargs stack
10056      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
10057      by named arguments.  STACK is 8-byte aligned.  */
10058   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10059   if (cum->aapcs_stack_size > 0)
10060     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10061   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10062   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10063
10064   /* Emit code to initialize GRTOP, the top of the GR save area.
10065      virtual_incoming_args_rtx should have been 16 byte aligned.  */
10066   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10067   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10068   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10069
10070   /* Emit code to initialize VRTOP, the top of the VR save area.
10071      This address is gr_save_area_bytes below GRTOP, rounded
10072      down to the next 16-byte boundary.  */
10073   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10074   vr_offset = ROUND_UP (gr_save_area_size,
10075                         STACK_BOUNDARY / BITS_PER_UNIT);
10076
10077   if (vr_offset)
10078     t = fold_build_pointer_plus_hwi (t, -vr_offset);
10079   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10080   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10081
10082   /* Emit code to initialize GROFF, the offset from GRTOP of the
10083      next GPR argument.  */
10084   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10085               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10086   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10087
10088   /* Likewise emit code to initialize VROFF, the offset from FTOP
10089      of the next VR argument.  */
10090   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10091               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10092   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10093 }
10094
10095 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
10096
10097 static tree
10098 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10099                               gimple_seq *post_p ATTRIBUTE_UNUSED)
10100 {
10101   tree addr;
10102   bool indirect_p;
10103   bool is_ha;           /* is HFA or HVA.  */
10104   bool dw_align;        /* double-word align.  */
10105   machine_mode ag_mode = VOIDmode;
10106   int nregs;
10107   machine_mode mode;
10108
10109   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10110   tree stack, f_top, f_off, off, arg, roundup, on_stack;
10111   HOST_WIDE_INT size, rsize, adjust, align;
10112   tree t, u, cond1, cond2;
10113
10114   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10115   if (indirect_p)
10116     type = build_pointer_type (type);
10117
10118   mode = TYPE_MODE (type);
10119
10120   f_stack = TYPE_FIELDS (va_list_type_node);
10121   f_grtop = DECL_CHAIN (f_stack);
10122   f_vrtop = DECL_CHAIN (f_grtop);
10123   f_groff = DECL_CHAIN (f_vrtop);
10124   f_vroff = DECL_CHAIN (f_groff);
10125
10126   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10127                   f_stack, NULL_TREE);
10128   size = int_size_in_bytes (type);
10129   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10130
10131   dw_align = false;
10132   adjust = 0;
10133   if (aarch64_vfp_is_call_or_return_candidate (mode,
10134                                                type,
10135                                                &ag_mode,
10136                                                &nregs,
10137                                                &is_ha))
10138     {
10139       /* TYPE passed in fp/simd registers.  */
10140       if (!TARGET_FLOAT)
10141         aarch64_err_no_fpadvsimd (mode, "varargs");
10142
10143       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10144                       unshare_expr (valist), f_vrtop, NULL_TREE);
10145       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10146                       unshare_expr (valist), f_vroff, NULL_TREE);
10147
10148       rsize = nregs * UNITS_PER_VREG;
10149
10150       if (is_ha)
10151         {
10152           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10153             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10154         }
10155       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10156                && size < UNITS_PER_VREG)
10157         {
10158           adjust = UNITS_PER_VREG - size;
10159         }
10160     }
10161   else
10162     {
10163       /* TYPE passed in general registers.  */
10164       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10165                       unshare_expr (valist), f_grtop, NULL_TREE);
10166       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10167                       unshare_expr (valist), f_groff, NULL_TREE);
10168       rsize = ROUND_UP (size, UNITS_PER_WORD);
10169       nregs = rsize / UNITS_PER_WORD;
10170
10171       if (align > 8)
10172         dw_align = true;
10173
10174       if (BLOCK_REG_PADDING (mode, type, 1) == downward
10175           && size < UNITS_PER_WORD)
10176         {
10177           adjust = UNITS_PER_WORD  - size;
10178         }
10179     }
10180
10181   /* Get a local temporary for the field value.  */
10182   off = get_initialized_tmp_var (f_off, pre_p, NULL);
10183
10184   /* Emit code to branch if off >= 0.  */
10185   t = build2 (GE_EXPR, boolean_type_node, off,
10186               build_int_cst (TREE_TYPE (off), 0));
10187   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10188
10189   if (dw_align)
10190     {
10191       /* Emit: offs = (offs + 15) & -16.  */
10192       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10193                   build_int_cst (TREE_TYPE (off), 15));
10194       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10195                   build_int_cst (TREE_TYPE (off), -16));
10196       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10197     }
10198   else
10199     roundup = NULL;
10200
10201   /* Update ap.__[g|v]r_offs  */
10202   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10203               build_int_cst (TREE_TYPE (off), rsize));
10204   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10205
10206   /* String up.  */
10207   if (roundup)
10208     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10209
10210   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
10211   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10212               build_int_cst (TREE_TYPE (f_off), 0));
10213   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10214
10215   /* String up: make sure the assignment happens before the use.  */
10216   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10217   COND_EXPR_ELSE (cond1) = t;
10218
10219   /* Prepare the trees handling the argument that is passed on the stack;
10220      the top level node will store in ON_STACK.  */
10221   arg = get_initialized_tmp_var (stack, pre_p, NULL);
10222   if (align > 8)
10223     {
10224       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
10225       t = fold_convert (intDI_type_node, arg);
10226       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10227                   build_int_cst (TREE_TYPE (t), 15));
10228       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10229                   build_int_cst (TREE_TYPE (t), -16));
10230       t = fold_convert (TREE_TYPE (arg), t);
10231       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10232     }
10233   else
10234     roundup = NULL;
10235   /* Advance ap.__stack  */
10236   t = fold_convert (intDI_type_node, arg);
10237   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10238               build_int_cst (TREE_TYPE (t), size + 7));
10239   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10240               build_int_cst (TREE_TYPE (t), -8));
10241   t = fold_convert (TREE_TYPE (arg), t);
10242   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10243   /* String up roundup and advance.  */
10244   if (roundup)
10245     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10246   /* String up with arg */
10247   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10248   /* Big-endianness related address adjustment.  */
10249   if (BLOCK_REG_PADDING (mode, type, 1) == downward
10250       && size < UNITS_PER_WORD)
10251   {
10252     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10253                 size_int (UNITS_PER_WORD - size));
10254     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10255   }
10256
10257   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10258   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10259
10260   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
10261   t = off;
10262   if (adjust)
10263     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10264                 build_int_cst (TREE_TYPE (off), adjust));
10265
10266   t = fold_convert (sizetype, t);
10267   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10268
10269   if (is_ha)
10270     {
10271       /* type ha; // treat as "struct {ftype field[n];}"
10272          ... [computing offs]
10273          for (i = 0; i <nregs; ++i, offs += 16)
10274            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10275          return ha;  */
10276       int i;
10277       tree tmp_ha, field_t, field_ptr_t;
10278
10279       /* Declare a local variable.  */
10280       tmp_ha = create_tmp_var_raw (type, "ha");
10281       gimple_add_tmp_var (tmp_ha);
10282
10283       /* Establish the base type.  */
10284       switch (ag_mode)
10285         {
10286         case SFmode:
10287           field_t = float_type_node;
10288           field_ptr_t = float_ptr_type_node;
10289           break;
10290         case DFmode:
10291           field_t = double_type_node;
10292           field_ptr_t = double_ptr_type_node;
10293           break;
10294         case TFmode:
10295           field_t = long_double_type_node;
10296           field_ptr_t = long_double_ptr_type_node;
10297           break;
10298         case HFmode:
10299           field_t = aarch64_fp16_type_node;
10300           field_ptr_t = aarch64_fp16_ptr_type_node;
10301           break;
10302         case V2SImode:
10303         case V4SImode:
10304             {
10305               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10306               field_t = build_vector_type_for_mode (innertype, ag_mode);
10307               field_ptr_t = build_pointer_type (field_t);
10308             }
10309           break;
10310         default:
10311           gcc_assert (0);
10312         }
10313
10314       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10315       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10316       addr = t;
10317       t = fold_convert (field_ptr_t, addr);
10318       t = build2 (MODIFY_EXPR, field_t,
10319                   build1 (INDIRECT_REF, field_t, tmp_ha),
10320                   build1 (INDIRECT_REF, field_t, t));
10321
10322       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10323       for (i = 1; i < nregs; ++i)
10324         {
10325           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10326           u = fold_convert (field_ptr_t, addr);
10327           u = build2 (MODIFY_EXPR, field_t,
10328                       build2 (MEM_REF, field_t, tmp_ha,
10329                               build_int_cst (field_ptr_t,
10330                                              (i *
10331                                               int_size_in_bytes (field_t)))),
10332                       build1 (INDIRECT_REF, field_t, u));
10333           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10334         }
10335
10336       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10337       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10338     }
10339
10340   COND_EXPR_ELSE (cond2) = t;
10341   addr = fold_convert (build_pointer_type (type), cond1);
10342   addr = build_va_arg_indirect_ref (addr);
10343
10344   if (indirect_p)
10345     addr = build_va_arg_indirect_ref (addr);
10346
10347   return addr;
10348 }
10349
10350 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10351
10352 static void
10353 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10354                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10355                                 int no_rtl)
10356 {
10357   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10358   CUMULATIVE_ARGS local_cum;
10359   int gr_saved = cfun->va_list_gpr_size;
10360   int vr_saved = cfun->va_list_fpr_size;
10361
10362   /* The caller has advanced CUM up to, but not beyond, the last named
10363      argument.  Advance a local copy of CUM past the last "real" named
10364      argument, to find out how many registers are left over.  */
10365   local_cum = *cum;
10366   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10367
10368   /* Found out how many registers we need to save.
10369      Honor tree-stdvar analysis results.  */
10370   if (cfun->va_list_gpr_size)
10371     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10372                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10373   if (cfun->va_list_fpr_size)
10374     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10375                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10376
10377   if (!TARGET_FLOAT)
10378     {
10379       gcc_assert (local_cum.aapcs_nvrn == 0);
10380       vr_saved = 0;
10381     }
10382
10383   if (!no_rtl)
10384     {
10385       if (gr_saved > 0)
10386         {
10387           rtx ptr, mem;
10388
10389           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10390           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10391                                - gr_saved * UNITS_PER_WORD);
10392           mem = gen_frame_mem (BLKmode, ptr);
10393           set_mem_alias_set (mem, get_varargs_alias_set ());
10394
10395           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10396                                mem, gr_saved);
10397         }
10398       if (vr_saved > 0)
10399         {
10400           /* We can't use move_block_from_reg, because it will use
10401              the wrong mode, storing D regs only.  */
10402           machine_mode mode = TImode;
10403           int off, i, vr_start;
10404
10405           /* Set OFF to the offset from virtual_incoming_args_rtx of
10406              the first vector register.  The VR save area lies below
10407              the GR one, and is aligned to 16 bytes.  */
10408           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10409                            STACK_BOUNDARY / BITS_PER_UNIT);
10410           off -= vr_saved * UNITS_PER_VREG;
10411
10412           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10413           for (i = 0; i < vr_saved; ++i)
10414             {
10415               rtx ptr, mem;
10416
10417               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10418               mem = gen_frame_mem (mode, ptr);
10419               set_mem_alias_set (mem, get_varargs_alias_set ());
10420               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10421               off += UNITS_PER_VREG;
10422             }
10423         }
10424     }
10425
10426   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10427      any complication of having crtl->args.pretend_args_size changed.  */
10428   cfun->machine->frame.saved_varargs_size
10429     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10430                  STACK_BOUNDARY / BITS_PER_UNIT)
10431        + vr_saved * UNITS_PER_VREG);
10432 }
10433
10434 static void
10435 aarch64_conditional_register_usage (void)
10436 {
10437   int i;
10438   if (!TARGET_FLOAT)
10439     {
10440       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10441         {
10442           fixed_regs[i] = 1;
10443           call_used_regs[i] = 1;
10444         }
10445     }
10446 }
10447
10448 /* Walk down the type tree of TYPE counting consecutive base elements.
10449    If *MODEP is VOIDmode, then set it to the first valid floating point
10450    type.  If a non-floating point type is found, or if a floating point
10451    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10452    otherwise return the count in the sub-tree.  */
10453 static int
10454 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10455 {
10456   machine_mode mode;
10457   HOST_WIDE_INT size;
10458
10459   switch (TREE_CODE (type))
10460     {
10461     case REAL_TYPE:
10462       mode = TYPE_MODE (type);
10463       if (mode != DFmode && mode != SFmode
10464           && mode != TFmode && mode != HFmode)
10465         return -1;
10466
10467       if (*modep == VOIDmode)
10468         *modep = mode;
10469
10470       if (*modep == mode)
10471         return 1;
10472
10473       break;
10474
10475     case COMPLEX_TYPE:
10476       mode = TYPE_MODE (TREE_TYPE (type));
10477       if (mode != DFmode && mode != SFmode
10478           && mode != TFmode && mode != HFmode)
10479         return -1;
10480
10481       if (*modep == VOIDmode)
10482         *modep = mode;
10483
10484       if (*modep == mode)
10485         return 2;
10486
10487       break;
10488
10489     case VECTOR_TYPE:
10490       /* Use V2SImode and V4SImode as representatives of all 64-bit
10491          and 128-bit vector types.  */
10492       size = int_size_in_bytes (type);
10493       switch (size)
10494         {
10495         case 8:
10496           mode = V2SImode;
10497           break;
10498         case 16:
10499           mode = V4SImode;
10500           break;
10501         default:
10502           return -1;
10503         }
10504
10505       if (*modep == VOIDmode)
10506         *modep = mode;
10507
10508       /* Vector modes are considered to be opaque: two vectors are
10509          equivalent for the purposes of being homogeneous aggregates
10510          if they are the same size.  */
10511       if (*modep == mode)
10512         return 1;
10513
10514       break;
10515
10516     case ARRAY_TYPE:
10517       {
10518         int count;
10519         tree index = TYPE_DOMAIN (type);
10520
10521         /* Can't handle incomplete types nor sizes that are not
10522            fixed.  */
10523         if (!COMPLETE_TYPE_P (type)
10524             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10525           return -1;
10526
10527         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10528         if (count == -1
10529             || !index
10530             || !TYPE_MAX_VALUE (index)
10531             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10532             || !TYPE_MIN_VALUE (index)
10533             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10534             || count < 0)
10535           return -1;
10536
10537         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10538                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10539
10540         /* There must be no padding.  */
10541         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10542           return -1;
10543
10544         return count;
10545       }
10546
10547     case RECORD_TYPE:
10548       {
10549         int count = 0;
10550         int sub_count;
10551         tree field;
10552
10553         /* Can't handle incomplete types nor sizes that are not
10554            fixed.  */
10555         if (!COMPLETE_TYPE_P (type)
10556             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10557           return -1;
10558
10559         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10560           {
10561             if (TREE_CODE (field) != FIELD_DECL)
10562               continue;
10563
10564             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10565             if (sub_count < 0)
10566               return -1;
10567             count += sub_count;
10568           }
10569
10570         /* There must be no padding.  */
10571         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10572           return -1;
10573
10574         return count;
10575       }
10576
10577     case UNION_TYPE:
10578     case QUAL_UNION_TYPE:
10579       {
10580         /* These aren't very interesting except in a degenerate case.  */
10581         int count = 0;
10582         int sub_count;
10583         tree field;
10584
10585         /* Can't handle incomplete types nor sizes that are not
10586            fixed.  */
10587         if (!COMPLETE_TYPE_P (type)
10588             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10589           return -1;
10590
10591         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10592           {
10593             if (TREE_CODE (field) != FIELD_DECL)
10594               continue;
10595
10596             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10597             if (sub_count < 0)
10598               return -1;
10599             count = count > sub_count ? count : sub_count;
10600           }
10601
10602         /* There must be no padding.  */
10603         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10604           return -1;
10605
10606         return count;
10607       }
10608
10609     default:
10610       break;
10611     }
10612
10613   return -1;
10614 }
10615
10616 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10617    type as described in AAPCS64 \S 4.1.2.
10618
10619    See the comment above aarch64_composite_type_p for the notes on MODE.  */
10620
10621 static bool
10622 aarch64_short_vector_p (const_tree type,
10623                         machine_mode mode)
10624 {
10625   HOST_WIDE_INT size = -1;
10626
10627   if (type && TREE_CODE (type) == VECTOR_TYPE)
10628     size = int_size_in_bytes (type);
10629   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10630             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10631     size = GET_MODE_SIZE (mode);
10632
10633   return (size == 8 || size == 16);
10634 }
10635
10636 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10637    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
10638    array types.  The C99 floating-point complex types are also considered
10639    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
10640    types, which are GCC extensions and out of the scope of AAPCS64, are
10641    treated as composite types here as well.
10642
10643    Note that MODE itself is not sufficient in determining whether a type
10644    is such a composite type or not.  This is because
10645    stor-layout.c:compute_record_mode may have already changed the MODE
10646    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
10647    structure with only one field may have its MODE set to the mode of the
10648    field.  Also an integer mode whose size matches the size of the
10649    RECORD_TYPE type may be used to substitute the original mode
10650    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
10651    solely relied on.  */
10652
10653 static bool
10654 aarch64_composite_type_p (const_tree type,
10655                           machine_mode mode)
10656 {
10657   if (aarch64_short_vector_p (type, mode))
10658     return false;
10659
10660   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10661     return true;
10662
10663   if (mode == BLKmode
10664       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10665       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10666     return true;
10667
10668   return false;
10669 }
10670
10671 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10672    shall be passed or returned in simd/fp register(s) (providing these
10673    parameter passing registers are available).
10674
10675    Upon successful return, *COUNT returns the number of needed registers,
10676    *BASE_MODE returns the mode of the individual register and when IS_HAF
10677    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10678    floating-point aggregate or a homogeneous short-vector aggregate.  */
10679
10680 static bool
10681 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10682                                          const_tree type,
10683                                          machine_mode *base_mode,
10684                                          int *count,
10685                                          bool *is_ha)
10686 {
10687   machine_mode new_mode = VOIDmode;
10688   bool composite_p = aarch64_composite_type_p (type, mode);
10689
10690   if (is_ha != NULL) *is_ha = false;
10691
10692   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10693       || aarch64_short_vector_p (type, mode))
10694     {
10695       *count = 1;
10696       new_mode = mode;
10697     }
10698   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10699     {
10700       if (is_ha != NULL) *is_ha = true;
10701       *count = 2;
10702       new_mode = GET_MODE_INNER (mode);
10703     }
10704   else if (type && composite_p)
10705     {
10706       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10707
10708       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10709         {
10710           if (is_ha != NULL) *is_ha = true;
10711           *count = ag_count;
10712         }
10713       else
10714         return false;
10715     }
10716   else
10717     return false;
10718
10719   *base_mode = new_mode;
10720   return true;
10721 }
10722
10723 /* Implement TARGET_STRUCT_VALUE_RTX.  */
10724
10725 static rtx
10726 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10727                           int incoming ATTRIBUTE_UNUSED)
10728 {
10729   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10730 }
10731
10732 /* Implements target hook vector_mode_supported_p.  */
10733 static bool
10734 aarch64_vector_mode_supported_p (machine_mode mode)
10735 {
10736   if (TARGET_SIMD
10737       && (mode == V4SImode  || mode == V8HImode
10738           || mode == V16QImode || mode == V2DImode
10739           || mode == V2SImode  || mode == V4HImode
10740           || mode == V8QImode || mode == V2SFmode
10741           || mode == V4SFmode || mode == V2DFmode
10742           || mode == V4HFmode || mode == V8HFmode
10743           || mode == V1DFmode))
10744     return true;
10745
10746   return false;
10747 }
10748
10749 /* Return appropriate SIMD container
10750    for MODE within a vector of WIDTH bits.  */
10751 static machine_mode
10752 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10753 {
10754   gcc_assert (width == 64 || width == 128);
10755   if (TARGET_SIMD)
10756     {
10757       if (width == 128)
10758         switch (mode)
10759           {
10760           case DFmode:
10761             return V2DFmode;
10762           case SFmode:
10763             return V4SFmode;
10764           case SImode:
10765             return V4SImode;
10766           case HImode:
10767             return V8HImode;
10768           case QImode:
10769             return V16QImode;
10770           case DImode:
10771             return V2DImode;
10772           default:
10773             break;
10774           }
10775       else
10776         switch (mode)
10777           {
10778           case SFmode:
10779             return V2SFmode;
10780           case SImode:
10781             return V2SImode;
10782           case HImode:
10783             return V4HImode;
10784           case QImode:
10785             return V8QImode;
10786           default:
10787             break;
10788           }
10789     }
10790   return word_mode;
10791 }
10792
10793 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
10794 static machine_mode
10795 aarch64_preferred_simd_mode (machine_mode mode)
10796 {
10797   return aarch64_simd_container_mode (mode, 128);
10798 }
10799
10800 /* Return the bitmask of possible vector sizes for the vectorizer
10801    to iterate over.  */
10802 static unsigned int
10803 aarch64_autovectorize_vector_sizes (void)
10804 {
10805   return (16 | 8);
10806 }
10807
10808 /* Implement TARGET_MANGLE_TYPE.  */
10809
10810 static const char *
10811 aarch64_mangle_type (const_tree type)
10812 {
10813   /* The AArch64 ABI documents say that "__va_list" has to be
10814      managled as if it is in the "std" namespace.  */
10815   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10816     return "St9__va_list";
10817
10818   /* Half-precision float.  */
10819   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10820     return "Dh";
10821
10822   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
10823      builtin types.  */
10824   if (TYPE_NAME (type) != NULL)
10825     return aarch64_mangle_builtin_type (type);
10826
10827   /* Use the default mangling.  */
10828   return NULL;
10829 }
10830
10831
10832 /* Return true if the rtx_insn contains a MEM RTX somewhere
10833    in it.  */
10834
10835 static bool
10836 has_memory_op (rtx_insn *mem_insn)
10837 {
10838   subrtx_iterator::array_type array;
10839   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
10840     if (MEM_P (*iter))
10841       return true;
10842
10843   return false;
10844 }
10845
10846 /* Find the first rtx_insn before insn that will generate an assembly
10847    instruction.  */
10848
10849 static rtx_insn *
10850 aarch64_prev_real_insn (rtx_insn *insn)
10851 {
10852   if (!insn)
10853     return NULL;
10854
10855   do
10856     {
10857       insn = prev_real_insn (insn);
10858     }
10859   while (insn && recog_memoized (insn) < 0);
10860
10861   return insn;
10862 }
10863
10864 static bool
10865 is_madd_op (enum attr_type t1)
10866 {
10867   unsigned int i;
10868   /* A number of these may be AArch32 only.  */
10869   enum attr_type mlatypes[] = {
10870     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10871     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10872     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10873   };
10874
10875   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10876     {
10877       if (t1 == mlatypes[i])
10878         return true;
10879     }
10880
10881   return false;
10882 }
10883
10884 /* Check if there is a register dependency between a load and the insn
10885    for which we hold recog_data.  */
10886
10887 static bool
10888 dep_between_memop_and_curr (rtx memop)
10889 {
10890   rtx load_reg;
10891   int opno;
10892
10893   gcc_assert (GET_CODE (memop) == SET);
10894
10895   if (!REG_P (SET_DEST (memop)))
10896     return false;
10897
10898   load_reg = SET_DEST (memop);
10899   for (opno = 1; opno < recog_data.n_operands; opno++)
10900     {
10901       rtx operand = recog_data.operand[opno];
10902       if (REG_P (operand)
10903           && reg_overlap_mentioned_p (load_reg, operand))
10904         return true;
10905
10906     }
10907   return false;
10908 }
10909
10910
10911 /* When working around the Cortex-A53 erratum 835769,
10912    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10913    instruction and has a preceding memory instruction such that a NOP
10914    should be inserted between them.  */
10915
10916 bool
10917 aarch64_madd_needs_nop (rtx_insn* insn)
10918 {
10919   enum attr_type attr_type;
10920   rtx_insn *prev;
10921   rtx body;
10922
10923   if (!TARGET_FIX_ERR_A53_835769)
10924     return false;
10925
10926   if (!INSN_P (insn) || recog_memoized (insn) < 0)
10927     return false;
10928
10929   attr_type = get_attr_type (insn);
10930   if (!is_madd_op (attr_type))
10931     return false;
10932
10933   prev = aarch64_prev_real_insn (insn);
10934   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
10935      Restore recog state to INSN to avoid state corruption.  */
10936   extract_constrain_insn_cached (insn);
10937
10938   if (!prev || !has_memory_op (prev))
10939     return false;
10940
10941   body = single_set (prev);
10942
10943   /* If the previous insn is a memory op and there is no dependency between
10944      it and the DImode madd, emit a NOP between them.  If body is NULL then we
10945      have a complex memory operation, probably a load/store pair.
10946      Be conservative for now and emit a NOP.  */
10947   if (GET_MODE (recog_data.operand[0]) == DImode
10948       && (!body || !dep_between_memop_and_curr (body)))
10949     return true;
10950
10951   return false;
10952
10953 }
10954
10955
10956 /* Implement FINAL_PRESCAN_INSN.  */
10957
10958 void
10959 aarch64_final_prescan_insn (rtx_insn *insn)
10960 {
10961   if (aarch64_madd_needs_nop (insn))
10962     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
10963 }
10964
10965
10966 /* Return the equivalent letter for size.  */
10967 static char
10968 sizetochar (int size)
10969 {
10970   switch (size)
10971     {
10972     case 64: return 'd';
10973     case 32: return 's';
10974     case 16: return 'h';
10975     case 8 : return 'b';
10976     default: gcc_unreachable ();
10977     }
10978 }
10979
10980 /* Return true iff x is a uniform vector of floating-point
10981    constants, and the constant can be represented in
10982    quarter-precision form.  Note, as aarch64_float_const_representable
10983    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
10984 static bool
10985 aarch64_vect_float_const_representable_p (rtx x)
10986 {
10987   rtx elt;
10988   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10989           && const_vec_duplicate_p (x, &elt)
10990           && aarch64_float_const_representable_p (elt));
10991 }
10992
10993 /* Return true for valid and false for invalid.  */
10994 bool
10995 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
10996                               struct simd_immediate_info *info)
10997 {
10998 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
10999   matches = 1;                                          \
11000   for (i = 0; i < idx; i += (STRIDE))                   \
11001     if (!(TEST))                                        \
11002       matches = 0;                                      \
11003   if (matches)                                          \
11004     {                                                   \
11005       immtype = (CLASS);                                \
11006       elsize = (ELSIZE);                                \
11007       eshift = (SHIFT);                                 \
11008       emvn = (NEG);                                     \
11009       break;                                            \
11010     }
11011
11012   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11013   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11014   unsigned char bytes[16];
11015   int immtype = -1, matches;
11016   unsigned int invmask = inverse ? 0xff : 0;
11017   int eshift, emvn;
11018
11019   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11020     {
11021       if (! (aarch64_simd_imm_zero_p (op, mode)
11022              || aarch64_vect_float_const_representable_p (op)))
11023         return false;
11024
11025       if (info)
11026         {
11027           info->value = CONST_VECTOR_ELT (op, 0);
11028           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
11029           info->mvn = false;
11030           info->shift = 0;
11031         }
11032
11033       return true;
11034     }
11035
11036   /* Splat vector constant out into a byte vector.  */
11037   for (i = 0; i < n_elts; i++)
11038     {
11039       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
11040          it must be laid out in the vector register in reverse order.  */
11041       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11042       unsigned HOST_WIDE_INT elpart;
11043
11044       gcc_assert (CONST_INT_P (el));
11045       elpart = INTVAL (el);
11046
11047       for (unsigned int byte = 0; byte < innersize; byte++)
11048         {
11049           bytes[idx++] = (elpart & 0xff) ^ invmask;
11050           elpart >>= BITS_PER_UNIT;
11051         }
11052
11053     }
11054
11055   /* Sanity check.  */
11056   gcc_assert (idx == GET_MODE_SIZE (mode));
11057
11058   do
11059     {
11060       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11061              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11062
11063       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11064              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11065
11066       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11067              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11068
11069       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11070              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11071
11072       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11073
11074       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11075
11076       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11077              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11078
11079       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11080              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11081
11082       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11083              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11084
11085       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11086              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11087
11088       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11089
11090       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11091
11092       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11093              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11094
11095       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11096              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11097
11098       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11099              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11100
11101       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11102              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11103
11104       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11105
11106       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11107              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11108     }
11109   while (0);
11110
11111   if (immtype == -1)
11112     return false;
11113
11114   if (info)
11115     {
11116       info->element_width = elsize;
11117       info->mvn = emvn != 0;
11118       info->shift = eshift;
11119
11120       unsigned HOST_WIDE_INT imm = 0;
11121
11122       if (immtype >= 12 && immtype <= 15)
11123         info->msl = true;
11124
11125       /* Un-invert bytes of recognized vector, if necessary.  */
11126       if (invmask != 0)
11127         for (i = 0; i < idx; i++)
11128           bytes[i] ^= invmask;
11129
11130       if (immtype == 17)
11131         {
11132           /* FIXME: Broken on 32-bit H_W_I hosts.  */
11133           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11134
11135           for (i = 0; i < 8; i++)
11136             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11137               << (i * BITS_PER_UNIT);
11138
11139
11140           info->value = GEN_INT (imm);
11141         }
11142       else
11143         {
11144           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11145             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11146
11147           /* Construct 'abcdefgh' because the assembler cannot handle
11148              generic constants.  */
11149           if (info->mvn)
11150             imm = ~imm;
11151           imm = (imm >> info->shift) & 0xff;
11152           info->value = GEN_INT (imm);
11153         }
11154     }
11155
11156   return true;
11157 #undef CHECK
11158 }
11159
11160 /* Check of immediate shift constants are within range.  */
11161 bool
11162 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11163 {
11164   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11165   if (left)
11166     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11167   else
11168     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11169 }
11170
11171 /* Return true if X is a uniform vector where all elements
11172    are either the floating-point constant 0.0 or the
11173    integer constant 0.  */
11174 bool
11175 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11176 {
11177   return x == CONST0_RTX (mode);
11178 }
11179
11180
11181 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11182    operation of width WIDTH at bit position POS.  */
11183
11184 rtx
11185 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11186 {
11187   gcc_assert (CONST_INT_P (width));
11188   gcc_assert (CONST_INT_P (pos));
11189
11190   unsigned HOST_WIDE_INT mask
11191     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11192   return GEN_INT (mask << UINTVAL (pos));
11193 }
11194
11195 bool
11196 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
11197 {
11198   HOST_WIDE_INT imm = INTVAL (x);
11199   int i;
11200
11201   for (i = 0; i < 8; i++)
11202     {
11203       unsigned int byte = imm & 0xff;
11204       if (byte != 0xff && byte != 0)
11205        return false;
11206       imm >>= 8;
11207     }
11208
11209   return true;
11210 }
11211
11212 bool
11213 aarch64_mov_operand_p (rtx x, machine_mode mode)
11214 {
11215   if (GET_CODE (x) == HIGH
11216       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11217     return true;
11218
11219   if (CONST_INT_P (x))
11220     return true;
11221
11222   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11223     return true;
11224
11225   return aarch64_classify_symbolic_expression (x)
11226     == SYMBOL_TINY_ABSOLUTE;
11227 }
11228
11229 /* Return a const_int vector of VAL.  */
11230 rtx
11231 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
11232 {
11233   int nunits = GET_MODE_NUNITS (mode);
11234   rtvec v = rtvec_alloc (nunits);
11235   int i;
11236
11237   for (i=0; i < nunits; i++)
11238     RTVEC_ELT (v, i) = GEN_INT (val);
11239
11240   return gen_rtx_CONST_VECTOR (mode, v);
11241 }
11242
11243 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
11244
11245 bool
11246 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11247 {
11248   machine_mode vmode;
11249
11250   gcc_assert (!VECTOR_MODE_P (mode));
11251   vmode = aarch64_preferred_simd_mode (mode);
11252   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11253   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11254 }
11255
11256 /* Construct and return a PARALLEL RTX vector with elements numbering the
11257    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11258    the vector - from the perspective of the architecture.  This does not
11259    line up with GCC's perspective on lane numbers, so we end up with
11260    different masks depending on our target endian-ness.  The diagram
11261    below may help.  We must draw the distinction when building masks
11262    which select one half of the vector.  An instruction selecting
11263    architectural low-lanes for a big-endian target, must be described using
11264    a mask selecting GCC high-lanes.
11265
11266                  Big-Endian             Little-Endian
11267
11268 GCC             0   1   2   3           3   2   1   0
11269               | x | x | x | x |       | x | x | x | x |
11270 Architecture    3   2   1   0           3   2   1   0
11271
11272 Low Mask:         { 2, 3 }                { 0, 1 }
11273 High Mask:        { 0, 1 }                { 2, 3 }
11274 */
11275
11276 rtx
11277 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11278 {
11279   int nunits = GET_MODE_NUNITS (mode);
11280   rtvec v = rtvec_alloc (nunits / 2);
11281   int high_base = nunits / 2;
11282   int low_base = 0;
11283   int base;
11284   rtx t1;
11285   int i;
11286
11287   if (BYTES_BIG_ENDIAN)
11288     base = high ? low_base : high_base;
11289   else
11290     base = high ? high_base : low_base;
11291
11292   for (i = 0; i < nunits / 2; i++)
11293     RTVEC_ELT (v, i) = GEN_INT (base + i);
11294
11295   t1 = gen_rtx_PARALLEL (mode, v);
11296   return t1;
11297 }
11298
11299 /* Check OP for validity as a PARALLEL RTX vector with elements
11300    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11301    from the perspective of the architecture.  See the diagram above
11302    aarch64_simd_vect_par_cnst_half for more details.  */
11303
11304 bool
11305 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11306                                        bool high)
11307 {
11308   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11309   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11310   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11311   int i = 0;
11312
11313   if (!VECTOR_MODE_P (mode))
11314     return false;
11315
11316   if (count_op != count_ideal)
11317     return false;
11318
11319   for (i = 0; i < count_ideal; i++)
11320     {
11321       rtx elt_op = XVECEXP (op, 0, i);
11322       rtx elt_ideal = XVECEXP (ideal, 0, i);
11323
11324       if (!CONST_INT_P (elt_op)
11325           || INTVAL (elt_ideal) != INTVAL (elt_op))
11326         return false;
11327     }
11328   return true;
11329 }
11330
11331 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11332    HIGH (exclusive).  */
11333 void
11334 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11335                           const_tree exp)
11336 {
11337   HOST_WIDE_INT lane;
11338   gcc_assert (CONST_INT_P (operand));
11339   lane = INTVAL (operand);
11340
11341   if (lane < low || lane >= high)
11342   {
11343     if (exp)
11344       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11345     else
11346       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11347   }
11348 }
11349
11350 /* Return TRUE if OP is a valid vector addressing mode.  */
11351 bool
11352 aarch64_simd_mem_operand_p (rtx op)
11353 {
11354   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11355                         || REG_P (XEXP (op, 0)));
11356 }
11357
11358 /* Emit a register copy from operand to operand, taking care not to
11359    early-clobber source registers in the process.
11360
11361    COUNT is the number of components into which the copy needs to be
11362    decomposed.  */
11363 void
11364 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
11365                                 unsigned int count)
11366 {
11367   unsigned int i;
11368   int rdest = REGNO (operands[0]);
11369   int rsrc = REGNO (operands[1]);
11370
11371   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11372       || rdest < rsrc)
11373     for (i = 0; i < count; i++)
11374       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11375                       gen_rtx_REG (mode, rsrc + i));
11376   else
11377     for (i = 0; i < count; i++)
11378       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11379                       gen_rtx_REG (mode, rsrc + count - i - 1));
11380 }
11381
11382 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11383    one of VSTRUCT modes: OI, CI, or XI.  */
11384 int
11385 aarch64_simd_attr_length_rglist (enum machine_mode mode)
11386 {
11387   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11388 }
11389
11390 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11391    alignment of a vector to 128 bits.  */
11392 static HOST_WIDE_INT
11393 aarch64_simd_vector_alignment (const_tree type)
11394 {
11395   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11396   return MIN (align, 128);
11397 }
11398
11399 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11400 static bool
11401 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11402 {
11403   if (is_packed)
11404     return false;
11405
11406   /* We guarantee alignment for vectors up to 128-bits.  */
11407   if (tree_int_cst_compare (TYPE_SIZE (type),
11408                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11409     return false;
11410
11411   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11412   return true;
11413 }
11414
11415 /* If VALS is a vector constant that can be loaded into a register
11416    using DUP, generate instructions to do so and return an RTX to
11417    assign to the register.  Otherwise return NULL_RTX.  */
11418 static rtx
11419 aarch64_simd_dup_constant (rtx vals)
11420 {
11421   machine_mode mode = GET_MODE (vals);
11422   machine_mode inner_mode = GET_MODE_INNER (mode);
11423   rtx x;
11424
11425   if (!const_vec_duplicate_p (vals, &x))
11426     return NULL_RTX;
11427
11428   /* We can load this constant by using DUP and a constant in a
11429      single ARM register.  This will be cheaper than a vector
11430      load.  */
11431   x = copy_to_mode_reg (inner_mode, x);
11432   return gen_rtx_VEC_DUPLICATE (mode, x);
11433 }
11434
11435
11436 /* Generate code to load VALS, which is a PARALLEL containing only
11437    constants (for vec_init) or CONST_VECTOR, efficiently into a
11438    register.  Returns an RTX to copy into the register, or NULL_RTX
11439    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11440 static rtx
11441 aarch64_simd_make_constant (rtx vals)
11442 {
11443   machine_mode mode = GET_MODE (vals);
11444   rtx const_dup;
11445   rtx const_vec = NULL_RTX;
11446   int n_elts = GET_MODE_NUNITS (mode);
11447   int n_const = 0;
11448   int i;
11449
11450   if (GET_CODE (vals) == CONST_VECTOR)
11451     const_vec = vals;
11452   else if (GET_CODE (vals) == PARALLEL)
11453     {
11454       /* A CONST_VECTOR must contain only CONST_INTs and
11455          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11456          Only store valid constants in a CONST_VECTOR.  */
11457       for (i = 0; i < n_elts; ++i)
11458         {
11459           rtx x = XVECEXP (vals, 0, i);
11460           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11461             n_const++;
11462         }
11463       if (n_const == n_elts)
11464         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11465     }
11466   else
11467     gcc_unreachable ();
11468
11469   if (const_vec != NULL_RTX
11470       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11471     /* Load using MOVI/MVNI.  */
11472     return const_vec;
11473   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11474     /* Loaded using DUP.  */
11475     return const_dup;
11476   else if (const_vec != NULL_RTX)
11477     /* Load from constant pool. We can not take advantage of single-cycle
11478        LD1 because we need a PC-relative addressing mode.  */
11479     return const_vec;
11480   else
11481     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11482        We can not construct an initializer.  */
11483     return NULL_RTX;
11484 }
11485
11486 /* Expand a vector initialisation sequence, such that TARGET is
11487    initialised to contain VALS.  */
11488
11489 void
11490 aarch64_expand_vector_init (rtx target, rtx vals)
11491 {
11492   machine_mode mode = GET_MODE (target);
11493   machine_mode inner_mode = GET_MODE_INNER (mode);
11494   /* The number of vector elements.  */
11495   int n_elts = GET_MODE_NUNITS (mode);
11496   /* The number of vector elements which are not constant.  */
11497   int n_var = 0;
11498   rtx any_const = NULL_RTX;
11499   /* The first element of vals.  */
11500   rtx v0 = XVECEXP (vals, 0, 0);
11501   bool all_same = true;
11502
11503   /* Count the number of variable elements to initialise.  */
11504   for (int i = 0; i < n_elts; ++i)
11505     {
11506       rtx x = XVECEXP (vals, 0, i);
11507       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11508         ++n_var;
11509       else
11510         any_const = x;
11511
11512       all_same &= rtx_equal_p (x, v0);
11513     }
11514
11515   /* No variable elements, hand off to aarch64_simd_make_constant which knows
11516      how best to handle this.  */
11517   if (n_var == 0)
11518     {
11519       rtx constant = aarch64_simd_make_constant (vals);
11520       if (constant != NULL_RTX)
11521         {
11522           emit_move_insn (target, constant);
11523           return;
11524         }
11525     }
11526
11527   /* Splat a single non-constant element if we can.  */
11528   if (all_same)
11529     {
11530       rtx x = copy_to_mode_reg (inner_mode, v0);
11531       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11532       return;
11533     }
11534
11535   /* Initialise a vector which is part-variable.  We want to first try
11536      to build those lanes which are constant in the most efficient way we
11537      can.  */
11538   if (n_var != n_elts)
11539     {
11540       rtx copy = copy_rtx (vals);
11541
11542       /* Load constant part of vector.  We really don't care what goes into the
11543          parts we will overwrite, but we're more likely to be able to load the
11544          constant efficiently if it has fewer, larger, repeating parts
11545          (see aarch64_simd_valid_immediate).  */
11546       for (int i = 0; i < n_elts; i++)
11547         {
11548           rtx x = XVECEXP (vals, 0, i);
11549           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11550             continue;
11551           rtx subst = any_const;
11552           for (int bit = n_elts / 2; bit > 0; bit /= 2)
11553             {
11554               /* Look in the copied vector, as more elements are const.  */
11555               rtx test = XVECEXP (copy, 0, i ^ bit);
11556               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11557                 {
11558                   subst = test;
11559                   break;
11560                 }
11561             }
11562           XVECEXP (copy, 0, i) = subst;
11563         }
11564       aarch64_expand_vector_init (target, copy);
11565     }
11566
11567   /* Insert the variable lanes directly.  */
11568
11569   enum insn_code icode = optab_handler (vec_set_optab, mode);
11570   gcc_assert (icode != CODE_FOR_nothing);
11571
11572   for (int i = 0; i < n_elts; i++)
11573     {
11574       rtx x = XVECEXP (vals, 0, i);
11575       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11576         continue;
11577       x = copy_to_mode_reg (inner_mode, x);
11578       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11579     }
11580 }
11581
11582 static unsigned HOST_WIDE_INT
11583 aarch64_shift_truncation_mask (machine_mode mode)
11584 {
11585   return
11586     (!SHIFT_COUNT_TRUNCATED
11587      || aarch64_vector_mode_supported_p (mode)
11588      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11589 }
11590
11591 /* Select a format to encode pointers in exception handling data.  */
11592 int
11593 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11594 {
11595    int type;
11596    switch (aarch64_cmodel)
11597      {
11598      case AARCH64_CMODEL_TINY:
11599      case AARCH64_CMODEL_TINY_PIC:
11600      case AARCH64_CMODEL_SMALL:
11601      case AARCH64_CMODEL_SMALL_PIC:
11602      case AARCH64_CMODEL_SMALL_SPIC:
11603        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
11604           for everything.  */
11605        type = DW_EH_PE_sdata4;
11606        break;
11607      default:
11608        /* No assumptions here.  8-byte relocs required.  */
11609        type = DW_EH_PE_sdata8;
11610        break;
11611      }
11612    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11613 }
11614
11615 /* The last .arch and .tune assembly strings that we printed.  */
11616 static std::string aarch64_last_printed_arch_string;
11617 static std::string aarch64_last_printed_tune_string;
11618
11619 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
11620    by the function fndecl.  */
11621
11622 void
11623 aarch64_declare_function_name (FILE *stream, const char* name,
11624                                 tree fndecl)
11625 {
11626   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11627
11628   struct cl_target_option *targ_options;
11629   if (target_parts)
11630     targ_options = TREE_TARGET_OPTION (target_parts);
11631   else
11632     targ_options = TREE_TARGET_OPTION (target_option_current_node);
11633   gcc_assert (targ_options);
11634
11635   const struct processor *this_arch
11636     = aarch64_get_arch (targ_options->x_explicit_arch);
11637
11638   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11639   std::string extension
11640     = aarch64_get_extension_string_for_isa_flags (isa_flags,
11641                                                   this_arch->flags);
11642   /* Only update the assembler .arch string if it is distinct from the last
11643      such string we printed.  */
11644   std::string to_print = this_arch->name + extension;
11645   if (to_print != aarch64_last_printed_arch_string)
11646     {
11647       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11648       aarch64_last_printed_arch_string = to_print;
11649     }
11650
11651   /* Print the cpu name we're tuning for in the comments, might be
11652      useful to readers of the generated asm.  Do it only when it changes
11653      from function to function and verbose assembly is requested.  */
11654   const struct processor *this_tune
11655     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11656
11657   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11658     {
11659       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11660                    this_tune->name);
11661       aarch64_last_printed_tune_string = this_tune->name;
11662     }
11663
11664   /* Don't forget the type directive for ELF.  */
11665   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11666   ASM_OUTPUT_LABEL (stream, name);
11667 }
11668
11669 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
11670
11671 static void
11672 aarch64_start_file (void)
11673 {
11674   struct cl_target_option *default_options
11675     = TREE_TARGET_OPTION (target_option_default_node);
11676
11677   const struct processor *default_arch
11678     = aarch64_get_arch (default_options->x_explicit_arch);
11679   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11680   std::string extension
11681     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11682                                                   default_arch->flags);
11683
11684    aarch64_last_printed_arch_string = default_arch->name + extension;
11685    aarch64_last_printed_tune_string = "";
11686    asm_fprintf (asm_out_file, "\t.arch %s\n",
11687                 aarch64_last_printed_arch_string.c_str ());
11688
11689    default_file_start ();
11690 }
11691
11692 /* Emit load exclusive.  */
11693
11694 static void
11695 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11696                              rtx mem, rtx model_rtx)
11697 {
11698   rtx (*gen) (rtx, rtx, rtx);
11699
11700   switch (mode)
11701     {
11702     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11703     case HImode: gen = gen_aarch64_load_exclusivehi; break;
11704     case SImode: gen = gen_aarch64_load_exclusivesi; break;
11705     case DImode: gen = gen_aarch64_load_exclusivedi; break;
11706     default:
11707       gcc_unreachable ();
11708     }
11709
11710   emit_insn (gen (rval, mem, model_rtx));
11711 }
11712
11713 /* Emit store exclusive.  */
11714
11715 static void
11716 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11717                               rtx rval, rtx mem, rtx model_rtx)
11718 {
11719   rtx (*gen) (rtx, rtx, rtx, rtx);
11720
11721   switch (mode)
11722     {
11723     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11724     case HImode: gen = gen_aarch64_store_exclusivehi; break;
11725     case SImode: gen = gen_aarch64_store_exclusivesi; break;
11726     case DImode: gen = gen_aarch64_store_exclusivedi; break;
11727     default:
11728       gcc_unreachable ();
11729     }
11730
11731   emit_insn (gen (bval, rval, mem, model_rtx));
11732 }
11733
11734 /* Mark the previous jump instruction as unlikely.  */
11735
11736 static void
11737 aarch64_emit_unlikely_jump (rtx insn)
11738 {
11739   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11740
11741   rtx_insn *jump = emit_jump_insn (insn);
11742   add_int_reg_note (jump, REG_BR_PROB, very_unlikely);
11743 }
11744
11745 /* Expand a compare and swap pattern.  */
11746
11747 void
11748 aarch64_expand_compare_and_swap (rtx operands[])
11749 {
11750   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11751   machine_mode mode, cmp_mode;
11752   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11753   int idx;
11754   gen_cas_fn gen;
11755   const gen_cas_fn split_cas[] =
11756   {
11757     gen_aarch64_compare_and_swapqi,
11758     gen_aarch64_compare_and_swaphi,
11759     gen_aarch64_compare_and_swapsi,
11760     gen_aarch64_compare_and_swapdi
11761   };
11762   const gen_cas_fn atomic_cas[] =
11763   {
11764     gen_aarch64_compare_and_swapqi_lse,
11765     gen_aarch64_compare_and_swaphi_lse,
11766     gen_aarch64_compare_and_swapsi_lse,
11767     gen_aarch64_compare_and_swapdi_lse
11768   };
11769
11770   bval = operands[0];
11771   rval = operands[1];
11772   mem = operands[2];
11773   oldval = operands[3];
11774   newval = operands[4];
11775   is_weak = operands[5];
11776   mod_s = operands[6];
11777   mod_f = operands[7];
11778   mode = GET_MODE (mem);
11779   cmp_mode = mode;
11780
11781   /* Normally the succ memory model must be stronger than fail, but in the
11782      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11783      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
11784
11785   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11786       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11787     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11788
11789   switch (mode)
11790     {
11791     case QImode:
11792     case HImode:
11793       /* For short modes, we're going to perform the comparison in SImode,
11794          so do the zero-extension now.  */
11795       cmp_mode = SImode;
11796       rval = gen_reg_rtx (SImode);
11797       oldval = convert_modes (SImode, mode, oldval, true);
11798       /* Fall through.  */
11799
11800     case SImode:
11801     case DImode:
11802       /* Force the value into a register if needed.  */
11803       if (!aarch64_plus_operand (oldval, mode))
11804         oldval = force_reg (cmp_mode, oldval);
11805       break;
11806
11807     default:
11808       gcc_unreachable ();
11809     }
11810
11811   switch (mode)
11812     {
11813     case QImode: idx = 0; break;
11814     case HImode: idx = 1; break;
11815     case SImode: idx = 2; break;
11816     case DImode: idx = 3; break;
11817     default:
11818       gcc_unreachable ();
11819     }
11820   if (TARGET_LSE)
11821     gen = atomic_cas[idx];
11822   else
11823     gen = split_cas[idx];
11824
11825   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11826
11827   if (mode == QImode || mode == HImode)
11828     emit_move_insn (operands[1], gen_lowpart (mode, rval));
11829
11830   x = gen_rtx_REG (CCmode, CC_REGNUM);
11831   x = gen_rtx_EQ (SImode, x, const0_rtx);
11832   emit_insn (gen_rtx_SET (bval, x));
11833 }
11834
11835 /* Test whether the target supports using a atomic load-operate instruction.
11836    CODE is the operation and AFTER is TRUE if the data in memory after the
11837    operation should be returned and FALSE if the data before the operation
11838    should be returned.  Returns FALSE if the operation isn't supported by the
11839    architecture.  */
11840
11841 bool
11842 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11843 {
11844   if (!TARGET_LSE)
11845     return false;
11846
11847   switch (code)
11848     {
11849     case SET:
11850     case AND:
11851     case IOR:
11852     case XOR:
11853     case MINUS:
11854     case PLUS:
11855       return true;
11856     default:
11857       return false;
11858     }
11859 }
11860
11861 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11862    sequence implementing an atomic operation.  */
11863
11864 static void
11865 aarch64_emit_post_barrier (enum memmodel model)
11866 {
11867   const enum memmodel base_model = memmodel_base (model);
11868
11869   if (is_mm_sync (model)
11870       && (base_model == MEMMODEL_ACQUIRE
11871           || base_model == MEMMODEL_ACQ_REL
11872           || base_model == MEMMODEL_SEQ_CST))
11873     {
11874       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11875     }
11876 }
11877
11878 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
11879    for the data in memory.  EXPECTED is the value expected to be in memory.
11880    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
11881    is the memory ordering to use.  */
11882
11883 void
11884 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11885                         rtx expected, rtx desired,
11886                         rtx model)
11887 {
11888   rtx (*gen) (rtx, rtx, rtx, rtx);
11889   machine_mode mode;
11890
11891   mode = GET_MODE (mem);
11892
11893   switch (mode)
11894     {
11895     case QImode: gen = gen_aarch64_atomic_casqi; break;
11896     case HImode: gen = gen_aarch64_atomic_cashi; break;
11897     case SImode: gen = gen_aarch64_atomic_cassi; break;
11898     case DImode: gen = gen_aarch64_atomic_casdi; break;
11899     default:
11900       gcc_unreachable ();
11901     }
11902
11903   /* Move the expected value into the CAS destination register.  */
11904   emit_insn (gen_rtx_SET (rval, expected));
11905
11906   /* Emit the CAS.  */
11907   emit_insn (gen (rval, mem, desired, model));
11908
11909   /* Compare the expected value with the value loaded by the CAS, to establish
11910      whether the swap was made.  */
11911   aarch64_gen_compare_reg (EQ, rval, expected);
11912 }
11913
11914 /* Split a compare and swap pattern.  */
11915
11916 void
11917 aarch64_split_compare_and_swap (rtx operands[])
11918 {
11919   rtx rval, mem, oldval, newval, scratch;
11920   machine_mode mode;
11921   bool is_weak;
11922   rtx_code_label *label1, *label2;
11923   rtx x, cond;
11924   enum memmodel model;
11925   rtx model_rtx;
11926
11927   rval = operands[0];
11928   mem = operands[1];
11929   oldval = operands[2];
11930   newval = operands[3];
11931   is_weak = (operands[4] != const0_rtx);
11932   model_rtx = operands[5];
11933   scratch = operands[7];
11934   mode = GET_MODE (mem);
11935   model = memmodel_from_int (INTVAL (model_rtx));
11936
11937   label1 = NULL;
11938   if (!is_weak)
11939     {
11940       label1 = gen_label_rtx ();
11941       emit_label (label1);
11942     }
11943   label2 = gen_label_rtx ();
11944
11945   /* The initial load can be relaxed for a __sync operation since a final
11946      barrier will be emitted to stop code hoisting.  */
11947   if (is_mm_sync (model))
11948     aarch64_emit_load_exclusive (mode, rval, mem,
11949                                  GEN_INT (MEMMODEL_RELAXED));
11950   else
11951     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
11952
11953   cond = aarch64_gen_compare_reg (NE, rval, oldval);
11954   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11955   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11956                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11957   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11958
11959   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
11960
11961   if (!is_weak)
11962     {
11963       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
11964       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11965                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
11966       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11967     }
11968   else
11969     {
11970       cond = gen_rtx_REG (CCmode, CC_REGNUM);
11971       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
11972       emit_insn (gen_rtx_SET (cond, x));
11973     }
11974
11975   emit_label (label2);
11976
11977   /* Emit any final barrier needed for a __sync operation.  */
11978   if (is_mm_sync (model))
11979     aarch64_emit_post_barrier (model);
11980 }
11981
11982 /* Emit a BIC instruction.  */
11983
11984 static void
11985 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
11986 {
11987   rtx shift_rtx = GEN_INT (shift);
11988   rtx (*gen) (rtx, rtx, rtx, rtx);
11989
11990   switch (mode)
11991     {
11992     case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
11993     case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
11994     default:
11995       gcc_unreachable ();
11996     }
11997
11998   emit_insn (gen (dst, s2, shift_rtx, s1));
11999 }
12000
12001 /* Emit an atomic swap.  */
12002
12003 static void
12004 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12005                           rtx mem, rtx model)
12006 {
12007   rtx (*gen) (rtx, rtx, rtx, rtx);
12008
12009   switch (mode)
12010     {
12011     case QImode: gen = gen_aarch64_atomic_swpqi; break;
12012     case HImode: gen = gen_aarch64_atomic_swphi; break;
12013     case SImode: gen = gen_aarch64_atomic_swpsi; break;
12014     case DImode: gen = gen_aarch64_atomic_swpdi; break;
12015     default:
12016       gcc_unreachable ();
12017     }
12018
12019   emit_insn (gen (dst, mem, value, model));
12020 }
12021
12022 /* Operations supported by aarch64_emit_atomic_load_op.  */
12023
12024 enum aarch64_atomic_load_op_code
12025 {
12026   AARCH64_LDOP_PLUS,    /* A + B  */
12027   AARCH64_LDOP_XOR,     /* A ^ B  */
12028   AARCH64_LDOP_OR,      /* A | B  */
12029   AARCH64_LDOP_BIC      /* A & ~B  */
12030 };
12031
12032 /* Emit an atomic load-operate.  */
12033
12034 static void
12035 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12036                              machine_mode mode, rtx dst, rtx src,
12037                              rtx mem, rtx model)
12038 {
12039   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12040   const aarch64_atomic_load_op_fn plus[] =
12041   {
12042     gen_aarch64_atomic_loadaddqi,
12043     gen_aarch64_atomic_loadaddhi,
12044     gen_aarch64_atomic_loadaddsi,
12045     gen_aarch64_atomic_loadadddi
12046   };
12047   const aarch64_atomic_load_op_fn eor[] =
12048   {
12049     gen_aarch64_atomic_loadeorqi,
12050     gen_aarch64_atomic_loadeorhi,
12051     gen_aarch64_atomic_loadeorsi,
12052     gen_aarch64_atomic_loadeordi
12053   };
12054   const aarch64_atomic_load_op_fn ior[] =
12055   {
12056     gen_aarch64_atomic_loadsetqi,
12057     gen_aarch64_atomic_loadsethi,
12058     gen_aarch64_atomic_loadsetsi,
12059     gen_aarch64_atomic_loadsetdi
12060   };
12061   const aarch64_atomic_load_op_fn bic[] =
12062   {
12063     gen_aarch64_atomic_loadclrqi,
12064     gen_aarch64_atomic_loadclrhi,
12065     gen_aarch64_atomic_loadclrsi,
12066     gen_aarch64_atomic_loadclrdi
12067   };
12068   aarch64_atomic_load_op_fn gen;
12069   int idx = 0;
12070
12071   switch (mode)
12072     {
12073     case QImode: idx = 0; break;
12074     case HImode: idx = 1; break;
12075     case SImode: idx = 2; break;
12076     case DImode: idx = 3; break;
12077     default:
12078       gcc_unreachable ();
12079     }
12080
12081   switch (code)
12082     {
12083     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12084     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12085     case AARCH64_LDOP_OR: gen = ior[idx]; break;
12086     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12087     default:
12088       gcc_unreachable ();
12089     }
12090
12091   emit_insn (gen (dst, mem, src, model));
12092 }
12093
12094 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
12095    location to store the data read from memory.  OUT_RESULT is the location to
12096    store the result of the operation.  MEM is the memory location to read and
12097    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
12098    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
12099    be NULL.  */
12100
12101 void
12102 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12103                          rtx mem, rtx value, rtx model_rtx)
12104 {
12105   machine_mode mode = GET_MODE (mem);
12106   machine_mode wmode = (mode == DImode ? DImode : SImode);
12107   const bool short_mode = (mode < SImode);
12108   aarch64_atomic_load_op_code ldop_code;
12109   rtx src;
12110   rtx x;
12111
12112   if (out_data)
12113     out_data = gen_lowpart (mode, out_data);
12114
12115   if (out_result)
12116     out_result = gen_lowpart (mode, out_result);
12117
12118   /* Make sure the value is in a register, putting it into a destination
12119      register if it needs to be manipulated.  */
12120   if (!register_operand (value, mode)
12121       || code == AND || code == MINUS)
12122     {
12123       src = out_result ? out_result : out_data;
12124       emit_move_insn (src, gen_lowpart (mode, value));
12125     }
12126   else
12127     src = value;
12128   gcc_assert (register_operand (src, mode));
12129
12130   /* Preprocess the data for the operation as necessary.  If the operation is
12131      a SET then emit a swap instruction and finish.  */
12132   switch (code)
12133     {
12134     case SET:
12135       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12136       return;
12137
12138     case MINUS:
12139       /* Negate the value and treat it as a PLUS.  */
12140       {
12141         rtx neg_src;
12142
12143         /* Resize the value if necessary.  */
12144         if (short_mode)
12145           src = gen_lowpart (wmode, src);
12146
12147         neg_src = gen_rtx_NEG (wmode, src);
12148         emit_insn (gen_rtx_SET (src, neg_src));
12149
12150         if (short_mode)
12151           src = gen_lowpart (mode, src);
12152       }
12153       /* Fall-through.  */
12154     case PLUS:
12155       ldop_code = AARCH64_LDOP_PLUS;
12156       break;
12157
12158     case IOR:
12159       ldop_code = AARCH64_LDOP_OR;
12160       break;
12161
12162     case XOR:
12163       ldop_code = AARCH64_LDOP_XOR;
12164       break;
12165
12166     case AND:
12167       {
12168         rtx not_src;
12169
12170         /* Resize the value if necessary.  */
12171         if (short_mode)
12172           src = gen_lowpart (wmode, src);
12173
12174         not_src = gen_rtx_NOT (wmode, src);
12175         emit_insn (gen_rtx_SET (src, not_src));
12176
12177         if (short_mode)
12178           src = gen_lowpart (mode, src);
12179       }
12180       ldop_code = AARCH64_LDOP_BIC;
12181       break;
12182
12183     default:
12184       /* The operation can't be done with atomic instructions.  */
12185       gcc_unreachable ();
12186     }
12187
12188   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12189
12190   /* If necessary, calculate the data in memory after the update by redoing the
12191      operation from values in registers.  */
12192   if (!out_result)
12193     return;
12194
12195   if (short_mode)
12196     {
12197       src = gen_lowpart (wmode, src);
12198       out_data = gen_lowpart (wmode, out_data);
12199       out_result = gen_lowpart (wmode, out_result);
12200     }
12201
12202   x = NULL_RTX;
12203
12204   switch (code)
12205     {
12206     case MINUS:
12207     case PLUS:
12208       x = gen_rtx_PLUS (wmode, out_data, src);
12209       break;
12210     case IOR:
12211       x = gen_rtx_IOR (wmode, out_data, src);
12212       break;
12213     case XOR:
12214       x = gen_rtx_XOR (wmode, out_data, src);
12215       break;
12216     case AND:
12217       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12218       return;
12219     default:
12220       gcc_unreachable ();
12221     }
12222
12223   emit_set_insn (out_result, x);
12224
12225   return;
12226 }
12227
12228 /* Split an atomic operation.  */
12229
12230 void
12231 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12232                          rtx value, rtx model_rtx, rtx cond)
12233 {
12234   machine_mode mode = GET_MODE (mem);
12235   machine_mode wmode = (mode == DImode ? DImode : SImode);
12236   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12237   const bool is_sync = is_mm_sync (model);
12238   rtx_code_label *label;
12239   rtx x;
12240
12241   /* Split the atomic operation into a sequence.  */
12242   label = gen_label_rtx ();
12243   emit_label (label);
12244
12245   if (new_out)
12246     new_out = gen_lowpart (wmode, new_out);
12247   if (old_out)
12248     old_out = gen_lowpart (wmode, old_out);
12249   else
12250     old_out = new_out;
12251   value = simplify_gen_subreg (wmode, value, mode, 0);
12252
12253   /* The initial load can be relaxed for a __sync operation since a final
12254      barrier will be emitted to stop code hoisting.  */
12255  if (is_sync)
12256     aarch64_emit_load_exclusive (mode, old_out, mem,
12257                                  GEN_INT (MEMMODEL_RELAXED));
12258   else
12259     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12260
12261   switch (code)
12262     {
12263     case SET:
12264       new_out = value;
12265       break;
12266
12267     case NOT:
12268       x = gen_rtx_AND (wmode, old_out, value);
12269       emit_insn (gen_rtx_SET (new_out, x));
12270       x = gen_rtx_NOT (wmode, new_out);
12271       emit_insn (gen_rtx_SET (new_out, x));
12272       break;
12273
12274     case MINUS:
12275       if (CONST_INT_P (value))
12276         {
12277           value = GEN_INT (-INTVAL (value));
12278           code = PLUS;
12279         }
12280       /* Fall through.  */
12281
12282     default:
12283       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12284       emit_insn (gen_rtx_SET (new_out, x));
12285       break;
12286     }
12287
12288   aarch64_emit_store_exclusive (mode, cond, mem,
12289                                 gen_lowpart (mode, new_out), model_rtx);
12290
12291   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12292   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12293                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12294   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12295
12296   /* Emit any final barrier needed for a __sync operation.  */
12297   if (is_sync)
12298     aarch64_emit_post_barrier (model);
12299 }
12300
12301 static void
12302 aarch64_init_libfuncs (void)
12303 {
12304    /* Half-precision float operations.  The compiler handles all operations
12305      with NULL libfuncs by converting to SFmode.  */
12306
12307   /* Conversions.  */
12308   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12309   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12310
12311   /* Arithmetic.  */
12312   set_optab_libfunc (add_optab, HFmode, NULL);
12313   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12314   set_optab_libfunc (smul_optab, HFmode, NULL);
12315   set_optab_libfunc (neg_optab, HFmode, NULL);
12316   set_optab_libfunc (sub_optab, HFmode, NULL);
12317
12318   /* Comparisons.  */
12319   set_optab_libfunc (eq_optab, HFmode, NULL);
12320   set_optab_libfunc (ne_optab, HFmode, NULL);
12321   set_optab_libfunc (lt_optab, HFmode, NULL);
12322   set_optab_libfunc (le_optab, HFmode, NULL);
12323   set_optab_libfunc (ge_optab, HFmode, NULL);
12324   set_optab_libfunc (gt_optab, HFmode, NULL);
12325   set_optab_libfunc (unord_optab, HFmode, NULL);
12326 }
12327
12328 /* Target hook for c_mode_for_suffix.  */
12329 static machine_mode
12330 aarch64_c_mode_for_suffix (char suffix)
12331 {
12332   if (suffix == 'q')
12333     return TFmode;
12334
12335   return VOIDmode;
12336 }
12337
12338 /* We can only represent floating point constants which will fit in
12339    "quarter-precision" values.  These values are characterised by
12340    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12341    by:
12342
12343    (-1)^s * (n/16) * 2^r
12344
12345    Where:
12346      's' is the sign bit.
12347      'n' is an integer in the range 16 <= n <= 31.
12348      'r' is an integer in the range -3 <= r <= 4.  */
12349
12350 /* Return true iff X can be represented by a quarter-precision
12351    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12352 bool
12353 aarch64_float_const_representable_p (rtx x)
12354 {
12355   /* This represents our current view of how many bits
12356      make up the mantissa.  */
12357   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12358   int exponent;
12359   unsigned HOST_WIDE_INT mantissa, mask;
12360   REAL_VALUE_TYPE r, m;
12361   bool fail;
12362
12363   if (!CONST_DOUBLE_P (x))
12364     return false;
12365
12366   /* We don't support HFmode constants yet.  */
12367   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12368     return false;
12369
12370   r = *CONST_DOUBLE_REAL_VALUE (x);
12371
12372   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12373      know if we have +zero until we analyse the mantissa, but we
12374      can reject the other invalid values.  */
12375   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12376       || REAL_VALUE_MINUS_ZERO (r))
12377     return false;
12378
12379   /* Extract exponent.  */
12380   r = real_value_abs (&r);
12381   exponent = REAL_EXP (&r);
12382
12383   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12384      highest (sign) bit, with a fixed binary point at bit point_pos.
12385      m1 holds the low part of the mantissa, m2 the high part.
12386      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12387      bits for the mantissa, this can fail (low bits will be lost).  */
12388   real_ldexp (&m, &r, point_pos - exponent);
12389   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12390
12391   /* If the low part of the mantissa has bits set we cannot represent
12392      the value.  */
12393   if (w.elt (0) != 0)
12394     return false;
12395   /* We have rejected the lower HOST_WIDE_INT, so update our
12396      understanding of how many bits lie in the mantissa and
12397      look only at the high HOST_WIDE_INT.  */
12398   mantissa = w.elt (1);
12399   point_pos -= HOST_BITS_PER_WIDE_INT;
12400
12401   /* We can only represent values with a mantissa of the form 1.xxxx.  */
12402   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12403   if ((mantissa & mask) != 0)
12404     return false;
12405
12406   /* Having filtered unrepresentable values, we may now remove all
12407      but the highest 5 bits.  */
12408   mantissa >>= point_pos - 5;
12409
12410   /* We cannot represent the value 0.0, so reject it.  This is handled
12411      elsewhere.  */
12412   if (mantissa == 0)
12413     return false;
12414
12415   /* Then, as bit 4 is always set, we can mask it off, leaving
12416      the mantissa in the range [0, 15].  */
12417   mantissa &= ~(1 << 4);
12418   gcc_assert (mantissa <= 15);
12419
12420   /* GCC internally does not use IEEE754-like encoding (where normalized
12421      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
12422      Our mantissa values are shifted 4 places to the left relative to
12423      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12424      by 5 places to correct for GCC's representation.  */
12425   exponent = 5 - exponent;
12426
12427   return (exponent >= 0 && exponent <= 7);
12428 }
12429
12430 char*
12431 aarch64_output_simd_mov_immediate (rtx const_vector,
12432                                    machine_mode mode,
12433                                    unsigned width)
12434 {
12435   bool is_valid;
12436   static char templ[40];
12437   const char *mnemonic;
12438   const char *shift_op;
12439   unsigned int lane_count = 0;
12440   char element_char;
12441
12442   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12443
12444   /* This will return true to show const_vector is legal for use as either
12445      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
12446      also update INFO to show how the immediate should be generated.  */
12447   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12448   gcc_assert (is_valid);
12449
12450   element_char = sizetochar (info.element_width);
12451   lane_count = width / info.element_width;
12452
12453   mode = GET_MODE_INNER (mode);
12454   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12455     {
12456       gcc_assert (info.shift == 0 && ! info.mvn);
12457       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12458          move immediate path.  */
12459       if (aarch64_float_const_zero_rtx_p (info.value))
12460         info.value = GEN_INT (0);
12461       else
12462         {
12463           const unsigned int buf_size = 20;
12464           char float_buf[buf_size] = {'\0'};
12465           real_to_decimal_for_mode (float_buf,
12466                                     CONST_DOUBLE_REAL_VALUE (info.value),
12467                                     buf_size, buf_size, 1, mode);
12468
12469           if (lane_count == 1)
12470             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12471           else
12472             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12473                       lane_count, element_char, float_buf);
12474           return templ;
12475         }
12476     }
12477
12478   mnemonic = info.mvn ? "mvni" : "movi";
12479   shift_op = info.msl ? "msl" : "lsl";
12480
12481   gcc_assert (CONST_INT_P (info.value));
12482   if (lane_count == 1)
12483     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12484               mnemonic, UINTVAL (info.value));
12485   else if (info.shift)
12486     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12487               ", %s %d", mnemonic, lane_count, element_char,
12488               UINTVAL (info.value), shift_op, info.shift);
12489   else
12490     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12491               mnemonic, lane_count, element_char, UINTVAL (info.value));
12492   return templ;
12493 }
12494
12495 char*
12496 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12497                                           machine_mode mode)
12498 {
12499   machine_mode vmode;
12500
12501   gcc_assert (!VECTOR_MODE_P (mode));
12502   vmode = aarch64_simd_container_mode (mode, 64);
12503   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12504   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12505 }
12506
12507 /* Split operands into moves from op[1] + op[2] into op[0].  */
12508
12509 void
12510 aarch64_split_combinev16qi (rtx operands[3])
12511 {
12512   unsigned int dest = REGNO (operands[0]);
12513   unsigned int src1 = REGNO (operands[1]);
12514   unsigned int src2 = REGNO (operands[2]);
12515   machine_mode halfmode = GET_MODE (operands[1]);
12516   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12517   rtx destlo, desthi;
12518
12519   gcc_assert (halfmode == V16QImode);
12520
12521   if (src1 == dest && src2 == dest + halfregs)
12522     {
12523       /* No-op move.  Can't split to nothing; emit something.  */
12524       emit_note (NOTE_INSN_DELETED);
12525       return;
12526     }
12527
12528   /* Preserve register attributes for variable tracking.  */
12529   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12530   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12531                                GET_MODE_SIZE (halfmode));
12532
12533   /* Special case of reversed high/low parts.  */
12534   if (reg_overlap_mentioned_p (operands[2], destlo)
12535       && reg_overlap_mentioned_p (operands[1], desthi))
12536     {
12537       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12538       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12539       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12540     }
12541   else if (!reg_overlap_mentioned_p (operands[2], destlo))
12542     {
12543       /* Try to avoid unnecessary moves if part of the result
12544          is in the right place already.  */
12545       if (src1 != dest)
12546         emit_move_insn (destlo, operands[1]);
12547       if (src2 != dest + halfregs)
12548         emit_move_insn (desthi, operands[2]);
12549     }
12550   else
12551     {
12552       if (src2 != dest + halfregs)
12553         emit_move_insn (desthi, operands[2]);
12554       if (src1 != dest)
12555         emit_move_insn (destlo, operands[1]);
12556     }
12557 }
12558
12559 /* vec_perm support.  */
12560
12561 #define MAX_VECT_LEN 16
12562
12563 struct expand_vec_perm_d
12564 {
12565   rtx target, op0, op1;
12566   unsigned char perm[MAX_VECT_LEN];
12567   machine_mode vmode;
12568   unsigned char nelt;
12569   bool one_vector_p;
12570   bool testing_p;
12571 };
12572
12573 /* Generate a variable permutation.  */
12574
12575 static void
12576 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12577 {
12578   machine_mode vmode = GET_MODE (target);
12579   bool one_vector_p = rtx_equal_p (op0, op1);
12580
12581   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12582   gcc_checking_assert (GET_MODE (op0) == vmode);
12583   gcc_checking_assert (GET_MODE (op1) == vmode);
12584   gcc_checking_assert (GET_MODE (sel) == vmode);
12585   gcc_checking_assert (TARGET_SIMD);
12586
12587   if (one_vector_p)
12588     {
12589       if (vmode == V8QImode)
12590         {
12591           /* Expand the argument to a V16QI mode by duplicating it.  */
12592           rtx pair = gen_reg_rtx (V16QImode);
12593           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12594           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12595         }
12596       else
12597         {
12598           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12599         }
12600     }
12601   else
12602     {
12603       rtx pair;
12604
12605       if (vmode == V8QImode)
12606         {
12607           pair = gen_reg_rtx (V16QImode);
12608           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12609           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12610         }
12611       else
12612         {
12613           pair = gen_reg_rtx (OImode);
12614           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12615           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12616         }
12617     }
12618 }
12619
12620 void
12621 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12622 {
12623   machine_mode vmode = GET_MODE (target);
12624   unsigned int nelt = GET_MODE_NUNITS (vmode);
12625   bool one_vector_p = rtx_equal_p (op0, op1);
12626   rtx mask;
12627
12628   /* The TBL instruction does not use a modulo index, so we must take care
12629      of that ourselves.  */
12630   mask = aarch64_simd_gen_const_vector_dup (vmode,
12631       one_vector_p ? nelt - 1 : 2 * nelt - 1);
12632   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12633
12634   /* For big-endian, we also need to reverse the index within the vector
12635      (but not which vector).  */
12636   if (BYTES_BIG_ENDIAN)
12637     {
12638       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
12639       if (!one_vector_p)
12640         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12641       sel = expand_simple_binop (vmode, XOR, sel, mask,
12642                                  NULL, 0, OPTAB_LIB_WIDEN);
12643     }
12644   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12645 }
12646
12647 /* Recognize patterns suitable for the TRN instructions.  */
12648 static bool
12649 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12650 {
12651   unsigned int i, odd, mask, nelt = d->nelt;
12652   rtx out, in0, in1, x;
12653   rtx (*gen) (rtx, rtx, rtx);
12654   machine_mode vmode = d->vmode;
12655
12656   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12657     return false;
12658
12659   /* Note that these are little-endian tests.
12660      We correct for big-endian later.  */
12661   if (d->perm[0] == 0)
12662     odd = 0;
12663   else if (d->perm[0] == 1)
12664     odd = 1;
12665   else
12666     return false;
12667   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12668
12669   for (i = 0; i < nelt; i += 2)
12670     {
12671       if (d->perm[i] != i + odd)
12672         return false;
12673       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12674         return false;
12675     }
12676
12677   /* Success!  */
12678   if (d->testing_p)
12679     return true;
12680
12681   in0 = d->op0;
12682   in1 = d->op1;
12683   if (BYTES_BIG_ENDIAN)
12684     {
12685       x = in0, in0 = in1, in1 = x;
12686       odd = !odd;
12687     }
12688   out = d->target;
12689
12690   if (odd)
12691     {
12692       switch (vmode)
12693         {
12694         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12695         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12696         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12697         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12698         case V4SImode: gen = gen_aarch64_trn2v4si; break;
12699         case V2SImode: gen = gen_aarch64_trn2v2si; break;
12700         case V2DImode: gen = gen_aarch64_trn2v2di; break;
12701         case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
12702         case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
12703         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12704         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12705         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12706         default:
12707           return false;
12708         }
12709     }
12710   else
12711     {
12712       switch (vmode)
12713         {
12714         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12715         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12716         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12717         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12718         case V4SImode: gen = gen_aarch64_trn1v4si; break;
12719         case V2SImode: gen = gen_aarch64_trn1v2si; break;
12720         case V2DImode: gen = gen_aarch64_trn1v2di; break;
12721         case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
12722         case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
12723         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12724         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12725         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12726         default:
12727           return false;
12728         }
12729     }
12730
12731   emit_insn (gen (out, in0, in1));
12732   return true;
12733 }
12734
12735 /* Recognize patterns suitable for the UZP instructions.  */
12736 static bool
12737 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12738 {
12739   unsigned int i, odd, mask, nelt = d->nelt;
12740   rtx out, in0, in1, x;
12741   rtx (*gen) (rtx, rtx, rtx);
12742   machine_mode vmode = d->vmode;
12743
12744   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12745     return false;
12746
12747   /* Note that these are little-endian tests.
12748      We correct for big-endian later.  */
12749   if (d->perm[0] == 0)
12750     odd = 0;
12751   else if (d->perm[0] == 1)
12752     odd = 1;
12753   else
12754     return false;
12755   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12756
12757   for (i = 0; i < nelt; i++)
12758     {
12759       unsigned elt = (i * 2 + odd) & mask;
12760       if (d->perm[i] != elt)
12761         return false;
12762     }
12763
12764   /* Success!  */
12765   if (d->testing_p)
12766     return true;
12767
12768   in0 = d->op0;
12769   in1 = d->op1;
12770   if (BYTES_BIG_ENDIAN)
12771     {
12772       x = in0, in0 = in1, in1 = x;
12773       odd = !odd;
12774     }
12775   out = d->target;
12776
12777   if (odd)
12778     {
12779       switch (vmode)
12780         {
12781         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12782         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12783         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12784         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12785         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12786         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12787         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12788         case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
12789         case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
12790         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12791         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12792         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12793         default:
12794           return false;
12795         }
12796     }
12797   else
12798     {
12799       switch (vmode)
12800         {
12801         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12802         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12803         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12804         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12805         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12806         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12807         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12808         case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
12809         case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
12810         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12811         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12812         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12813         default:
12814           return false;
12815         }
12816     }
12817
12818   emit_insn (gen (out, in0, in1));
12819   return true;
12820 }
12821
12822 /* Recognize patterns suitable for the ZIP instructions.  */
12823 static bool
12824 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12825 {
12826   unsigned int i, high, mask, nelt = d->nelt;
12827   rtx out, in0, in1, x;
12828   rtx (*gen) (rtx, rtx, rtx);
12829   machine_mode vmode = d->vmode;
12830
12831   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12832     return false;
12833
12834   /* Note that these are little-endian tests.
12835      We correct for big-endian later.  */
12836   high = nelt / 2;
12837   if (d->perm[0] == high)
12838     /* Do Nothing.  */
12839     ;
12840   else if (d->perm[0] == 0)
12841     high = 0;
12842   else
12843     return false;
12844   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12845
12846   for (i = 0; i < nelt / 2; i++)
12847     {
12848       unsigned elt = (i + high) & mask;
12849       if (d->perm[i * 2] != elt)
12850         return false;
12851       elt = (elt + nelt) & mask;
12852       if (d->perm[i * 2 + 1] != elt)
12853         return false;
12854     }
12855
12856   /* Success!  */
12857   if (d->testing_p)
12858     return true;
12859
12860   in0 = d->op0;
12861   in1 = d->op1;
12862   if (BYTES_BIG_ENDIAN)
12863     {
12864       x = in0, in0 = in1, in1 = x;
12865       high = !high;
12866     }
12867   out = d->target;
12868
12869   if (high)
12870     {
12871       switch (vmode)
12872         {
12873         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12874         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12875         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12876         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12877         case V4SImode: gen = gen_aarch64_zip2v4si; break;
12878         case V2SImode: gen = gen_aarch64_zip2v2si; break;
12879         case V2DImode: gen = gen_aarch64_zip2v2di; break;
12880         case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
12881         case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
12882         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12883         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12884         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12885         default:
12886           return false;
12887         }
12888     }
12889   else
12890     {
12891       switch (vmode)
12892         {
12893         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
12894         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
12895         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
12896         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
12897         case V4SImode: gen = gen_aarch64_zip1v4si; break;
12898         case V2SImode: gen = gen_aarch64_zip1v2si; break;
12899         case V2DImode: gen = gen_aarch64_zip1v2di; break;
12900         case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
12901         case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
12902         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
12903         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
12904         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
12905         default:
12906           return false;
12907         }
12908     }
12909
12910   emit_insn (gen (out, in0, in1));
12911   return true;
12912 }
12913
12914 /* Recognize patterns for the EXT insn.  */
12915
12916 static bool
12917 aarch64_evpc_ext (struct expand_vec_perm_d *d)
12918 {
12919   unsigned int i, nelt = d->nelt;
12920   rtx (*gen) (rtx, rtx, rtx, rtx);
12921   rtx offset;
12922
12923   unsigned int location = d->perm[0]; /* Always < nelt.  */
12924
12925   /* Check if the extracted indices are increasing by one.  */
12926   for (i = 1; i < nelt; i++)
12927     {
12928       unsigned int required = location + i;
12929       if (d->one_vector_p)
12930         {
12931           /* We'll pass the same vector in twice, so allow indices to wrap.  */
12932           required &= (nelt - 1);
12933         }
12934       if (d->perm[i] != required)
12935         return false;
12936     }
12937
12938   switch (d->vmode)
12939     {
12940     case V16QImode: gen = gen_aarch64_extv16qi; break;
12941     case V8QImode: gen = gen_aarch64_extv8qi; break;
12942     case V4HImode: gen = gen_aarch64_extv4hi; break;
12943     case V8HImode: gen = gen_aarch64_extv8hi; break;
12944     case V2SImode: gen = gen_aarch64_extv2si; break;
12945     case V4SImode: gen = gen_aarch64_extv4si; break;
12946     case V4HFmode: gen = gen_aarch64_extv4hf; break;
12947     case V8HFmode: gen = gen_aarch64_extv8hf; break;
12948     case V2SFmode: gen = gen_aarch64_extv2sf; break;
12949     case V4SFmode: gen = gen_aarch64_extv4sf; break;
12950     case V2DImode: gen = gen_aarch64_extv2di; break;
12951     case V2DFmode: gen = gen_aarch64_extv2df; break;
12952     default:
12953       return false;
12954     }
12955
12956   /* Success! */
12957   if (d->testing_p)
12958     return true;
12959
12960   /* The case where (location == 0) is a no-op for both big- and little-endian,
12961      and is removed by the mid-end at optimization levels -O1 and higher.  */
12962
12963   if (BYTES_BIG_ENDIAN && (location != 0))
12964     {
12965       /* After setup, we want the high elements of the first vector (stored
12966          at the LSB end of the register), and the low elements of the second
12967          vector (stored at the MSB end of the register). So swap.  */
12968       std::swap (d->op0, d->op1);
12969       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
12970       location = nelt - location;
12971     }
12972
12973   offset = GEN_INT (location);
12974   emit_insn (gen (d->target, d->op0, d->op1, offset));
12975   return true;
12976 }
12977
12978 /* Recognize patterns for the REV insns.  */
12979
12980 static bool
12981 aarch64_evpc_rev (struct expand_vec_perm_d *d)
12982 {
12983   unsigned int i, j, diff, nelt = d->nelt;
12984   rtx (*gen) (rtx, rtx);
12985
12986   if (!d->one_vector_p)
12987     return false;
12988
12989   diff = d->perm[0];
12990   switch (diff)
12991     {
12992     case 7:
12993       switch (d->vmode)
12994         {
12995         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
12996         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
12997         default:
12998           return false;
12999         }
13000       break;
13001     case 3:
13002       switch (d->vmode)
13003         {
13004         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
13005         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
13006         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
13007         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
13008         default:
13009           return false;
13010         }
13011       break;
13012     case 1:
13013       switch (d->vmode)
13014         {
13015         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
13016         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
13017         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
13018         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
13019         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
13020         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
13021         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
13022         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
13023         case V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
13024         case V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
13025         default:
13026           return false;
13027         }
13028       break;
13029     default:
13030       return false;
13031     }
13032
13033   for (i = 0; i < nelt ; i += diff + 1)
13034     for (j = 0; j <= diff; j += 1)
13035       {
13036         /* This is guaranteed to be true as the value of diff
13037            is 7, 3, 1 and we should have enough elements in the
13038            queue to generate this.  Getting a vector mask with a
13039            value of diff other than these values implies that
13040            something is wrong by the time we get here.  */
13041         gcc_assert (i + j < nelt);
13042         if (d->perm[i + j] != i + diff - j)
13043           return false;
13044       }
13045
13046   /* Success! */
13047   if (d->testing_p)
13048     return true;
13049
13050   emit_insn (gen (d->target, d->op0));
13051   return true;
13052 }
13053
13054 static bool
13055 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13056 {
13057   rtx (*gen) (rtx, rtx, rtx);
13058   rtx out = d->target;
13059   rtx in0;
13060   machine_mode vmode = d->vmode;
13061   unsigned int i, elt, nelt = d->nelt;
13062   rtx lane;
13063
13064   elt = d->perm[0];
13065   for (i = 1; i < nelt; i++)
13066     {
13067       if (elt != d->perm[i])
13068         return false;
13069     }
13070
13071   /* The generic preparation in aarch64_expand_vec_perm_const_1
13072      swaps the operand order and the permute indices if it finds
13073      d->perm[0] to be in the second operand.  Thus, we can always
13074      use d->op0 and need not do any extra arithmetic to get the
13075      correct lane number.  */
13076   in0 = d->op0;
13077   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
13078
13079   switch (vmode)
13080     {
13081     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13082     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13083     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13084     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13085     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13086     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13087     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13088     case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13089     case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13090     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13091     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13092     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13093     default:
13094       return false;
13095     }
13096
13097   emit_insn (gen (out, in0, lane));
13098   return true;
13099 }
13100
13101 static bool
13102 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13103 {
13104   rtx rperm[MAX_VECT_LEN], sel;
13105   machine_mode vmode = d->vmode;
13106   unsigned int i, nelt = d->nelt;
13107
13108   if (d->testing_p)
13109     return true;
13110
13111   /* Generic code will try constant permutation twice.  Once with the
13112      original mode and again with the elements lowered to QImode.
13113      So wait and don't do the selector expansion ourselves.  */
13114   if (vmode != V8QImode && vmode != V16QImode)
13115     return false;
13116
13117   for (i = 0; i < nelt; ++i)
13118     {
13119       int nunits = GET_MODE_NUNITS (vmode);
13120
13121       /* If big-endian and two vectors we end up with a weird mixed-endian
13122          mode on NEON.  Reverse the index within each word but not the word
13123          itself.  */
13124       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13125                                            : d->perm[i]);
13126     }
13127   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13128   sel = force_reg (vmode, sel);
13129
13130   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13131   return true;
13132 }
13133
13134 static bool
13135 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13136 {
13137   /* The pattern matching functions above are written to look for a small
13138      number to begin the sequence (0, 1, N/2).  If we begin with an index
13139      from the second operand, we can swap the operands.  */
13140   if (d->perm[0] >= d->nelt)
13141     {
13142       unsigned i, nelt = d->nelt;
13143
13144       gcc_assert (nelt == (nelt & -nelt));
13145       for (i = 0; i < nelt; ++i)
13146         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
13147
13148       std::swap (d->op0, d->op1);
13149     }
13150
13151   if (TARGET_SIMD)
13152     {
13153       if (aarch64_evpc_rev (d))
13154         return true;
13155       else if (aarch64_evpc_ext (d))
13156         return true;
13157       else if (aarch64_evpc_dup (d))
13158         return true;
13159       else if (aarch64_evpc_zip (d))
13160         return true;
13161       else if (aarch64_evpc_uzp (d))
13162         return true;
13163       else if (aarch64_evpc_trn (d))
13164         return true;
13165       return aarch64_evpc_tbl (d);
13166     }
13167   return false;
13168 }
13169
13170 /* Expand a vec_perm_const pattern.  */
13171
13172 bool
13173 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13174 {
13175   struct expand_vec_perm_d d;
13176   int i, nelt, which;
13177
13178   d.target = target;
13179   d.op0 = op0;
13180   d.op1 = op1;
13181
13182   d.vmode = GET_MODE (target);
13183   gcc_assert (VECTOR_MODE_P (d.vmode));
13184   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13185   d.testing_p = false;
13186
13187   for (i = which = 0; i < nelt; ++i)
13188     {
13189       rtx e = XVECEXP (sel, 0, i);
13190       int ei = INTVAL (e) & (2 * nelt - 1);
13191       which |= (ei < nelt ? 1 : 2);
13192       d.perm[i] = ei;
13193     }
13194
13195   switch (which)
13196     {
13197     default:
13198       gcc_unreachable ();
13199
13200     case 3:
13201       d.one_vector_p = false;
13202       if (!rtx_equal_p (op0, op1))
13203         break;
13204
13205       /* The elements of PERM do not suggest that only the first operand
13206          is used, but both operands are identical.  Allow easier matching
13207          of the permutation by folding the permutation into the single
13208          input vector.  */
13209       /* Fall Through.  */
13210     case 2:
13211       for (i = 0; i < nelt; ++i)
13212         d.perm[i] &= nelt - 1;
13213       d.op0 = op1;
13214       d.one_vector_p = true;
13215       break;
13216
13217     case 1:
13218       d.op1 = op0;
13219       d.one_vector_p = true;
13220       break;
13221     }
13222
13223   return aarch64_expand_vec_perm_const_1 (&d);
13224 }
13225
13226 static bool
13227 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13228                                      const unsigned char *sel)
13229 {
13230   struct expand_vec_perm_d d;
13231   unsigned int i, nelt, which;
13232   bool ret;
13233
13234   d.vmode = vmode;
13235   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13236   d.testing_p = true;
13237   memcpy (d.perm, sel, nelt);
13238
13239   /* Calculate whether all elements are in one vector.  */
13240   for (i = which = 0; i < nelt; ++i)
13241     {
13242       unsigned char e = d.perm[i];
13243       gcc_assert (e < 2 * nelt);
13244       which |= (e < nelt ? 1 : 2);
13245     }
13246
13247   /* If all elements are from the second vector, reindex as if from the
13248      first vector.  */
13249   if (which == 2)
13250     for (i = 0; i < nelt; ++i)
13251       d.perm[i] -= nelt;
13252
13253   /* Check whether the mask can be applied to a single vector.  */
13254   d.one_vector_p = (which != 3);
13255
13256   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13257   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13258   if (!d.one_vector_p)
13259     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13260
13261   start_sequence ();
13262   ret = aarch64_expand_vec_perm_const_1 (&d);
13263   end_sequence ();
13264
13265   return ret;
13266 }
13267
13268 rtx
13269 aarch64_reverse_mask (enum machine_mode mode)
13270 {
13271   /* We have to reverse each vector because we dont have
13272      a permuted load that can reverse-load according to ABI rules.  */
13273   rtx mask;
13274   rtvec v = rtvec_alloc (16);
13275   int i, j;
13276   int nunits = GET_MODE_NUNITS (mode);
13277   int usize = GET_MODE_UNIT_SIZE (mode);
13278
13279   gcc_assert (BYTES_BIG_ENDIAN);
13280   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13281
13282   for (i = 0; i < nunits; i++)
13283     for (j = 0; j < usize; j++)
13284       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13285   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13286   return force_reg (V16QImode, mask);
13287 }
13288
13289 /* Implement MODES_TIEABLE_P.  In principle we should always return true.
13290    However due to issues with register allocation it is preferable to avoid
13291    tieing integer scalar and FP scalar modes.  Executing integer operations
13292    in general registers is better than treating them as scalar vector
13293    operations.  This reduces latency and avoids redundant int<->FP moves.
13294    So tie modes if they are either the same class, or vector modes with
13295    other vector modes, vector structs or any scalar mode.
13296 */
13297
13298 bool
13299 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13300 {
13301   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13302     return true;
13303
13304   /* We specifically want to allow elements of "structure" modes to
13305      be tieable to the structure.  This more general condition allows
13306      other rarer situations too.  */
13307   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13308     return true;
13309
13310   /* Also allow any scalar modes with vectors.  */
13311   if (aarch64_vector_mode_supported_p (mode1)
13312       || aarch64_vector_mode_supported_p (mode2))
13313     return true;
13314
13315   return false;
13316 }
13317
13318 /* Return a new RTX holding the result of moving POINTER forward by
13319    AMOUNT bytes.  */
13320
13321 static rtx
13322 aarch64_move_pointer (rtx pointer, int amount)
13323 {
13324   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13325
13326   return adjust_automodify_address (pointer, GET_MODE (pointer),
13327                                     next, amount);
13328 }
13329
13330 /* Return a new RTX holding the result of moving POINTER forward by the
13331    size of the mode it points to.  */
13332
13333 static rtx
13334 aarch64_progress_pointer (rtx pointer)
13335 {
13336   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13337
13338   return aarch64_move_pointer (pointer, amount);
13339 }
13340
13341 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13342    MODE bytes.  */
13343
13344 static void
13345 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13346                                               machine_mode mode)
13347 {
13348   rtx reg = gen_reg_rtx (mode);
13349
13350   /* "Cast" the pointers to the correct mode.  */
13351   *src = adjust_address (*src, mode, 0);
13352   *dst = adjust_address (*dst, mode, 0);
13353   /* Emit the memcpy.  */
13354   emit_move_insn (reg, *src);
13355   emit_move_insn (*dst, reg);
13356   /* Move the pointers forward.  */
13357   *src = aarch64_progress_pointer (*src);
13358   *dst = aarch64_progress_pointer (*dst);
13359 }
13360
13361 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13362    we succeed, otherwise return false.  */
13363
13364 bool
13365 aarch64_expand_movmem (rtx *operands)
13366 {
13367   unsigned int n;
13368   rtx dst = operands[0];
13369   rtx src = operands[1];
13370   rtx base;
13371   bool speed_p = !optimize_function_for_size_p (cfun);
13372
13373   /* When optimizing for size, give a better estimate of the length of a
13374      memcpy call, but use the default otherwise.  */
13375   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13376
13377   /* We can't do anything smart if the amount to copy is not constant.  */
13378   if (!CONST_INT_P (operands[2]))
13379     return false;
13380
13381   n = UINTVAL (operands[2]);
13382
13383   /* Try to keep the number of instructions low.  For cases below 16 bytes we
13384      need to make at most two moves.  For cases above 16 bytes it will be one
13385      move for each 16 byte chunk, then at most two additional moves.  */
13386   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13387     return false;
13388
13389   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13390   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13391
13392   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13393   src = adjust_automodify_address (src, VOIDmode, base, 0);
13394
13395   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13396      1-byte chunk.  */
13397   if (n < 4)
13398     {
13399       if (n >= 2)
13400         {
13401           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13402           n -= 2;
13403         }
13404
13405       if (n == 1)
13406         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13407
13408       return true;
13409     }
13410
13411   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
13412      4-byte chunk, partially overlapping with the previously copied chunk.  */
13413   if (n < 8)
13414     {
13415       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13416       n -= 4;
13417       if (n > 0)
13418         {
13419           int move = n - 4;
13420
13421           src = aarch64_move_pointer (src, move);
13422           dst = aarch64_move_pointer (dst, move);
13423           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13424         }
13425       return true;
13426     }
13427
13428   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
13429      them, then (if applicable) an 8-byte chunk.  */
13430   while (n >= 8)
13431     {
13432       if (n / 16)
13433         {
13434           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13435           n -= 16;
13436         }
13437       else
13438         {
13439           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13440           n -= 8;
13441         }
13442     }
13443
13444   /* Finish the final bytes of the copy.  We can always do this in one
13445      instruction.  We either copy the exact amount we need, or partially
13446      overlap with the previous chunk we copied and copy 8-bytes.  */
13447   if (n == 0)
13448     return true;
13449   else if (n == 1)
13450     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13451   else if (n == 2)
13452     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13453   else if (n == 4)
13454     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13455   else
13456     {
13457       if (n == 3)
13458         {
13459           src = aarch64_move_pointer (src, -1);
13460           dst = aarch64_move_pointer (dst, -1);
13461           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13462         }
13463       else
13464         {
13465           int move = n - 8;
13466
13467           src = aarch64_move_pointer (src, move);
13468           dst = aarch64_move_pointer (dst, move);
13469           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13470         }
13471     }
13472
13473   return true;
13474 }
13475
13476 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
13477    SImode stores.  Handle the case when the constant has identical
13478    bottom and top halves.  This is beneficial when the two stores can be
13479    merged into an STP and we avoid synthesising potentially expensive
13480    immediates twice.  Return true if such a split is possible.  */
13481
13482 bool
13483 aarch64_split_dimode_const_store (rtx dst, rtx src)
13484 {
13485   rtx lo = gen_lowpart (SImode, src);
13486   rtx hi = gen_highpart_mode (SImode, DImode, src);
13487
13488   bool size_p = optimize_function_for_size_p (cfun);
13489
13490   if (!rtx_equal_p (lo, hi))
13491     return false;
13492
13493   unsigned int orig_cost
13494     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
13495   unsigned int lo_cost
13496     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
13497
13498   /* We want to transform:
13499      MOV        x1, 49370
13500      MOVK       x1, 0x140, lsl 16
13501      MOVK       x1, 0xc0da, lsl 32
13502      MOVK       x1, 0x140, lsl 48
13503      STR        x1, [x0]
13504    into:
13505      MOV        w1, 49370
13506      MOVK       w1, 0x140, lsl 16
13507      STP        w1, w1, [x0]
13508    So we want to perform this only when we save two instructions
13509    or more.  When optimizing for size, however, accept any code size
13510    savings we can.  */
13511   if (size_p && orig_cost <= lo_cost)
13512     return false;
13513
13514   if (!size_p
13515       && (orig_cost <= lo_cost + 1))
13516     return false;
13517
13518   rtx mem_lo = adjust_address (dst, SImode, 0);
13519   if (!aarch64_mem_pair_operand (mem_lo, SImode))
13520     return false;
13521
13522   rtx tmp_reg = gen_reg_rtx (SImode);
13523   aarch64_expand_mov_immediate (tmp_reg, lo);
13524   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
13525   /* Don't emit an explicit store pair as this may not be always profitable.
13526      Let the sched-fusion logic decide whether to merge them.  */
13527   emit_move_insn (mem_lo, tmp_reg);
13528   emit_move_insn (mem_hi, tmp_reg);
13529
13530   return true;
13531 }
13532
13533 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
13534
13535 static unsigned HOST_WIDE_INT
13536 aarch64_asan_shadow_offset (void)
13537 {
13538   return (HOST_WIDE_INT_1 << 36);
13539 }
13540
13541 static bool
13542 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13543                                         unsigned int align,
13544                                         enum by_pieces_operation op,
13545                                         bool speed_p)
13546 {
13547   /* STORE_BY_PIECES can be used when copying a constant string, but
13548      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13549      For now we always fail this and let the move_by_pieces code copy
13550      the string from read-only memory.  */
13551   if (op == STORE_BY_PIECES)
13552     return false;
13553
13554   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13555 }
13556
13557 static rtx
13558 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
13559                         int code, tree treeop0, tree treeop1)
13560 {
13561   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13562   rtx op0, op1;
13563   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13564   insn_code icode;
13565   struct expand_operand ops[4];
13566
13567   start_sequence ();
13568   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13569
13570   op_mode = GET_MODE (op0);
13571   if (op_mode == VOIDmode)
13572     op_mode = GET_MODE (op1);
13573
13574   switch (op_mode)
13575     {
13576     case QImode:
13577     case HImode:
13578     case SImode:
13579       cmp_mode = SImode;
13580       icode = CODE_FOR_cmpsi;
13581       break;
13582
13583     case DImode:
13584       cmp_mode = DImode;
13585       icode = CODE_FOR_cmpdi;
13586       break;
13587
13588     case SFmode:
13589       cmp_mode = SFmode;
13590       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13591       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13592       break;
13593
13594     case DFmode:
13595       cmp_mode = DFmode;
13596       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13597       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13598       break;
13599
13600     default:
13601       end_sequence ();
13602       return NULL_RTX;
13603     }
13604
13605   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13606   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13607   if (!op0 || !op1)
13608     {
13609       end_sequence ();
13610       return NULL_RTX;
13611     }
13612   *prep_seq = get_insns ();
13613   end_sequence ();
13614
13615   create_fixed_operand (&ops[0], op0);
13616   create_fixed_operand (&ops[1], op1);
13617
13618   start_sequence ();
13619   if (!maybe_expand_insn (icode, 2, ops))
13620     {
13621       end_sequence ();
13622       return NULL_RTX;
13623     }
13624   *gen_seq = get_insns ();
13625   end_sequence ();
13626
13627   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13628                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13629 }
13630
13631 static rtx
13632 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
13633                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
13634 {
13635   rtx op0, op1, target;
13636   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13637   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13638   insn_code icode;
13639   struct expand_operand ops[6];
13640   int aarch64_cond;
13641
13642   push_to_sequence (*prep_seq);
13643   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13644
13645   op_mode = GET_MODE (op0);
13646   if (op_mode == VOIDmode)
13647     op_mode = GET_MODE (op1);
13648
13649   switch (op_mode)
13650     {
13651     case QImode:
13652     case HImode:
13653     case SImode:
13654       cmp_mode = SImode;
13655       icode = CODE_FOR_ccmpsi;
13656       break;
13657
13658     case DImode:
13659       cmp_mode = DImode;
13660       icode = CODE_FOR_ccmpdi;
13661       break;
13662
13663     case SFmode:
13664       cmp_mode = SFmode;
13665       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13666       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13667       break;
13668
13669     case DFmode:
13670       cmp_mode = DFmode;
13671       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13672       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13673       break;
13674
13675     default:
13676       end_sequence ();
13677       return NULL_RTX;
13678     }
13679
13680   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13681   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13682   if (!op0 || !op1)
13683     {
13684       end_sequence ();
13685       return NULL_RTX;
13686     }
13687   *prep_seq = get_insns ();
13688   end_sequence ();
13689
13690   target = gen_rtx_REG (cc_mode, CC_REGNUM);
13691   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
13692
13693   if (bit_code != AND)
13694     {
13695       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
13696                                                 GET_MODE (XEXP (prev, 0))),
13697                              VOIDmode, XEXP (prev, 0), const0_rtx);
13698       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
13699     }
13700
13701   create_fixed_operand (&ops[0], XEXP (prev, 0));
13702   create_fixed_operand (&ops[1], target);
13703   create_fixed_operand (&ops[2], op0);
13704   create_fixed_operand (&ops[3], op1);
13705   create_fixed_operand (&ops[4], prev);
13706   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
13707
13708   push_to_sequence (*gen_seq);
13709   if (!maybe_expand_insn (icode, 6, ops))
13710     {
13711       end_sequence ();
13712       return NULL_RTX;
13713     }
13714
13715   *gen_seq = get_insns ();
13716   end_sequence ();
13717
13718   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
13719 }
13720
13721 #undef TARGET_GEN_CCMP_FIRST
13722 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13723
13724 #undef TARGET_GEN_CCMP_NEXT
13725 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13726
13727 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
13728    instruction fusion of some sort.  */
13729
13730 static bool
13731 aarch64_macro_fusion_p (void)
13732 {
13733   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13734 }
13735
13736
13737 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
13738    should be kept together during scheduling.  */
13739
13740 static bool
13741 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13742 {
13743   rtx set_dest;
13744   rtx prev_set = single_set (prev);
13745   rtx curr_set = single_set (curr);
13746   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
13747   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13748
13749   if (!aarch64_macro_fusion_p ())
13750     return false;
13751
13752   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
13753     {
13754       /* We are trying to match:
13755          prev (mov)  == (set (reg r0) (const_int imm16))
13756          curr (movk) == (set (zero_extract (reg r0)
13757                                            (const_int 16)
13758                                            (const_int 16))
13759                              (const_int imm16_1))  */
13760
13761       set_dest = SET_DEST (curr_set);
13762
13763       if (GET_CODE (set_dest) == ZERO_EXTRACT
13764           && CONST_INT_P (SET_SRC (curr_set))
13765           && CONST_INT_P (SET_SRC (prev_set))
13766           && CONST_INT_P (XEXP (set_dest, 2))
13767           && INTVAL (XEXP (set_dest, 2)) == 16
13768           && REG_P (XEXP (set_dest, 0))
13769           && REG_P (SET_DEST (prev_set))
13770           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13771         {
13772           return true;
13773         }
13774     }
13775
13776   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
13777     {
13778
13779       /*  We're trying to match:
13780           prev (adrp) == (set (reg r1)
13781                               (high (symbol_ref ("SYM"))))
13782           curr (add) == (set (reg r0)
13783                              (lo_sum (reg r1)
13784                                      (symbol_ref ("SYM"))))
13785           Note that r0 need not necessarily be the same as r1, especially
13786           during pre-regalloc scheduling.  */
13787
13788       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13789           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13790         {
13791           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13792               && REG_P (XEXP (SET_SRC (curr_set), 0))
13793               && REGNO (XEXP (SET_SRC (curr_set), 0))
13794                  == REGNO (SET_DEST (prev_set))
13795               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13796                               XEXP (SET_SRC (curr_set), 1)))
13797             return true;
13798         }
13799     }
13800
13801   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
13802     {
13803
13804       /* We're trying to match:
13805          prev (movk) == (set (zero_extract (reg r0)
13806                                            (const_int 16)
13807                                            (const_int 32))
13808                              (const_int imm16_1))
13809          curr (movk) == (set (zero_extract (reg r0)
13810                                            (const_int 16)
13811                                            (const_int 48))
13812                              (const_int imm16_2))  */
13813
13814       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13815           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13816           && REG_P (XEXP (SET_DEST (prev_set), 0))
13817           && REG_P (XEXP (SET_DEST (curr_set), 0))
13818           && REGNO (XEXP (SET_DEST (prev_set), 0))
13819              == REGNO (XEXP (SET_DEST (curr_set), 0))
13820           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13821           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13822           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13823           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13824           && CONST_INT_P (SET_SRC (prev_set))
13825           && CONST_INT_P (SET_SRC (curr_set)))
13826         return true;
13827
13828     }
13829   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
13830     {
13831       /* We're trying to match:
13832           prev (adrp) == (set (reg r0)
13833                               (high (symbol_ref ("SYM"))))
13834           curr (ldr) == (set (reg r1)
13835                              (mem (lo_sum (reg r0)
13836                                              (symbol_ref ("SYM")))))
13837                  or
13838           curr (ldr) == (set (reg r1)
13839                              (zero_extend (mem
13840                                            (lo_sum (reg r0)
13841                                                    (symbol_ref ("SYM"))))))  */
13842       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13843           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13844         {
13845           rtx curr_src = SET_SRC (curr_set);
13846
13847           if (GET_CODE (curr_src) == ZERO_EXTEND)
13848             curr_src = XEXP (curr_src, 0);
13849
13850           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13851               && REG_P (XEXP (XEXP (curr_src, 0), 0))
13852               && REGNO (XEXP (XEXP (curr_src, 0), 0))
13853                  == REGNO (SET_DEST (prev_set))
13854               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13855                               XEXP (SET_SRC (prev_set), 0)))
13856               return true;
13857         }
13858     }
13859
13860   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
13861        && aarch_crypto_can_dual_issue (prev, curr))
13862     return true;
13863
13864   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
13865       && any_condjump_p (curr))
13866     {
13867       enum attr_type prev_type = get_attr_type (prev);
13868
13869       /* FIXME: this misses some which is considered simple arthematic
13870          instructions for ThunderX.  Simple shifts are missed here.  */
13871       if (prev_type == TYPE_ALUS_SREG
13872           || prev_type == TYPE_ALUS_IMM
13873           || prev_type == TYPE_LOGICS_REG
13874           || prev_type == TYPE_LOGICS_IMM)
13875         return true;
13876     }
13877
13878   return false;
13879 }
13880
13881 /* Return true iff the instruction fusion described by OP is enabled.  */
13882
13883 bool
13884 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
13885 {
13886   return (aarch64_tune_params.fusible_ops & op) != 0;
13887 }
13888
13889 /* If MEM is in the form of [base+offset], extract the two parts
13890    of address and set to BASE and OFFSET, otherwise return false
13891    after clearing BASE and OFFSET.  */
13892
13893 bool
13894 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
13895 {
13896   rtx addr;
13897
13898   gcc_assert (MEM_P (mem));
13899
13900   addr = XEXP (mem, 0);
13901
13902   if (REG_P (addr))
13903     {
13904       *base = addr;
13905       *offset = const0_rtx;
13906       return true;
13907     }
13908
13909   if (GET_CODE (addr) == PLUS
13910       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
13911     {
13912       *base = XEXP (addr, 0);
13913       *offset = XEXP (addr, 1);
13914       return true;
13915     }
13916
13917   *base = NULL_RTX;
13918   *offset = NULL_RTX;
13919
13920   return false;
13921 }
13922
13923 /* Types for scheduling fusion.  */
13924 enum sched_fusion_type
13925 {
13926   SCHED_FUSION_NONE = 0,
13927   SCHED_FUSION_LD_SIGN_EXTEND,
13928   SCHED_FUSION_LD_ZERO_EXTEND,
13929   SCHED_FUSION_LD,
13930   SCHED_FUSION_ST,
13931   SCHED_FUSION_NUM
13932 };
13933
13934 /* If INSN is a load or store of address in the form of [base+offset],
13935    extract the two parts and set to BASE and OFFSET.  Return scheduling
13936    fusion type this INSN is.  */
13937
13938 static enum sched_fusion_type
13939 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
13940 {
13941   rtx x, dest, src;
13942   enum sched_fusion_type fusion = SCHED_FUSION_LD;
13943
13944   gcc_assert (INSN_P (insn));
13945   x = PATTERN (insn);
13946   if (GET_CODE (x) != SET)
13947     return SCHED_FUSION_NONE;
13948
13949   src = SET_SRC (x);
13950   dest = SET_DEST (x);
13951
13952   machine_mode dest_mode = GET_MODE (dest);
13953
13954   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
13955     return SCHED_FUSION_NONE;
13956
13957   if (GET_CODE (src) == SIGN_EXTEND)
13958     {
13959       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
13960       src = XEXP (src, 0);
13961       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13962         return SCHED_FUSION_NONE;
13963     }
13964   else if (GET_CODE (src) == ZERO_EXTEND)
13965     {
13966       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
13967       src = XEXP (src, 0);
13968       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13969         return SCHED_FUSION_NONE;
13970     }
13971
13972   if (GET_CODE (src) == MEM && REG_P (dest))
13973     extract_base_offset_in_addr (src, base, offset);
13974   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
13975     {
13976       fusion = SCHED_FUSION_ST;
13977       extract_base_offset_in_addr (dest, base, offset);
13978     }
13979   else
13980     return SCHED_FUSION_NONE;
13981
13982   if (*base == NULL_RTX || *offset == NULL_RTX)
13983     fusion = SCHED_FUSION_NONE;
13984
13985   return fusion;
13986 }
13987
13988 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
13989
13990    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
13991    and PRI are only calculated for these instructions.  For other instruction,
13992    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
13993    type instruction fusion can be added by returning different priorities.
13994
13995    It's important that irrelevant instructions get the largest FUSION_PRI.  */
13996
13997 static void
13998 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
13999                                int *fusion_pri, int *pri)
14000 {
14001   int tmp, off_val;
14002   rtx base, offset;
14003   enum sched_fusion_type fusion;
14004
14005   gcc_assert (INSN_P (insn));
14006
14007   tmp = max_pri - 1;
14008   fusion = fusion_load_store (insn, &base, &offset);
14009   if (fusion == SCHED_FUSION_NONE)
14010     {
14011       *pri = tmp;
14012       *fusion_pri = tmp;
14013       return;
14014     }
14015
14016   /* Set FUSION_PRI according to fusion type and base register.  */
14017   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14018
14019   /* Calculate PRI.  */
14020   tmp /= 2;
14021
14022   /* INSN with smaller offset goes first.  */
14023   off_val = (int)(INTVAL (offset));
14024   if (off_val >= 0)
14025     tmp -= (off_val & 0xfffff);
14026   else
14027     tmp += ((- off_val) & 0xfffff);
14028
14029   *pri = tmp;
14030   return;
14031 }
14032
14033 /* Given OPERANDS of consecutive load/store, check if we can merge
14034    them into ldp/stp.  LOAD is true if they are load instructions.
14035    MODE is the mode of memory operands.  */
14036
14037 bool
14038 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14039                                 enum machine_mode mode)
14040 {
14041   HOST_WIDE_INT offval_1, offval_2, msize;
14042   enum reg_class rclass_1, rclass_2;
14043   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14044
14045   if (load)
14046     {
14047       mem_1 = operands[1];
14048       mem_2 = operands[3];
14049       reg_1 = operands[0];
14050       reg_2 = operands[2];
14051       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14052       if (REGNO (reg_1) == REGNO (reg_2))
14053         return false;
14054     }
14055   else
14056     {
14057       mem_1 = operands[0];
14058       mem_2 = operands[2];
14059       reg_1 = operands[1];
14060       reg_2 = operands[3];
14061     }
14062
14063   /* The mems cannot be volatile.  */
14064   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14065     return false;
14066
14067   /* If we have SImode and slow unaligned ldp,
14068      check the alignment to be at least 8 byte. */
14069   if (mode == SImode
14070       && (aarch64_tune_params.extra_tuning_flags
14071           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14072       && !optimize_size
14073       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14074     return false;
14075
14076   /* Check if the addresses are in the form of [base+offset].  */
14077   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14078   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14079     return false;
14080   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14081   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14082     return false;
14083
14084   /* Check if the bases are same.  */
14085   if (!rtx_equal_p (base_1, base_2))
14086     return false;
14087
14088   offval_1 = INTVAL (offset_1);
14089   offval_2 = INTVAL (offset_2);
14090   msize = GET_MODE_SIZE (mode);
14091   /* Check if the offsets are consecutive.  */
14092   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14093     return false;
14094
14095   /* Check if the addresses are clobbered by load.  */
14096   if (load)
14097     {
14098       if (reg_mentioned_p (reg_1, mem_1))
14099         return false;
14100
14101       /* In increasing order, the last load can clobber the address.  */
14102       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14103       return false;
14104     }
14105
14106   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14107     rclass_1 = FP_REGS;
14108   else
14109     rclass_1 = GENERAL_REGS;
14110
14111   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14112     rclass_2 = FP_REGS;
14113   else
14114     rclass_2 = GENERAL_REGS;
14115
14116   /* Check if the registers are of same class.  */
14117   if (rclass_1 != rclass_2)
14118     return false;
14119
14120   return true;
14121 }
14122
14123 /* Given OPERANDS of consecutive load/store, check if we can merge
14124    them into ldp/stp by adjusting the offset.  LOAD is true if they
14125    are load instructions.  MODE is the mode of memory operands.
14126
14127    Given below consecutive stores:
14128
14129      str  w1, [xb, 0x100]
14130      str  w1, [xb, 0x104]
14131      str  w1, [xb, 0x108]
14132      str  w1, [xb, 0x10c]
14133
14134    Though the offsets are out of the range supported by stp, we can
14135    still pair them after adjusting the offset, like:
14136
14137      add  scratch, xb, 0x100
14138      stp  w1, w1, [scratch]
14139      stp  w1, w1, [scratch, 0x8]
14140
14141    The peephole patterns detecting this opportunity should guarantee
14142    the scratch register is avaliable.  */
14143
14144 bool
14145 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14146                                        enum machine_mode mode)
14147 {
14148   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14149   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14150   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14151   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14152
14153   if (load)
14154     {
14155       reg_1 = operands[0];
14156       mem_1 = operands[1];
14157       reg_2 = operands[2];
14158       mem_2 = operands[3];
14159       reg_3 = operands[4];
14160       mem_3 = operands[5];
14161       reg_4 = operands[6];
14162       mem_4 = operands[7];
14163       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14164                   && REG_P (reg_3) && REG_P (reg_4));
14165       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14166         return false;
14167     }
14168   else
14169     {
14170       mem_1 = operands[0];
14171       reg_1 = operands[1];
14172       mem_2 = operands[2];
14173       reg_2 = operands[3];
14174       mem_3 = operands[4];
14175       reg_3 = operands[5];
14176       mem_4 = operands[6];
14177       reg_4 = operands[7];
14178     }
14179   /* Skip if memory operand is by itslef valid for ldp/stp.  */
14180   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14181     return false;
14182
14183   /* The mems cannot be volatile.  */
14184   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14185       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14186     return false;
14187
14188   /* Check if the addresses are in the form of [base+offset].  */
14189   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14190   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14191     return false;
14192   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14193   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14194     return false;
14195   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14196   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14197     return false;
14198   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14199   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14200     return false;
14201
14202   /* Check if the bases are same.  */
14203   if (!rtx_equal_p (base_1, base_2)
14204       || !rtx_equal_p (base_2, base_3)
14205       || !rtx_equal_p (base_3, base_4))
14206     return false;
14207
14208   offval_1 = INTVAL (offset_1);
14209   offval_2 = INTVAL (offset_2);
14210   offval_3 = INTVAL (offset_3);
14211   offval_4 = INTVAL (offset_4);
14212   msize = GET_MODE_SIZE (mode);
14213   /* Check if the offsets are consecutive.  */
14214   if ((offval_1 != (offval_2 + msize)
14215        || offval_1 != (offval_3 + msize * 2)
14216        || offval_1 != (offval_4 + msize * 3))
14217       && (offval_4 != (offval_3 + msize)
14218           || offval_4 != (offval_2 + msize * 2)
14219           || offval_4 != (offval_1 + msize * 3)))
14220     return false;
14221
14222   /* Check if the addresses are clobbered by load.  */
14223   if (load)
14224     {
14225       if (reg_mentioned_p (reg_1, mem_1)
14226           || reg_mentioned_p (reg_2, mem_2)
14227           || reg_mentioned_p (reg_3, mem_3))
14228         return false;
14229
14230       /* In increasing order, the last load can clobber the address.  */
14231       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14232         return false;
14233     }
14234
14235   /* If we have SImode and slow unaligned ldp,
14236      check the alignment to be at least 8 byte. */
14237   if (mode == SImode
14238       && (aarch64_tune_params.extra_tuning_flags
14239           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14240       && !optimize_size
14241       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14242     return false;
14243
14244   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14245     rclass_1 = FP_REGS;
14246   else
14247     rclass_1 = GENERAL_REGS;
14248
14249   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14250     rclass_2 = FP_REGS;
14251   else
14252     rclass_2 = GENERAL_REGS;
14253
14254   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14255     rclass_3 = FP_REGS;
14256   else
14257     rclass_3 = GENERAL_REGS;
14258
14259   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14260     rclass_4 = FP_REGS;
14261   else
14262     rclass_4 = GENERAL_REGS;
14263
14264   /* Check if the registers are of same class.  */
14265   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14266     return false;
14267
14268   return true;
14269 }
14270
14271 /* Given OPERANDS of consecutive load/store, this function pairs them
14272    into ldp/stp after adjusting the offset.  It depends on the fact
14273    that addresses of load/store instructions are in increasing order.
14274    MODE is the mode of memory operands.  CODE is the rtl operator
14275    which should be applied to all memory operands, it's SIGN_EXTEND,
14276    ZERO_EXTEND or UNKNOWN.  */
14277
14278 bool
14279 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14280                              enum machine_mode mode, RTX_CODE code)
14281 {
14282   rtx base, offset, t1, t2;
14283   rtx mem_1, mem_2, mem_3, mem_4;
14284   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14285
14286   if (load)
14287     {
14288       mem_1 = operands[1];
14289       mem_2 = operands[3];
14290       mem_3 = operands[5];
14291       mem_4 = operands[7];
14292     }
14293   else
14294     {
14295       mem_1 = operands[0];
14296       mem_2 = operands[2];
14297       mem_3 = operands[4];
14298       mem_4 = operands[6];
14299       gcc_assert (code == UNKNOWN);
14300     }
14301
14302   extract_base_offset_in_addr (mem_1, &base, &offset);
14303   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14304
14305   /* Adjust offset thus it can fit in ldp/stp instruction.  */
14306   msize = GET_MODE_SIZE (mode);
14307   stp_off_limit = msize * 0x40;
14308   off_val = INTVAL (offset);
14309   abs_off = (off_val < 0) ? -off_val : off_val;
14310   new_off = abs_off % stp_off_limit;
14311   adj_off = abs_off - new_off;
14312
14313   /* Further adjust to make sure all offsets are OK.  */
14314   if ((new_off + msize * 2) >= stp_off_limit)
14315     {
14316       adj_off += stp_off_limit;
14317       new_off -= stp_off_limit;
14318     }
14319
14320   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
14321   if (adj_off >= 0x1000)
14322     return false;
14323
14324   if (off_val < 0)
14325     {
14326       adj_off = -adj_off;
14327       new_off = -new_off;
14328     }
14329
14330   /* Create new memory references.  */
14331   mem_1 = change_address (mem_1, VOIDmode,
14332                           plus_constant (DImode, operands[8], new_off));
14333
14334   /* Check if the adjusted address is OK for ldp/stp.  */
14335   if (!aarch64_mem_pair_operand (mem_1, mode))
14336     return false;
14337
14338   msize = GET_MODE_SIZE (mode);
14339   mem_2 = change_address (mem_2, VOIDmode,
14340                           plus_constant (DImode,
14341                                          operands[8],
14342                                          new_off + msize));
14343   mem_3 = change_address (mem_3, VOIDmode,
14344                           plus_constant (DImode,
14345                                          operands[8],
14346                                          new_off + msize * 2));
14347   mem_4 = change_address (mem_4, VOIDmode,
14348                           plus_constant (DImode,
14349                                          operands[8],
14350                                          new_off + msize * 3));
14351
14352   if (code == ZERO_EXTEND)
14353     {
14354       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14355       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14356       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14357       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14358     }
14359   else if (code == SIGN_EXTEND)
14360     {
14361       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14362       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14363       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14364       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14365     }
14366
14367   if (load)
14368     {
14369       operands[1] = mem_1;
14370       operands[3] = mem_2;
14371       operands[5] = mem_3;
14372       operands[7] = mem_4;
14373     }
14374   else
14375     {
14376       operands[0] = mem_1;
14377       operands[2] = mem_2;
14378       operands[4] = mem_3;
14379       operands[6] = mem_4;
14380     }
14381
14382   /* Emit adjusting instruction.  */
14383   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14384   /* Emit ldp/stp instructions.  */
14385   t1 = gen_rtx_SET (operands[0], operands[1]);
14386   t2 = gen_rtx_SET (operands[2], operands[3]);
14387   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14388   t1 = gen_rtx_SET (operands[4], operands[5]);
14389   t2 = gen_rtx_SET (operands[6], operands[7]);
14390   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14391   return true;
14392 }
14393
14394 /* Return 1 if pseudo register should be created and used to hold
14395    GOT address for PIC code.  */
14396
14397 bool
14398 aarch64_use_pseudo_pic_reg (void)
14399 {
14400   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14401 }
14402
14403 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
14404
14405 static int
14406 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14407 {
14408   switch (XINT (x, 1))
14409     {
14410     case UNSPEC_GOTSMALLPIC:
14411     case UNSPEC_GOTSMALLPIC28K:
14412     case UNSPEC_GOTTINYPIC:
14413       return 0;
14414     default:
14415       break;
14416     }
14417
14418   return default_unspec_may_trap_p (x, flags);
14419 }
14420
14421
14422 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14423    return the log2 of that value.  Otherwise return -1.  */
14424
14425 int
14426 aarch64_fpconst_pow_of_2 (rtx x)
14427 {
14428   const REAL_VALUE_TYPE *r;
14429
14430   if (!CONST_DOUBLE_P (x))
14431     return -1;
14432
14433   r = CONST_DOUBLE_REAL_VALUE (x);
14434
14435   if (REAL_VALUE_NEGATIVE (*r)
14436       || REAL_VALUE_ISNAN (*r)
14437       || REAL_VALUE_ISINF (*r)
14438       || !real_isinteger (r, DFmode))
14439     return -1;
14440
14441   return exact_log2 (real_to_integer (r));
14442 }
14443
14444 /* If X is a vector of equal CONST_DOUBLE values and that value is
14445    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
14446
14447 int
14448 aarch64_vec_fpconst_pow_of_2 (rtx x)
14449 {
14450   if (GET_CODE (x) != CONST_VECTOR)
14451     return -1;
14452
14453   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14454     return -1;
14455
14456   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14457   if (firstval <= 0)
14458     return -1;
14459
14460   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14461     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14462       return -1;
14463
14464   return firstval;
14465 }
14466
14467 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
14468    to float.
14469
14470    __fp16 always promotes through this hook.
14471    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
14472    through the generic excess precision logic rather than here.  */
14473
14474 static tree
14475 aarch64_promoted_type (const_tree t)
14476 {
14477   if (SCALAR_FLOAT_TYPE_P (t)
14478       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
14479     return float_type_node;
14480
14481   return NULL_TREE;
14482 }
14483
14484 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
14485
14486 static bool
14487 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
14488                            optimization_type opt_type)
14489 {
14490   switch (op)
14491     {
14492     case rsqrt_optab:
14493       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14494
14495     default:
14496       return true;
14497     }
14498 }
14499
14500 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
14501    if MODE is HFmode, and punt to the generic implementation otherwise.  */
14502
14503 static bool
14504 aarch64_libgcc_floating_mode_supported_p (machine_mode mode)
14505 {
14506   return (mode == HFmode
14507           ? true
14508           : default_libgcc_floating_mode_supported_p (mode));
14509 }
14510
14511 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
14512    if MODE is HFmode, and punt to the generic implementation otherwise.  */
14513
14514 static bool
14515 aarch64_scalar_mode_supported_p (machine_mode mode)
14516 {
14517   return (mode == HFmode
14518           ? true
14519           : default_scalar_mode_supported_p (mode));
14520 }
14521
14522 /* Set the value of FLT_EVAL_METHOD.
14523    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
14524
14525     0: evaluate all operations and constants, whose semantic type has at
14526        most the range and precision of type float, to the range and
14527        precision of float; evaluate all other operations and constants to
14528        the range and precision of the semantic type;
14529
14530     N, where _FloatN is a supported interchange floating type
14531        evaluate all operations and constants, whose semantic type has at
14532        most the range and precision of _FloatN type, to the range and
14533        precision of the _FloatN type; evaluate all other operations and
14534        constants to the range and precision of the semantic type;
14535
14536    If we have the ARMv8.2-A extensions then we support _Float16 in native
14537    precision, so we should set this to 16.  Otherwise, we support the type,
14538    but want to evaluate expressions in float precision, so set this to
14539    0.  */
14540
14541 static enum flt_eval_method
14542 aarch64_excess_precision (enum excess_precision_type type)
14543 {
14544   switch (type)
14545     {
14546       case EXCESS_PRECISION_TYPE_FAST:
14547       case EXCESS_PRECISION_TYPE_STANDARD:
14548         /* We can calculate either in 16-bit range and precision or
14549            32-bit range and precision.  Make that decision based on whether
14550            we have native support for the ARMv8.2-A 16-bit floating-point
14551            instructions or not.  */
14552         return (TARGET_FP_F16INST
14553                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
14554                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
14555       case EXCESS_PRECISION_TYPE_IMPLICIT:
14556         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
14557       default:
14558         gcc_unreachable ();
14559     }
14560   return FLT_EVAL_METHOD_UNPREDICTABLE;
14561 }
14562
14563 #undef TARGET_ADDRESS_COST
14564 #define TARGET_ADDRESS_COST aarch64_address_cost
14565
14566 /* This hook will determines whether unnamed bitfields affect the alignment
14567    of the containing structure.  The hook returns true if the structure
14568    should inherit the alignment requirements of an unnamed bitfield's
14569    type.  */
14570 #undef TARGET_ALIGN_ANON_BITFIELD
14571 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14572
14573 #undef TARGET_ASM_ALIGNED_DI_OP
14574 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14575
14576 #undef TARGET_ASM_ALIGNED_HI_OP
14577 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14578
14579 #undef TARGET_ASM_ALIGNED_SI_OP
14580 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14581
14582 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14583 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14584   hook_bool_const_tree_hwi_hwi_const_tree_true
14585
14586 #undef TARGET_ASM_FILE_START
14587 #define TARGET_ASM_FILE_START aarch64_start_file
14588
14589 #undef TARGET_ASM_OUTPUT_MI_THUNK
14590 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14591
14592 #undef TARGET_ASM_SELECT_RTX_SECTION
14593 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14594
14595 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14596 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14597
14598 #undef TARGET_BUILD_BUILTIN_VA_LIST
14599 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14600
14601 #undef TARGET_CALLEE_COPIES
14602 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14603
14604 #undef TARGET_CAN_ELIMINATE
14605 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14606
14607 #undef TARGET_CAN_INLINE_P
14608 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14609
14610 #undef TARGET_CANNOT_FORCE_CONST_MEM
14611 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14612
14613 #undef TARGET_CASE_VALUES_THRESHOLD
14614 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14615
14616 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14617 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14618
14619 /* Only the least significant bit is used for initialization guard
14620    variables.  */
14621 #undef TARGET_CXX_GUARD_MASK_BIT
14622 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14623
14624 #undef TARGET_C_MODE_FOR_SUFFIX
14625 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14626
14627 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14628 #undef  TARGET_DEFAULT_TARGET_FLAGS
14629 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14630 #endif
14631
14632 #undef TARGET_CLASS_MAX_NREGS
14633 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14634
14635 #undef TARGET_BUILTIN_DECL
14636 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14637
14638 #undef TARGET_BUILTIN_RECIPROCAL
14639 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14640
14641 #undef TARGET_C_EXCESS_PRECISION
14642 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
14643
14644 #undef  TARGET_EXPAND_BUILTIN
14645 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14646
14647 #undef TARGET_EXPAND_BUILTIN_VA_START
14648 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14649
14650 #undef TARGET_FOLD_BUILTIN
14651 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14652
14653 #undef TARGET_FUNCTION_ARG
14654 #define TARGET_FUNCTION_ARG aarch64_function_arg
14655
14656 #undef TARGET_FUNCTION_ARG_ADVANCE
14657 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14658
14659 #undef TARGET_FUNCTION_ARG_BOUNDARY
14660 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14661
14662 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14663 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14664
14665 #undef TARGET_FUNCTION_VALUE
14666 #define TARGET_FUNCTION_VALUE aarch64_function_value
14667
14668 #undef TARGET_FUNCTION_VALUE_REGNO_P
14669 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14670
14671 #undef TARGET_FRAME_POINTER_REQUIRED
14672 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14673
14674 #undef TARGET_GIMPLE_FOLD_BUILTIN
14675 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14676
14677 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14678 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14679
14680 #undef  TARGET_INIT_BUILTINS
14681 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
14682
14683 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14684 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14685   aarch64_ira_change_pseudo_allocno_class
14686
14687 #undef TARGET_LEGITIMATE_ADDRESS_P
14688 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14689
14690 #undef TARGET_LEGITIMATE_CONSTANT_P
14691 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14692
14693 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
14694 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
14695   aarch64_legitimize_address_displacement
14696
14697 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14698 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14699
14700 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
14701 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
14702 aarch64_libgcc_floating_mode_supported_p
14703
14704 #undef TARGET_MANGLE_TYPE
14705 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14706
14707 #undef TARGET_MEMORY_MOVE_COST
14708 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14709
14710 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14711 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14712
14713 #undef TARGET_MUST_PASS_IN_STACK
14714 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14715
14716 /* This target hook should return true if accesses to volatile bitfields
14717    should use the narrowest mode possible.  It should return false if these
14718    accesses should use the bitfield container type.  */
14719 #undef TARGET_NARROW_VOLATILE_BITFIELD
14720 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14721
14722 #undef  TARGET_OPTION_OVERRIDE
14723 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14724
14725 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14726 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14727   aarch64_override_options_after_change
14728
14729 #undef TARGET_OPTION_SAVE
14730 #define TARGET_OPTION_SAVE aarch64_option_save
14731
14732 #undef TARGET_OPTION_RESTORE
14733 #define TARGET_OPTION_RESTORE aarch64_option_restore
14734
14735 #undef TARGET_OPTION_PRINT
14736 #define TARGET_OPTION_PRINT aarch64_option_print
14737
14738 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14739 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14740
14741 #undef TARGET_SET_CURRENT_FUNCTION
14742 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14743
14744 #undef TARGET_PASS_BY_REFERENCE
14745 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14746
14747 #undef TARGET_PREFERRED_RELOAD_CLASS
14748 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14749
14750 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14751 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14752
14753 #undef TARGET_PROMOTED_TYPE
14754 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14755
14756 #undef TARGET_SECONDARY_RELOAD
14757 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14758
14759 #undef TARGET_SHIFT_TRUNCATION_MASK
14760 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14761
14762 #undef TARGET_SETUP_INCOMING_VARARGS
14763 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14764
14765 #undef TARGET_STRUCT_VALUE_RTX
14766 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
14767
14768 #undef TARGET_REGISTER_MOVE_COST
14769 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14770
14771 #undef TARGET_RETURN_IN_MEMORY
14772 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14773
14774 #undef TARGET_RETURN_IN_MSB
14775 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14776
14777 #undef TARGET_RTX_COSTS
14778 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14779
14780 #undef TARGET_SCALAR_MODE_SUPPORTED_P
14781 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
14782
14783 #undef TARGET_SCHED_ISSUE_RATE
14784 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14785
14786 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14787 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14788   aarch64_sched_first_cycle_multipass_dfa_lookahead
14789
14790 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14791 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14792   aarch64_first_cycle_multipass_dfa_lookahead_guard
14793
14794 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
14795 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
14796   aarch64_get_separate_components
14797
14798 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
14799 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
14800   aarch64_components_for_bb
14801
14802 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
14803 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
14804   aarch64_disqualify_components
14805
14806 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
14807 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
14808   aarch64_emit_prologue_components
14809
14810 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
14811 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
14812   aarch64_emit_epilogue_components
14813
14814 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
14815 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
14816   aarch64_set_handled_components
14817
14818 #undef TARGET_TRAMPOLINE_INIT
14819 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
14820
14821 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
14822 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
14823
14824 #undef TARGET_VECTOR_MODE_SUPPORTED_P
14825 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
14826
14827 #undef TARGET_ARRAY_MODE_SUPPORTED_P
14828 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
14829
14830 #undef TARGET_VECTORIZE_ADD_STMT_COST
14831 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
14832
14833 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
14834 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
14835   aarch64_builtin_vectorization_cost
14836
14837 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
14838 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
14839
14840 #undef TARGET_VECTORIZE_BUILTINS
14841 #define TARGET_VECTORIZE_BUILTINS
14842
14843 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
14844 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
14845   aarch64_builtin_vectorized_function
14846
14847 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
14848 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
14849   aarch64_autovectorize_vector_sizes
14850
14851 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
14852 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
14853   aarch64_atomic_assign_expand_fenv
14854
14855 /* Section anchor support.  */
14856
14857 #undef TARGET_MIN_ANCHOR_OFFSET
14858 #define TARGET_MIN_ANCHOR_OFFSET -256
14859
14860 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
14861    byte offset; we can do much more for larger data types, but have no way
14862    to determine the size of the access.  We assume accesses are aligned.  */
14863 #undef TARGET_MAX_ANCHOR_OFFSET
14864 #define TARGET_MAX_ANCHOR_OFFSET 4095
14865
14866 #undef TARGET_VECTOR_ALIGNMENT
14867 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
14868
14869 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
14870 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
14871   aarch64_simd_vector_alignment_reachable
14872
14873 /* vec_perm support.  */
14874
14875 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
14876 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
14877   aarch64_vectorize_vec_perm_const_ok
14878
14879 #undef TARGET_INIT_LIBFUNCS
14880 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
14881
14882 #undef TARGET_FIXED_CONDITION_CODE_REGS
14883 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
14884
14885 #undef TARGET_FLAGS_REGNUM
14886 #define TARGET_FLAGS_REGNUM CC_REGNUM
14887
14888 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
14889 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
14890
14891 #undef TARGET_ASAN_SHADOW_OFFSET
14892 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
14893
14894 #undef TARGET_LEGITIMIZE_ADDRESS
14895 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
14896
14897 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
14898 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
14899   aarch64_use_by_pieces_infrastructure_p
14900
14901 #undef TARGET_CAN_USE_DOLOOP_P
14902 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
14903
14904 #undef TARGET_SCHED_MACRO_FUSION_P
14905 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
14906
14907 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
14908 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
14909
14910 #undef TARGET_SCHED_FUSION_PRIORITY
14911 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
14912
14913 #undef TARGET_UNSPEC_MAY_TRAP_P
14914 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
14915
14916 #undef TARGET_USE_PSEUDO_PIC_REG
14917 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
14918
14919 #undef TARGET_PRINT_OPERAND
14920 #define TARGET_PRINT_OPERAND aarch64_print_operand
14921
14922 #undef TARGET_PRINT_OPERAND_ADDRESS
14923 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
14924
14925 #undef TARGET_OPTAB_SUPPORTED_P
14926 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
14927
14928 #undef TARGET_OMIT_STRUCT_RETURN_REG
14929 #define TARGET_OMIT_STRUCT_RETURN_REG true
14930
14931 struct gcc_target targetm = TARGET_INITIALIZER;
14932
14933 #include "gt-aarch64.h"