gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2016 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #include "config.h"
  22 #define INCLUDE_STRING
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "memmodel.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "cfgloop.h"
  33 #include "df.h"
  34 #include "tm_p.h"
  35 #include "stringpool.h"
  36 #include "optabs.h"
  37 #include "regs.h"
  38 #include "emit-rtl.h"
  39 #include "recog.h"
  40 #include "diagnostic.h"
  41 #include "insn-attr.h"
  42 #include "alias.h"
  43 #include "fold-const.h"
  44 #include "stor-layout.h"
  45 #include "calls.h"
  46 #include "varasm.h"
  47 #include "output.h"
  48 #include "flags.h"
  49 #include "explow.h"
  50 #include "expr.h"
  51 #include "reload.h"
  52 #include "langhooks.h"
  53 #include "opts.h"
  54 #include "params.h"
  55 #include "gimplify.h"
  56 #include "dwarf2.h"
  57 #include "gimple-iterator.h"
  58 #include "tree-vectorizer.h"
  59 #include "aarch64-cost-tables.h"
  60 #include "dumpfile.h"
  61 #include "builtins.h"
  62 #include "rtl-iter.h"
  63 #include "tm-constrs.h"
  64 #include "sched-int.h"
  65 #include "target-globals.h"
  66 #include "common/common-target.h"
  67
  68 /* This file should be included last.  */
  69 #include "target-def.h"
  70
  71 /* Defined for convenience.  */
  72 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  73
  74 /* Classifies an address.
  75
  76    ADDRESS_REG_IMM
  77        A simple base register plus immediate offset.
  78
  79    ADDRESS_REG_WB
  80        A base register indexed by immediate offset with writeback.
  81
  82    ADDRESS_REG_REG
  83        A base register indexed by (optionally scaled) register.
  84
  85    ADDRESS_REG_UXTW
  86        A base register indexed by (optionally scaled) zero-extended register.
  87
  88    ADDRESS_REG_SXTW
  89        A base register indexed by (optionally scaled) sign-extended register.
  90
  91    ADDRESS_LO_SUM
  92        A LO_SUM rtx with a base register and "LO12" symbol relocation.
  93
  94    ADDRESS_SYMBOLIC:
  95        A constant symbolic address, in pc-relative literal pool.  */
  96
  97 enum aarch64_address_type {
  98   ADDRESS_REG_IMM,
  99   ADDRESS_REG_WB,
 100   ADDRESS_REG_REG,
 101   ADDRESS_REG_UXTW,
 102   ADDRESS_REG_SXTW,
 103   ADDRESS_LO_SUM,
 104   ADDRESS_SYMBOLIC
 105 };
 106
 107 struct aarch64_address_info {
 108   enum aarch64_address_type type;
 109   rtx base;
 110   rtx offset;
 111   int shift;
 112   enum aarch64_symbol_type symbol_type;
 113 };
 114
 115 struct simd_immediate_info
 116 {
 117   rtx value;
 118   int shift;
 119   int element_width;
 120   bool mvn;
 121   bool msl;
 122 };
 123
 124 /* The current code model.  */
 125 enum aarch64_code_model aarch64_cmodel;
 126
 127 #ifdef HAVE_AS_TLS
 128 #undef TARGET_HAVE_TLS
 129 #define TARGET_HAVE_TLS 1
 130 #endif
 131
 132 static bool aarch64_composite_type_p (const_tree, machine_mode);
 133 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 134                                                      const_tree,
 135                                                      machine_mode *, int *,
 136                                                      bool *);
 137 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 138 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 139 static void aarch64_override_options_after_change (void);
 140 static bool aarch64_vector_mode_supported_p (machine_mode);
 141 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
 142                                                  const unsigned char *sel);
 143 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 144
 145 /* Major revision number of the ARM Architecture implemented by the target.  */
 146 unsigned aarch64_architecture_version;
 147
 148 /* The processor for which instructions should be scheduled.  */
 149 enum aarch64_processor aarch64_tune = cortexa53;
 150
 151 /* Mask to specify which instruction scheduling options should be used.  */
 152 unsigned long aarch64_tune_flags = 0;
 153
 154 /* Global flag for PC relative loads.  */
 155 bool aarch64_pcrelative_literal_loads;
 156
 157 /* Support for command line parsing of boolean flags in the tuning
 158    structures.  */
 159 struct aarch64_flag_desc
 160 {
 161   const char* name;
 162   unsigned int flag;
 163 };
 164
 165 #define AARCH64_FUSION_PAIR(name, internal_name) \
 166   { name, AARCH64_FUSE_##internal_name },
 167 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 168 {
 169   { "none", AARCH64_FUSE_NOTHING },
 170 #include "aarch64-fusion-pairs.def"
 171   { "all", AARCH64_FUSE_ALL },
 172   { NULL, AARCH64_FUSE_NOTHING }
 173 };
 174
 175 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 176   { name, AARCH64_EXTRA_TUNE_##internal_name },
 177 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 178 {
 179   { "none", AARCH64_EXTRA_TUNE_NONE },
 180 #include "aarch64-tuning-flags.def"
 181   { "all", AARCH64_EXTRA_TUNE_ALL },
 182   { NULL, AARCH64_EXTRA_TUNE_NONE }
 183 };
 184
 185 /* Tuning parameters.  */
 186
 187 static const struct cpu_addrcost_table generic_addrcost_table =
 188 {
 189     {
 190       0, /* hi  */
 191       0, /* si  */
 192       0, /* di  */
 193       0, /* ti  */
 194     },
 195   0, /* pre_modify  */
 196   0, /* post_modify  */
 197   0, /* register_offset  */
 198   0, /* register_sextend  */
 199   0, /* register_zextend  */
 200   0 /* imm_offset  */
 201 };
 202
 203 static const struct cpu_addrcost_table cortexa57_addrcost_table =
 204 {
 205     {
 206       1, /* hi  */
 207       0, /* si  */
 208       0, /* di  */
 209       1, /* ti  */
 210     },
 211   0, /* pre_modify  */
 212   0, /* post_modify  */
 213   0, /* register_offset  */
 214   0, /* register_sextend  */
 215   0, /* register_zextend  */
 216   0, /* imm_offset  */
 217 };
 218
 219 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 220 {
 221     {
 222       0, /* hi  */
 223       0, /* si  */
 224       0, /* di  */
 225       2, /* ti  */
 226     },
 227   0, /* pre_modify  */
 228   0, /* post_modify  */
 229   1, /* register_offset  */
 230   1, /* register_sextend  */
 231   2, /* register_zextend  */
 232   0, /* imm_offset  */
 233 };
 234
 235 static const struct cpu_addrcost_table xgene1_addrcost_table =
 236 {
 237     {
 238       1, /* hi  */
 239       0, /* si  */
 240       0, /* di  */
 241       1, /* ti  */
 242     },
 243   1, /* pre_modify  */
 244   0, /* post_modify  */
 245   0, /* register_offset  */
 246   1, /* register_sextend  */
 247   1, /* register_zextend  */
 248   0, /* imm_offset  */
 249 };
 250
 251 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
 252 {
 253     {
 254       1, /* hi  */
 255       0, /* si  */
 256       0, /* di  */
 257       1, /* ti  */
 258     },
 259   0, /* pre_modify  */
 260   0, /* post_modify  */
 261   0, /* register_offset  */
 262   0, /* register_sextend  */
 263   0, /* register_zextend  */
 264   0 /* imm_offset  */
 265 };
 266
 267 static const struct cpu_addrcost_table vulcan_addrcost_table =
 268 {
 269     {
 270       0, /* hi  */
 271       0, /* si  */
 272       0, /* di  */
 273       2, /* ti  */
 274     },
 275   0, /* pre_modify  */
 276   0, /* post_modify  */
 277   2, /* register_offset  */
 278   3, /* register_sextend  */
 279   3, /* register_zextend  */
 280   0, /* imm_offset  */
 281 };
 282
 283 static const struct cpu_regmove_cost generic_regmove_cost =
 284 {
 285   1, /* GP2GP  */
 286   /* Avoid the use of slow int<->fp moves for spilling by setting
 287      their cost higher than memmov_cost.  */
 288   5, /* GP2FP  */
 289   5, /* FP2GP  */
 290   2 /* FP2FP  */
 291 };
 292
 293 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 294 {
 295   1, /* GP2GP  */
 296   /* Avoid the use of slow int<->fp moves for spilling by setting
 297      their cost higher than memmov_cost.  */
 298   5, /* GP2FP  */
 299   5, /* FP2GP  */
 300   2 /* FP2FP  */
 301 };
 302
 303 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 304 {
 305   1, /* GP2GP  */
 306   /* Avoid the use of slow int<->fp moves for spilling by setting
 307      their cost higher than memmov_cost.  */
 308   5, /* GP2FP  */
 309   5, /* FP2GP  */
 310   2 /* FP2FP  */
 311 };
 312
 313 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 314 {
 315   1, /* GP2GP  */
 316   /* Avoid the use of slow int<->fp moves for spilling by setting
 317      their cost higher than memmov_cost (actual, 4 and 9).  */
 318   9, /* GP2FP  */
 319   9, /* FP2GP  */
 320   1 /* FP2FP  */
 321 };
 322
 323 static const struct cpu_regmove_cost thunderx_regmove_cost =
 324 {
 325   2, /* GP2GP  */
 326   2, /* GP2FP  */
 327   6, /* FP2GP  */
 328   4 /* FP2FP  */
 329 };
 330
 331 static const struct cpu_regmove_cost xgene1_regmove_cost =
 332 {
 333   1, /* GP2GP  */
 334   /* Avoid the use of slow int<->fp moves for spilling by setting
 335      their cost higher than memmov_cost.  */
 336   8, /* GP2FP  */
 337   8, /* FP2GP  */
 338   2 /* FP2FP  */
 339 };
 340
 341 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 342 {
 343   2, /* GP2GP  */
 344   /* Avoid the use of int<->fp moves for spilling.  */
 345   6, /* GP2FP  */
 346   6, /* FP2GP  */
 347   4 /* FP2FP  */
 348 };
 349
 350 static const struct cpu_regmove_cost vulcan_regmove_cost =
 351 {
 352   1, /* GP2GP  */
 353   /* Avoid the use of int<->fp moves for spilling.  */
 354   8, /* GP2FP  */
 355   8, /* FP2GP  */
 356   4  /* FP2FP  */
 357 };
 358
 359 /* Generic costs for vector insn classes.  */
 360 static const struct cpu_vector_cost generic_vector_cost =
 361 {
 362   1, /* scalar_stmt_cost  */
 363   1, /* scalar_load_cost  */
 364   1, /* scalar_store_cost  */
 365   1, /* vec_stmt_cost  */
 366   2, /* vec_permute_cost  */
 367   1, /* vec_to_scalar_cost  */
 368   1, /* scalar_to_vec_cost  */
 369   1, /* vec_align_load_cost  */
 370   1, /* vec_unalign_load_cost  */
 371   1, /* vec_unalign_store_cost  */
 372   1, /* vec_store_cost  */
 373   3, /* cond_taken_branch_cost  */
 374   1 /* cond_not_taken_branch_cost  */
 375 };
 376
 377 /* ThunderX costs for vector insn classes.  */
 378 static const struct cpu_vector_cost thunderx_vector_cost =
 379 {
 380   1, /* scalar_stmt_cost  */
 381   3, /* scalar_load_cost  */
 382   1, /* scalar_store_cost  */
 383   4, /* vec_stmt_cost  */
 384   4, /* vec_permute_cost  */
 385   2, /* vec_to_scalar_cost  */
 386   2, /* scalar_to_vec_cost  */
 387   3, /* vec_align_load_cost  */
 388   10, /* vec_unalign_load_cost  */
 389   10, /* vec_unalign_store_cost  */
 390   1, /* vec_store_cost  */
 391   3, /* cond_taken_branch_cost  */
 392   3 /* cond_not_taken_branch_cost  */
 393 };
 394
 395 /* Generic costs for vector insn classes.  */
 396 static const struct cpu_vector_cost cortexa57_vector_cost =
 397 {
 398   1, /* scalar_stmt_cost  */
 399   4, /* scalar_load_cost  */
 400   1, /* scalar_store_cost  */
 401   2, /* vec_stmt_cost  */
 402   3, /* vec_permute_cost  */
 403   8, /* vec_to_scalar_cost  */
 404   8, /* scalar_to_vec_cost  */
 405   4, /* vec_align_load_cost  */
 406   4, /* vec_unalign_load_cost  */
 407   1, /* vec_unalign_store_cost  */
 408   1, /* vec_store_cost  */
 409   1, /* cond_taken_branch_cost  */
 410   1 /* cond_not_taken_branch_cost  */
 411 };
 412
 413 static const struct cpu_vector_cost exynosm1_vector_cost =
 414 {
 415   1, /* scalar_stmt_cost  */
 416   5, /* scalar_load_cost  */
 417   1, /* scalar_store_cost  */
 418   3, /* vec_stmt_cost  */
 419   3, /* vec_permute_cost  */
 420   3, /* vec_to_scalar_cost  */
 421   3, /* scalar_to_vec_cost  */
 422   5, /* vec_align_load_cost  */
 423   5, /* vec_unalign_load_cost  */
 424   1, /* vec_unalign_store_cost  */
 425   1, /* vec_store_cost  */
 426   1, /* cond_taken_branch_cost  */
 427   1 /* cond_not_taken_branch_cost  */
 428 };
 429
 430 /* Generic costs for vector insn classes.  */
 431 static const struct cpu_vector_cost xgene1_vector_cost =
 432 {
 433   1, /* scalar_stmt_cost  */
 434   5, /* scalar_load_cost  */
 435   1, /* scalar_store_cost  */
 436   2, /* vec_stmt_cost  */
 437   2, /* vec_permute_cost  */
 438   4, /* vec_to_scalar_cost  */
 439   4, /* scalar_to_vec_cost  */
 440   10, /* vec_align_load_cost  */
 441   10, /* vec_unalign_load_cost  */
 442   2, /* vec_unalign_store_cost  */
 443   2, /* vec_store_cost  */
 444   2, /* cond_taken_branch_cost  */
 445   1 /* cond_not_taken_branch_cost  */
 446 };
 447
 448 /* Costs for vector insn classes for Vulcan.  */
 449 static const struct cpu_vector_cost vulcan_vector_cost =
 450 {
 451   6, /* scalar_stmt_cost  */
 452   4, /* scalar_load_cost  */
 453   1, /* scalar_store_cost  */
 454   6, /* vec_stmt_cost  */
 455   3, /* vec_permute_cost  */
 456   6, /* vec_to_scalar_cost  */
 457   5, /* scalar_to_vec_cost  */
 458   8, /* vec_align_load_cost  */
 459   8, /* vec_unalign_load_cost  */
 460   4, /* vec_unalign_store_cost  */
 461   4, /* vec_store_cost  */
 462   2, /* cond_taken_branch_cost  */
 463   1  /* cond_not_taken_branch_cost  */
 464 };
 465
 466 /* Generic costs for branch instructions.  */
 467 static const struct cpu_branch_cost generic_branch_cost =
 468 {
 469   2,  /* Predictable.  */
 470   2   /* Unpredictable.  */
 471 };
 472
 473 /* Branch costs for Cortex-A57.  */
 474 static const struct cpu_branch_cost cortexa57_branch_cost =
 475 {
 476   1,  /* Predictable.  */
 477   3   /* Unpredictable.  */
 478 };
 479
 480 /* Branch costs for Vulcan.  */
 481 static const struct cpu_branch_cost vulcan_branch_cost =
 482 {
 483   1,  /* Predictable.  */
 484   3   /* Unpredictable.  */
 485 };
 486
 487 /* Generic approximation modes.  */
 488 static const cpu_approx_modes generic_approx_modes =
 489 {
 490   AARCH64_APPROX_NONE,  /* division  */
 491   AARCH64_APPROX_NONE,  /* sqrt  */
 492   AARCH64_APPROX_NONE   /* recip_sqrt  */
 493 };
 494
 495 /* Approximation modes for Exynos M1.  */
 496 static const cpu_approx_modes exynosm1_approx_modes =
 497 {
 498   AARCH64_APPROX_NONE,  /* division  */
 499   AARCH64_APPROX_ALL,   /* sqrt  */
 500   AARCH64_APPROX_ALL    /* recip_sqrt  */
 501 };
 502
 503 /* Approximation modes for X-Gene 1.  */
 504 static const cpu_approx_modes xgene1_approx_modes =
 505 {
 506   AARCH64_APPROX_NONE,  /* division  */
 507   AARCH64_APPROX_NONE,  /* sqrt  */
 508   AARCH64_APPROX_ALL    /* recip_sqrt  */
 509 };
 510
 511 static const struct tune_params generic_tunings =
 512 {
 513   &cortexa57_extra_costs,
 514   &generic_addrcost_table,
 515   &generic_regmove_cost,
 516   &generic_vector_cost,
 517   &generic_branch_cost,
 518   &generic_approx_modes,
 519   4, /* memmov_cost  */
 520   2, /* issue_rate  */
 521   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 522   8,    /* function_align.  */
 523   8,    /* jump_align.  */
 524   4,    /* loop_align.  */
 525   2,    /* int_reassoc_width.  */
 526   4,    /* fp_reassoc_width.  */
 527   1,    /* vec_reassoc_width.  */
 528   2,    /* min_div_recip_mul_sf.  */
 529   2,    /* min_div_recip_mul_df.  */
 530   0,    /* max_case_values.  */
 531   0,    /* cache_line_size.  */
 532   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 533   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 534 };
 535
 536 static const struct tune_params cortexa35_tunings =
 537 {
 538   &cortexa53_extra_costs,
 539   &generic_addrcost_table,
 540   &cortexa53_regmove_cost,
 541   &generic_vector_cost,
 542   &cortexa57_branch_cost,
 543   &generic_approx_modes,
 544   4, /* memmov_cost  */
 545   1, /* issue_rate  */
 546   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 547    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 548   16,   /* function_align.  */
 549   8,    /* jump_align.  */
 550   8,    /* loop_align.  */
 551   2,    /* int_reassoc_width.  */
 552   4,    /* fp_reassoc_width.  */
 553   1,    /* vec_reassoc_width.  */
 554   2,    /* min_div_recip_mul_sf.  */
 555   2,    /* min_div_recip_mul_df.  */
 556   0,    /* max_case_values.  */
 557   0,    /* cache_line_size.  */
 558   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 559   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 560 };
 561
 562 static const struct tune_params cortexa53_tunings =
 563 {
 564   &cortexa53_extra_costs,
 565   &generic_addrcost_table,
 566   &cortexa53_regmove_cost,
 567   &generic_vector_cost,
 568   &cortexa57_branch_cost,
 569   &generic_approx_modes,
 570   4, /* memmov_cost  */
 571   2, /* issue_rate  */
 572   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 573    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 574   16,   /* function_align.  */
 575   8,    /* jump_align.  */
 576   8,    /* loop_align.  */
 577   2,    /* int_reassoc_width.  */
 578   4,    /* fp_reassoc_width.  */
 579   1,    /* vec_reassoc_width.  */
 580   2,    /* min_div_recip_mul_sf.  */
 581   2,    /* min_div_recip_mul_df.  */
 582   0,    /* max_case_values.  */
 583   0,    /* cache_line_size.  */
 584   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 585   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 586 };
 587
 588 static const struct tune_params cortexa57_tunings =
 589 {
 590   &cortexa57_extra_costs,
 591   &cortexa57_addrcost_table,
 592   &cortexa57_regmove_cost,
 593   &cortexa57_vector_cost,
 594   &cortexa57_branch_cost,
 595   &generic_approx_modes,
 596   4, /* memmov_cost  */
 597   3, /* issue_rate  */
 598   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 599    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 600   16,   /* function_align.  */
 601   8,    /* jump_align.  */
 602   8,    /* loop_align.  */
 603   2,    /* int_reassoc_width.  */
 604   4,    /* fp_reassoc_width.  */
 605   1,    /* vec_reassoc_width.  */
 606   2,    /* min_div_recip_mul_sf.  */
 607   2,    /* min_div_recip_mul_df.  */
 608   0,    /* max_case_values.  */
 609   0,    /* cache_line_size.  */
 610   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 611   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS)  /* tune_flags.  */
 612 };
 613
 614 static const struct tune_params cortexa72_tunings =
 615 {
 616   &cortexa57_extra_costs,
 617   &cortexa57_addrcost_table,
 618   &cortexa57_regmove_cost,
 619   &cortexa57_vector_cost,
 620   &cortexa57_branch_cost,
 621   &generic_approx_modes,
 622   4, /* memmov_cost  */
 623   3, /* issue_rate  */
 624   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 625    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 626   16,   /* function_align.  */
 627   8,    /* jump_align.  */
 628   8,    /* loop_align.  */
 629   2,    /* int_reassoc_width.  */
 630   4,    /* fp_reassoc_width.  */
 631   1,    /* vec_reassoc_width.  */
 632   2,    /* min_div_recip_mul_sf.  */
 633   2,    /* min_div_recip_mul_df.  */
 634   0,    /* max_case_values.  */
 635   0,    /* cache_line_size.  */
 636   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 637   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 638 };
 639
 640 static const struct tune_params cortexa73_tunings =
 641 {
 642   &cortexa57_extra_costs,
 643   &cortexa57_addrcost_table,
 644   &cortexa57_regmove_cost,
 645   &cortexa57_vector_cost,
 646   &cortexa57_branch_cost,
 647   &generic_approx_modes,
 648   4, /* memmov_cost.  */
 649   2, /* issue_rate.  */
 650   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 651    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 652   16,   /* function_align.  */
 653   8,    /* jump_align.  */
 654   8,    /* loop_align.  */
 655   2,    /* int_reassoc_width.  */
 656   4,    /* fp_reassoc_width.  */
 657   1,    /* vec_reassoc_width.  */
 658   2,    /* min_div_recip_mul_sf.  */
 659   2,    /* min_div_recip_mul_df.  */
 660   0,    /* max_case_values.  */
 661   0,    /* cache_line_size.  */
 662   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 663   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 664 };
 665
 666 static const struct tune_params exynosm1_tunings =
 667 {
 668   &exynosm1_extra_costs,
 669   &exynosm1_addrcost_table,
 670   &exynosm1_regmove_cost,
 671   &exynosm1_vector_cost,
 672   &generic_branch_cost,
 673   &exynosm1_approx_modes,
 674   4,    /* memmov_cost  */
 675   3,    /* issue_rate  */
 676   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 677   4,    /* function_align.  */
 678   4,    /* jump_align.  */
 679   4,    /* loop_align.  */
 680   2,    /* int_reassoc_width.  */
 681   4,    /* fp_reassoc_width.  */
 682   1,    /* vec_reassoc_width.  */
 683   2,    /* min_div_recip_mul_sf.  */
 684   2,    /* min_div_recip_mul_df.  */
 685   48,   /* max_case_values.  */
 686   64,   /* cache_line_size.  */
 687   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 688   (AARCH64_EXTRA_TUNE_NONE) /* tune_flags.  */
 689 };
 690
 691 static const struct tune_params thunderx_tunings =
 692 {
 693   &thunderx_extra_costs,
 694   &generic_addrcost_table,
 695   &thunderx_regmove_cost,
 696   &thunderx_vector_cost,
 697   &generic_branch_cost,
 698   &generic_approx_modes,
 699   6, /* memmov_cost  */
 700   2, /* issue_rate  */
 701   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 702   8,    /* function_align.  */
 703   8,    /* jump_align.  */
 704   8,    /* loop_align.  */
 705   2,    /* int_reassoc_width.  */
 706   4,    /* fp_reassoc_width.  */
 707   1,    /* vec_reassoc_width.  */
 708   2,    /* min_div_recip_mul_sf.  */
 709   2,    /* min_div_recip_mul_df.  */
 710   0,    /* max_case_values.  */
 711   0,    /* cache_line_size.  */
 712   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 713   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)      /* tune_flags.  */
 714 };
 715
 716 static const struct tune_params xgene1_tunings =
 717 {
 718   &xgene1_extra_costs,
 719   &xgene1_addrcost_table,
 720   &xgene1_regmove_cost,
 721   &xgene1_vector_cost,
 722   &generic_branch_cost,
 723   &xgene1_approx_modes,
 724   6, /* memmov_cost  */
 725   4, /* issue_rate  */
 726   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 727   16,   /* function_align.  */
 728   8,    /* jump_align.  */
 729   16,   /* loop_align.  */
 730   2,    /* int_reassoc_width.  */
 731   4,    /* fp_reassoc_width.  */
 732   1,    /* vec_reassoc_width.  */
 733   2,    /* min_div_recip_mul_sf.  */
 734   2,    /* min_div_recip_mul_df.  */
 735   0,    /* max_case_values.  */
 736   0,    /* cache_line_size.  */
 737   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 738   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 739 };
 740
 741 static const struct tune_params qdf24xx_tunings =
 742 {
 743   &qdf24xx_extra_costs,
 744   &qdf24xx_addrcost_table,
 745   &qdf24xx_regmove_cost,
 746   &generic_vector_cost,
 747   &generic_branch_cost,
 748   &generic_approx_modes,
 749   4, /* memmov_cost  */
 750   4, /* issue_rate  */
 751   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 752    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 753   16,   /* function_align.  */
 754   8,    /* jump_align.  */
 755   16,   /* loop_align.  */
 756   2,    /* int_reassoc_width.  */
 757   4,    /* fp_reassoc_width.  */
 758   1,    /* vec_reassoc_width.  */
 759   2,    /* min_div_recip_mul_sf.  */
 760   2,    /* min_div_recip_mul_df.  */
 761   0,    /* max_case_values.  */
 762   64,   /* cache_line_size.  */
 763   tune_params::AUTOPREFETCHER_STRONG,   /* autoprefetcher_model.  */
 764   (AARCH64_EXTRA_TUNE_NONE)             /* tune_flags.  */
 765 };
 766
 767 static const struct tune_params vulcan_tunings =
 768 {
 769   &vulcan_extra_costs,
 770   &vulcan_addrcost_table,
 771   &vulcan_regmove_cost,
 772   &vulcan_vector_cost,
 773   &vulcan_branch_cost,
 774   &generic_approx_modes,
 775   4, /* memmov_cost.  */
 776   4, /* issue_rate.  */
 777   AARCH64_FUSE_NOTHING, /* fuseable_ops.  */
 778   16,   /* function_align.  */
 779   8,    /* jump_align.  */
 780   16,   /* loop_align.  */
 781   3,    /* int_reassoc_width.  */
 782   2,    /* fp_reassoc_width.  */
 783   2,    /* vec_reassoc_width.  */
 784   2,    /* min_div_recip_mul_sf.  */
 785   2,    /* min_div_recip_mul_df.  */
 786   0,    /* max_case_values.  */
 787   64,   /* cache_line_size.  */
 788   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 789   (AARCH64_EXTRA_TUNE_NONE)     /* tune_flags.  */
 790 };
 791
 792 /* Support for fine-grained override of the tuning structures.  */
 793 struct aarch64_tuning_override_function
 794 {
 795   const char* name;
 796   void (*parse_override)(const char*, struct tune_params*);
 797 };
 798
 799 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 800 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 801
 802 static const struct aarch64_tuning_override_function
 803 aarch64_tuning_override_functions[] =
 804 {
 805   { "fuse", aarch64_parse_fuse_string },
 806   { "tune", aarch64_parse_tune_string },
 807   { NULL, NULL }
 808 };
 809
 810 /* A processor implementing AArch64.  */
 811 struct processor
 812 {
 813   const char *const name;
 814   enum aarch64_processor ident;
 815   enum aarch64_processor sched_core;
 816   enum aarch64_arch arch;
 817   unsigned architecture_version;
 818   const unsigned long flags;
 819   const struct tune_params *const tune;
 820 };
 821
 822 /* Architectures implementing AArch64.  */
 823 static const struct processor all_architectures[] =
 824 {
 825 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 826   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 827 #include "aarch64-arches.def"
 828   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 829 };
 830
 831 /* Processor cores implementing AArch64.  */
 832 static const struct processor all_cores[] =
 833 {
 834 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
 835   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 836   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 837   FLAGS, &COSTS##_tunings},
 838 #include "aarch64-cores.def"
 839   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 840     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 841   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 842 };
 843
 844
 845 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 846    handling code or by target attributes.  */
 847 static const struct processor *selected_arch;
 848 static const struct processor *selected_cpu;
 849 static const struct processor *selected_tune;
 850
 851 /* The current tuning set.  */
 852 struct tune_params aarch64_tune_params = generic_tunings;
 853
 854 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 855
 856 /* An ISA extension in the co-processor and main instruction set space.  */
 857 struct aarch64_option_extension
 858 {
 859   const char *const name;
 860   const unsigned long flags_on;
 861   const unsigned long flags_off;
 862 };
 863
 864 typedef enum aarch64_cond_code
 865 {
 866   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 867   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 868   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 869 }
 870 aarch64_cc;
 871
 872 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 873
 874 /* The condition codes of the processor, and the inverse function.  */
 875 static const char * const aarch64_condition_codes[] =
 876 {
 877   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 878   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 879 };
 880
 881 /* Generate code to enable conditional branches in functions over 1 MiB.  */
 882 const char *
 883 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
 884                         const char * branch_format)
 885 {
 886     rtx_code_label * tmp_label = gen_label_rtx ();
 887     char label_buf[256];
 888     char buffer[128];
 889     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
 890                                  CODE_LABEL_NUMBER (tmp_label));
 891     const char *label_ptr = targetm.strip_name_encoding (label_buf);
 892     rtx dest_label = operands[pos_label];
 893     operands[pos_label] = tmp_label;
 894
 895     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
 896     output_asm_insn (buffer, operands);
 897
 898     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
 899     operands[pos_label] = dest_label;
 900     output_asm_insn (buffer, operands);
 901     return "";
 902 }
 903
 904 void
 905 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
 906 {
 907   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
 908   if (TARGET_GENERAL_REGS_ONLY)
 909     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
 910   else
 911     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
 912 }
 913
 914 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
 915    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
 916    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
 917    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
 918    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
 919    irrespectively of its cost results in bad allocations with many redundant
 920    int<->FP moves which are expensive on various cores.
 921    To avoid this we don't allow ALL_REGS as the allocno class, but force a
 922    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
 923    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
 924    Otherwise set the allocno class depending on the mode.
 925    The result of this is that it is no longer inefficient to have a higher
 926    memory move cost than the register move cost.
 927 */
 928
 929 static reg_class_t
 930 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
 931                                          reg_class_t best_class)
 932 {
 933   enum machine_mode mode;
 934
 935   if (allocno_class != ALL_REGS)
 936     return allocno_class;
 937
 938   if (best_class != ALL_REGS)
 939     return best_class;
 940
 941   mode = PSEUDO_REGNO_MODE (regno);
 942   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
 943 }
 944
 945 static unsigned int
 946 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
 947 {
 948   if (GET_MODE_UNIT_SIZE (mode) == 4)
 949     return aarch64_tune_params.min_div_recip_mul_sf;
 950   return aarch64_tune_params.min_div_recip_mul_df;
 951 }
 952
 953 static int
 954 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
 955                              enum machine_mode mode)
 956 {
 957   if (VECTOR_MODE_P (mode))
 958     return aarch64_tune_params.vec_reassoc_width;
 959   if (INTEGRAL_MODE_P (mode))
 960     return aarch64_tune_params.int_reassoc_width;
 961   if (FLOAT_MODE_P (mode))
 962     return aarch64_tune_params.fp_reassoc_width;
 963   return 1;
 964 }
 965
 966 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
 967 unsigned
 968 aarch64_dbx_register_number (unsigned regno)
 969 {
 970    if (GP_REGNUM_P (regno))
 971      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
 972    else if (regno == SP_REGNUM)
 973      return AARCH64_DWARF_SP;
 974    else if (FP_REGNUM_P (regno))
 975      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
 976
 977    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
 978       equivalent DWARF register.  */
 979    return DWARF_FRAME_REGISTERS;
 980 }
 981
 982 /* Return TRUE if MODE is any of the large INT modes.  */
 983 static bool
 984 aarch64_vect_struct_mode_p (machine_mode mode)
 985 {
 986   return mode == OImode || mode == CImode || mode == XImode;
 987 }
 988
 989 /* Return TRUE if MODE is any of the vector modes.  */
 990 static bool
 991 aarch64_vector_mode_p (machine_mode mode)
 992 {
 993   return aarch64_vector_mode_supported_p (mode)
 994          || aarch64_vect_struct_mode_p (mode);
 995 }
 996
 997 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 998 static bool
 999 aarch64_array_mode_supported_p (machine_mode mode,
1000                                 unsigned HOST_WIDE_INT nelems)
1001 {
1002   if (TARGET_SIMD
1003       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1004           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1005       && (nelems >= 2 && nelems <= 4))
1006     return true;
1007
1008   return false;
1009 }
1010
1011 /* Implement HARD_REGNO_NREGS.  */
1012
1013 int
1014 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1015 {
1016   switch (aarch64_regno_regclass (regno))
1017     {
1018     case FP_REGS:
1019     case FP_LO_REGS:
1020       return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1021     default:
1022       return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1023     }
1024   gcc_unreachable ();
1025 }
1026
1027 /* Implement HARD_REGNO_MODE_OK.  */
1028
1029 int
1030 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1031 {
1032   if (GET_MODE_CLASS (mode) == MODE_CC)
1033     return regno == CC_REGNUM;
1034
1035   if (regno == SP_REGNUM)
1036     /* The purpose of comparing with ptr_mode is to support the
1037        global register variable associated with the stack pointer
1038        register via the syntax of asm ("wsp") in ILP32.  */
1039     return mode == Pmode || mode == ptr_mode;
1040
1041   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1042     return mode == Pmode;
1043
1044   if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1045     return 1;
1046
1047   if (FP_REGNUM_P (regno))
1048     {
1049       if (aarch64_vect_struct_mode_p (mode))
1050         return
1051           (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1052       else
1053         return 1;
1054     }
1055
1056   return 0;
1057 }
1058
1059 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1060 machine_mode
1061 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1062                                      machine_mode mode)
1063 {
1064   /* Handle modes that fit within single registers.  */
1065   if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1066     {
1067       if (GET_MODE_SIZE (mode) >= 4)
1068         return mode;
1069       else
1070         return SImode;
1071     }
1072   /* Fall back to generic for multi-reg and very large modes.  */
1073   else
1074     return choose_hard_reg_mode (regno, nregs, false);
1075 }
1076
1077 /* Return true if calls to DECL should be treated as
1078    long-calls (ie called via a register).  */
1079 static bool
1080 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1081 {
1082   return false;
1083 }
1084
1085 /* Return true if calls to symbol-ref SYM should be treated as
1086    long-calls (ie called via a register).  */
1087 bool
1088 aarch64_is_long_call_p (rtx sym)
1089 {
1090   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1091 }
1092
1093 /* Return true if calls to symbol-ref SYM should not go through
1094    plt stubs.  */
1095
1096 bool
1097 aarch64_is_noplt_call_p (rtx sym)
1098 {
1099   const_tree decl = SYMBOL_REF_DECL (sym);
1100
1101   if (flag_pic
1102       && decl
1103       && (!flag_plt
1104           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1105       && !targetm.binds_local_p (decl))
1106     return true;
1107
1108   return false;
1109 }
1110
1111 /* Return true if the offsets to a zero/sign-extract operation
1112    represent an expression that matches an extend operation.  The
1113    operands represent the paramters from
1114
1115    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1116 bool
1117 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1118                                 rtx extract_imm)
1119 {
1120   HOST_WIDE_INT mult_val, extract_val;
1121
1122   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1123     return false;
1124
1125   mult_val = INTVAL (mult_imm);
1126   extract_val = INTVAL (extract_imm);
1127
1128   if (extract_val > 8
1129       && extract_val < GET_MODE_BITSIZE (mode)
1130       && exact_log2 (extract_val & ~7) > 0
1131       && (extract_val & 7) <= 4
1132       && mult_val == (1 << (extract_val & 7)))
1133     return true;
1134
1135   return false;
1136 }
1137
1138 /* Emit an insn that's a simple single-set.  Both the operands must be
1139    known to be valid.  */
1140 inline static rtx
1141 emit_set_insn (rtx x, rtx y)
1142 {
1143   return emit_insn (gen_rtx_SET (x, y));
1144 }
1145
1146 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1147    return the rtx for register 0 in the proper mode.  */
1148 rtx
1149 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1150 {
1151   machine_mode mode = SELECT_CC_MODE (code, x, y);
1152   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1153
1154   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1155   return cc_reg;
1156 }
1157
1158 /* Build the SYMBOL_REF for __tls_get_addr.  */
1159
1160 static GTY(()) rtx tls_get_addr_libfunc;
1161
1162 rtx
1163 aarch64_tls_get_addr (void)
1164 {
1165   if (!tls_get_addr_libfunc)
1166     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1167   return tls_get_addr_libfunc;
1168 }
1169
1170 /* Return the TLS model to use for ADDR.  */
1171
1172 static enum tls_model
1173 tls_symbolic_operand_type (rtx addr)
1174 {
1175   enum tls_model tls_kind = TLS_MODEL_NONE;
1176   rtx sym, addend;
1177
1178   if (GET_CODE (addr) == CONST)
1179     {
1180       split_const (addr, &sym, &addend);
1181       if (GET_CODE (sym) == SYMBOL_REF)
1182         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1183     }
1184   else if (GET_CODE (addr) == SYMBOL_REF)
1185     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1186
1187   return tls_kind;
1188 }
1189
1190 /* We'll allow lo_sum's in addresses in our legitimate addresses
1191    so that combine would take care of combining addresses where
1192    necessary, but for generation purposes, we'll generate the address
1193    as :
1194    RTL                               Absolute
1195    tmp = hi (symbol_ref);            adrp  x1, foo
1196    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1197                                      nop
1198
1199    PIC                               TLS
1200    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1201    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1202                                      bl   __tls_get_addr
1203                                      nop
1204
1205    Load TLS symbol, depending on TLS mechanism and TLS access model.
1206
1207    Global Dynamic - Traditional TLS:
1208    adrp tmp, :tlsgd:imm
1209    add  dest, tmp, #:tlsgd_lo12:imm
1210    bl   __tls_get_addr
1211
1212    Global Dynamic - TLS Descriptors:
1213    adrp dest, :tlsdesc:imm
1214    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1215    add  dest, dest, #:tlsdesc_lo12:imm
1216    blr  tmp
1217    mrs  tp, tpidr_el0
1218    add  dest, dest, tp
1219
1220    Initial Exec:
1221    mrs  tp, tpidr_el0
1222    adrp tmp, :gottprel:imm
1223    ldr  dest, [tmp, #:gottprel_lo12:imm]
1224    add  dest, dest, tp
1225
1226    Local Exec:
1227    mrs  tp, tpidr_el0
1228    add  t0, tp, #:tprel_hi12:imm, lsl #12
1229    add  t0, t0, #:tprel_lo12_nc:imm
1230 */
1231
1232 static void
1233 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1234                                    enum aarch64_symbol_type type)
1235 {
1236   switch (type)
1237     {
1238     case SYMBOL_SMALL_ABSOLUTE:
1239       {
1240         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1241         rtx tmp_reg = dest;
1242         machine_mode mode = GET_MODE (dest);
1243
1244         gcc_assert (mode == Pmode || mode == ptr_mode);
1245
1246         if (can_create_pseudo_p ())
1247           tmp_reg = gen_reg_rtx (mode);
1248
1249         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1250         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1251         return;
1252       }
1253
1254     case SYMBOL_TINY_ABSOLUTE:
1255       emit_insn (gen_rtx_SET (dest, imm));
1256       return;
1257
1258     case SYMBOL_SMALL_GOT_28K:
1259       {
1260         machine_mode mode = GET_MODE (dest);
1261         rtx gp_rtx = pic_offset_table_rtx;
1262         rtx insn;
1263         rtx mem;
1264
1265         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1266            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1267            decide rtx costs, in which case pic_offset_table_rtx is not
1268            initialized.  For that case no need to generate the first adrp
1269            instruction as the final cost for global variable access is
1270            one instruction.  */
1271         if (gp_rtx != NULL)
1272           {
1273             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1274                using the page base as GOT base, the first page may be wasted,
1275                in the worst scenario, there is only 28K space for GOT).
1276
1277                The generate instruction sequence for accessing global variable
1278                is:
1279
1280                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1281
1282                Only one instruction needed. But we must initialize
1283                pic_offset_table_rtx properly.  We generate initialize insn for
1284                every global access, and allow CSE to remove all redundant.
1285
1286                The final instruction sequences will look like the following
1287                for multiply global variables access.
1288
1289                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1290
1291                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1292                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1293                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1294                  ...  */
1295
1296             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1297             crtl->uses_pic_offset_table = 1;
1298             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1299
1300             if (mode != GET_MODE (gp_rtx))
1301               gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
1302           }
1303
1304         if (mode == ptr_mode)
1305           {
1306             if (mode == DImode)
1307               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1308             else
1309               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1310
1311             mem = XVECEXP (SET_SRC (insn), 0, 0);
1312           }
1313         else
1314           {
1315             gcc_assert (mode == Pmode);
1316
1317             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1318             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1319           }
1320
1321         /* The operand is expected to be MEM.  Whenever the related insn
1322            pattern changed, above code which calculate mem should be
1323            updated.  */
1324         gcc_assert (GET_CODE (mem) == MEM);
1325         MEM_READONLY_P (mem) = 1;
1326         MEM_NOTRAP_P (mem) = 1;
1327         emit_insn (insn);
1328         return;
1329       }
1330
1331     case SYMBOL_SMALL_GOT_4G:
1332       {
1333         /* In ILP32, the mode of dest can be either SImode or DImode,
1334            while the got entry is always of SImode size.  The mode of
1335            dest depends on how dest is used: if dest is assigned to a
1336            pointer (e.g. in the memory), it has SImode; it may have
1337            DImode if dest is dereferenced to access the memeory.
1338            This is why we have to handle three different ldr_got_small
1339            patterns here (two patterns for ILP32).  */
1340
1341         rtx insn;
1342         rtx mem;
1343         rtx tmp_reg = dest;
1344         machine_mode mode = GET_MODE (dest);
1345
1346         if (can_create_pseudo_p ())
1347           tmp_reg = gen_reg_rtx (mode);
1348
1349         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1350         if (mode == ptr_mode)
1351           {
1352             if (mode == DImode)
1353               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1354             else
1355               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1356
1357             mem = XVECEXP (SET_SRC (insn), 0, 0);
1358           }
1359         else
1360           {
1361             gcc_assert (mode == Pmode);
1362
1363             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1364             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1365           }
1366
1367         gcc_assert (GET_CODE (mem) == MEM);
1368         MEM_READONLY_P (mem) = 1;
1369         MEM_NOTRAP_P (mem) = 1;
1370         emit_insn (insn);
1371         return;
1372       }
1373
1374     case SYMBOL_SMALL_TLSGD:
1375       {
1376         rtx_insn *insns;
1377         rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
1378
1379         start_sequence ();
1380         aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
1381         insns = get_insns ();
1382         end_sequence ();
1383
1384         RTL_CONST_CALL_P (insns) = 1;
1385         emit_libcall_block (insns, dest, result, imm);
1386         return;
1387       }
1388
1389     case SYMBOL_SMALL_TLSDESC:
1390       {
1391         machine_mode mode = GET_MODE (dest);
1392         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1393         rtx tp;
1394
1395         gcc_assert (mode == Pmode || mode == ptr_mode);
1396
1397         /* In ILP32, the got entry is always of SImode size.  Unlike
1398            small GOT, the dest is fixed at reg 0.  */
1399         if (TARGET_ILP32)
1400           emit_insn (gen_tlsdesc_small_si (imm));
1401         else
1402           emit_insn (gen_tlsdesc_small_di (imm));
1403         tp = aarch64_load_tp (NULL);
1404
1405         if (mode != Pmode)
1406           tp = gen_lowpart (mode, tp);
1407
1408         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1409         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1410         return;
1411       }
1412
1413     case SYMBOL_SMALL_TLSIE:
1414       {
1415         /* In ILP32, the mode of dest can be either SImode or DImode,
1416            while the got entry is always of SImode size.  The mode of
1417            dest depends on how dest is used: if dest is assigned to a
1418            pointer (e.g. in the memory), it has SImode; it may have
1419            DImode if dest is dereferenced to access the memeory.
1420            This is why we have to handle three different tlsie_small
1421            patterns here (two patterns for ILP32).  */
1422         machine_mode mode = GET_MODE (dest);
1423         rtx tmp_reg = gen_reg_rtx (mode);
1424         rtx tp = aarch64_load_tp (NULL);
1425
1426         if (mode == ptr_mode)
1427           {
1428             if (mode == DImode)
1429               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1430             else
1431               {
1432                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1433                 tp = gen_lowpart (mode, tp);
1434               }
1435           }
1436         else
1437           {
1438             gcc_assert (mode == Pmode);
1439             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1440           }
1441
1442         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1443         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1444         return;
1445       }
1446
1447     case SYMBOL_TLSLE12:
1448     case SYMBOL_TLSLE24:
1449     case SYMBOL_TLSLE32:
1450     case SYMBOL_TLSLE48:
1451       {
1452         machine_mode mode = GET_MODE (dest);
1453         rtx tp = aarch64_load_tp (NULL);
1454
1455         if (mode != Pmode)
1456           tp = gen_lowpart (mode, tp);
1457
1458         switch (type)
1459           {
1460           case SYMBOL_TLSLE12:
1461             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1462                         (dest, tp, imm));
1463             break;
1464           case SYMBOL_TLSLE24:
1465             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1466                         (dest, tp, imm));
1467           break;
1468           case SYMBOL_TLSLE32:
1469             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1470                         (dest, imm));
1471             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1472                         (dest, dest, tp));
1473           break;
1474           case SYMBOL_TLSLE48:
1475             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1476                         (dest, imm));
1477             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1478                         (dest, dest, tp));
1479             break;
1480           default:
1481             gcc_unreachable ();
1482           }
1483
1484         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1485         return;
1486       }
1487
1488     case SYMBOL_TINY_GOT:
1489       emit_insn (gen_ldr_got_tiny (dest, imm));
1490       return;
1491
1492     case SYMBOL_TINY_TLSIE:
1493       {
1494         machine_mode mode = GET_MODE (dest);
1495         rtx tp = aarch64_load_tp (NULL);
1496
1497         if (mode == ptr_mode)
1498           {
1499             if (mode == DImode)
1500               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1501             else
1502               {
1503                 tp = gen_lowpart (mode, tp);
1504                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1505               }
1506           }
1507         else
1508           {
1509             gcc_assert (mode == Pmode);
1510             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1511           }
1512
1513         set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1514         return;
1515       }
1516
1517     default:
1518       gcc_unreachable ();
1519     }
1520 }
1521
1522 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1523    handle all moves if !can_create_pseudo_p ().  The distinction is
1524    important because, unlike emit_move_insn, the move expanders know
1525    how to force Pmode objects into the constant pool even when the
1526    constant pool address is not itself legitimate.  */
1527 static rtx
1528 aarch64_emit_move (rtx dest, rtx src)
1529 {
1530   return (can_create_pseudo_p ()
1531           ? emit_move_insn (dest, src)
1532           : emit_move_insn_1 (dest, src));
1533 }
1534
1535 /* Split a 128-bit move operation into two 64-bit move operations,
1536    taking care to handle partial overlap of register to register
1537    copies.  Special cases are needed when moving between GP regs and
1538    FP regs.  SRC can be a register, constant or memory; DST a register
1539    or memory.  If either operand is memory it must not have any side
1540    effects.  */
1541 void
1542 aarch64_split_128bit_move (rtx dst, rtx src)
1543 {
1544   rtx dst_lo, dst_hi;
1545   rtx src_lo, src_hi;
1546
1547   machine_mode mode = GET_MODE (dst);
1548
1549   gcc_assert (mode == TImode || mode == TFmode);
1550   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1551   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1552
1553   if (REG_P (dst) && REG_P (src))
1554     {
1555       int src_regno = REGNO (src);
1556       int dst_regno = REGNO (dst);
1557
1558       /* Handle FP <-> GP regs.  */
1559       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1560         {
1561           src_lo = gen_lowpart (word_mode, src);
1562           src_hi = gen_highpart (word_mode, src);
1563
1564           if (mode == TImode)
1565             {
1566               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1567               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1568             }
1569           else
1570             {
1571               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1572               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1573             }
1574           return;
1575         }
1576       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1577         {
1578           dst_lo = gen_lowpart (word_mode, dst);
1579           dst_hi = gen_highpart (word_mode, dst);
1580
1581           if (mode == TImode)
1582             {
1583               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1584               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1585             }
1586           else
1587             {
1588               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1589               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1590             }
1591           return;
1592         }
1593     }
1594
1595   dst_lo = gen_lowpart (word_mode, dst);
1596   dst_hi = gen_highpart (word_mode, dst);
1597   src_lo = gen_lowpart (word_mode, src);
1598   src_hi = gen_highpart_mode (word_mode, mode, src);
1599
1600   /* At most one pairing may overlap.  */
1601   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1602     {
1603       aarch64_emit_move (dst_hi, src_hi);
1604       aarch64_emit_move (dst_lo, src_lo);
1605     }
1606   else
1607     {
1608       aarch64_emit_move (dst_lo, src_lo);
1609       aarch64_emit_move (dst_hi, src_hi);
1610     }
1611 }
1612
1613 bool
1614 aarch64_split_128bit_move_p (rtx dst, rtx src)
1615 {
1616   return (! REG_P (src)
1617           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1618 }
1619
1620 /* Split a complex SIMD combine.  */
1621
1622 void
1623 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1624 {
1625   machine_mode src_mode = GET_MODE (src1);
1626   machine_mode dst_mode = GET_MODE (dst);
1627
1628   gcc_assert (VECTOR_MODE_P (dst_mode));
1629
1630   if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1631     {
1632       rtx (*gen) (rtx, rtx, rtx);
1633
1634       switch (src_mode)
1635         {
1636         case V8QImode:
1637           gen = gen_aarch64_simd_combinev8qi;
1638           break;
1639         case V4HImode:
1640           gen = gen_aarch64_simd_combinev4hi;
1641           break;
1642         case V2SImode:
1643           gen = gen_aarch64_simd_combinev2si;
1644           break;
1645         case V4HFmode:
1646           gen = gen_aarch64_simd_combinev4hf;
1647           break;
1648         case V2SFmode:
1649           gen = gen_aarch64_simd_combinev2sf;
1650           break;
1651         case DImode:
1652           gen = gen_aarch64_simd_combinedi;
1653           break;
1654         case DFmode:
1655           gen = gen_aarch64_simd_combinedf;
1656           break;
1657         default:
1658           gcc_unreachable ();
1659         }
1660
1661       emit_insn (gen (dst, src1, src2));
1662       return;
1663     }
1664 }
1665
1666 /* Split a complex SIMD move.  */
1667
1668 void
1669 aarch64_split_simd_move (rtx dst, rtx src)
1670 {
1671   machine_mode src_mode = GET_MODE (src);
1672   machine_mode dst_mode = GET_MODE (dst);
1673
1674   gcc_assert (VECTOR_MODE_P (dst_mode));
1675
1676   if (REG_P (dst) && REG_P (src))
1677     {
1678       rtx (*gen) (rtx, rtx);
1679
1680       gcc_assert (VECTOR_MODE_P (src_mode));
1681
1682       switch (src_mode)
1683         {
1684         case V16QImode:
1685           gen = gen_aarch64_split_simd_movv16qi;
1686           break;
1687         case V8HImode:
1688           gen = gen_aarch64_split_simd_movv8hi;
1689           break;
1690         case V4SImode:
1691           gen = gen_aarch64_split_simd_movv4si;
1692           break;
1693         case V2DImode:
1694           gen = gen_aarch64_split_simd_movv2di;
1695           break;
1696         case V8HFmode:
1697           gen = gen_aarch64_split_simd_movv8hf;
1698           break;
1699         case V4SFmode:
1700           gen = gen_aarch64_split_simd_movv4sf;
1701           break;
1702         case V2DFmode:
1703           gen = gen_aarch64_split_simd_movv2df;
1704           break;
1705         default:
1706           gcc_unreachable ();
1707         }
1708
1709       emit_insn (gen (dst, src));
1710       return;
1711     }
1712 }
1713
1714 bool
1715 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1716                               machine_mode ymode, rtx y)
1717 {
1718   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1719   gcc_assert (r != NULL);
1720   return rtx_equal_p (x, r);
1721 }
1722
1723
1724 static rtx
1725 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1726 {
1727   if (can_create_pseudo_p ())
1728     return force_reg (mode, value);
1729   else
1730     {
1731       x = aarch64_emit_move (x, value);
1732       return x;
1733     }
1734 }
1735
1736
1737 static rtx
1738 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1739 {
1740   if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1741     {
1742       rtx high;
1743       /* Load the full offset into a register.  This
1744          might be improvable in the future.  */
1745       high = GEN_INT (offset);
1746       offset = 0;
1747       high = aarch64_force_temporary (mode, temp, high);
1748       reg = aarch64_force_temporary (mode, temp,
1749                                      gen_rtx_PLUS (mode, high, reg));
1750     }
1751   return plus_constant (mode, reg, offset);
1752 }
1753
1754 static int
1755 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1756                                 machine_mode mode)
1757 {
1758   int i;
1759   unsigned HOST_WIDE_INT val, val2, mask;
1760   int one_match, zero_match;
1761   int num_insns;
1762
1763   val = INTVAL (imm);
1764
1765   if (aarch64_move_imm (val, mode))
1766     {
1767       if (generate)
1768         emit_insn (gen_rtx_SET (dest, imm));
1769       return 1;
1770     }
1771
1772   if ((val >> 32) == 0 || mode == SImode)
1773     {
1774       if (generate)
1775         {
1776           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1777           if (mode == SImode)
1778             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1779                                        GEN_INT ((val >> 16) & 0xffff)));
1780           else
1781             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1782                                        GEN_INT ((val >> 16) & 0xffff)));
1783         }
1784       return 2;
1785     }
1786
1787   /* Remaining cases are all for DImode.  */
1788
1789   mask = 0xffff;
1790   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1791     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1792   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1793     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1794
1795   if (zero_match != 2 && one_match != 2)
1796     {
1797       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1798          For a 64-bit bitmask try whether changing 16 bits to all ones or
1799          zeroes creates a valid bitmask.  To check any repeated bitmask,
1800          try using 16 bits from the other 32-bit half of val.  */
1801
1802       for (i = 0; i < 64; i += 16, mask <<= 16)
1803         {
1804           val2 = val & ~mask;
1805           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1806             break;
1807           val2 = val | mask;
1808           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1809             break;
1810           val2 = val2 & ~mask;
1811           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1812           if (val2 != val && aarch64_bitmask_imm (val2, mode))
1813             break;
1814         }
1815       if (i != 64)
1816         {
1817           if (generate)
1818             {
1819               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1820               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1821                                          GEN_INT ((val >> i) & 0xffff)));
1822             }
1823           return 2;
1824         }
1825     }
1826
1827   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1828      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
1829      otherwise skip zero bits.  */
1830
1831   num_insns = 1;
1832   mask = 0xffff;
1833   val2 = one_match > zero_match ? ~val : val;
1834   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1835
1836   if (generate)
1837     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1838                                            ? (val | ~(mask << i))
1839                                            : (val & (mask << i)))));
1840   for (i += 16; i < 64; i += 16)
1841     {
1842       if ((val2 & (mask << i)) == 0)
1843         continue;
1844       if (generate)
1845         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1846                                    GEN_INT ((val >> i) & 0xffff)));
1847       num_insns ++;
1848     }
1849
1850   return num_insns;
1851 }
1852
1853
1854 void
1855 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1856 {
1857   machine_mode mode = GET_MODE (dest);
1858
1859   gcc_assert (mode == SImode || mode == DImode);
1860
1861   /* Check on what type of symbol it is.  */
1862   if (GET_CODE (imm) == SYMBOL_REF
1863       || GET_CODE (imm) == LABEL_REF
1864       || GET_CODE (imm) == CONST)
1865     {
1866       rtx mem, base, offset;
1867       enum aarch64_symbol_type sty;
1868
1869       /* If we have (const (plus symbol offset)), separate out the offset
1870          before we start classifying the symbol.  */
1871       split_const (imm, &base, &offset);
1872
1873       sty = aarch64_classify_symbol (base, offset);
1874       switch (sty)
1875         {
1876         case SYMBOL_FORCE_TO_MEM:
1877           if (offset != const0_rtx
1878               && targetm.cannot_force_const_mem (mode, imm))
1879             {
1880               gcc_assert (can_create_pseudo_p ());
1881               base = aarch64_force_temporary (mode, dest, base);
1882               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1883               aarch64_emit_move (dest, base);
1884               return;
1885             }
1886
1887           mem = force_const_mem (ptr_mode, imm);
1888           gcc_assert (mem);
1889
1890           /* If we aren't generating PC relative literals, then
1891              we need to expand the literal pool access carefully.
1892              This is something that needs to be done in a number
1893              of places, so could well live as a separate function.  */
1894           if (!aarch64_pcrelative_literal_loads)
1895             {
1896               gcc_assert (can_create_pseudo_p ());
1897               base = gen_reg_rtx (ptr_mode);
1898               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1899               mem = gen_rtx_MEM (ptr_mode, base);
1900             }
1901
1902           if (mode != ptr_mode)
1903             mem = gen_rtx_ZERO_EXTEND (mode, mem);
1904
1905           emit_insn (gen_rtx_SET (dest, mem));
1906
1907           return;
1908
1909         case SYMBOL_SMALL_TLSGD:
1910         case SYMBOL_SMALL_TLSDESC:
1911         case SYMBOL_SMALL_TLSIE:
1912         case SYMBOL_SMALL_GOT_28K:
1913         case SYMBOL_SMALL_GOT_4G:
1914         case SYMBOL_TINY_GOT:
1915         case SYMBOL_TINY_TLSIE:
1916           if (offset != const0_rtx)
1917             {
1918               gcc_assert(can_create_pseudo_p ());
1919               base = aarch64_force_temporary (mode, dest, base);
1920               base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1921               aarch64_emit_move (dest, base);
1922               return;
1923             }
1924           /* FALLTHRU */
1925
1926         case SYMBOL_SMALL_ABSOLUTE:
1927         case SYMBOL_TINY_ABSOLUTE:
1928         case SYMBOL_TLSLE12:
1929         case SYMBOL_TLSLE24:
1930         case SYMBOL_TLSLE32:
1931         case SYMBOL_TLSLE48:
1932           aarch64_load_symref_appropriately (dest, imm, sty);
1933           return;
1934
1935         default:
1936           gcc_unreachable ();
1937         }
1938     }
1939
1940   if (!CONST_INT_P (imm))
1941     {
1942       if (GET_CODE (imm) == HIGH)
1943         emit_insn (gen_rtx_SET (dest, imm));
1944       else
1945         {
1946           rtx mem = force_const_mem (mode, imm);
1947           gcc_assert (mem);
1948           emit_insn (gen_rtx_SET (dest, mem));
1949         }
1950
1951       return;
1952     }
1953
1954   aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1955 }
1956
1957 /* Add DELTA to REGNUM in mode MODE.  SCRATCHREG can be used to hold a
1958    temporary value if necessary.  FRAME_RELATED_P should be true if
1959    the RTX_FRAME_RELATED flag should be set and CFA adjustments added
1960    to the generated instructions.  If SCRATCHREG is known to hold
1961    abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
1962    immediate again.
1963
1964    Since this function may be used to adjust the stack pointer, we must
1965    ensure that it cannot cause transient stack deallocation (for example
1966    by first incrementing SP and then decrementing when adjusting by a
1967    large immediate).  */
1968
1969 static void
1970 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
1971                                HOST_WIDE_INT delta, bool frame_related_p,
1972                                bool emit_move_imm)
1973 {
1974   HOST_WIDE_INT mdelta = abs_hwi (delta);
1975   rtx this_rtx = gen_rtx_REG (mode, regnum);
1976   rtx_insn *insn;
1977
1978   if (!mdelta)
1979     return;
1980
1981   /* Single instruction adjustment.  */
1982   if (aarch64_uimm12_shift (mdelta))
1983     {
1984       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
1985       RTX_FRAME_RELATED_P (insn) = frame_related_p;
1986       return;
1987     }
1988
1989   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
1990      Only do this if mdelta is not a 16-bit move as adjusting using a move
1991      is better.  */
1992   if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
1993     {
1994       HOST_WIDE_INT low_off = mdelta & 0xfff;
1995
1996       low_off = delta < 0 ? -low_off : low_off;
1997       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
1998       RTX_FRAME_RELATED_P (insn) = frame_related_p;
1999       insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2000       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2001       return;
2002     }
2003
2004   /* Emit a move immediate if required and an addition/subtraction.  */
2005   rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2006   if (emit_move_imm)
2007     aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2008   insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2009                               : gen_add2_insn (this_rtx, scratch_rtx));
2010   if (frame_related_p)
2011     {
2012       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2013       rtx adj = plus_constant (mode, this_rtx, delta);
2014       add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2015     }
2016 }
2017
2018 static inline void
2019 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2020                       HOST_WIDE_INT delta)
2021 {
2022   aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2023 }
2024
2025 static inline void
2026 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2027 {
2028   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2029                                  true, emit_move_imm);
2030 }
2031
2032 static inline void
2033 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2034 {
2035   aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2036                                  frame_related_p, true);
2037 }
2038
2039 static bool
2040 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2041                                  tree exp ATTRIBUTE_UNUSED)
2042 {
2043   /* Currently, always true.  */
2044   return true;
2045 }
2046
2047 /* Implement TARGET_PASS_BY_REFERENCE.  */
2048
2049 static bool
2050 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2051                            machine_mode mode,
2052                            const_tree type,
2053                            bool named ATTRIBUTE_UNUSED)
2054 {
2055   HOST_WIDE_INT size;
2056   machine_mode dummymode;
2057   int nregs;
2058
2059   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
2060   size = (mode == BLKmode && type)
2061     ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2062
2063   /* Aggregates are passed by reference based on their size.  */
2064   if (type && AGGREGATE_TYPE_P (type))
2065     {
2066       size = int_size_in_bytes (type);
2067     }
2068
2069   /* Variable sized arguments are always returned by reference.  */
2070   if (size < 0)
2071     return true;
2072
2073   /* Can this be a candidate to be passed in fp/simd register(s)?  */
2074   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2075                                                &dummymode, &nregs,
2076                                                NULL))
2077     return false;
2078
2079   /* Arguments which are variable sized or larger than 2 registers are
2080      passed by reference unless they are a homogenous floating point
2081      aggregate.  */
2082   return size > 2 * UNITS_PER_WORD;
2083 }
2084
2085 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
2086 static bool
2087 aarch64_return_in_msb (const_tree valtype)
2088 {
2089   machine_mode dummy_mode;
2090   int dummy_int;
2091
2092   /* Never happens in little-endian mode.  */
2093   if (!BYTES_BIG_ENDIAN)
2094     return false;
2095
2096   /* Only composite types smaller than or equal to 16 bytes can
2097      be potentially returned in registers.  */
2098   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2099       || int_size_in_bytes (valtype) <= 0
2100       || int_size_in_bytes (valtype) > 16)
2101     return false;
2102
2103   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2104      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2105      is always passed/returned in the least significant bits of fp/simd
2106      register(s).  */
2107   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2108                                                &dummy_mode, &dummy_int, NULL))
2109     return false;
2110
2111   return true;
2112 }
2113
2114 /* Implement TARGET_FUNCTION_VALUE.
2115    Define how to find the value returned by a function.  */
2116
2117 static rtx
2118 aarch64_function_value (const_tree type, const_tree func,
2119                         bool outgoing ATTRIBUTE_UNUSED)
2120 {
2121   machine_mode mode;
2122   int unsignedp;
2123   int count;
2124   machine_mode ag_mode;
2125
2126   mode = TYPE_MODE (type);
2127   if (INTEGRAL_TYPE_P (type))
2128     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2129
2130   if (aarch64_return_in_msb (type))
2131     {
2132       HOST_WIDE_INT size = int_size_in_bytes (type);
2133
2134       if (size % UNITS_PER_WORD != 0)
2135         {
2136           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2137           mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2138         }
2139     }
2140
2141   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2142                                                &ag_mode, &count, NULL))
2143     {
2144       if (!aarch64_composite_type_p (type, mode))
2145         {
2146           gcc_assert (count == 1 && mode == ag_mode);
2147           return gen_rtx_REG (mode, V0_REGNUM);
2148         }
2149       else
2150         {
2151           int i;
2152           rtx par;
2153
2154           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2155           for (i = 0; i < count; i++)
2156             {
2157               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2158               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2159                                        GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2160               XVECEXP (par, 0, i) = tmp;
2161             }
2162           return par;
2163         }
2164     }
2165   else
2166     return gen_rtx_REG (mode, R0_REGNUM);
2167 }
2168
2169 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2170    Return true if REGNO is the number of a hard register in which the values
2171    of called function may come back.  */
2172
2173 static bool
2174 aarch64_function_value_regno_p (const unsigned int regno)
2175 {
2176   /* Maximum of 16 bytes can be returned in the general registers.  Examples
2177      of 16-byte return values are: 128-bit integers and 16-byte small
2178      structures (excluding homogeneous floating-point aggregates).  */
2179   if (regno == R0_REGNUM || regno == R1_REGNUM)
2180     return true;
2181
2182   /* Up to four fp/simd registers can return a function value, e.g. a
2183      homogeneous floating-point aggregate having four members.  */
2184   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2185     return TARGET_FLOAT;
2186
2187   return false;
2188 }
2189
2190 /* Implement TARGET_RETURN_IN_MEMORY.
2191
2192    If the type T of the result of a function is such that
2193      void func (T arg)
2194    would require that arg be passed as a value in a register (or set of
2195    registers) according to the parameter passing rules, then the result
2196    is returned in the same registers as would be used for such an
2197    argument.  */
2198
2199 static bool
2200 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2201 {
2202   HOST_WIDE_INT size;
2203   machine_mode ag_mode;
2204   int count;
2205
2206   if (!AGGREGATE_TYPE_P (type)
2207       && TREE_CODE (type) != COMPLEX_TYPE
2208       && TREE_CODE (type) != VECTOR_TYPE)
2209     /* Simple scalar types always returned in registers.  */
2210     return false;
2211
2212   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2213                                                type,
2214                                                &ag_mode,
2215                                                &count,
2216                                                NULL))
2217     return false;
2218
2219   /* Types larger than 2 registers returned in memory.  */
2220   size = int_size_in_bytes (type);
2221   return (size < 0 || size > 2 * UNITS_PER_WORD);
2222 }
2223
2224 static bool
2225 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2226                                const_tree type, int *nregs)
2227 {
2228   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2229   return aarch64_vfp_is_call_or_return_candidate (mode,
2230                                                   type,
2231                                                   &pcum->aapcs_vfp_rmode,
2232                                                   nregs,
2233                                                   NULL);
2234 }
2235
2236 /* Given MODE and TYPE of a function argument, return the alignment in
2237    bits.  The idea is to suppress any stronger alignment requested by
2238    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2239    This is a helper function for local use only.  */
2240
2241 static unsigned int
2242 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2243 {
2244   if (!type)
2245     return GET_MODE_ALIGNMENT (mode);
2246   if (integer_zerop (TYPE_SIZE (type)))
2247     return 0;
2248
2249   gcc_assert (TYPE_MODE (type) == mode);
2250
2251   if (!AGGREGATE_TYPE_P (type))
2252     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2253
2254   if (TREE_CODE (type) == ARRAY_TYPE)
2255     return TYPE_ALIGN (TREE_TYPE (type));
2256
2257   unsigned int alignment = 0;
2258
2259   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2260     alignment = std::max (alignment, DECL_ALIGN (field));
2261
2262   return alignment;
2263 }
2264
2265 /* Layout a function argument according to the AAPCS64 rules.  The rule
2266    numbers refer to the rule numbers in the AAPCS64.  */
2267
2268 static void
2269 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2270                     const_tree type,
2271                     bool named ATTRIBUTE_UNUSED)
2272 {
2273   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2274   int ncrn, nvrn, nregs;
2275   bool allocate_ncrn, allocate_nvrn;
2276   HOST_WIDE_INT size;
2277
2278   /* We need to do this once per argument.  */
2279   if (pcum->aapcs_arg_processed)
2280     return;
2281
2282   pcum->aapcs_arg_processed = true;
2283
2284   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
2285   size
2286     = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2287                 UNITS_PER_WORD);
2288
2289   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2290   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2291                                                  mode,
2292                                                  type,
2293                                                  &nregs);
2294
2295   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2296      The following code thus handles passing by SIMD/FP registers first.  */
2297
2298   nvrn = pcum->aapcs_nvrn;
2299
2300   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2301      and homogenous short-vector aggregates (HVA).  */
2302   if (allocate_nvrn)
2303     {
2304       if (!TARGET_FLOAT)
2305         aarch64_err_no_fpadvsimd (mode, "argument");
2306
2307       if (nvrn + nregs <= NUM_FP_ARG_REGS)
2308         {
2309           pcum->aapcs_nextnvrn = nvrn + nregs;
2310           if (!aarch64_composite_type_p (type, mode))
2311             {
2312               gcc_assert (nregs == 1);
2313               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2314             }
2315           else
2316             {
2317               rtx par;
2318               int i;
2319               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2320               for (i = 0; i < nregs; i++)
2321                 {
2322                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2323                                          V0_REGNUM + nvrn + i);
2324                   tmp = gen_rtx_EXPR_LIST
2325                     (VOIDmode, tmp,
2326                      GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2327                   XVECEXP (par, 0, i) = tmp;
2328                 }
2329               pcum->aapcs_reg = par;
2330             }
2331           return;
2332         }
2333       else
2334         {
2335           /* C.3 NSRN is set to 8.  */
2336           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2337           goto on_stack;
2338         }
2339     }
2340
2341   ncrn = pcum->aapcs_ncrn;
2342   nregs = size / UNITS_PER_WORD;
2343
2344   /* C6 - C9.  though the sign and zero extension semantics are
2345      handled elsewhere.  This is the case where the argument fits
2346      entirely general registers.  */
2347   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2348     {
2349       unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2350
2351       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2352
2353       /* C.8 if the argument has an alignment of 16 then the NGRN is
2354          rounded up to the next even number.  */
2355       if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
2356         {
2357           ++ncrn;
2358           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2359         }
2360       /* NREGS can be 0 when e.g. an empty structure is to be passed.
2361          A reg is still generated for it, but the caller should be smart
2362          enough not to use it.  */
2363       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2364         {
2365           pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2366         }
2367       else
2368         {
2369           rtx par;
2370           int i;
2371
2372           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2373           for (i = 0; i < nregs; i++)
2374             {
2375               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2376               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2377                                        GEN_INT (i * UNITS_PER_WORD));
2378               XVECEXP (par, 0, i) = tmp;
2379             }
2380           pcum->aapcs_reg = par;
2381         }
2382
2383       pcum->aapcs_nextncrn = ncrn + nregs;
2384       return;
2385     }
2386
2387   /* C.11  */
2388   pcum->aapcs_nextncrn = NUM_ARG_REGS;
2389
2390   /* The argument is passed on stack; record the needed number of words for
2391      this argument and align the total size if necessary.  */
2392 on_stack:
2393   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2394   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2395     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2396                                        16 / UNITS_PER_WORD);
2397   return;
2398 }
2399
2400 /* Implement TARGET_FUNCTION_ARG.  */
2401
2402 static rtx
2403 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2404                       const_tree type, bool named)
2405 {
2406   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2407   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2408
2409   if (mode == VOIDmode)
2410     return NULL_RTX;
2411
2412   aarch64_layout_arg (pcum_v, mode, type, named);
2413   return pcum->aapcs_reg;
2414 }
2415
2416 void
2417 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2418                            const_tree fntype ATTRIBUTE_UNUSED,
2419                            rtx libname ATTRIBUTE_UNUSED,
2420                            const_tree fndecl ATTRIBUTE_UNUSED,
2421                            unsigned n_named ATTRIBUTE_UNUSED)
2422 {
2423   pcum->aapcs_ncrn = 0;
2424   pcum->aapcs_nvrn = 0;
2425   pcum->aapcs_nextncrn = 0;
2426   pcum->aapcs_nextnvrn = 0;
2427   pcum->pcs_variant = ARM_PCS_AAPCS64;
2428   pcum->aapcs_reg = NULL_RTX;
2429   pcum->aapcs_arg_processed = false;
2430   pcum->aapcs_stack_words = 0;
2431   pcum->aapcs_stack_size = 0;
2432
2433   if (!TARGET_FLOAT
2434       && fndecl && TREE_PUBLIC (fndecl)
2435       && fntype && fntype != error_mark_node)
2436     {
2437       const_tree type = TREE_TYPE (fntype);
2438       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
2439       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
2440       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2441                                                    &mode, &nregs, NULL))
2442         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2443     }
2444   return;
2445 }
2446
2447 static void
2448 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2449                               machine_mode mode,
2450                               const_tree type,
2451                               bool named)
2452 {
2453   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2454   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2455     {
2456       aarch64_layout_arg (pcum_v, mode, type, named);
2457       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2458                   != (pcum->aapcs_stack_words != 0));
2459       pcum->aapcs_arg_processed = false;
2460       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2461       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2462       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2463       pcum->aapcs_stack_words = 0;
2464       pcum->aapcs_reg = NULL_RTX;
2465     }
2466 }
2467
2468 bool
2469 aarch64_function_arg_regno_p (unsigned regno)
2470 {
2471   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2472           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2473 }
2474
2475 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
2476    PARM_BOUNDARY bits of alignment, but will be given anything up
2477    to STACK_BOUNDARY bits if the type requires it.  This makes sure
2478    that both before and after the layout of each argument, the Next
2479    Stacked Argument Address (NSAA) will have a minimum alignment of
2480    8 bytes.  */
2481
2482 static unsigned int
2483 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2484 {
2485   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2486
2487   if (alignment < PARM_BOUNDARY)
2488     alignment = PARM_BOUNDARY;
2489   if (alignment > STACK_BOUNDARY)
2490     alignment = STACK_BOUNDARY;
2491   return alignment;
2492 }
2493
2494 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2495
2496    Return true if an argument passed on the stack should be padded upwards,
2497    i.e. if the least-significant byte of the stack slot has useful data.
2498
2499    Small aggregate types are placed in the lowest memory address.
2500
2501    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
2502
2503 bool
2504 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2505 {
2506   /* On little-endian targets, the least significant byte of every stack
2507      argument is passed at the lowest byte address of the stack slot.  */
2508   if (!BYTES_BIG_ENDIAN)
2509     return true;
2510
2511   /* Otherwise, integral, floating-point and pointer types are padded downward:
2512      the least significant byte of a stack argument is passed at the highest
2513      byte address of the stack slot.  */
2514   if (type
2515       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2516          || POINTER_TYPE_P (type))
2517       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2518     return false;
2519
2520   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
2521   return true;
2522 }
2523
2524 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2525
2526    It specifies padding for the last (may also be the only)
2527    element of a block move between registers and memory.  If
2528    assuming the block is in the memory, padding upward means that
2529    the last element is padded after its highest significant byte,
2530    while in downward padding, the last element is padded at the
2531    its least significant byte side.
2532
2533    Small aggregates and small complex types are always padded
2534    upwards.
2535
2536    We don't need to worry about homogeneous floating-point or
2537    short-vector aggregates; their move is not affected by the
2538    padding direction determined here.  Regardless of endianness,
2539    each element of such an aggregate is put in the least
2540    significant bits of a fp/simd register.
2541
2542    Return !BYTES_BIG_ENDIAN if the least significant byte of the
2543    register has useful data, and return the opposite if the most
2544    significant byte does.  */
2545
2546 bool
2547 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2548                      bool first ATTRIBUTE_UNUSED)
2549 {
2550
2551   /* Small composite types are always padded upward.  */
2552   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2553     {
2554       HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2555                             : GET_MODE_SIZE (mode));
2556       if (size < 2 * UNITS_PER_WORD)
2557         return true;
2558     }
2559
2560   /* Otherwise, use the default padding.  */
2561   return !BYTES_BIG_ENDIAN;
2562 }
2563
2564 static machine_mode
2565 aarch64_libgcc_cmp_return_mode (void)
2566 {
2567   return SImode;
2568 }
2569
2570 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2571
2572 /* We use the 12-bit shifted immediate arithmetic instructions so values
2573    must be multiple of (1 << 12), i.e. 4096.  */
2574 #define ARITH_FACTOR 4096
2575
2576 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2577 #error Cannot use simple address calculation for stack probing
2578 #endif
2579
2580 /* The pair of scratch registers used for stack probing.  */
2581 #define PROBE_STACK_FIRST_REG  9
2582 #define PROBE_STACK_SECOND_REG 10
2583
2584 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2585    inclusive.  These are offsets from the current stack pointer.  */
2586
2587 static void
2588 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2589 {
2590   rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2591
2592   /* See the same assertion on PROBE_INTERVAL above.  */
2593   gcc_assert ((first % ARITH_FACTOR) == 0);
2594
2595   /* See if we have a constant small number of probes to generate.  If so,
2596      that's the easy case.  */
2597   if (size <= PROBE_INTERVAL)
2598     {
2599       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2600
2601       emit_set_insn (reg1,
2602                      plus_constant (ptr_mode,
2603                                     stack_pointer_rtx, -(first + base)));
2604       emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2605     }
2606
2607   /* The run-time loop is made up of 8 insns in the generic case while the
2608      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
2609   else if (size <= 4 * PROBE_INTERVAL)
2610     {
2611       HOST_WIDE_INT i, rem;
2612
2613       emit_set_insn (reg1,
2614                      plus_constant (ptr_mode,
2615                                     stack_pointer_rtx,
2616                                     -(first + PROBE_INTERVAL)));
2617       emit_stack_probe (reg1);
2618
2619       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2620          it exceeds SIZE.  If only two probes are needed, this will not
2621          generate any code.  Then probe at FIRST + SIZE.  */
2622       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2623         {
2624           emit_set_insn (reg1,
2625                          plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2626           emit_stack_probe (reg1);
2627         }
2628
2629       rem = size - (i - PROBE_INTERVAL);
2630       if (rem > 256)
2631         {
2632           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2633
2634           emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2635           emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2636         }
2637       else
2638         emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2639     }
2640
2641   /* Otherwise, do the same as above, but in a loop.  Note that we must be
2642      extra careful with variables wrapping around because we might be at
2643      the very top (or the very bottom) of the address space and we have
2644      to be able to handle this case properly; in particular, we use an
2645      equality test for the loop condition.  */
2646   else
2647     {
2648       rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2649
2650       /* Step 1: round SIZE to the previous multiple of the interval.  */
2651
2652       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2653
2654
2655       /* Step 2: compute initial and final value of the loop counter.  */
2656
2657       /* TEST_ADDR = SP + FIRST.  */
2658       emit_set_insn (reg1,
2659                      plus_constant (ptr_mode, stack_pointer_rtx, -first));
2660
2661       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
2662       emit_set_insn (reg2,
2663                      plus_constant (ptr_mode, stack_pointer_rtx,
2664                                     -(first + rounded_size)));
2665
2666
2667       /* Step 3: the loop
2668
2669          do
2670            {
2671              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2672              probe at TEST_ADDR
2673            }
2674          while (TEST_ADDR != LAST_ADDR)
2675
2676          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2677          until it is equal to ROUNDED_SIZE.  */
2678
2679       if (ptr_mode == DImode)
2680         emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2681       else
2682         emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2683
2684
2685       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2686          that SIZE is equal to ROUNDED_SIZE.  */
2687
2688       if (size != rounded_size)
2689         {
2690           HOST_WIDE_INT rem = size - rounded_size;
2691
2692           if (rem > 256)
2693             {
2694               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2695
2696               emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2697               emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2698             }
2699           else
2700             emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2701         }
2702     }
2703
2704   /* Make sure nothing is scheduled before we are done.  */
2705   emit_insn (gen_blockage ());
2706 }
2707
2708 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
2709    absolute addresses.  */
2710
2711 const char *
2712 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2713 {
2714   static int labelno = 0;
2715   char loop_lab[32];
2716   rtx xops[2];
2717
2718   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2719
2720   /* Loop.  */
2721   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2722
2723   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
2724   xops[0] = reg1;
2725   xops[1] = GEN_INT (PROBE_INTERVAL);
2726   output_asm_insn ("sub\t%0, %0, %1", xops);
2727
2728   /* Probe at TEST_ADDR.  */
2729   output_asm_insn ("str\txzr, [%0]", xops);
2730
2731   /* Test if TEST_ADDR == LAST_ADDR.  */
2732   xops[1] = reg2;
2733   output_asm_insn ("cmp\t%0, %1", xops);
2734
2735   /* Branch.  */
2736   fputs ("\tb.ne\t", asm_out_file);
2737   assemble_name_raw (asm_out_file, loop_lab);
2738   fputc ('\n', asm_out_file);
2739
2740   return "";
2741 }
2742
2743 static bool
2744 aarch64_frame_pointer_required (void)
2745 {
2746   /* In aarch64_override_options_after_change
2747      flag_omit_leaf_frame_pointer turns off the frame pointer by
2748      default.  Turn it back on now if we've not got a leaf
2749      function.  */
2750   if (flag_omit_leaf_frame_pointer
2751       && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2752     return true;
2753
2754   return false;
2755 }
2756
2757 /* Mark the registers that need to be saved by the callee and calculate
2758    the size of the callee-saved registers area and frame record (both FP
2759    and LR may be omitted).  */
2760 static void
2761 aarch64_layout_frame (void)
2762 {
2763   HOST_WIDE_INT offset = 0;
2764   int regno, last_fp_reg = INVALID_REGNUM;
2765
2766   if (reload_completed && cfun->machine->frame.laid_out)
2767     return;
2768
2769 #define SLOT_NOT_REQUIRED (-2)
2770 #define SLOT_REQUIRED     (-1)
2771
2772   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2773   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2774
2775   /* First mark all the registers that really need to be saved...  */
2776   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2777     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2778
2779   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2780     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2781
2782   /* ... that includes the eh data registers (if needed)...  */
2783   if (crtl->calls_eh_return)
2784     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2785       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2786         = SLOT_REQUIRED;
2787
2788   /* ... and any callee saved register that dataflow says is live.  */
2789   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2790     if (df_regs_ever_live_p (regno)
2791         && (regno == R30_REGNUM
2792             || !call_used_regs[regno]))
2793       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2794
2795   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2796     if (df_regs_ever_live_p (regno)
2797         && !call_used_regs[regno])
2798       {
2799         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2800         last_fp_reg = regno;
2801       }
2802
2803   if (frame_pointer_needed)
2804     {
2805       /* FP and LR are placed in the linkage record.  */
2806       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2807       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2808       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2809       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2810       offset += 2 * UNITS_PER_WORD;
2811     }
2812
2813   /* Now assign stack slots for them.  */
2814   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2815     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2816       {
2817         cfun->machine->frame.reg_offset[regno] = offset;
2818         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2819           cfun->machine->frame.wb_candidate1 = regno;
2820         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2821           cfun->machine->frame.wb_candidate2 = regno;
2822         offset += UNITS_PER_WORD;
2823       }
2824
2825   HOST_WIDE_INT max_int_offset = offset;
2826   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2827   bool has_align_gap = offset != max_int_offset;
2828
2829   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2830     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2831       {
2832         /* If there is an alignment gap between integer and fp callee-saves,
2833            allocate the last fp register to it if possible.  */
2834         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2835           {
2836             cfun->machine->frame.reg_offset[regno] = max_int_offset;
2837             break;
2838           }
2839
2840         cfun->machine->frame.reg_offset[regno] = offset;
2841         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2842           cfun->machine->frame.wb_candidate1 = regno;
2843         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2844                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2845           cfun->machine->frame.wb_candidate2 = regno;
2846         offset += UNITS_PER_WORD;
2847       }
2848
2849   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2850
2851   cfun->machine->frame.saved_regs_size = offset;
2852
2853   HOST_WIDE_INT varargs_and_saved_regs_size
2854     = offset + cfun->machine->frame.saved_varargs_size;
2855
2856   cfun->machine->frame.hard_fp_offset
2857     = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2858                 STACK_BOUNDARY / BITS_PER_UNIT);
2859
2860   cfun->machine->frame.frame_size
2861     = ROUND_UP (cfun->machine->frame.hard_fp_offset
2862                 + crtl->outgoing_args_size,
2863                 STACK_BOUNDARY / BITS_PER_UNIT);
2864
2865   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2866
2867   cfun->machine->frame.initial_adjust = 0;
2868   cfun->machine->frame.final_adjust = 0;
2869   cfun->machine->frame.callee_adjust = 0;
2870   cfun->machine->frame.callee_offset = 0;
2871
2872   HOST_WIDE_INT max_push_offset = 0;
2873   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2874     max_push_offset = 512;
2875   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2876     max_push_offset = 256;
2877
2878   if (cfun->machine->frame.frame_size < max_push_offset
2879       && crtl->outgoing_args_size == 0)
2880     {
2881       /* Simple, small frame with no outgoing arguments:
2882          stp reg1, reg2, [sp, -frame_size]!
2883          stp reg3, reg4, [sp, 16]  */
2884       cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2885     }
2886   else if ((crtl->outgoing_args_size
2887             + cfun->machine->frame.saved_regs_size < 512)
2888            && !(cfun->calls_alloca
2889                 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2890     {
2891       /* Frame with small outgoing arguments:
2892          sub sp, sp, frame_size
2893          stp reg1, reg2, [sp, outgoing_args_size]
2894          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
2895       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2896       cfun->machine->frame.callee_offset
2897         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2898     }
2899   else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
2900     {
2901       /* Frame with large outgoing arguments but a small local area:
2902          stp reg1, reg2, [sp, -hard_fp_offset]!
2903          stp reg3, reg4, [sp, 16]
2904          sub sp, sp, outgoing_args_size  */
2905       cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
2906       cfun->machine->frame.final_adjust
2907         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2908     }
2909   else if (!frame_pointer_needed
2910            && varargs_and_saved_regs_size < max_push_offset)
2911     {
2912       /* Frame with large local area and outgoing arguments (this pushes the
2913          callee-saves first, followed by the locals and outgoing area):
2914          stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2915          stp reg3, reg4, [sp, 16]
2916          sub sp, sp, frame_size - varargs_and_saved_regs_size  */
2917       cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
2918       cfun->machine->frame.final_adjust
2919         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2920       cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
2921       cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
2922     }
2923   else
2924     {
2925       /* Frame with large local area and outgoing arguments using frame pointer:
2926          sub sp, sp, hard_fp_offset
2927          stp x29, x30, [sp, 0]
2928          add x29, sp, 0
2929          stp reg3, reg4, [sp, 16]
2930          sub sp, sp, outgoing_args_size  */
2931       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
2932       cfun->machine->frame.final_adjust
2933         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
2934     }
2935
2936   cfun->machine->frame.laid_out = true;
2937 }
2938
2939 /* Return true if the register REGNO is saved on entry to
2940    the current function.  */
2941
2942 static bool
2943 aarch64_register_saved_on_entry (int regno)
2944 {
2945   return cfun->machine->frame.reg_offset[regno] >= 0;
2946 }
2947
2948 /* Return the next register up from REGNO up to LIMIT for the callee
2949    to save.  */
2950
2951 static unsigned
2952 aarch64_next_callee_save (unsigned regno, unsigned limit)
2953 {
2954   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2955     regno ++;
2956   return regno;
2957 }
2958
2959 /* Push the register number REGNO of mode MODE to the stack with write-back
2960    adjusting the stack by ADJUSTMENT.  */
2961
2962 static void
2963 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2964                            HOST_WIDE_INT adjustment)
2965  {
2966   rtx base_rtx = stack_pointer_rtx;
2967   rtx insn, reg, mem;
2968
2969   reg = gen_rtx_REG (mode, regno);
2970   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2971                             plus_constant (Pmode, base_rtx, -adjustment));
2972   mem = gen_rtx_MEM (mode, mem);
2973
2974   insn = emit_move_insn (mem, reg);
2975   RTX_FRAME_RELATED_P (insn) = 1;
2976 }
2977
2978 /* Generate and return an instruction to store the pair of registers
2979    REG and REG2 of mode MODE to location BASE with write-back adjusting
2980    the stack location BASE by ADJUSTMENT.  */
2981
2982 static rtx
2983 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2984                           HOST_WIDE_INT adjustment)
2985 {
2986   switch (mode)
2987     {
2988     case DImode:
2989       return gen_storewb_pairdi_di (base, base, reg, reg2,
2990                                     GEN_INT (-adjustment),
2991                                     GEN_INT (UNITS_PER_WORD - adjustment));
2992     case DFmode:
2993       return gen_storewb_pairdf_di (base, base, reg, reg2,
2994                                     GEN_INT (-adjustment),
2995                                     GEN_INT (UNITS_PER_WORD - adjustment));
2996     default:
2997       gcc_unreachable ();
2998     }
2999 }
3000
3001 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3002    stack pointer by ADJUSTMENT.  */
3003
3004 static void
3005 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3006 {
3007   rtx_insn *insn;
3008   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3009
3010   if (regno2 == INVALID_REGNUM)
3011     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3012
3013   rtx reg1 = gen_rtx_REG (mode, regno1);
3014   rtx reg2 = gen_rtx_REG (mode, regno2);
3015
3016   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3017                                               reg2, adjustment));
3018   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3019   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3020   RTX_FRAME_RELATED_P (insn) = 1;
3021 }
3022
3023 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3024    adjusting it by ADJUSTMENT afterwards.  */
3025
3026 static rtx
3027 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3028                          HOST_WIDE_INT adjustment)
3029 {
3030   switch (mode)
3031     {
3032     case DImode:
3033       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3034                                    GEN_INT (UNITS_PER_WORD));
3035     case DFmode:
3036       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3037                                    GEN_INT (UNITS_PER_WORD));
3038     default:
3039       gcc_unreachable ();
3040     }
3041 }
3042
3043 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3044    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3045    into CFI_OPS.  */
3046
3047 static void
3048 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3049                   rtx *cfi_ops)
3050 {
3051   machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3052   rtx reg1 = gen_rtx_REG (mode, regno1);
3053
3054   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3055
3056   if (regno2 == INVALID_REGNUM)
3057     {
3058       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3059       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3060       emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3061     }
3062   else
3063     {
3064       rtx reg2 = gen_rtx_REG (mode, regno2);
3065       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3066       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3067                                           reg2, adjustment));
3068     }
3069 }
3070
3071 /* Generate and return a store pair instruction of mode MODE to store
3072    register REG1 to MEM1 and register REG2 to MEM2.  */
3073
3074 static rtx
3075 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3076                         rtx reg2)
3077 {
3078   switch (mode)
3079     {
3080     case DImode:
3081       return gen_store_pairdi (mem1, reg1, mem2, reg2);
3082
3083     case DFmode:
3084       return gen_store_pairdf (mem1, reg1, mem2, reg2);
3085
3086     default:
3087       gcc_unreachable ();
3088     }
3089 }
3090
3091 /* Generate and regurn a load pair isntruction of mode MODE to load register
3092    REG1 from MEM1 and register REG2 from MEM2.  */
3093
3094 static rtx
3095 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3096                        rtx mem2)
3097 {
3098   switch (mode)
3099     {
3100     case DImode:
3101       return gen_load_pairdi (reg1, mem1, reg2, mem2);
3102
3103     case DFmode:
3104       return gen_load_pairdf (reg1, mem1, reg2, mem2);
3105
3106     default:
3107       gcc_unreachable ();
3108     }
3109 }
3110
3111 /* Emit code to save the callee-saved registers from register number START
3112    to LIMIT to the stack at the location starting at offset START_OFFSET,
3113    skipping any write-back candidates if SKIP_WB is true.  */
3114
3115 static void
3116 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3117                            unsigned start, unsigned limit, bool skip_wb)
3118 {
3119   rtx_insn *insn;
3120   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3121                                                  ? gen_frame_mem : gen_rtx_MEM);
3122   unsigned regno;
3123   unsigned regno2;
3124
3125   for (regno = aarch64_next_callee_save (start, limit);
3126        regno <= limit;
3127        regno = aarch64_next_callee_save (regno + 1, limit))
3128     {
3129       rtx reg, mem;
3130       HOST_WIDE_INT offset;
3131
3132       if (skip_wb
3133           && (regno == cfun->machine->frame.wb_candidate1
3134               || regno == cfun->machine->frame.wb_candidate2))
3135         continue;
3136
3137       reg = gen_rtx_REG (mode, regno);
3138       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3139       mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3140                                               offset));
3141
3142       regno2 = aarch64_next_callee_save (regno + 1, limit);
3143
3144       if (regno2 <= limit
3145           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3146               == cfun->machine->frame.reg_offset[regno2]))
3147
3148         {
3149           rtx reg2 = gen_rtx_REG (mode, regno2);
3150           rtx mem2;
3151
3152           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3153           mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3154                                                    offset));
3155           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3156                                                     reg2));
3157
3158           /* The first part of a frame-related parallel insn is
3159              always assumed to be relevant to the frame
3160              calculations; subsequent parts, are only
3161              frame-related if explicitly marked.  */
3162           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3163           regno = regno2;
3164         }
3165       else
3166         insn = emit_move_insn (mem, reg);
3167
3168       RTX_FRAME_RELATED_P (insn) = 1;
3169     }
3170 }
3171
3172 /* Emit code to restore the callee registers of mode MODE from register
3173    number START up to and including LIMIT.  Restore from the stack offset
3174    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3175    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
3176
3177 static void
3178 aarch64_restore_callee_saves (machine_mode mode,
3179                               HOST_WIDE_INT start_offset, unsigned start,
3180                               unsigned limit, bool skip_wb, rtx *cfi_ops)
3181 {
3182   rtx base_rtx = stack_pointer_rtx;
3183   rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3184                                                  ? gen_frame_mem : gen_rtx_MEM);
3185   unsigned regno;
3186   unsigned regno2;
3187   HOST_WIDE_INT offset;
3188
3189   for (regno = aarch64_next_callee_save (start, limit);
3190        regno <= limit;
3191        regno = aarch64_next_callee_save (regno + 1, limit))
3192     {
3193       rtx reg, mem;
3194
3195       if (skip_wb
3196           && (regno == cfun->machine->frame.wb_candidate1
3197               || regno == cfun->machine->frame.wb_candidate2))
3198         continue;
3199
3200       reg = gen_rtx_REG (mode, regno);
3201       offset = start_offset + cfun->machine->frame.reg_offset[regno];
3202       mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3203
3204       regno2 = aarch64_next_callee_save (regno + 1, limit);
3205
3206       if (regno2 <= limit
3207           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3208               == cfun->machine->frame.reg_offset[regno2]))
3209         {
3210           rtx reg2 = gen_rtx_REG (mode, regno2);
3211           rtx mem2;
3212
3213           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3214           mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3215           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3216
3217           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3218           regno = regno2;
3219         }
3220       else
3221         emit_move_insn (reg, mem);
3222       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3223     }
3224 }
3225
3226 /* AArch64 stack frames generated by this compiler look like:
3227
3228         +-------------------------------+
3229         |                               |
3230         |  incoming stack arguments     |
3231         |                               |
3232         +-------------------------------+
3233         |                               | <-- incoming stack pointer (aligned)
3234         |  callee-allocated save area   |
3235         |  for register varargs         |
3236         |                               |
3237         +-------------------------------+
3238         |  local variables              | <-- frame_pointer_rtx
3239         |                               |
3240         +-------------------------------+
3241         |  padding0                     | \
3242         +-------------------------------+  |
3243         |  callee-saved registers       |  | frame.saved_regs_size
3244         +-------------------------------+  |
3245         |  LR'                          |  |
3246         +-------------------------------+  |
3247         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
3248         +-------------------------------+
3249         |  dynamic allocation           |
3250         +-------------------------------+
3251         |  padding                      |
3252         +-------------------------------+
3253         |  outgoing stack arguments     | <-- arg_pointer
3254         |                               |
3255         +-------------------------------+
3256         |                               | <-- stack_pointer_rtx (aligned)
3257
3258    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3259    but leave frame_pointer_rtx and hard_frame_pointer_rtx
3260    unchanged.  */
3261
3262 /* Generate the prologue instructions for entry into a function.
3263    Establish the stack frame by decreasing the stack pointer with a
3264    properly calculated size and, if necessary, create a frame record
3265    filled with the values of LR and previous frame pointer.  The
3266    current FP is also set up if it is in use.  */
3267
3268 void
3269 aarch64_expand_prologue (void)
3270 {
3271   aarch64_layout_frame ();
3272
3273   HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3274   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3275   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3276   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3277   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3278   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3279   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3280   rtx_insn *insn;
3281
3282   if (flag_stack_usage_info)
3283     current_function_static_stack_size = frame_size;
3284
3285   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3286     {
3287       if (crtl->is_leaf && !cfun->calls_alloca)
3288         {
3289           if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3290             aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3291                                             frame_size - STACK_CHECK_PROTECT);
3292         }
3293       else if (frame_size > 0)
3294         aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3295     }
3296
3297   aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3298
3299   if (callee_adjust != 0)
3300     aarch64_push_regs (reg1, reg2, callee_adjust);
3301
3302   if (frame_pointer_needed)
3303     {
3304       if (callee_adjust == 0)
3305         aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3306                                    R30_REGNUM, false);
3307       insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3308                                        stack_pointer_rtx,
3309                                        GEN_INT (callee_offset)));
3310       RTX_FRAME_RELATED_P (insn) = 1;
3311       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3312     }
3313
3314   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3315                              callee_adjust != 0 || frame_pointer_needed);
3316   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3317                              callee_adjust != 0 || frame_pointer_needed);
3318   aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3319 }
3320
3321 /* Return TRUE if we can use a simple_return insn.
3322
3323    This function checks whether the callee saved stack is empty, which
3324    means no restore actions are need. The pro_and_epilogue will use
3325    this to check whether shrink-wrapping opt is feasible.  */
3326
3327 bool
3328 aarch64_use_return_insn_p (void)
3329 {
3330   if (!reload_completed)
3331     return false;
3332
3333   if (crtl->profile)
3334     return false;
3335
3336   aarch64_layout_frame ();
3337
3338   return cfun->machine->frame.frame_size == 0;
3339 }
3340
3341 /* Generate the epilogue instructions for returning from a function.
3342    This is almost exactly the reverse of the prolog sequence, except
3343    that we need to insert barriers to avoid scheduling loads that read
3344    from a deallocated stack, and we optimize the unwind records by
3345    emitting them all together if possible.  */
3346 void
3347 aarch64_expand_epilogue (bool for_sibcall)
3348 {
3349   aarch64_layout_frame ();
3350
3351   HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3352   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3353   HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3354   HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3355   unsigned reg1 = cfun->machine->frame.wb_candidate1;
3356   unsigned reg2 = cfun->machine->frame.wb_candidate2;
3357   rtx cfi_ops = NULL;
3358   rtx_insn *insn;
3359
3360   /* We need to add memory barrier to prevent read from deallocated stack.  */
3361   bool need_barrier_p = (get_frame_size ()
3362                          + cfun->machine->frame.saved_varargs_size) != 0;
3363
3364   /* Emit a barrier to prevent loads from a deallocated stack.  */
3365   if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca)
3366     {
3367       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3368       need_barrier_p = false;
3369     }
3370
3371   /* Restore the stack pointer from the frame pointer if it may not
3372      be the same as the stack pointer.  */
3373   if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3374     {
3375       insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3376                                        hard_frame_pointer_rtx,
3377                                        GEN_INT (-callee_offset)));
3378       /* If writeback is used when restoring callee-saves, the CFA
3379          is restored on the instruction doing the writeback.  */
3380       RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3381     }
3382   else
3383     aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3384
3385   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3386                                 callee_adjust != 0, &cfi_ops);
3387   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3388                                 callee_adjust != 0, &cfi_ops);
3389
3390   if (need_barrier_p)
3391     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3392
3393   if (callee_adjust != 0)
3394     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3395
3396   if (callee_adjust != 0 || initial_adjust > 65536)
3397     {
3398       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
3399       insn = get_last_insn ();
3400       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3401       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3402       RTX_FRAME_RELATED_P (insn) = 1;
3403       cfi_ops = NULL;
3404     }
3405
3406   aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3407
3408   if (cfi_ops)
3409     {
3410       /* Emit delayed restores and reset the CFA to be SP.  */
3411       insn = get_last_insn ();
3412       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3413       REG_NOTES (insn) = cfi_ops;
3414       RTX_FRAME_RELATED_P (insn) = 1;
3415     }
3416
3417   /* Stack adjustment for exception handler.  */
3418   if (crtl->calls_eh_return)
3419     {
3420       /* We need to unwind the stack by the offset computed by
3421          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
3422          to be SP; letting the CFA move during this adjustment
3423          is just as correct as retaining the CFA from the body
3424          of the function.  Therefore, do nothing special.  */
3425       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3426     }
3427
3428   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3429   if (!for_sibcall)
3430     emit_jump_insn (ret_rtx);
3431 }
3432
3433 /* Return the place to copy the exception unwinding return address to.
3434    This will probably be a stack slot, but could (in theory be the
3435    return register).  */
3436 rtx
3437 aarch64_final_eh_return_addr (void)
3438 {
3439   HOST_WIDE_INT fp_offset;
3440
3441   aarch64_layout_frame ();
3442
3443   fp_offset = cfun->machine->frame.frame_size
3444               - cfun->machine->frame.hard_fp_offset;
3445
3446   if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
3447     return gen_rtx_REG (DImode, LR_REGNUM);
3448
3449   /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2.  This can
3450      result in a store to save LR introduced by builtin_eh_return () being
3451      incorrectly deleted because the alias is not detected.
3452      So in the calculation of the address to copy the exception unwinding
3453      return address to, we note 2 cases.
3454      If FP is needed and the fp_offset is 0, it means that SP = FP and hence
3455      we return a SP-relative location since all the addresses are SP-relative
3456      in this case.  This prevents the store from being optimized away.
3457      If the fp_offset is not 0, then the addresses will be FP-relative and
3458      therefore we return a FP-relative location.  */
3459
3460   if (frame_pointer_needed)
3461     {
3462       if (fp_offset)
3463         return gen_frame_mem (DImode,
3464                               plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3465       else
3466         return gen_frame_mem (DImode,
3467                               plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
3468     }
3469
3470   /* If FP is not needed, we calculate the location of LR, which would be
3471      at the top of the saved registers block.  */
3472
3473   return gen_frame_mem (DImode,
3474                         plus_constant (Pmode,
3475                                        stack_pointer_rtx,
3476                                        fp_offset
3477                                        + cfun->machine->frame.saved_regs_size
3478                                        - 2 * UNITS_PER_WORD));
3479 }
3480
3481 /* Output code to add DELTA to the first argument, and then jump
3482    to FUNCTION.  Used for C++ multiple inheritance.  */
3483 static void
3484 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3485                          HOST_WIDE_INT delta,
3486                          HOST_WIDE_INT vcall_offset,
3487                          tree function)
3488 {
3489   /* The this pointer is always in x0.  Note that this differs from
3490      Arm where the this pointer maybe bumped to r1 if r0 is required
3491      to return a pointer to an aggregate.  On AArch64 a result value
3492      pointer will be in x8.  */
3493   int this_regno = R0_REGNUM;
3494   rtx this_rtx, temp0, temp1, addr, funexp;
3495   rtx_insn *insn;
3496
3497   reload_completed = 1;
3498   emit_note (NOTE_INSN_PROLOGUE_END);
3499
3500   if (vcall_offset == 0)
3501     aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3502   else
3503     {
3504       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3505
3506       this_rtx = gen_rtx_REG (Pmode, this_regno);
3507       temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3508       temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3509
3510       addr = this_rtx;
3511       if (delta != 0)
3512         {
3513           if (delta >= -256 && delta < 256)
3514             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3515                                        plus_constant (Pmode, this_rtx, delta));
3516           else
3517             aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3518         }
3519
3520       if (Pmode == ptr_mode)
3521         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3522       else
3523         aarch64_emit_move (temp0,
3524                            gen_rtx_ZERO_EXTEND (Pmode,
3525                                                 gen_rtx_MEM (ptr_mode, addr)));
3526
3527       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3528           addr = plus_constant (Pmode, temp0, vcall_offset);
3529       else
3530         {
3531           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3532                                           Pmode);
3533           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3534         }
3535
3536       if (Pmode == ptr_mode)
3537         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3538       else
3539         aarch64_emit_move (temp1,
3540                            gen_rtx_SIGN_EXTEND (Pmode,
3541                                                 gen_rtx_MEM (ptr_mode, addr)));
3542
3543       emit_insn (gen_add2_insn (this_rtx, temp1));
3544     }
3545
3546   /* Generate a tail call to the target function.  */
3547   if (!TREE_USED (function))
3548     {
3549       assemble_external (function);
3550       TREE_USED (function) = 1;
3551     }
3552   funexp = XEXP (DECL_RTL (function), 0);
3553   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3554   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3555   SIBLING_CALL_P (insn) = 1;
3556
3557   insn = get_insns ();
3558   shorten_branches (insn);
3559   final_start_function (insn, file, 1);
3560   final (insn, file, 1);
3561   final_end_function ();
3562
3563   /* Stop pretending to be a post-reload pass.  */
3564   reload_completed = 0;
3565 }
3566
3567 static bool
3568 aarch64_tls_referenced_p (rtx x)
3569 {
3570   if (!TARGET_HAVE_TLS)
3571     return false;
3572   subrtx_iterator::array_type array;
3573   FOR_EACH_SUBRTX (iter, array, x, ALL)
3574     {
3575       const_rtx x = *iter;
3576       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3577         return true;
3578       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3579          TLS offsets, not real symbol references.  */
3580       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3581         iter.skip_subrtxes ();
3582     }
3583   return false;
3584 }
3585
3586
3587 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3588    a left shift of 0 or 12 bits.  */
3589 bool
3590 aarch64_uimm12_shift (HOST_WIDE_INT val)
3591 {
3592   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3593           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3594           );
3595 }
3596
3597
3598 /* Return true if val is an immediate that can be loaded into a
3599    register by a MOVZ instruction.  */
3600 static bool
3601 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3602 {
3603   if (GET_MODE_SIZE (mode) > 4)
3604     {
3605       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3606           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3607         return 1;
3608     }
3609   else
3610     {
3611       /* Ignore sign extension.  */
3612       val &= (HOST_WIDE_INT) 0xffffffff;
3613     }
3614   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3615           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3616 }
3617
3618 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
3619
3620 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3621   {
3622     0x0000000100000001ull,
3623     0x0001000100010001ull,
3624     0x0101010101010101ull,
3625     0x1111111111111111ull,
3626     0x5555555555555555ull,
3627   };
3628
3629
3630 /* Return true if val is a valid bitmask immediate.  */
3631
3632 bool
3633 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3634 {
3635   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3636   int bits;
3637
3638   /* Check for a single sequence of one bits and return quickly if so.
3639      The special cases of all ones and all zeroes returns false.  */
3640   val = (unsigned HOST_WIDE_INT) val_in;
3641   tmp = val + (val & -val);
3642
3643   if (tmp == (tmp & -tmp))
3644     return (val + 1) > 1;
3645
3646   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
3647   if (mode == SImode)
3648     val = (val << 32) | (val & 0xffffffff);
3649
3650   /* Invert if the immediate doesn't start with a zero bit - this means we
3651      only need to search for sequences of one bits.  */
3652   if (val & 1)
3653     val = ~val;
3654
3655   /* Find the first set bit and set tmp to val with the first sequence of one
3656      bits removed.  Return success if there is a single sequence of ones.  */
3657   first_one = val & -val;
3658   tmp = val & (val + first_one);
3659
3660   if (tmp == 0)
3661     return true;
3662
3663   /* Find the next set bit and compute the difference in bit position.  */
3664   next_one = tmp & -tmp;
3665   bits = clz_hwi (first_one) - clz_hwi (next_one);
3666   mask = val ^ tmp;
3667
3668   /* Check the bit position difference is a power of 2, and that the first
3669      sequence of one bits fits within 'bits' bits.  */
3670   if ((mask >> bits) != 0 || bits != (bits & -bits))
3671     return false;
3672
3673   /* Check the sequence of one bits is repeated 64/bits times.  */
3674   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3675 }
3676
3677
3678 /* Return true if val is an immediate that can be loaded into a
3679    register in a single instruction.  */
3680 bool
3681 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3682 {
3683   if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3684     return 1;
3685   return aarch64_bitmask_imm (val, mode);
3686 }
3687
3688 static bool
3689 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3690 {
3691   rtx base, offset;
3692
3693   if (GET_CODE (x) == HIGH)
3694     return true;
3695
3696   split_const (x, &base, &offset);
3697   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3698     {
3699       if (aarch64_classify_symbol (base, offset)
3700           != SYMBOL_FORCE_TO_MEM)
3701         return true;
3702       else
3703         /* Avoid generating a 64-bit relocation in ILP32; leave
3704            to aarch64_expand_mov_immediate to handle it properly.  */
3705         return mode != ptr_mode;
3706     }
3707
3708   return aarch64_tls_referenced_p (x);
3709 }
3710
3711 /* Implement TARGET_CASE_VALUES_THRESHOLD.
3712    The expansion for a table switch is quite expensive due to the number
3713    of instructions, the table lookup and hard to predict indirect jump.
3714    When optimizing for speed, and -O3 enabled, use the per-core tuning if
3715    set, otherwise use tables for > 16 cases as a tradeoff between size and
3716    performance.  When optimizing for size, use the default setting.  */
3717
3718 static unsigned int
3719 aarch64_case_values_threshold (void)
3720 {
3721   /* Use the specified limit for the number of cases before using jump
3722      tables at higher optimization levels.  */
3723   if (optimize > 2
3724       && selected_cpu->tune->max_case_values != 0)
3725     return selected_cpu->tune->max_case_values;
3726   else
3727     return optimize_size ? default_case_values_threshold () : 17;
3728 }
3729
3730 /* Return true if register REGNO is a valid index register.
3731    STRICT_P is true if REG_OK_STRICT is in effect.  */
3732
3733 bool
3734 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3735 {
3736   if (!HARD_REGISTER_NUM_P (regno))
3737     {
3738       if (!strict_p)
3739         return true;
3740
3741       if (!reg_renumber)
3742         return false;
3743
3744       regno = reg_renumber[regno];
3745     }
3746   return GP_REGNUM_P (regno);
3747 }
3748
3749 /* Return true if register REGNO is a valid base register for mode MODE.
3750    STRICT_P is true if REG_OK_STRICT is in effect.  */
3751
3752 bool
3753 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3754 {
3755   if (!HARD_REGISTER_NUM_P (regno))
3756     {
3757       if (!strict_p)
3758         return true;
3759
3760       if (!reg_renumber)
3761         return false;
3762
3763       regno = reg_renumber[regno];
3764     }
3765
3766   /* The fake registers will be eliminated to either the stack or
3767      hard frame pointer, both of which are usually valid base registers.
3768      Reload deals with the cases where the eliminated form isn't valid.  */
3769   return (GP_REGNUM_P (regno)
3770           || regno == SP_REGNUM
3771           || regno == FRAME_POINTER_REGNUM
3772           || regno == ARG_POINTER_REGNUM);
3773 }
3774
3775 /* Return true if X is a valid base register for mode MODE.
3776    STRICT_P is true if REG_OK_STRICT is in effect.  */
3777
3778 static bool
3779 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3780 {
3781   if (!strict_p && GET_CODE (x) == SUBREG)
3782     x = SUBREG_REG (x);
3783
3784   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3785 }
3786
3787 /* Return true if address offset is a valid index.  If it is, fill in INFO
3788    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
3789
3790 static bool
3791 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3792                         machine_mode mode, bool strict_p)
3793 {
3794   enum aarch64_address_type type;
3795   rtx index;
3796   int shift;
3797
3798   /* (reg:P) */
3799   if ((REG_P (x) || GET_CODE (x) == SUBREG)
3800       && GET_MODE (x) == Pmode)
3801     {
3802       type = ADDRESS_REG_REG;
3803       index = x;
3804       shift = 0;
3805     }
3806   /* (sign_extend:DI (reg:SI)) */
3807   else if ((GET_CODE (x) == SIGN_EXTEND
3808             || GET_CODE (x) == ZERO_EXTEND)
3809            && GET_MODE (x) == DImode
3810            && GET_MODE (XEXP (x, 0)) == SImode)
3811     {
3812       type = (GET_CODE (x) == SIGN_EXTEND)
3813         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3814       index = XEXP (x, 0);
3815       shift = 0;
3816     }
3817   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3818   else if (GET_CODE (x) == MULT
3819            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3820                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3821            && GET_MODE (XEXP (x, 0)) == DImode
3822            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3823            && CONST_INT_P (XEXP (x, 1)))
3824     {
3825       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3826         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3827       index = XEXP (XEXP (x, 0), 0);
3828       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3829     }
3830   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3831   else if (GET_CODE (x) == ASHIFT
3832            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3833                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3834            && GET_MODE (XEXP (x, 0)) == DImode
3835            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3836            && CONST_INT_P (XEXP (x, 1)))
3837     {
3838       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3839         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3840       index = XEXP (XEXP (x, 0), 0);
3841       shift = INTVAL (XEXP (x, 1));
3842     }
3843   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3844   else if ((GET_CODE (x) == SIGN_EXTRACT
3845             || GET_CODE (x) == ZERO_EXTRACT)
3846            && GET_MODE (x) == DImode
3847            && GET_CODE (XEXP (x, 0)) == MULT
3848            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3849            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3850     {
3851       type = (GET_CODE (x) == SIGN_EXTRACT)
3852         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3853       index = XEXP (XEXP (x, 0), 0);
3854       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3855       if (INTVAL (XEXP (x, 1)) != 32 + shift
3856           || INTVAL (XEXP (x, 2)) != 0)
3857         shift = -1;
3858     }
3859   /* (and:DI (mult:DI (reg:DI) (const_int scale))
3860      (const_int 0xffffffff<<shift)) */
3861   else if (GET_CODE (x) == AND
3862            && GET_MODE (x) == DImode
3863            && GET_CODE (XEXP (x, 0)) == MULT
3864            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3865            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3866            && CONST_INT_P (XEXP (x, 1)))
3867     {
3868       type = ADDRESS_REG_UXTW;
3869       index = XEXP (XEXP (x, 0), 0);
3870       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3871       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3872         shift = -1;
3873     }
3874   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3875   else if ((GET_CODE (x) == SIGN_EXTRACT
3876             || GET_CODE (x) == ZERO_EXTRACT)
3877            && GET_MODE (x) == DImode
3878            && GET_CODE (XEXP (x, 0)) == ASHIFT
3879            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3880            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3881     {
3882       type = (GET_CODE (x) == SIGN_EXTRACT)
3883         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3884       index = XEXP (XEXP (x, 0), 0);
3885       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3886       if (INTVAL (XEXP (x, 1)) != 32 + shift
3887           || INTVAL (XEXP (x, 2)) != 0)
3888         shift = -1;
3889     }
3890   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3891      (const_int 0xffffffff<<shift)) */
3892   else if (GET_CODE (x) == AND
3893            && GET_MODE (x) == DImode
3894            && GET_CODE (XEXP (x, 0)) == ASHIFT
3895            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3896            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3897            && CONST_INT_P (XEXP (x, 1)))
3898     {
3899       type = ADDRESS_REG_UXTW;
3900       index = XEXP (XEXP (x, 0), 0);
3901       shift = INTVAL (XEXP (XEXP (x, 0), 1));
3902       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3903         shift = -1;
3904     }
3905   /* (mult:P (reg:P) (const_int scale)) */
3906   else if (GET_CODE (x) == MULT
3907            && GET_MODE (x) == Pmode
3908            && GET_MODE (XEXP (x, 0)) == Pmode
3909            && CONST_INT_P (XEXP (x, 1)))
3910     {
3911       type = ADDRESS_REG_REG;
3912       index = XEXP (x, 0);
3913       shift = exact_log2 (INTVAL (XEXP (x, 1)));
3914     }
3915   /* (ashift:P (reg:P) (const_int shift)) */
3916   else if (GET_CODE (x) == ASHIFT
3917            && GET_MODE (x) == Pmode
3918            && GET_MODE (XEXP (x, 0)) == Pmode
3919            && CONST_INT_P (XEXP (x, 1)))
3920     {
3921       type = ADDRESS_REG_REG;
3922       index = XEXP (x, 0);
3923       shift = INTVAL (XEXP (x, 1));
3924     }
3925   else
3926     return false;
3927
3928   if (GET_CODE (index) == SUBREG)
3929     index = SUBREG_REG (index);
3930
3931   if ((shift == 0 ||
3932        (shift > 0 && shift <= 3
3933         && (1 << shift) == GET_MODE_SIZE (mode)))
3934       && REG_P (index)
3935       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3936     {
3937       info->type = type;
3938       info->offset = index;
3939       info->shift = shift;
3940       return true;
3941     }
3942
3943   return false;
3944 }
3945
3946 bool
3947 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3948 {
3949   return (offset >= -64 * GET_MODE_SIZE (mode)
3950           && offset < 64 * GET_MODE_SIZE (mode)
3951           && offset % GET_MODE_SIZE (mode) == 0);
3952 }
3953
3954 static inline bool
3955 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3956                                HOST_WIDE_INT offset)
3957 {
3958   return offset >= -256 && offset < 256;
3959 }
3960
3961 static inline bool
3962 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3963 {
3964   return (offset >= 0
3965           && offset < 4096 * GET_MODE_SIZE (mode)
3966           && offset % GET_MODE_SIZE (mode) == 0);
3967 }
3968
3969 /* Return true if MODE is one of the modes for which we
3970    support LDP/STP operations.  */
3971
3972 static bool
3973 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
3974 {
3975   return mode == SImode || mode == DImode
3976          || mode == SFmode || mode == DFmode
3977          || (aarch64_vector_mode_supported_p (mode)
3978              && GET_MODE_SIZE (mode) == 8);
3979 }
3980
3981 /* Return true if REGNO is a virtual pointer register, or an eliminable
3982    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
3983    include stack_pointer or hard_frame_pointer.  */
3984 static bool
3985 virt_or_elim_regno_p (unsigned regno)
3986 {
3987   return ((regno >= FIRST_VIRTUAL_REGISTER
3988            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
3989           || regno == FRAME_POINTER_REGNUM
3990           || regno == ARG_POINTER_REGNUM);
3991 }
3992
3993 /* Return true if X is a valid address for machine mode MODE.  If it is,
3994    fill in INFO appropriately.  STRICT_P is true if REG_OK_STRICT is in
3995    effect.  OUTER_CODE is PARALLEL for a load/store pair.  */
3996
3997 static bool
3998 aarch64_classify_address (struct aarch64_address_info *info,
3999                           rtx x, machine_mode mode,
4000                           RTX_CODE outer_code, bool strict_p)
4001 {
4002   enum rtx_code code = GET_CODE (x);
4003   rtx op0, op1;
4004
4005   /* On BE, we use load/store pair for all large int mode load/stores.  */
4006   bool load_store_pair_p = (outer_code == PARALLEL
4007                             || (BYTES_BIG_ENDIAN
4008                                 && aarch64_vect_struct_mode_p (mode)));
4009
4010   bool allow_reg_index_p =
4011     !load_store_pair_p
4012     && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4013     && !aarch64_vect_struct_mode_p (mode);
4014
4015   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4016      REG addressing.  */
4017   if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4018       && (code != POST_INC && code != REG))
4019     return false;
4020
4021   switch (code)
4022     {
4023     case REG:
4024     case SUBREG:
4025       info->type = ADDRESS_REG_IMM;
4026       info->base = x;
4027       info->offset = const0_rtx;
4028       return aarch64_base_register_rtx_p (x, strict_p);
4029
4030     case PLUS:
4031       op0 = XEXP (x, 0);
4032       op1 = XEXP (x, 1);
4033
4034       if (! strict_p
4035           && REG_P (op0)
4036           && virt_or_elim_regno_p (REGNO (op0))
4037           && CONST_INT_P (op1))
4038         {
4039           info->type = ADDRESS_REG_IMM;
4040           info->base = op0;
4041           info->offset = op1;
4042
4043           return true;
4044         }
4045
4046       if (GET_MODE_SIZE (mode) != 0
4047           && CONST_INT_P (op1)
4048           && aarch64_base_register_rtx_p (op0, strict_p))
4049         {
4050           HOST_WIDE_INT offset = INTVAL (op1);
4051
4052           info->type = ADDRESS_REG_IMM;
4053           info->base = op0;
4054           info->offset = op1;
4055
4056           /* TImode and TFmode values are allowed in both pairs of X
4057              registers and individual Q registers.  The available
4058              address modes are:
4059              X,X: 7-bit signed scaled offset
4060              Q:   9-bit signed offset
4061              We conservatively require an offset representable in either mode.
4062              When performing the check for pairs of X registers i.e.  LDP/STP
4063              pass down DImode since that is the natural size of the LDP/STP
4064              instruction memory accesses.  */
4065           if (mode == TImode || mode == TFmode)
4066             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4067                     && offset_9bit_signed_unscaled_p (mode, offset));
4068
4069           /* A 7bit offset check because OImode will emit a ldp/stp
4070              instruction (only big endian will get here).
4071              For ldp/stp instructions, the offset is scaled for the size of a
4072              single element of the pair.  */
4073           if (mode == OImode)
4074             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4075
4076           /* Three 9/12 bit offsets checks because CImode will emit three
4077              ldr/str instructions (only big endian will get here).  */
4078           if (mode == CImode)
4079             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4080                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4081                         || offset_12bit_unsigned_scaled_p (V16QImode,
4082                                                            offset + 32)));
4083
4084           /* Two 7bit offsets checks because XImode will emit two ldp/stp
4085              instructions (only big endian will get here).  */
4086           if (mode == XImode)
4087             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4088                     && aarch64_offset_7bit_signed_scaled_p (TImode,
4089                                                             offset + 32));
4090
4091           if (load_store_pair_p)
4092             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4093                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4094           else
4095             return (offset_9bit_signed_unscaled_p (mode, offset)
4096                     || offset_12bit_unsigned_scaled_p (mode, offset));
4097         }
4098
4099       if (allow_reg_index_p)
4100         {
4101           /* Look for base + (scaled/extended) index register.  */
4102           if (aarch64_base_register_rtx_p (op0, strict_p)
4103               && aarch64_classify_index (info, op1, mode, strict_p))
4104             {
4105               info->base = op0;
4106               return true;
4107             }
4108           if (aarch64_base_register_rtx_p (op1, strict_p)
4109               && aarch64_classify_index (info, op0, mode, strict_p))
4110             {
4111               info->base = op1;
4112               return true;
4113             }
4114         }
4115
4116       return false;
4117
4118     case POST_INC:
4119     case POST_DEC:
4120     case PRE_INC:
4121     case PRE_DEC:
4122       info->type = ADDRESS_REG_WB;
4123       info->base = XEXP (x, 0);
4124       info->offset = NULL_RTX;
4125       return aarch64_base_register_rtx_p (info->base, strict_p);
4126
4127     case POST_MODIFY:
4128     case PRE_MODIFY:
4129       info->type = ADDRESS_REG_WB;
4130       info->base = XEXP (x, 0);
4131       if (GET_CODE (XEXP (x, 1)) == PLUS
4132           && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4133           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4134           && aarch64_base_register_rtx_p (info->base, strict_p))
4135         {
4136           HOST_WIDE_INT offset;
4137           info->offset = XEXP (XEXP (x, 1), 1);
4138           offset = INTVAL (info->offset);
4139
4140           /* TImode and TFmode values are allowed in both pairs of X
4141              registers and individual Q registers.  The available
4142              address modes are:
4143              X,X: 7-bit signed scaled offset
4144              Q:   9-bit signed offset
4145              We conservatively require an offset representable in either mode.
4146            */
4147           if (mode == TImode || mode == TFmode)
4148             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4149                     && offset_9bit_signed_unscaled_p (mode, offset));
4150
4151           if (load_store_pair_p)
4152             return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4153                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4154           else
4155             return offset_9bit_signed_unscaled_p (mode, offset);
4156         }
4157       return false;
4158
4159     case CONST:
4160     case SYMBOL_REF:
4161     case LABEL_REF:
4162       /* load literal: pc-relative constant pool entry.  Only supported
4163          for SI mode or larger.  */
4164       info->type = ADDRESS_SYMBOLIC;
4165
4166       if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4167         {
4168           rtx sym, addend;
4169
4170           split_const (x, &sym, &addend);
4171           return ((GET_CODE (sym) == LABEL_REF
4172                    || (GET_CODE (sym) == SYMBOL_REF
4173                        && CONSTANT_POOL_ADDRESS_P (sym)
4174                        && aarch64_pcrelative_literal_loads)));
4175         }
4176       return false;
4177
4178     case LO_SUM:
4179       info->type = ADDRESS_LO_SUM;
4180       info->base = XEXP (x, 0);
4181       info->offset = XEXP (x, 1);
4182       if (allow_reg_index_p
4183           && aarch64_base_register_rtx_p (info->base, strict_p))
4184         {
4185           rtx sym, offs;
4186           split_const (info->offset, &sym, &offs);
4187           if (GET_CODE (sym) == SYMBOL_REF
4188               && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4189             {
4190               /* The symbol and offset must be aligned to the access size.  */
4191               unsigned int align;
4192               unsigned int ref_size;
4193
4194               if (CONSTANT_POOL_ADDRESS_P (sym))
4195                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4196               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4197                 {
4198                   tree exp = SYMBOL_REF_DECL (sym);
4199                   align = TYPE_ALIGN (TREE_TYPE (exp));
4200                   align = CONSTANT_ALIGNMENT (exp, align);
4201                 }
4202               else if (SYMBOL_REF_DECL (sym))
4203                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4204               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4205                        && SYMBOL_REF_BLOCK (sym) != NULL)
4206                 align = SYMBOL_REF_BLOCK (sym)->alignment;
4207               else
4208                 align = BITS_PER_UNIT;
4209
4210               ref_size = GET_MODE_SIZE (mode);
4211               if (ref_size == 0)
4212                 ref_size = GET_MODE_SIZE (DImode);
4213
4214               return ((INTVAL (offs) & (ref_size - 1)) == 0
4215                       && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4216             }
4217         }
4218       return false;
4219
4220     default:
4221       return false;
4222     }
4223 }
4224
4225 bool
4226 aarch64_symbolic_address_p (rtx x)
4227 {
4228   rtx offset;
4229
4230   split_const (x, &x, &offset);
4231   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4232 }
4233
4234 /* Classify the base of symbolic expression X.  */
4235
4236 enum aarch64_symbol_type
4237 aarch64_classify_symbolic_expression (rtx x)
4238 {
4239   rtx offset;
4240
4241   split_const (x, &x, &offset);
4242   return aarch64_classify_symbol (x, offset);
4243 }
4244
4245
4246 /* Return TRUE if X is a legitimate address for accessing memory in
4247    mode MODE.  */
4248 static bool
4249 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4250 {
4251   struct aarch64_address_info addr;
4252
4253   return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4254 }
4255
4256 /* Return TRUE if X is a legitimate address for accessing memory in
4257    mode MODE.  OUTER_CODE will be PARALLEL if this is a load/store
4258    pair operation.  */
4259 bool
4260 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4261                               RTX_CODE outer_code, bool strict_p)
4262 {
4263   struct aarch64_address_info addr;
4264
4265   return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4266 }
4267
4268 /* Split an out-of-range address displacement into a base and offset.
4269    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4270    to increase opportunities for sharing the base address of different sizes.
4271    For TI/TFmode and unaligned accesses use a 256-byte range.  */
4272 static bool
4273 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4274 {
4275   HOST_WIDE_INT mask = GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3fff;
4276
4277   if (mode == TImode || mode == TFmode ||
4278       (INTVAL (*disp) & (GET_MODE_SIZE (mode) - 1)) != 0)
4279     mask = 0xff;
4280
4281   *off = GEN_INT (INTVAL (*disp) & ~mask);
4282   *disp = GEN_INT (INTVAL (*disp) & mask);
4283   return true;
4284 }
4285
4286 /* Return TRUE if rtx X is immediate constant 0.0 */
4287 bool
4288 aarch64_float_const_zero_rtx_p (rtx x)
4289 {
4290   if (GET_MODE (x) == VOIDmode)
4291     return false;
4292
4293   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4294     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4295   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4296 }
4297
4298 /* Return the fixed registers used for condition codes.  */
4299
4300 static bool
4301 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4302 {
4303   *p1 = CC_REGNUM;
4304   *p2 = INVALID_REGNUM;
4305   return true;
4306 }
4307
4308 /* Emit call insn with PAT and do aarch64-specific handling.  */
4309
4310 void
4311 aarch64_emit_call_insn (rtx pat)
4312 {
4313   rtx insn = emit_call_insn (pat);
4314
4315   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4316   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4317   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4318 }
4319
4320 machine_mode
4321 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4322 {
4323   /* All floating point compares return CCFP if it is an equality
4324      comparison, and CCFPE otherwise.  */
4325   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4326     {
4327       switch (code)
4328         {
4329         case EQ:
4330         case NE:
4331         case UNORDERED:
4332         case ORDERED:
4333         case UNLT:
4334         case UNLE:
4335         case UNGT:
4336         case UNGE:
4337         case UNEQ:
4338         case LTGT:
4339           return CCFPmode;
4340
4341         case LT:
4342         case LE:
4343         case GT:
4344         case GE:
4345           return CCFPEmode;
4346
4347         default:
4348           gcc_unreachable ();
4349         }
4350     }
4351
4352   /* Equality comparisons of short modes against zero can be performed
4353      using the TST instruction with the appropriate bitmask.  */
4354   if (y == const0_rtx && REG_P (x)
4355       && (code == EQ || code == NE)
4356       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4357     return CC_NZmode;
4358
4359   /* Similarly, comparisons of zero_extends from shorter modes can
4360      be performed using an ANDS with an immediate mask.  */
4361   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4362       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4363       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4364       && (code == EQ || code == NE))
4365     return CC_NZmode;
4366
4367   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4368       && y == const0_rtx
4369       && (code == EQ || code == NE || code == LT || code == GE)
4370       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4371           || GET_CODE (x) == NEG
4372           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4373               && CONST_INT_P (XEXP (x, 2)))))
4374     return CC_NZmode;
4375
4376   /* A compare with a shifted operand.  Because of canonicalization,
4377      the comparison will have to be swapped when we emit the assembly
4378      code.  */
4379   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4380       && (REG_P (y) || GET_CODE (y) == SUBREG)
4381       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4382           || GET_CODE (x) == LSHIFTRT
4383           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4384     return CC_SWPmode;
4385
4386   /* Similarly for a negated operand, but we can only do this for
4387      equalities.  */
4388   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4389       && (REG_P (y) || GET_CODE (y) == SUBREG)
4390       && (code == EQ || code == NE)
4391       && GET_CODE (x) == NEG)
4392     return CC_Zmode;
4393
4394   /* A test for unsigned overflow.  */
4395   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4396       && code == NE
4397       && GET_CODE (x) == PLUS
4398       && GET_CODE (y) == ZERO_EXTEND)
4399     return CC_Cmode;
4400
4401   /* For everything else, return CCmode.  */
4402   return CCmode;
4403 }
4404
4405 static int
4406 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4407
4408 int
4409 aarch64_get_condition_code (rtx x)
4410 {
4411   machine_mode mode = GET_MODE (XEXP (x, 0));
4412   enum rtx_code comp_code = GET_CODE (x);
4413
4414   if (GET_MODE_CLASS (mode) != MODE_CC)
4415     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4416   return aarch64_get_condition_code_1 (mode, comp_code);
4417 }
4418
4419 static int
4420 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4421 {
4422   switch (mode)
4423     {
4424     case CCFPmode:
4425     case CCFPEmode:
4426       switch (comp_code)
4427         {
4428         case GE: return AARCH64_GE;
4429         case GT: return AARCH64_GT;
4430         case LE: return AARCH64_LS;
4431         case LT: return AARCH64_MI;
4432         case NE: return AARCH64_NE;
4433         case EQ: return AARCH64_EQ;
4434         case ORDERED: return AARCH64_VC;
4435         case UNORDERED: return AARCH64_VS;
4436         case UNLT: return AARCH64_LT;
4437         case UNLE: return AARCH64_LE;
4438         case UNGT: return AARCH64_HI;
4439         case UNGE: return AARCH64_PL;
4440         default: return -1;
4441         }
4442       break;
4443
4444     case CCmode:
4445       switch (comp_code)
4446         {
4447         case NE: return AARCH64_NE;
4448         case EQ: return AARCH64_EQ;
4449         case GE: return AARCH64_GE;
4450         case GT: return AARCH64_GT;
4451         case LE: return AARCH64_LE;
4452         case LT: return AARCH64_LT;
4453         case GEU: return AARCH64_CS;
4454         case GTU: return AARCH64_HI;
4455         case LEU: return AARCH64_LS;
4456         case LTU: return AARCH64_CC;
4457         default: return -1;
4458         }
4459       break;
4460
4461     case CC_SWPmode:
4462       switch (comp_code)
4463         {
4464         case NE: return AARCH64_NE;
4465         case EQ: return AARCH64_EQ;
4466         case GE: return AARCH64_LE;
4467         case GT: return AARCH64_LT;
4468         case LE: return AARCH64_GE;
4469         case LT: return AARCH64_GT;
4470         case GEU: return AARCH64_LS;
4471         case GTU: return AARCH64_CC;
4472         case LEU: return AARCH64_CS;
4473         case LTU: return AARCH64_HI;
4474         default: return -1;
4475         }
4476       break;
4477
4478     case CC_NZmode:
4479       switch (comp_code)
4480         {
4481         case NE: return AARCH64_NE;
4482         case EQ: return AARCH64_EQ;
4483         case GE: return AARCH64_PL;
4484         case LT: return AARCH64_MI;
4485         default: return -1;
4486         }
4487       break;
4488
4489     case CC_Zmode:
4490       switch (comp_code)
4491         {
4492         case NE: return AARCH64_NE;
4493         case EQ: return AARCH64_EQ;
4494         default: return -1;
4495         }
4496       break;
4497
4498     case CC_Cmode:
4499       switch (comp_code)
4500         {
4501         case NE: return AARCH64_CS;
4502         case EQ: return AARCH64_CC;
4503         default: return -1;
4504         }
4505       break;
4506
4507     default:
4508       return -1;
4509     }
4510
4511   return -1;
4512 }
4513
4514 bool
4515 aarch64_const_vec_all_same_in_range_p (rtx x,
4516                                   HOST_WIDE_INT minval,
4517                                   HOST_WIDE_INT maxval)
4518 {
4519   HOST_WIDE_INT firstval;
4520   int count, i;
4521
4522   if (GET_CODE (x) != CONST_VECTOR
4523       || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4524     return false;
4525
4526   firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4527   if (firstval < minval || firstval > maxval)
4528     return false;
4529
4530   count = CONST_VECTOR_NUNITS (x);
4531   for (i = 1; i < count; i++)
4532     if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4533       return false;
4534
4535   return true;
4536 }
4537
4538 bool
4539 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4540 {
4541   return aarch64_const_vec_all_same_in_range_p (x, val, val);
4542 }
4543
4544
4545 /* N Z C V.  */
4546 #define AARCH64_CC_V 1
4547 #define AARCH64_CC_C (1 << 1)
4548 #define AARCH64_CC_Z (1 << 2)
4549 #define AARCH64_CC_N (1 << 3)
4550
4551 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
4552 static const int aarch64_nzcv_codes[] =
4553 {
4554   0,            /* EQ, Z == 1.  */
4555   AARCH64_CC_Z, /* NE, Z == 0.  */
4556   0,            /* CS, C == 1.  */
4557   AARCH64_CC_C, /* CC, C == 0.  */
4558   0,            /* MI, N == 1.  */
4559   AARCH64_CC_N, /* PL, N == 0.  */
4560   0,            /* VS, V == 1.  */
4561   AARCH64_CC_V, /* VC, V == 0.  */
4562   0,            /* HI, C ==1 && Z == 0.  */
4563   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
4564   AARCH64_CC_V, /* GE, N == V.  */
4565   0,            /* LT, N != V.  */
4566   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
4567   0,            /* LE, !(Z == 0 && N == V).  */
4568   0,            /* AL, Any.  */
4569   0             /* NV, Any.  */
4570 };
4571
4572 static void
4573 aarch64_print_operand (FILE *f, rtx x, int code)
4574 {
4575   switch (code)
4576     {
4577     /* An integer or symbol address without a preceding # sign.  */
4578     case 'c':
4579       switch (GET_CODE (x))
4580         {
4581         case CONST_INT:
4582           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4583           break;
4584
4585         case SYMBOL_REF:
4586           output_addr_const (f, x);
4587           break;
4588
4589         case CONST:
4590           if (GET_CODE (XEXP (x, 0)) == PLUS
4591               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4592             {
4593               output_addr_const (f, x);
4594               break;
4595             }
4596           /* Fall through.  */
4597
4598         default:
4599           output_operand_lossage ("Unsupported operand for code '%c'", code);
4600         }
4601       break;
4602
4603     case 'e':
4604       /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w.  */
4605       {
4606         int n;
4607
4608         if (!CONST_INT_P (x)
4609             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4610           {
4611             output_operand_lossage ("invalid operand for '%%%c'", code);
4612             return;
4613           }
4614
4615         switch (n)
4616           {
4617           case 3:
4618             fputc ('b', f);
4619             break;
4620           case 4:
4621             fputc ('h', f);
4622             break;
4623           case 5:
4624             fputc ('w', f);
4625             break;
4626           default:
4627             output_operand_lossage ("invalid operand for '%%%c'", code);
4628             return;
4629           }
4630       }
4631       break;
4632
4633     case 'p':
4634       {
4635         int n;
4636
4637         /* Print N such that 2^N == X.  */
4638         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4639           {
4640             output_operand_lossage ("invalid operand for '%%%c'", code);
4641             return;
4642           }
4643
4644         asm_fprintf (f, "%d", n);
4645       }
4646       break;
4647
4648     case 'P':
4649       /* Print the number of non-zero bits in X (a const_int).  */
4650       if (!CONST_INT_P (x))
4651         {
4652           output_operand_lossage ("invalid operand for '%%%c'", code);
4653           return;
4654         }
4655
4656       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4657       break;
4658
4659     case 'H':
4660       /* Print the higher numbered register of a pair (TImode) of regs.  */
4661       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4662         {
4663           output_operand_lossage ("invalid operand for '%%%c'", code);
4664           return;
4665         }
4666
4667       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4668       break;
4669
4670     case 'M':
4671     case 'm':
4672       {
4673         int cond_code;
4674         /* Print a condition (eq, ne, etc) or its inverse.  */
4675
4676         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
4677         if (x == const_true_rtx)
4678           {
4679             if (code == 'M')
4680               fputs ("nv", f);
4681             return;
4682           }
4683
4684         if (!COMPARISON_P (x))
4685           {
4686             output_operand_lossage ("invalid operand for '%%%c'", code);
4687             return;
4688           }
4689
4690         cond_code = aarch64_get_condition_code (x);
4691         gcc_assert (cond_code >= 0);
4692         if (code == 'M')
4693           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
4694         fputs (aarch64_condition_codes[cond_code], f);
4695       }
4696       break;
4697
4698     case 'b':
4699     case 'h':
4700     case 's':
4701     case 'd':
4702     case 'q':
4703       /* Print a scalar FP/SIMD register name.  */
4704       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4705         {
4706           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4707           return;
4708         }
4709       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4710       break;
4711
4712     case 'S':
4713     case 'T':
4714     case 'U':
4715     case 'V':
4716       /* Print the first FP/SIMD register name in a list.  */
4717       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4718         {
4719           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4720           return;
4721         }
4722       asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4723       break;
4724
4725     case 'R':
4726       /* Print a scalar FP/SIMD register name + 1.  */
4727       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4728         {
4729           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4730           return;
4731         }
4732       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4733       break;
4734
4735     case 'X':
4736       /* Print bottom 16 bits of integer constant in hex.  */
4737       if (!CONST_INT_P (x))
4738         {
4739           output_operand_lossage ("invalid operand for '%%%c'", code);
4740           return;
4741         }
4742       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4743       break;
4744
4745     case 'w':
4746     case 'x':
4747       /* Print a general register name or the zero register (32-bit or
4748          64-bit).  */
4749       if (x == const0_rtx
4750           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4751         {
4752           asm_fprintf (f, "%czr", code);
4753           break;
4754         }
4755
4756       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4757         {
4758           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4759           break;
4760         }
4761
4762       if (REG_P (x) && REGNO (x) == SP_REGNUM)
4763         {
4764           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4765           break;
4766         }
4767
4768       /* Fall through */
4769
4770     case 0:
4771       /* Print a normal operand, if it's a general register, then we
4772          assume DImode.  */
4773       if (x == NULL)
4774         {
4775           output_operand_lossage ("missing operand");
4776           return;
4777         }
4778
4779       switch (GET_CODE (x))
4780         {
4781         case REG:
4782           asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4783           break;
4784
4785         case MEM:
4786           output_address (GET_MODE (x), XEXP (x, 0));
4787           break;
4788
4789         case CONST:
4790         case LABEL_REF:
4791         case SYMBOL_REF:
4792           output_addr_const (asm_out_file, x);
4793           break;
4794
4795         case CONST_INT:
4796           asm_fprintf (f, "%wd", INTVAL (x));
4797           break;
4798
4799         case CONST_VECTOR:
4800           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4801             {
4802               gcc_assert (
4803                   aarch64_const_vec_all_same_in_range_p (x,
4804                                                          HOST_WIDE_INT_MIN,
4805                                                          HOST_WIDE_INT_MAX));
4806               asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4807             }
4808           else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4809             {
4810               fputc ('0', f);
4811             }
4812           else
4813             gcc_unreachable ();
4814           break;
4815
4816         case CONST_DOUBLE:
4817           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
4818              be getting CONST_DOUBLEs holding integers.  */
4819           gcc_assert (GET_MODE (x) != VOIDmode);
4820           if (aarch64_float_const_zero_rtx_p (x))
4821             {
4822               fputc ('0', f);
4823               break;
4824             }
4825           else if (aarch64_float_const_representable_p (x))
4826             {
4827 #define buf_size 20
4828               char float_buf[buf_size] = {'\0'};
4829               real_to_decimal_for_mode (float_buf,
4830                                         CONST_DOUBLE_REAL_VALUE (x),
4831                                         buf_size, buf_size,
4832                                         1, GET_MODE (x));
4833               asm_fprintf (asm_out_file, "%s", float_buf);
4834               break;
4835 #undef buf_size
4836             }
4837           output_operand_lossage ("invalid constant");
4838           return;
4839         default:
4840           output_operand_lossage ("invalid operand");
4841           return;
4842         }
4843       break;
4844
4845     case 'A':
4846       if (GET_CODE (x) == HIGH)
4847         x = XEXP (x, 0);
4848
4849       switch (aarch64_classify_symbolic_expression (x))
4850         {
4851         case SYMBOL_SMALL_GOT_4G:
4852           asm_fprintf (asm_out_file, ":got:");
4853           break;
4854
4855         case SYMBOL_SMALL_TLSGD:
4856           asm_fprintf (asm_out_file, ":tlsgd:");
4857           break;
4858
4859         case SYMBOL_SMALL_TLSDESC:
4860           asm_fprintf (asm_out_file, ":tlsdesc:");
4861           break;
4862
4863         case SYMBOL_SMALL_TLSIE:
4864           asm_fprintf (asm_out_file, ":gottprel:");
4865           break;
4866
4867         case SYMBOL_TLSLE24:
4868           asm_fprintf (asm_out_file, ":tprel:");
4869           break;
4870
4871         case SYMBOL_TINY_GOT:
4872           gcc_unreachable ();
4873           break;
4874
4875         default:
4876           break;
4877         }
4878       output_addr_const (asm_out_file, x);
4879       break;
4880
4881     case 'L':
4882       switch (aarch64_classify_symbolic_expression (x))
4883         {
4884         case SYMBOL_SMALL_GOT_4G:
4885           asm_fprintf (asm_out_file, ":lo12:");
4886           break;
4887
4888         case SYMBOL_SMALL_TLSGD:
4889           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4890           break;
4891
4892         case SYMBOL_SMALL_TLSDESC:
4893           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4894           break;
4895
4896         case SYMBOL_SMALL_TLSIE:
4897           asm_fprintf (asm_out_file, ":gottprel_lo12:");
4898           break;
4899
4900         case SYMBOL_TLSLE12:
4901           asm_fprintf (asm_out_file, ":tprel_lo12:");
4902           break;
4903
4904         case SYMBOL_TLSLE24:
4905           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4906           break;
4907
4908         case SYMBOL_TINY_GOT:
4909           asm_fprintf (asm_out_file, ":got:");
4910           break;
4911
4912         case SYMBOL_TINY_TLSIE:
4913           asm_fprintf (asm_out_file, ":gottprel:");
4914           break;
4915
4916         default:
4917           break;
4918         }
4919       output_addr_const (asm_out_file, x);
4920       break;
4921
4922     case 'G':
4923
4924       switch (aarch64_classify_symbolic_expression (x))
4925         {
4926         case SYMBOL_TLSLE24:
4927           asm_fprintf (asm_out_file, ":tprel_hi12:");
4928           break;
4929         default:
4930           break;
4931         }
4932       output_addr_const (asm_out_file, x);
4933       break;
4934
4935     case 'k':
4936       {
4937         HOST_WIDE_INT cond_code;
4938         /* Print nzcv.  */
4939
4940         if (!CONST_INT_P (x))
4941           {
4942             output_operand_lossage ("invalid operand for '%%%c'", code);
4943             return;
4944           }
4945
4946         cond_code = INTVAL (x);
4947         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
4948         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
4949       }
4950       break;
4951
4952     default:
4953       output_operand_lossage ("invalid operand prefix '%%%c'", code);
4954       return;
4955     }
4956 }
4957
4958 static void
4959 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
4960 {
4961   struct aarch64_address_info addr;
4962
4963   if (aarch64_classify_address (&addr, x, mode, MEM, true))
4964     switch (addr.type)
4965       {
4966       case ADDRESS_REG_IMM:
4967         if (addr.offset == const0_rtx)
4968           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4969         else
4970           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4971                        INTVAL (addr.offset));
4972         return;
4973
4974       case ADDRESS_REG_REG:
4975         if (addr.shift == 0)
4976           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4977                        reg_names [REGNO (addr.offset)]);
4978         else
4979           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4980                        reg_names [REGNO (addr.offset)], addr.shift);
4981         return;
4982
4983       case ADDRESS_REG_UXTW:
4984         if (addr.shift == 0)
4985           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4986                        REGNO (addr.offset) - R0_REGNUM);
4987         else
4988           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4989                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4990         return;
4991
4992       case ADDRESS_REG_SXTW:
4993         if (addr.shift == 0)
4994           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4995                        REGNO (addr.offset) - R0_REGNUM);
4996         else
4997           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4998                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
4999         return;
5000
5001       case ADDRESS_REG_WB:
5002         switch (GET_CODE (x))
5003           {
5004           case PRE_INC:
5005             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5006                          GET_MODE_SIZE (mode));
5007             return;
5008           case POST_INC:
5009             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5010                          GET_MODE_SIZE (mode));
5011             return;
5012           case PRE_DEC:
5013             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5014                          GET_MODE_SIZE (mode));
5015             return;
5016           case POST_DEC:
5017             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5018                          GET_MODE_SIZE (mode));
5019             return;
5020           case PRE_MODIFY:
5021             asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5022                          INTVAL (addr.offset));
5023             return;
5024           case POST_MODIFY:
5025             asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5026                          INTVAL (addr.offset));
5027             return;
5028           default:
5029             break;
5030           }
5031         break;
5032
5033       case ADDRESS_LO_SUM:
5034         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5035         output_addr_const (f, addr.offset);
5036         asm_fprintf (f, "]");
5037         return;
5038
5039       case ADDRESS_SYMBOLIC:
5040         break;
5041       }
5042
5043   output_addr_const (f, x);
5044 }
5045
5046 bool
5047 aarch64_label_mentioned_p (rtx x)
5048 {
5049   const char *fmt;
5050   int i;
5051
5052   if (GET_CODE (x) == LABEL_REF)
5053     return true;
5054
5055   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5056      referencing instruction, but they are constant offsets, not
5057      symbols.  */
5058   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5059     return false;
5060
5061   fmt = GET_RTX_FORMAT (GET_CODE (x));
5062   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5063     {
5064       if (fmt[i] == 'E')
5065         {
5066           int j;
5067
5068           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5069             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5070               return 1;
5071         }
5072       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5073         return 1;
5074     }
5075
5076   return 0;
5077 }
5078
5079 /* Implement REGNO_REG_CLASS.  */
5080
5081 enum reg_class
5082 aarch64_regno_regclass (unsigned regno)
5083 {
5084   if (GP_REGNUM_P (regno))
5085     return GENERAL_REGS;
5086
5087   if (regno == SP_REGNUM)
5088     return STACK_REG;
5089
5090   if (regno == FRAME_POINTER_REGNUM
5091       || regno == ARG_POINTER_REGNUM)
5092     return POINTER_REGS;
5093
5094   if (FP_REGNUM_P (regno))
5095     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
5096
5097   return NO_REGS;
5098 }
5099
5100 static rtx
5101 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
5102 {
5103   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5104      where mask is selected by alignment and size of the offset.
5105      We try to pick as large a range for the offset as possible to
5106      maximize the chance of a CSE.  However, for aligned addresses
5107      we limit the range to 4k so that structures with different sized
5108      elements are likely to use the same base.  We need to be careful
5109      not to split a CONST for some forms of address expression, otherwise
5110      it will generate sub-optimal code.  */
5111
5112   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5113     {
5114       rtx base = XEXP (x, 0);
5115       rtx offset_rtx = XEXP (x, 1);
5116       HOST_WIDE_INT offset = INTVAL (offset_rtx);
5117
5118       if (GET_CODE (base) == PLUS)
5119         {
5120           rtx op0 = XEXP (base, 0);
5121           rtx op1 = XEXP (base, 1);
5122
5123           /* Force any scaling into a temp for CSE.  */
5124           op0 = force_reg (Pmode, op0);
5125           op1 = force_reg (Pmode, op1);
5126
5127           /* Let the pointer register be in op0.  */
5128           if (REG_POINTER (op1))
5129             std::swap (op0, op1);
5130
5131           /* If the pointer is virtual or frame related, then we know that
5132              virtual register instantiation or register elimination is going
5133              to apply a second constant.  We want the two constants folded
5134              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
5135           if (virt_or_elim_regno_p (REGNO (op0)))
5136             {
5137               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5138                                    NULL_RTX, true, OPTAB_DIRECT);
5139               return gen_rtx_PLUS (Pmode, base, op1);
5140             }
5141
5142           /* Otherwise, in order to encourage CSE (and thence loop strength
5143              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
5144           base = expand_binop (Pmode, add_optab, op0, op1,
5145                                NULL_RTX, true, OPTAB_DIRECT);
5146           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5147         }
5148
5149       /* Does it look like we'll need a load/store-pair operation?  */
5150       HOST_WIDE_INT base_offset;
5151       if (GET_MODE_SIZE (mode) > 16
5152           || mode == TImode)
5153         base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
5154                        & ~((128 * GET_MODE_SIZE (mode)) - 1));
5155       /* For offsets aren't a multiple of the access size, the limit is
5156          -256...255.  */
5157       else if (offset & (GET_MODE_SIZE (mode) - 1))
5158         {
5159           base_offset = (offset + 0x100) & ~0x1ff;
5160
5161           /* BLKmode typically uses LDP of X-registers.  */
5162           if (mode == BLKmode)
5163             base_offset = (offset + 512) & ~0x3ff;
5164         }
5165       /* Small negative offsets are supported.  */
5166       else if (IN_RANGE (offset, -256, 0))
5167         base_offset = 0;
5168       /* Use 12-bit offset by access size.  */
5169       else
5170         base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5171
5172       if (base_offset != 0)
5173         {
5174           base = plus_constant (Pmode, base, base_offset);
5175           base = force_operand (base, NULL_RTX);
5176           return plus_constant (Pmode, base, offset - base_offset);
5177         }
5178     }
5179
5180   return x;
5181 }
5182
5183 /* Return the reload icode required for a constant pool in mode.  */
5184 static enum insn_code
5185 aarch64_constant_pool_reload_icode (machine_mode mode)
5186 {
5187   switch (mode)
5188     {
5189     case SFmode:
5190       return CODE_FOR_aarch64_reload_movcpsfdi;
5191
5192     case DFmode:
5193       return CODE_FOR_aarch64_reload_movcpdfdi;
5194
5195     case TFmode:
5196       return CODE_FOR_aarch64_reload_movcptfdi;
5197
5198     case V8QImode:
5199       return CODE_FOR_aarch64_reload_movcpv8qidi;
5200
5201     case V16QImode:
5202       return CODE_FOR_aarch64_reload_movcpv16qidi;
5203
5204     case V4HImode:
5205       return CODE_FOR_aarch64_reload_movcpv4hidi;
5206
5207     case V8HImode:
5208       return CODE_FOR_aarch64_reload_movcpv8hidi;
5209
5210     case V2SImode:
5211       return CODE_FOR_aarch64_reload_movcpv2sidi;
5212
5213     case V4SImode:
5214       return CODE_FOR_aarch64_reload_movcpv4sidi;
5215
5216     case V2DImode:
5217       return CODE_FOR_aarch64_reload_movcpv2didi;
5218
5219     case V2DFmode:
5220       return CODE_FOR_aarch64_reload_movcpv2dfdi;
5221
5222     default:
5223       gcc_unreachable ();
5224     }
5225
5226   gcc_unreachable ();
5227 }
5228 static reg_class_t
5229 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5230                           reg_class_t rclass,
5231                           machine_mode mode,
5232                           secondary_reload_info *sri)
5233 {
5234
5235   /* If we have to disable direct literal pool loads and stores because the
5236      function is too big, then we need a scratch register.  */
5237   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5238       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5239           || targetm.vector_mode_supported_p (GET_MODE (x)))
5240       && !aarch64_pcrelative_literal_loads)
5241     {
5242       sri->icode = aarch64_constant_pool_reload_icode (mode);
5243       return NO_REGS;
5244     }
5245
5246   /* Without the TARGET_SIMD instructions we cannot move a Q register
5247      to a Q register directly.  We need a scratch.  */
5248   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5249       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5250       && reg_class_subset_p (rclass, FP_REGS))
5251     {
5252       if (mode == TFmode)
5253         sri->icode = CODE_FOR_aarch64_reload_movtf;
5254       else if (mode == TImode)
5255         sri->icode = CODE_FOR_aarch64_reload_movti;
5256       return NO_REGS;
5257     }
5258
5259   /* A TFmode or TImode memory access should be handled via an FP_REGS
5260      because AArch64 has richer addressing modes for LDR/STR instructions
5261      than LDP/STP instructions.  */
5262   if (TARGET_FLOAT && rclass == GENERAL_REGS
5263       && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5264     return FP_REGS;
5265
5266   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5267       return GENERAL_REGS;
5268
5269   return NO_REGS;
5270 }
5271
5272 static bool
5273 aarch64_can_eliminate (const int from, const int to)
5274 {
5275   /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5276      HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM.  */
5277
5278   if (frame_pointer_needed)
5279     {
5280       if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5281         return true;
5282       if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5283         return false;
5284       if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5285           && !cfun->calls_alloca)
5286         return true;
5287       if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5288         return true;
5289
5290       return false;
5291     }
5292   else
5293     {
5294       /* If we decided that we didn't need a leaf frame pointer but then used
5295          LR in the function, then we'll want a frame pointer after all, so
5296          prevent this elimination to ensure a frame pointer is used.  */
5297       if (to == STACK_POINTER_REGNUM
5298           && flag_omit_leaf_frame_pointer
5299           && df_regs_ever_live_p (LR_REGNUM))
5300         return false;
5301     }
5302
5303   return true;
5304 }
5305
5306 HOST_WIDE_INT
5307 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5308 {
5309   aarch64_layout_frame ();
5310
5311   if (to == HARD_FRAME_POINTER_REGNUM)
5312     {
5313       if (from == ARG_POINTER_REGNUM)
5314         return cfun->machine->frame.hard_fp_offset;
5315
5316       if (from == FRAME_POINTER_REGNUM)
5317         return cfun->machine->frame.hard_fp_offset
5318                - cfun->machine->frame.locals_offset;
5319     }
5320
5321   if (to == STACK_POINTER_REGNUM)
5322     {
5323       if (from == FRAME_POINTER_REGNUM)
5324           return cfun->machine->frame.frame_size
5325                  - cfun->machine->frame.locals_offset;
5326     }
5327
5328   return cfun->machine->frame.frame_size;
5329 }
5330
5331 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
5332    previous frame.  */
5333
5334 rtx
5335 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5336 {
5337   if (count != 0)
5338     return const0_rtx;
5339   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5340 }
5341
5342
5343 static void
5344 aarch64_asm_trampoline_template (FILE *f)
5345 {
5346   if (TARGET_ILP32)
5347     {
5348       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5349       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5350     }
5351   else
5352     {
5353       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5354       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5355     }
5356   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5357   assemble_aligned_integer (4, const0_rtx);
5358   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5359   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5360 }
5361
5362 static void
5363 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5364 {
5365   rtx fnaddr, mem, a_tramp;
5366   const int tramp_code_sz = 16;
5367
5368   /* Don't need to copy the trailing D-words, we fill those in below.  */
5369   emit_block_move (m_tramp, assemble_trampoline_template (),
5370                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5371   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5372   fnaddr = XEXP (DECL_RTL (fndecl), 0);
5373   if (GET_MODE (fnaddr) != ptr_mode)
5374     fnaddr = convert_memory_address (ptr_mode, fnaddr);
5375   emit_move_insn (mem, fnaddr);
5376
5377   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5378   emit_move_insn (mem, chain_value);
5379
5380   /* XXX We should really define a "clear_cache" pattern and use
5381      gen_clear_cache().  */
5382   a_tramp = XEXP (m_tramp, 0);
5383   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5384                      LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5385                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5386                      ptr_mode);
5387 }
5388
5389 static unsigned char
5390 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5391 {
5392   switch (regclass)
5393     {
5394     case CALLER_SAVE_REGS:
5395     case POINTER_REGS:
5396     case GENERAL_REGS:
5397     case ALL_REGS:
5398     case FP_REGS:
5399     case FP_LO_REGS:
5400       return
5401         aarch64_vector_mode_p (mode)
5402           ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5403           : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5404     case STACK_REG:
5405       return 1;
5406
5407     case NO_REGS:
5408       return 0;
5409
5410     default:
5411       break;
5412     }
5413   gcc_unreachable ();
5414 }
5415
5416 static reg_class_t
5417 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5418 {
5419   if (regclass == POINTER_REGS)
5420     return GENERAL_REGS;
5421
5422   if (regclass == STACK_REG)
5423     {
5424       if (REG_P(x)
5425           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5426           return regclass;
5427
5428       return NO_REGS;
5429     }
5430
5431   /* If it's an integer immediate that MOVI can't handle, then
5432      FP_REGS is not an option, so we return NO_REGS instead.  */
5433   if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5434       && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5435     return NO_REGS;
5436
5437   /* Register eliminiation can result in a request for
5438      SP+constant->FP_REGS.  We cannot support such operations which
5439      use SP as source and an FP_REG as destination, so reject out
5440      right now.  */
5441   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5442     {
5443       rtx lhs = XEXP (x, 0);
5444
5445       /* Look through a possible SUBREG introduced by ILP32.  */
5446       if (GET_CODE (lhs) == SUBREG)
5447         lhs = SUBREG_REG (lhs);
5448
5449       gcc_assert (REG_P (lhs));
5450       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5451                                       POINTER_REGS));
5452       return NO_REGS;
5453     }
5454
5455   return regclass;
5456 }
5457
5458 void
5459 aarch64_asm_output_labelref (FILE* f, const char *name)
5460 {
5461   asm_fprintf (f, "%U%s", name);
5462 }
5463
5464 static void
5465 aarch64_elf_asm_constructor (rtx symbol, int priority)
5466 {
5467   if (priority == DEFAULT_INIT_PRIORITY)
5468     default_ctor_section_asm_out_constructor (symbol, priority);
5469   else
5470     {
5471       section *s;
5472       char buf[18];
5473       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5474       s = get_section (buf, SECTION_WRITE, NULL);
5475       switch_to_section (s);
5476       assemble_align (POINTER_SIZE);
5477       assemble_aligned_integer (POINTER_BYTES, symbol);
5478     }
5479 }
5480
5481 static void
5482 aarch64_elf_asm_destructor (rtx symbol, int priority)
5483 {
5484   if (priority == DEFAULT_INIT_PRIORITY)
5485     default_dtor_section_asm_out_destructor (symbol, priority);
5486   else
5487     {
5488       section *s;
5489       char buf[18];
5490       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5491       s = get_section (buf, SECTION_WRITE, NULL);
5492       switch_to_section (s);
5493       assemble_align (POINTER_SIZE);
5494       assemble_aligned_integer (POINTER_BYTES, symbol);
5495     }
5496 }
5497
5498 const char*
5499 aarch64_output_casesi (rtx *operands)
5500 {
5501   char buf[100];
5502   char label[100];
5503   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5504   int index;
5505   static const char *const patterns[4][2] =
5506   {
5507     {
5508       "ldrb\t%w3, [%0,%w1,uxtw]",
5509       "add\t%3, %4, %w3, sxtb #2"
5510     },
5511     {
5512       "ldrh\t%w3, [%0,%w1,uxtw #1]",
5513       "add\t%3, %4, %w3, sxth #2"
5514     },
5515     {
5516       "ldr\t%w3, [%0,%w1,uxtw #2]",
5517       "add\t%3, %4, %w3, sxtw #2"
5518     },
5519     /* We assume that DImode is only generated when not optimizing and
5520        that we don't really need 64-bit address offsets.  That would
5521        imply an object file with 8GB of code in a single function!  */
5522     {
5523       "ldr\t%w3, [%0,%w1,uxtw #2]",
5524       "add\t%3, %4, %w3, sxtw #2"
5525     }
5526   };
5527
5528   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5529
5530   index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5531
5532   gcc_assert (index >= 0 && index <= 3);
5533
5534   /* Need to implement table size reduction, by chaning the code below.  */
5535   output_asm_insn (patterns[index][0], operands);
5536   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5537   snprintf (buf, sizeof (buf),
5538             "adr\t%%4, %s", targetm.strip_name_encoding (label));
5539   output_asm_insn (buf, operands);
5540   output_asm_insn (patterns[index][1], operands);
5541   output_asm_insn ("br\t%3", operands);
5542   assemble_label (asm_out_file, label);
5543   return "";
5544 }
5545
5546
5547 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5548    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5549    operator.  */
5550
5551 int
5552 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5553 {
5554   if (shift >= 0 && shift <= 3)
5555     {
5556       int size;
5557       for (size = 8; size <= 32; size *= 2)
5558         {
5559           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5560           if (mask == bits << shift)
5561             return size;
5562         }
5563     }
5564   return 0;
5565 }
5566
5567 /* Constant pools are per function only when PC relative
5568    literal loads are true or we are in the large memory
5569    model.  */
5570
5571 static inline bool
5572 aarch64_can_use_per_function_literal_pools_p (void)
5573 {
5574   return (aarch64_pcrelative_literal_loads
5575           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5576 }
5577
5578 static bool
5579 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5580 {
5581   /* Fixme:: In an ideal world this would work similar
5582      to the logic in aarch64_select_rtx_section but this
5583      breaks bootstrap in gcc go.  For now we workaround
5584      this by returning false here.  */
5585   return false;
5586 }
5587
5588 /* Select appropriate section for constants depending
5589    on where we place literal pools.  */
5590
5591 static section *
5592 aarch64_select_rtx_section (machine_mode mode,
5593                             rtx x,
5594                             unsigned HOST_WIDE_INT align)
5595 {
5596   if (aarch64_can_use_per_function_literal_pools_p ())
5597     return function_section (current_function_decl);
5598
5599   return default_elf_select_rtx_section (mode, x, align);
5600 }
5601
5602 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
5603 void
5604 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
5605                                   HOST_WIDE_INT offset)
5606 {
5607   /* When using per-function literal pools, we must ensure that any code
5608      section is aligned to the minimal instruction length, lest we get
5609      errors from the assembler re "unaligned instructions".  */
5610   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
5611     ASM_OUTPUT_ALIGN (f, 2);
5612 }
5613
5614 /* Costs.  */
5615
5616 /* Helper function for rtx cost calculation.  Strip a shift expression
5617    from X.  Returns the inner operand if successful, or the original
5618    expression on failure.  */
5619 static rtx
5620 aarch64_strip_shift (rtx x)
5621 {
5622   rtx op = x;
5623
5624   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5625      we can convert both to ROR during final output.  */
5626   if ((GET_CODE (op) == ASHIFT
5627        || GET_CODE (op) == ASHIFTRT
5628        || GET_CODE (op) == LSHIFTRT
5629        || GET_CODE (op) == ROTATERT
5630        || GET_CODE (op) == ROTATE)
5631       && CONST_INT_P (XEXP (op, 1)))
5632     return XEXP (op, 0);
5633
5634   if (GET_CODE (op) == MULT
5635       && CONST_INT_P (XEXP (op, 1))
5636       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5637     return XEXP (op, 0);
5638
5639   return x;
5640 }
5641
5642 /* Helper function for rtx cost calculation.  Strip an extend
5643    expression from X.  Returns the inner operand if successful, or the
5644    original expression on failure.  We deal with a number of possible
5645    canonicalization variations here.  */
5646 static rtx
5647 aarch64_strip_extend (rtx x)
5648 {
5649   rtx op = x;
5650
5651   /* Zero and sign extraction of a widened value.  */
5652   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5653       && XEXP (op, 2) == const0_rtx
5654       && GET_CODE (XEXP (op, 0)) == MULT
5655       && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5656                                          XEXP (op, 1)))
5657     return XEXP (XEXP (op, 0), 0);
5658
5659   /* It can also be represented (for zero-extend) as an AND with an
5660      immediate.  */
5661   if (GET_CODE (op) == AND
5662       && GET_CODE (XEXP (op, 0)) == MULT
5663       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5664       && CONST_INT_P (XEXP (op, 1))
5665       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5666                            INTVAL (XEXP (op, 1))) != 0)
5667     return XEXP (XEXP (op, 0), 0);
5668
5669   /* Now handle extended register, as this may also have an optional
5670      left shift by 1..4.  */
5671   if (GET_CODE (op) == ASHIFT
5672       && CONST_INT_P (XEXP (op, 1))
5673       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5674     op = XEXP (op, 0);
5675
5676   if (GET_CODE (op) == ZERO_EXTEND
5677       || GET_CODE (op) == SIGN_EXTEND)
5678     op = XEXP (op, 0);
5679
5680   if (op != x)
5681     return op;
5682
5683   return x;
5684 }
5685
5686 /* Return true iff CODE is a shift supported in combination
5687    with arithmetic instructions.  */
5688
5689 static bool
5690 aarch64_shift_p (enum rtx_code code)
5691 {
5692   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5693 }
5694
5695 /* Helper function for rtx cost calculation.  Calculate the cost of
5696    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5697    Return the calculated cost of the expression, recursing manually in to
5698    operands where needed.  */
5699
5700 static int
5701 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
5702 {
5703   rtx op0, op1;
5704   const struct cpu_cost_table *extra_cost
5705     = aarch64_tune_params.insn_extra_cost;
5706   int cost = 0;
5707   bool compound_p = (outer == PLUS || outer == MINUS);
5708   machine_mode mode = GET_MODE (x);
5709
5710   gcc_checking_assert (code == MULT);
5711
5712   op0 = XEXP (x, 0);
5713   op1 = XEXP (x, 1);
5714
5715   if (VECTOR_MODE_P (mode))
5716     mode = GET_MODE_INNER (mode);
5717
5718   /* Integer multiply/fma.  */
5719   if (GET_MODE_CLASS (mode) == MODE_INT)
5720     {
5721       /* The multiply will be canonicalized as a shift, cost it as such.  */
5722       if (aarch64_shift_p (GET_CODE (x))
5723           || (CONST_INT_P (op1)
5724               && exact_log2 (INTVAL (op1)) > 0))
5725         {
5726           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5727                            || GET_CODE (op0) == SIGN_EXTEND;
5728           if (speed)
5729             {
5730               if (compound_p)
5731                 {
5732                   if (REG_P (op1))
5733                     /* ARITH + shift-by-register.  */
5734                     cost += extra_cost->alu.arith_shift_reg;
5735                   else if (is_extend)
5736                     /* ARITH + extended register.  We don't have a cost field
5737                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
5738                     cost += extra_cost->alu.extend_arith;
5739                   else
5740                     /* ARITH + shift-by-immediate.  */
5741                     cost += extra_cost->alu.arith_shift;
5742                 }
5743               else
5744                 /* LSL (immediate).  */
5745                 cost += extra_cost->alu.shift;
5746
5747             }
5748           /* Strip extends as we will have costed them in the case above.  */
5749           if (is_extend)
5750             op0 = aarch64_strip_extend (op0);
5751
5752           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
5753
5754           return cost;
5755         }
5756
5757       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
5758          compound and let the below cases handle it.  After all, MNEG is a
5759          special-case alias of MSUB.  */
5760       if (GET_CODE (op0) == NEG)
5761         {
5762           op0 = XEXP (op0, 0);
5763           compound_p = true;
5764         }
5765
5766       /* Integer multiplies or FMAs have zero/sign extending variants.  */
5767       if ((GET_CODE (op0) == ZERO_EXTEND
5768            && GET_CODE (op1) == ZERO_EXTEND)
5769           || (GET_CODE (op0) == SIGN_EXTEND
5770               && GET_CODE (op1) == SIGN_EXTEND))
5771         {
5772           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
5773           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
5774
5775           if (speed)
5776             {
5777               if (compound_p)
5778                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
5779                 cost += extra_cost->mult[0].extend_add;
5780               else
5781                 /* MUL/SMULL/UMULL.  */
5782                 cost += extra_cost->mult[0].extend;
5783             }
5784
5785           return cost;
5786         }
5787
5788       /* This is either an integer multiply or a MADD.  In both cases
5789          we want to recurse and cost the operands.  */
5790       cost += rtx_cost (op0, mode, MULT, 0, speed);
5791       cost += rtx_cost (op1, mode, MULT, 1, speed);
5792
5793       if (speed)
5794         {
5795           if (compound_p)
5796             /* MADD/MSUB.  */
5797             cost += extra_cost->mult[mode == DImode].add;
5798           else
5799             /* MUL.  */
5800             cost += extra_cost->mult[mode == DImode].simple;
5801         }
5802
5803       return cost;
5804     }
5805   else
5806     {
5807       if (speed)
5808         {
5809           /* Floating-point FMA/FMUL can also support negations of the
5810              operands, unless the rounding mode is upward or downward in
5811              which case FNMUL is different than FMUL with operand negation.  */
5812           bool neg0 = GET_CODE (op0) == NEG;
5813           bool neg1 = GET_CODE (op1) == NEG;
5814           if (compound_p || !flag_rounding_math || (neg0 && neg1))
5815             {
5816               if (neg0)
5817                 op0 = XEXP (op0, 0);
5818               if (neg1)
5819                 op1 = XEXP (op1, 0);
5820             }
5821
5822           if (compound_p)
5823             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
5824             cost += extra_cost->fp[mode == DFmode].fma;
5825           else
5826             /* FMUL/FNMUL.  */
5827             cost += extra_cost->fp[mode == DFmode].mult;
5828         }
5829
5830       cost += rtx_cost (op0, mode, MULT, 0, speed);
5831       cost += rtx_cost (op1, mode, MULT, 1, speed);
5832       return cost;
5833     }
5834 }
5835
5836 static int
5837 aarch64_address_cost (rtx x,
5838                       machine_mode mode,
5839                       addr_space_t as ATTRIBUTE_UNUSED,
5840                       bool speed)
5841 {
5842   enum rtx_code c = GET_CODE (x);
5843   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
5844   struct aarch64_address_info info;
5845   int cost = 0;
5846   info.shift = 0;
5847
5848   if (!aarch64_classify_address (&info, x, mode, c, false))
5849     {
5850       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5851         {
5852           /* This is a CONST or SYMBOL ref which will be split
5853              in a different way depending on the code model in use.
5854              Cost it through the generic infrastructure.  */
5855           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
5856           /* Divide through by the cost of one instruction to
5857              bring it to the same units as the address costs.  */
5858           cost_symbol_ref /= COSTS_N_INSNS (1);
5859           /* The cost is then the cost of preparing the address,
5860              followed by an immediate (possibly 0) offset.  */
5861           return cost_symbol_ref + addr_cost->imm_offset;
5862         }
5863       else
5864         {
5865           /* This is most likely a jump table from a case
5866              statement.  */
5867           return addr_cost->register_offset;
5868         }
5869     }
5870
5871   switch (info.type)
5872     {
5873       case ADDRESS_LO_SUM:
5874       case ADDRESS_SYMBOLIC:
5875       case ADDRESS_REG_IMM:
5876         cost += addr_cost->imm_offset;
5877         break;
5878
5879       case ADDRESS_REG_WB:
5880         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5881           cost += addr_cost->pre_modify;
5882         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5883           cost += addr_cost->post_modify;
5884         else
5885           gcc_unreachable ();
5886
5887         break;
5888
5889       case ADDRESS_REG_REG:
5890         cost += addr_cost->register_offset;
5891         break;
5892
5893       case ADDRESS_REG_SXTW:
5894         cost += addr_cost->register_sextend;
5895         break;
5896
5897       case ADDRESS_REG_UXTW:
5898         cost += addr_cost->register_zextend;
5899         break;
5900
5901       default:
5902         gcc_unreachable ();
5903     }
5904
5905
5906   if (info.shift > 0)
5907     {
5908       /* For the sake of calculating the cost of the shifted register
5909          component, we can treat same sized modes in the same way.  */
5910       switch (GET_MODE_BITSIZE (mode))
5911         {
5912           case 16:
5913             cost += addr_cost->addr_scale_costs.hi;
5914             break;
5915
5916           case 32:
5917             cost += addr_cost->addr_scale_costs.si;
5918             break;
5919
5920           case 64:
5921             cost += addr_cost->addr_scale_costs.di;
5922             break;
5923
5924           /* We can't tell, or this is a 128-bit vector.  */
5925           default:
5926             cost += addr_cost->addr_scale_costs.ti;
5927             break;
5928         }
5929     }
5930
5931   return cost;
5932 }
5933
5934 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
5935    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
5936    to be taken.  */
5937
5938 int
5939 aarch64_branch_cost (bool speed_p, bool predictable_p)
5940 {
5941   /* When optimizing for speed, use the cost of unpredictable branches.  */
5942   const struct cpu_branch_cost *branch_costs =
5943     aarch64_tune_params.branch_costs;
5944
5945   if (!speed_p || predictable_p)
5946     return branch_costs->predictable;
5947   else
5948     return branch_costs->unpredictable;
5949 }
5950
5951 /* Return true if the RTX X in mode MODE is a zero or sign extract
5952    usable in an ADD or SUB (extended register) instruction.  */
5953 static bool
5954 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5955 {
5956   /* Catch add with a sign extract.
5957      This is add_<optab><mode>_multp2.  */
5958   if (GET_CODE (x) == SIGN_EXTRACT
5959       || GET_CODE (x) == ZERO_EXTRACT)
5960     {
5961       rtx op0 = XEXP (x, 0);
5962       rtx op1 = XEXP (x, 1);
5963       rtx op2 = XEXP (x, 2);
5964
5965       if (GET_CODE (op0) == MULT
5966           && CONST_INT_P (op1)
5967           && op2 == const0_rtx
5968           && CONST_INT_P (XEXP (op0, 1))
5969           && aarch64_is_extend_from_extract (mode,
5970                                              XEXP (op0, 1),
5971                                              op1))
5972         {
5973           return true;
5974         }
5975     }
5976   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
5977      No shift.  */
5978   else if (GET_CODE (x) == SIGN_EXTEND
5979            || GET_CODE (x) == ZERO_EXTEND)
5980     return REG_P (XEXP (x, 0));
5981
5982   return false;
5983 }
5984
5985 static bool
5986 aarch64_frint_unspec_p (unsigned int u)
5987 {
5988   switch (u)
5989     {
5990       case UNSPEC_FRINTZ:
5991       case UNSPEC_FRINTP:
5992       case UNSPEC_FRINTM:
5993       case UNSPEC_FRINTA:
5994       case UNSPEC_FRINTN:
5995       case UNSPEC_FRINTX:
5996       case UNSPEC_FRINTI:
5997         return true;
5998
5999       default:
6000         return false;
6001     }
6002 }
6003
6004 /* Return true iff X is an rtx that will match an extr instruction
6005    i.e. as described in the *extr<mode>5_insn family of patterns.
6006    OP0 and OP1 will be set to the operands of the shifts involved
6007    on success and will be NULL_RTX otherwise.  */
6008
6009 static bool
6010 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6011 {
6012   rtx op0, op1;
6013   machine_mode mode = GET_MODE (x);
6014
6015   *res_op0 = NULL_RTX;
6016   *res_op1 = NULL_RTX;
6017
6018   if (GET_CODE (x) != IOR)
6019     return false;
6020
6021   op0 = XEXP (x, 0);
6022   op1 = XEXP (x, 1);
6023
6024   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6025       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6026     {
6027      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
6028       if (GET_CODE (op1) == ASHIFT)
6029         std::swap (op0, op1);
6030
6031       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6032         return false;
6033
6034       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6035       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6036
6037       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6038           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6039         {
6040           *res_op0 = XEXP (op0, 0);
6041           *res_op1 = XEXP (op1, 0);
6042           return true;
6043         }
6044     }
6045
6046   return false;
6047 }
6048
6049 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6050    storing it in *COST.  Result is true if the total cost of the operation
6051    has now been calculated.  */
6052 static bool
6053 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6054 {
6055   rtx inner;
6056   rtx comparator;
6057   enum rtx_code cmpcode;
6058
6059   if (COMPARISON_P (op0))
6060     {
6061       inner = XEXP (op0, 0);
6062       comparator = XEXP (op0, 1);
6063       cmpcode = GET_CODE (op0);
6064     }
6065   else
6066     {
6067       inner = op0;
6068       comparator = const0_rtx;
6069       cmpcode = NE;
6070     }
6071
6072   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6073     {
6074       /* Conditional branch.  */
6075       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6076         return true;
6077       else
6078         {
6079           if (cmpcode == NE || cmpcode == EQ)
6080             {
6081               if (comparator == const0_rtx)
6082                 {
6083                   /* TBZ/TBNZ/CBZ/CBNZ.  */
6084                   if (GET_CODE (inner) == ZERO_EXTRACT)
6085                     /* TBZ/TBNZ.  */
6086                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6087                                        ZERO_EXTRACT, 0, speed);
6088                   else
6089                     /* CBZ/CBNZ.  */
6090                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6091
6092                 return true;
6093               }
6094             }
6095           else if (cmpcode == LT || cmpcode == GE)
6096             {
6097               /* TBZ/TBNZ.  */
6098               if (comparator == const0_rtx)
6099                 return true;
6100             }
6101         }
6102     }
6103   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6104     {
6105       /* CCMP.  */
6106       if (GET_CODE (op1) == COMPARE)
6107         {
6108           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
6109           if (XEXP (op1, 1) == const0_rtx)
6110             *cost += 1;
6111           if (speed)
6112             {
6113               machine_mode mode = GET_MODE (XEXP (op1, 0));
6114               const struct cpu_cost_table *extra_cost
6115                 = aarch64_tune_params.insn_extra_cost;
6116
6117               if (GET_MODE_CLASS (mode) == MODE_INT)
6118                 *cost += extra_cost->alu.arith;
6119               else
6120                 *cost += extra_cost->fp[mode == DFmode].compare;
6121             }
6122           return true;
6123         }
6124
6125       /* It's a conditional operation based on the status flags,
6126          so it must be some flavor of CSEL.  */
6127
6128       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
6129       if (GET_CODE (op1) == NEG
6130           || GET_CODE (op1) == NOT
6131           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6132         op1 = XEXP (op1, 0);
6133       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6134         {
6135           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
6136           op1 = XEXP (op1, 0);
6137           op2 = XEXP (op2, 0);
6138         }
6139
6140       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6141       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6142       return true;
6143     }
6144
6145   /* We don't know what this is, cost all operands.  */
6146   return false;
6147 }
6148
6149 /* Check whether X is a bitfield operation of the form shift + extend that
6150    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
6151    operand to which the bitfield operation is applied.  Otherwise return
6152    NULL_RTX.  */
6153
6154 static rtx
6155 aarch64_extend_bitfield_pattern_p (rtx x)
6156 {
6157   rtx_code outer_code = GET_CODE (x);
6158   machine_mode outer_mode = GET_MODE (x);
6159
6160   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6161       && outer_mode != SImode && outer_mode != DImode)
6162     return NULL_RTX;
6163
6164   rtx inner = XEXP (x, 0);
6165   rtx_code inner_code = GET_CODE (inner);
6166   machine_mode inner_mode = GET_MODE (inner);
6167   rtx op = NULL_RTX;
6168
6169   switch (inner_code)
6170     {
6171       case ASHIFT:
6172         if (CONST_INT_P (XEXP (inner, 1))
6173             && (inner_mode == QImode || inner_mode == HImode))
6174           op = XEXP (inner, 0);
6175         break;
6176       case LSHIFTRT:
6177         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6178             && (inner_mode == QImode || inner_mode == HImode))
6179           op = XEXP (inner, 0);
6180         break;
6181       case ASHIFTRT:
6182         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6183             && (inner_mode == QImode || inner_mode == HImode))
6184           op = XEXP (inner, 0);
6185         break;
6186       default:
6187         break;
6188     }
6189
6190   return op;
6191 }
6192
6193 /* Return true if the mask and a shift amount from an RTX of the form
6194    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6195    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
6196
6197 bool
6198 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6199 {
6200   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6201          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6202          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6203          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6204 }
6205
6206 /* Calculate the cost of calculating X, storing it in *COST.  Result
6207    is true if the total cost of the operation has now been calculated.  */
6208 static bool
6209 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6210                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6211 {
6212   rtx op0, op1, op2;
6213   const struct cpu_cost_table *extra_cost
6214     = aarch64_tune_params.insn_extra_cost;
6215   int code = GET_CODE (x);
6216
6217   /* By default, assume that everything has equivalent cost to the
6218      cheapest instruction.  Any additional costs are applied as a delta
6219      above this default.  */
6220   *cost = COSTS_N_INSNS (1);
6221
6222   switch (code)
6223     {
6224     case SET:
6225       /* The cost depends entirely on the operands to SET.  */
6226       *cost = 0;
6227       op0 = SET_DEST (x);
6228       op1 = SET_SRC (x);
6229
6230       switch (GET_CODE (op0))
6231         {
6232         case MEM:
6233           if (speed)
6234             {
6235               rtx address = XEXP (op0, 0);
6236               if (VECTOR_MODE_P (mode))
6237                 *cost += extra_cost->ldst.storev;
6238               else if (GET_MODE_CLASS (mode) == MODE_INT)
6239                 *cost += extra_cost->ldst.store;
6240               else if (mode == SFmode)
6241                 *cost += extra_cost->ldst.storef;
6242               else if (mode == DFmode)
6243                 *cost += extra_cost->ldst.stored;
6244
6245               *cost +=
6246                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6247                                                      0, speed));
6248             }
6249
6250           *cost += rtx_cost (op1, mode, SET, 1, speed);
6251           return true;
6252
6253         case SUBREG:
6254           if (! REG_P (SUBREG_REG (op0)))
6255             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6256
6257           /* Fall through.  */
6258         case REG:
6259           /* The cost is one per vector-register copied.  */
6260           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6261             {
6262               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6263                               / GET_MODE_SIZE (V4SImode);
6264               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6265             }
6266           /* const0_rtx is in general free, but we will use an
6267              instruction to set a register to 0.  */
6268           else if (REG_P (op1) || op1 == const0_rtx)
6269             {
6270               /* The cost is 1 per register copied.  */
6271               int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6272                               / UNITS_PER_WORD;
6273               *cost = COSTS_N_INSNS (n_minus_1 + 1);
6274             }
6275           else
6276             /* Cost is just the cost of the RHS of the set.  */
6277             *cost += rtx_cost (op1, mode, SET, 1, speed);
6278           return true;
6279
6280         case ZERO_EXTRACT:
6281         case SIGN_EXTRACT:
6282           /* Bit-field insertion.  Strip any redundant widening of
6283              the RHS to meet the width of the target.  */
6284           if (GET_CODE (op1) == SUBREG)
6285             op1 = SUBREG_REG (op1);
6286           if ((GET_CODE (op1) == ZERO_EXTEND
6287                || GET_CODE (op1) == SIGN_EXTEND)
6288               && CONST_INT_P (XEXP (op0, 1))
6289               && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6290                   >= INTVAL (XEXP (op0, 1))))
6291             op1 = XEXP (op1, 0);
6292
6293           if (CONST_INT_P (op1))
6294             {
6295               /* MOV immediate is assumed to always be cheap.  */
6296               *cost = COSTS_N_INSNS (1);
6297             }
6298           else
6299             {
6300               /* BFM.  */
6301               if (speed)
6302                 *cost += extra_cost->alu.bfi;
6303               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6304             }
6305
6306           return true;
6307
6308         default:
6309           /* We can't make sense of this, assume default cost.  */
6310           *cost = COSTS_N_INSNS (1);
6311           return false;
6312         }
6313       return false;
6314
6315     case CONST_INT:
6316       /* If an instruction can incorporate a constant within the
6317          instruction, the instruction's expression avoids calling
6318          rtx_cost() on the constant.  If rtx_cost() is called on a
6319          constant, then it is usually because the constant must be
6320          moved into a register by one or more instructions.
6321
6322          The exception is constant 0, which can be expressed
6323          as XZR/WZR and is therefore free.  The exception to this is
6324          if we have (set (reg) (const0_rtx)) in which case we must cost
6325          the move.  However, we can catch that when we cost the SET, so
6326          we don't need to consider that here.  */
6327       if (x == const0_rtx)
6328         *cost = 0;
6329       else
6330         {
6331           /* To an approximation, building any other constant is
6332              proportionally expensive to the number of instructions
6333              required to build that constant.  This is true whether we
6334              are compiling for SPEED or otherwise.  */
6335           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6336                                  (NULL_RTX, x, false, mode));
6337         }
6338       return true;
6339
6340     case CONST_DOUBLE:
6341       if (speed)
6342         {
6343           /* mov[df,sf]_aarch64.  */
6344           if (aarch64_float_const_representable_p (x))
6345             /* FMOV (scalar immediate).  */
6346             *cost += extra_cost->fp[mode == DFmode].fpconst;
6347           else if (!aarch64_float_const_zero_rtx_p (x))
6348             {
6349               /* This will be a load from memory.  */
6350               if (mode == DFmode)
6351                 *cost += extra_cost->ldst.loadd;
6352               else
6353                 *cost += extra_cost->ldst.loadf;
6354             }
6355           else
6356             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
6357                or MOV v0.s[0], wzr - neither of which are modeled by the
6358                cost tables.  Just use the default cost.  */
6359             {
6360             }
6361         }
6362
6363       return true;
6364
6365     case MEM:
6366       if (speed)
6367         {
6368           /* For loads we want the base cost of a load, plus an
6369              approximation for the additional cost of the addressing
6370              mode.  */
6371           rtx address = XEXP (x, 0);
6372           if (VECTOR_MODE_P (mode))
6373             *cost += extra_cost->ldst.loadv;
6374           else if (GET_MODE_CLASS (mode) == MODE_INT)
6375             *cost += extra_cost->ldst.load;
6376           else if (mode == SFmode)
6377             *cost += extra_cost->ldst.loadf;
6378           else if (mode == DFmode)
6379             *cost += extra_cost->ldst.loadd;
6380
6381           *cost +=
6382                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6383                                                      0, speed));
6384         }
6385
6386       return true;
6387
6388     case NEG:
6389       op0 = XEXP (x, 0);
6390
6391       if (VECTOR_MODE_P (mode))
6392         {
6393           if (speed)
6394             {
6395               /* FNEG.  */
6396               *cost += extra_cost->vect.alu;
6397             }
6398           return false;
6399         }
6400
6401       if (GET_MODE_CLASS (mode) == MODE_INT)
6402         {
6403           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6404               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6405             {
6406               /* CSETM.  */
6407               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6408               return true;
6409             }
6410
6411           /* Cost this as SUB wzr, X.  */
6412           op0 = CONST0_RTX (mode);
6413           op1 = XEXP (x, 0);
6414           goto cost_minus;
6415         }
6416
6417       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6418         {
6419           /* Support (neg(fma...)) as a single instruction only if
6420              sign of zeros is unimportant.  This matches the decision
6421              making in aarch64.md.  */
6422           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6423             {
6424               /* FNMADD.  */
6425               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6426               return true;
6427             }
6428           if (GET_CODE (op0) == MULT)
6429             {
6430               /* FNMUL.  */
6431               *cost = rtx_cost (op0, mode, NEG, 0, speed);
6432               return true;
6433             }
6434           if (speed)
6435             /* FNEG.  */
6436             *cost += extra_cost->fp[mode == DFmode].neg;
6437           return false;
6438         }
6439
6440       return false;
6441
6442     case CLRSB:
6443     case CLZ:
6444       if (speed)
6445         {
6446           if (VECTOR_MODE_P (mode))
6447             *cost += extra_cost->vect.alu;
6448           else
6449             *cost += extra_cost->alu.clz;
6450         }
6451
6452       return false;
6453
6454     case COMPARE:
6455       op0 = XEXP (x, 0);
6456       op1 = XEXP (x, 1);
6457
6458       if (op1 == const0_rtx
6459           && GET_CODE (op0) == AND)
6460         {
6461           x = op0;
6462           mode = GET_MODE (op0);
6463           goto cost_logic;
6464         }
6465
6466       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6467         {
6468           /* TODO: A write to the CC flags possibly costs extra, this
6469              needs encoding in the cost tables.  */
6470
6471           mode = GET_MODE (op0);
6472           /* ANDS.  */
6473           if (GET_CODE (op0) == AND)
6474             {
6475               x = op0;
6476               goto cost_logic;
6477             }
6478
6479           if (GET_CODE (op0) == PLUS)
6480             {
6481               /* ADDS (and CMN alias).  */
6482               x = op0;
6483               goto cost_plus;
6484             }
6485
6486           if (GET_CODE (op0) == MINUS)
6487             {
6488               /* SUBS.  */
6489               x = op0;
6490               goto cost_minus;
6491             }
6492
6493           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6494               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6495               && CONST_INT_P (XEXP (op0, 2)))
6496             {
6497               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6498                  Handle it here directly rather than going to cost_logic
6499                  since we know the immediate generated for the TST is valid
6500                  so we can avoid creating an intermediate rtx for it only
6501                  for costing purposes.  */
6502               if (speed)
6503                 *cost += extra_cost->alu.logical;
6504
6505               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6506                                  ZERO_EXTRACT, 0, speed);
6507               return true;
6508             }
6509
6510           if (GET_CODE (op1) == NEG)
6511             {
6512               /* CMN.  */
6513               if (speed)
6514                 *cost += extra_cost->alu.arith;
6515
6516               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6517               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6518               return true;
6519             }
6520
6521           /* CMP.
6522
6523              Compare can freely swap the order of operands, and
6524              canonicalization puts the more complex operation first.
6525              But the integer MINUS logic expects the shift/extend
6526              operation in op1.  */
6527           if (! (REG_P (op0)
6528                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6529           {
6530             op0 = XEXP (x, 1);
6531             op1 = XEXP (x, 0);
6532           }
6533           goto cost_minus;
6534         }
6535
6536       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6537         {
6538           /* FCMP.  */
6539           if (speed)
6540             *cost += extra_cost->fp[mode == DFmode].compare;
6541
6542           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6543             {
6544               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6545               /* FCMP supports constant 0.0 for no extra cost. */
6546               return true;
6547             }
6548           return false;
6549         }
6550
6551       if (VECTOR_MODE_P (mode))
6552         {
6553           /* Vector compare.  */
6554           if (speed)
6555             *cost += extra_cost->vect.alu;
6556
6557           if (aarch64_float_const_zero_rtx_p (op1))
6558             {
6559               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6560                  cost.  */
6561               return true;
6562             }
6563           return false;
6564         }
6565       return false;
6566
6567     case MINUS:
6568       {
6569         op0 = XEXP (x, 0);
6570         op1 = XEXP (x, 1);
6571
6572 cost_minus:
6573         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6574
6575         /* Detect valid immediates.  */
6576         if ((GET_MODE_CLASS (mode) == MODE_INT
6577              || (GET_MODE_CLASS (mode) == MODE_CC
6578                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6579             && CONST_INT_P (op1)
6580             && aarch64_uimm12_shift (INTVAL (op1)))
6581           {
6582             if (speed)
6583               /* SUB(S) (immediate).  */
6584               *cost += extra_cost->alu.arith;
6585             return true;
6586           }
6587
6588         /* Look for SUB (extended register).  */
6589         if (aarch64_rtx_arith_op_extract_p (op1, mode))
6590           {
6591             if (speed)
6592               *cost += extra_cost->alu.extend_arith;
6593
6594             op1 = aarch64_strip_extend (op1);
6595             *cost += rtx_cost (op1, VOIDmode,
6596                                (enum rtx_code) GET_CODE (op1), 0, speed);
6597             return true;
6598           }
6599
6600         rtx new_op1 = aarch64_strip_extend (op1);
6601
6602         /* Cost this as an FMA-alike operation.  */
6603         if ((GET_CODE (new_op1) == MULT
6604              || aarch64_shift_p (GET_CODE (new_op1)))
6605             && code != COMPARE)
6606           {
6607             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6608                                             (enum rtx_code) code,
6609                                             speed);
6610             return true;
6611           }
6612
6613         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6614
6615         if (speed)
6616           {
6617             if (VECTOR_MODE_P (mode))
6618               {
6619                 /* Vector SUB.  */
6620                 *cost += extra_cost->vect.alu;
6621               }
6622             else if (GET_MODE_CLASS (mode) == MODE_INT)
6623               {
6624                 /* SUB(S).  */
6625                 *cost += extra_cost->alu.arith;
6626               }
6627             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6628               {
6629                 /* FSUB.  */
6630                 *cost += extra_cost->fp[mode == DFmode].addsub;
6631               }
6632           }
6633         return true;
6634       }
6635
6636     case PLUS:
6637       {
6638         rtx new_op0;
6639
6640         op0 = XEXP (x, 0);
6641         op1 = XEXP (x, 1);
6642
6643 cost_plus:
6644         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6645             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6646           {
6647             /* CSINC.  */
6648             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6649             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6650             return true;
6651           }
6652
6653         if (GET_MODE_CLASS (mode) == MODE_INT
6654             && CONST_INT_P (op1)
6655             && aarch64_uimm12_shift (INTVAL (op1)))
6656           {
6657             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6658
6659             if (speed)
6660               /* ADD (immediate).  */
6661               *cost += extra_cost->alu.arith;
6662             return true;
6663           }
6664
6665         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6666
6667         /* Look for ADD (extended register).  */
6668         if (aarch64_rtx_arith_op_extract_p (op0, mode))
6669           {
6670             if (speed)
6671               *cost += extra_cost->alu.extend_arith;
6672
6673             op0 = aarch64_strip_extend (op0);
6674             *cost += rtx_cost (op0, VOIDmode,
6675                                (enum rtx_code) GET_CODE (op0), 0, speed);
6676             return true;
6677           }
6678
6679         /* Strip any extend, leave shifts behind as we will
6680            cost them through mult_cost.  */
6681         new_op0 = aarch64_strip_extend (op0);
6682
6683         if (GET_CODE (new_op0) == MULT
6684             || aarch64_shift_p (GET_CODE (new_op0)))
6685           {
6686             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6687                                             speed);
6688             return true;
6689           }
6690
6691         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
6692
6693         if (speed)
6694           {
6695             if (VECTOR_MODE_P (mode))
6696               {
6697                 /* Vector ADD.  */
6698                 *cost += extra_cost->vect.alu;
6699               }
6700             else if (GET_MODE_CLASS (mode) == MODE_INT)
6701               {
6702                 /* ADD.  */
6703                 *cost += extra_cost->alu.arith;
6704               }
6705             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6706               {
6707                 /* FADD.  */
6708                 *cost += extra_cost->fp[mode == DFmode].addsub;
6709               }
6710           }
6711         return true;
6712       }
6713
6714     case BSWAP:
6715       *cost = COSTS_N_INSNS (1);
6716
6717       if (speed)
6718         {
6719           if (VECTOR_MODE_P (mode))
6720             *cost += extra_cost->vect.alu;
6721           else
6722             *cost += extra_cost->alu.rev;
6723         }
6724       return false;
6725
6726     case IOR:
6727       if (aarch_rev16_p (x))
6728         {
6729           *cost = COSTS_N_INSNS (1);
6730
6731           if (speed)
6732             {
6733               if (VECTOR_MODE_P (mode))
6734                 *cost += extra_cost->vect.alu;
6735               else
6736                 *cost += extra_cost->alu.rev;
6737             }
6738           return true;
6739         }
6740
6741       if (aarch64_extr_rtx_p (x, &op0, &op1))
6742         {
6743           *cost += rtx_cost (op0, mode, IOR, 0, speed);
6744           *cost += rtx_cost (op1, mode, IOR, 1, speed);
6745           if (speed)
6746             *cost += extra_cost->alu.shift;
6747
6748           return true;
6749         }
6750     /* Fall through.  */
6751     case XOR:
6752     case AND:
6753     cost_logic:
6754       op0 = XEXP (x, 0);
6755       op1 = XEXP (x, 1);
6756
6757       if (VECTOR_MODE_P (mode))
6758         {
6759           if (speed)
6760             *cost += extra_cost->vect.alu;
6761           return true;
6762         }
6763
6764       if (code == AND
6765           && GET_CODE (op0) == MULT
6766           && CONST_INT_P (XEXP (op0, 1))
6767           && CONST_INT_P (op1)
6768           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6769                                INTVAL (op1)) != 0)
6770         {
6771           /* This is a UBFM/SBFM.  */
6772           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
6773           if (speed)
6774             *cost += extra_cost->alu.bfx;
6775           return true;
6776         }
6777
6778       if (GET_MODE_CLASS (mode) == MODE_INT)
6779         {
6780           if (CONST_INT_P (op1))
6781             {
6782               /* We have a mask + shift version of a UBFIZ
6783                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
6784               if (GET_CODE (op0) == ASHIFT
6785                   && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
6786                                                           XEXP (op0, 1)))
6787                 {
6788                   *cost += rtx_cost (XEXP (op0, 0), mode,
6789                                      (enum rtx_code) code, 0, speed);
6790                   if (speed)
6791                     *cost += extra_cost->alu.bfx;
6792
6793                   return true;
6794                 }
6795               else if (aarch64_bitmask_imm (INTVAL (op1), mode))
6796                 {
6797                 /* We possibly get the immediate for free, this is not
6798                    modelled.  */
6799                   *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6800                   if (speed)
6801                     *cost += extra_cost->alu.logical;
6802
6803                   return true;
6804                 }
6805             }
6806           else
6807             {
6808               rtx new_op0 = op0;
6809
6810               /* Handle ORN, EON, or BIC.  */
6811               if (GET_CODE (op0) == NOT)
6812                 op0 = XEXP (op0, 0);
6813
6814               new_op0 = aarch64_strip_shift (op0);
6815
6816               /* If we had a shift on op0 then this is a logical-shift-
6817                  by-register/immediate operation.  Otherwise, this is just
6818                  a logical operation.  */
6819               if (speed)
6820                 {
6821                   if (new_op0 != op0)
6822                     {
6823                       /* Shift by immediate.  */
6824                       if (CONST_INT_P (XEXP (op0, 1)))
6825                         *cost += extra_cost->alu.log_shift;
6826                       else
6827                         *cost += extra_cost->alu.log_shift_reg;
6828                     }
6829                   else
6830                     *cost += extra_cost->alu.logical;
6831                 }
6832
6833               /* In both cases we want to cost both operands.  */
6834               *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
6835               *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
6836
6837               return true;
6838             }
6839         }
6840       return false;
6841
6842     case NOT:
6843       x = XEXP (x, 0);
6844       op0 = aarch64_strip_shift (x);
6845
6846       if (VECTOR_MODE_P (mode))
6847         {
6848           /* Vector NOT.  */
6849           *cost += extra_cost->vect.alu;
6850           return false;
6851         }
6852
6853       /* MVN-shifted-reg.  */
6854       if (op0 != x)
6855         {
6856           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6857
6858           if (speed)
6859             *cost += extra_cost->alu.log_shift;
6860
6861           return true;
6862         }
6863       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6864          Handle the second form here taking care that 'a' in the above can
6865          be a shift.  */
6866       else if (GET_CODE (op0) == XOR)
6867         {
6868           rtx newop0 = XEXP (op0, 0);
6869           rtx newop1 = XEXP (op0, 1);
6870           rtx op0_stripped = aarch64_strip_shift (newop0);
6871
6872           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
6873           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
6874
6875           if (speed)
6876             {
6877               if (op0_stripped != newop0)
6878                 *cost += extra_cost->alu.log_shift;
6879               else
6880                 *cost += extra_cost->alu.logical;
6881             }
6882
6883           return true;
6884         }
6885       /* MVN.  */
6886       if (speed)
6887         *cost += extra_cost->alu.logical;
6888
6889       return false;
6890
6891     case ZERO_EXTEND:
6892
6893       op0 = XEXP (x, 0);
6894       /* If a value is written in SI mode, then zero extended to DI
6895          mode, the operation will in general be free as a write to
6896          a 'w' register implicitly zeroes the upper bits of an 'x'
6897          register.  However, if this is
6898
6899            (set (reg) (zero_extend (reg)))
6900
6901          we must cost the explicit register move.  */
6902       if (mode == DImode
6903           && GET_MODE (op0) == SImode
6904           && outer == SET)
6905         {
6906           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
6907
6908         /* If OP_COST is non-zero, then the cost of the zero extend
6909            is effectively the cost of the inner operation.  Otherwise
6910            we have a MOV instruction and we take the cost from the MOV
6911            itself.  This is true independently of whether we are
6912            optimizing for space or time.  */
6913           if (op_cost)
6914             *cost = op_cost;
6915
6916           return true;
6917         }
6918       else if (MEM_P (op0))
6919         {
6920           /* All loads can zero extend to any size for free.  */
6921           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
6922           return true;
6923         }
6924
6925       op0 = aarch64_extend_bitfield_pattern_p (x);
6926       if (op0)
6927         {
6928           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
6929           if (speed)
6930             *cost += extra_cost->alu.bfx;
6931           return true;
6932         }
6933
6934       if (speed)
6935         {
6936           if (VECTOR_MODE_P (mode))
6937             {
6938               /* UMOV.  */
6939               *cost += extra_cost->vect.alu;
6940             }
6941           else
6942             {
6943               /* We generate an AND instead of UXTB/UXTH.  */
6944               *cost += extra_cost->alu.logical;
6945             }
6946         }
6947       return false;
6948
6949     case SIGN_EXTEND:
6950       if (MEM_P (XEXP (x, 0)))
6951         {
6952           /* LDRSH.  */
6953           if (speed)
6954             {
6955               rtx address = XEXP (XEXP (x, 0), 0);
6956               *cost += extra_cost->ldst.load_sign_extend;
6957
6958               *cost +=
6959                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6960                                                      0, speed));
6961             }
6962           return true;
6963         }
6964
6965       op0 = aarch64_extend_bitfield_pattern_p (x);
6966       if (op0)
6967         {
6968           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
6969           if (speed)
6970             *cost += extra_cost->alu.bfx;
6971           return true;
6972         }
6973
6974       if (speed)
6975         {
6976           if (VECTOR_MODE_P (mode))
6977             *cost += extra_cost->vect.alu;
6978           else
6979             *cost += extra_cost->alu.extend;
6980         }
6981       return false;
6982
6983     case ASHIFT:
6984       op0 = XEXP (x, 0);
6985       op1 = XEXP (x, 1);
6986
6987       if (CONST_INT_P (op1))
6988         {
6989           if (speed)
6990             {
6991               if (VECTOR_MODE_P (mode))
6992                 {
6993                   /* Vector shift (immediate).  */
6994                   *cost += extra_cost->vect.alu;
6995                 }
6996               else
6997                 {
6998                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
6999                      aliases.  */
7000                   *cost += extra_cost->alu.shift;
7001                 }
7002             }
7003
7004           /* We can incorporate zero/sign extend for free.  */
7005           if (GET_CODE (op0) == ZERO_EXTEND
7006               || GET_CODE (op0) == SIGN_EXTEND)
7007             op0 = XEXP (op0, 0);
7008
7009           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7010           return true;
7011         }
7012       else
7013         {
7014           if (speed)
7015             {
7016               if (VECTOR_MODE_P (mode))
7017                 {
7018                   /* Vector shift (register).  */
7019                   *cost += extra_cost->vect.alu;
7020                 }
7021               else
7022                 {
7023                   /* LSLV.  */
7024                   *cost += extra_cost->alu.shift_reg;
7025                 }
7026             }
7027           return false;  /* All arguments need to be in registers.  */
7028         }
7029
7030     case ROTATE:
7031     case ROTATERT:
7032     case LSHIFTRT:
7033     case ASHIFTRT:
7034       op0 = XEXP (x, 0);
7035       op1 = XEXP (x, 1);
7036
7037       if (CONST_INT_P (op1))
7038         {
7039           /* ASR (immediate) and friends.  */
7040           if (speed)
7041             {
7042               if (VECTOR_MODE_P (mode))
7043                 *cost += extra_cost->vect.alu;
7044               else
7045                 *cost += extra_cost->alu.shift;
7046             }
7047
7048           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7049           return true;
7050         }
7051       else
7052         {
7053
7054           /* ASR (register) and friends.  */
7055           if (speed)
7056             {
7057               if (VECTOR_MODE_P (mode))
7058                 *cost += extra_cost->vect.alu;
7059               else
7060                 *cost += extra_cost->alu.shift_reg;
7061             }
7062           return false;  /* All arguments need to be in registers.  */
7063         }
7064
7065     case SYMBOL_REF:
7066
7067       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7068           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7069         {
7070           /* LDR.  */
7071           if (speed)
7072             *cost += extra_cost->ldst.load;
7073         }
7074       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7075                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7076         {
7077           /* ADRP, followed by ADD.  */
7078           *cost += COSTS_N_INSNS (1);
7079           if (speed)
7080             *cost += 2 * extra_cost->alu.arith;
7081         }
7082       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7083                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7084         {
7085           /* ADR.  */
7086           if (speed)
7087             *cost += extra_cost->alu.arith;
7088         }
7089
7090       if (flag_pic)
7091         {
7092           /* One extra load instruction, after accessing the GOT.  */
7093           *cost += COSTS_N_INSNS (1);
7094           if (speed)
7095             *cost += extra_cost->ldst.load;
7096         }
7097       return true;
7098
7099     case HIGH:
7100     case LO_SUM:
7101       /* ADRP/ADD (immediate).  */
7102       if (speed)
7103         *cost += extra_cost->alu.arith;
7104       return true;
7105
7106     case ZERO_EXTRACT:
7107     case SIGN_EXTRACT:
7108       /* UBFX/SBFX.  */
7109       if (speed)
7110         {
7111           if (VECTOR_MODE_P (mode))
7112             *cost += extra_cost->vect.alu;
7113           else
7114             *cost += extra_cost->alu.bfx;
7115         }
7116
7117       /* We can trust that the immediates used will be correct (there
7118          are no by-register forms), so we need only cost op0.  */
7119       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7120       return true;
7121
7122     case MULT:
7123       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7124       /* aarch64_rtx_mult_cost always handles recursion to its
7125          operands.  */
7126       return true;
7127
7128     case MOD:
7129     /* We can expand signed mod by power of 2 using a NEGS, two parallel
7130        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
7131        an unconditional negate.  This case should only ever be reached through
7132        the set_smod_pow2_cheap check in expmed.c.  */
7133       if (CONST_INT_P (XEXP (x, 1))
7134           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7135           && (mode == SImode || mode == DImode))
7136         {
7137           /* We expand to 4 instructions.  Reset the baseline.  */
7138           *cost = COSTS_N_INSNS (4);
7139
7140           if (speed)
7141             *cost += 2 * extra_cost->alu.logical
7142                      + 2 * extra_cost->alu.arith;
7143
7144           return true;
7145         }
7146
7147     /* Fall-through.  */
7148     case UMOD:
7149       if (speed)
7150         {
7151           if (VECTOR_MODE_P (mode))
7152             *cost += extra_cost->vect.alu;
7153           else if (GET_MODE_CLASS (mode) == MODE_INT)
7154             *cost += (extra_cost->mult[mode == DImode].add
7155                       + extra_cost->mult[mode == DImode].idiv);
7156           else if (mode == DFmode)
7157             *cost += (extra_cost->fp[1].mult
7158                       + extra_cost->fp[1].div);
7159           else if (mode == SFmode)
7160             *cost += (extra_cost->fp[0].mult
7161                       + extra_cost->fp[0].div);
7162         }
7163       return false;  /* All arguments need to be in registers.  */
7164
7165     case DIV:
7166     case UDIV:
7167     case SQRT:
7168       if (speed)
7169         {
7170           if (VECTOR_MODE_P (mode))
7171             *cost += extra_cost->vect.alu;
7172           else if (GET_MODE_CLASS (mode) == MODE_INT)
7173             /* There is no integer SQRT, so only DIV and UDIV can get
7174                here.  */
7175             *cost += extra_cost->mult[mode == DImode].idiv;
7176           else
7177             *cost += extra_cost->fp[mode == DFmode].div;
7178         }
7179       return false;  /* All arguments need to be in registers.  */
7180
7181     case IF_THEN_ELSE:
7182       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7183                                          XEXP (x, 2), cost, speed);
7184
7185     case EQ:
7186     case NE:
7187     case GT:
7188     case GTU:
7189     case LT:
7190     case LTU:
7191     case GE:
7192     case GEU:
7193     case LE:
7194     case LEU:
7195
7196       return false; /* All arguments must be in registers.  */
7197
7198     case FMA:
7199       op0 = XEXP (x, 0);
7200       op1 = XEXP (x, 1);
7201       op2 = XEXP (x, 2);
7202
7203       if (speed)
7204         {
7205           if (VECTOR_MODE_P (mode))
7206             *cost += extra_cost->vect.alu;
7207           else
7208             *cost += extra_cost->fp[mode == DFmode].fma;
7209         }
7210
7211       /* FMSUB, FNMADD, and FNMSUB are free.  */
7212       if (GET_CODE (op0) == NEG)
7213         op0 = XEXP (op0, 0);
7214
7215       if (GET_CODE (op2) == NEG)
7216         op2 = XEXP (op2, 0);
7217
7218       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7219          and the by-element operand as operand 0.  */
7220       if (GET_CODE (op1) == NEG)
7221         op1 = XEXP (op1, 0);
7222
7223       /* Catch vector-by-element operations.  The by-element operand can
7224          either be (vec_duplicate (vec_select (x))) or just
7225          (vec_select (x)), depending on whether we are multiplying by
7226          a vector or a scalar.
7227
7228          Canonicalization is not very good in these cases, FMA4 will put the
7229          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
7230       if (GET_CODE (op0) == VEC_DUPLICATE)
7231         op0 = XEXP (op0, 0);
7232       else if (GET_CODE (op1) == VEC_DUPLICATE)
7233         op1 = XEXP (op1, 0);
7234
7235       if (GET_CODE (op0) == VEC_SELECT)
7236         op0 = XEXP (op0, 0);
7237       else if (GET_CODE (op1) == VEC_SELECT)
7238         op1 = XEXP (op1, 0);
7239
7240       /* If the remaining parameters are not registers,
7241          get the cost to put them into registers.  */
7242       *cost += rtx_cost (op0, mode, FMA, 0, speed);
7243       *cost += rtx_cost (op1, mode, FMA, 1, speed);
7244       *cost += rtx_cost (op2, mode, FMA, 2, speed);
7245       return true;
7246
7247     case FLOAT:
7248     case UNSIGNED_FLOAT:
7249       if (speed)
7250         *cost += extra_cost->fp[mode == DFmode].fromint;
7251       return false;
7252
7253     case FLOAT_EXTEND:
7254       if (speed)
7255         {
7256           if (VECTOR_MODE_P (mode))
7257             {
7258               /*Vector truncate.  */
7259               *cost += extra_cost->vect.alu;
7260             }
7261           else
7262             *cost += extra_cost->fp[mode == DFmode].widen;
7263         }
7264       return false;
7265
7266     case FLOAT_TRUNCATE:
7267       if (speed)
7268         {
7269           if (VECTOR_MODE_P (mode))
7270             {
7271               /*Vector conversion.  */
7272               *cost += extra_cost->vect.alu;
7273             }
7274           else
7275             *cost += extra_cost->fp[mode == DFmode].narrow;
7276         }
7277       return false;
7278
7279     case FIX:
7280     case UNSIGNED_FIX:
7281       x = XEXP (x, 0);
7282       /* Strip the rounding part.  They will all be implemented
7283          by the fcvt* family of instructions anyway.  */
7284       if (GET_CODE (x) == UNSPEC)
7285         {
7286           unsigned int uns_code = XINT (x, 1);
7287
7288           if (uns_code == UNSPEC_FRINTA
7289               || uns_code == UNSPEC_FRINTM
7290               || uns_code == UNSPEC_FRINTN
7291               || uns_code == UNSPEC_FRINTP
7292               || uns_code == UNSPEC_FRINTZ)
7293             x = XVECEXP (x, 0, 0);
7294         }
7295
7296       if (speed)
7297         {
7298           if (VECTOR_MODE_P (mode))
7299             *cost += extra_cost->vect.alu;
7300           else
7301             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7302         }
7303
7304       /* We can combine fmul by a power of 2 followed by a fcvt into a single
7305          fixed-point fcvt.  */
7306       if (GET_CODE (x) == MULT
7307           && ((VECTOR_MODE_P (mode)
7308                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7309               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7310         {
7311           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7312                              0, speed);
7313           return true;
7314         }
7315
7316       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7317       return true;
7318
7319     case ABS:
7320       if (VECTOR_MODE_P (mode))
7321         {
7322           /* ABS (vector).  */
7323           if (speed)
7324             *cost += extra_cost->vect.alu;
7325         }
7326       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7327         {
7328           op0 = XEXP (x, 0);
7329
7330           /* FABD, which is analogous to FADD.  */
7331           if (GET_CODE (op0) == MINUS)
7332             {
7333               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7334               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7335               if (speed)
7336                 *cost += extra_cost->fp[mode == DFmode].addsub;
7337
7338               return true;
7339             }
7340           /* Simple FABS is analogous to FNEG.  */
7341           if (speed)
7342             *cost += extra_cost->fp[mode == DFmode].neg;
7343         }
7344       else
7345         {
7346           /* Integer ABS will either be split to
7347              two arithmetic instructions, or will be an ABS
7348              (scalar), which we don't model.  */
7349           *cost = COSTS_N_INSNS (2);
7350           if (speed)
7351             *cost += 2 * extra_cost->alu.arith;
7352         }
7353       return false;
7354
7355     case SMAX:
7356     case SMIN:
7357       if (speed)
7358         {
7359           if (VECTOR_MODE_P (mode))
7360             *cost += extra_cost->vect.alu;
7361           else
7362             {
7363               /* FMAXNM/FMINNM/FMAX/FMIN.
7364                  TODO: This may not be accurate for all implementations, but
7365                  we do not model this in the cost tables.  */
7366               *cost += extra_cost->fp[mode == DFmode].addsub;
7367             }
7368         }
7369       return false;
7370
7371     case UNSPEC:
7372       /* The floating point round to integer frint* instructions.  */
7373       if (aarch64_frint_unspec_p (XINT (x, 1)))
7374         {
7375           if (speed)
7376             *cost += extra_cost->fp[mode == DFmode].roundint;
7377
7378           return false;
7379         }
7380
7381       if (XINT (x, 1) == UNSPEC_RBIT)
7382         {
7383           if (speed)
7384             *cost += extra_cost->alu.rev;
7385
7386           return false;
7387         }
7388       break;
7389
7390     case TRUNCATE:
7391
7392       /* Decompose <su>muldi3_highpart.  */
7393       if (/* (truncate:DI  */
7394           mode == DImode
7395           /*   (lshiftrt:TI  */
7396           && GET_MODE (XEXP (x, 0)) == TImode
7397           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7398           /*      (mult:TI  */
7399           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7400           /*        (ANY_EXTEND:TI (reg:DI))
7401                     (ANY_EXTEND:TI (reg:DI)))  */
7402           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7403                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7404               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7405                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7406           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7407           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7408           /*     (const_int 64)  */
7409           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7410           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7411         {
7412           /* UMULH/SMULH.  */
7413           if (speed)
7414             *cost += extra_cost->mult[mode == DImode].extend;
7415           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7416                              mode, MULT, 0, speed);
7417           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7418                              mode, MULT, 1, speed);
7419           return true;
7420         }
7421
7422       /* Fall through.  */
7423     default:
7424       break;
7425     }
7426
7427   if (dump_file
7428       && flag_aarch64_verbose_cost)
7429     fprintf (dump_file,
7430       "\nFailed to cost RTX.  Assuming default cost.\n");
7431
7432   return true;
7433 }
7434
7435 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7436    calculated for X.  This cost is stored in *COST.  Returns true
7437    if the total cost of X was calculated.  */
7438 static bool
7439 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7440                    int param, int *cost, bool speed)
7441 {
7442   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7443
7444   if (dump_file
7445       && flag_aarch64_verbose_cost)
7446     {
7447       print_rtl_single (dump_file, x);
7448       fprintf (dump_file, "\n%s cost: %d (%s)\n",
7449                speed ? "Hot" : "Cold",
7450                *cost, result ? "final" : "partial");
7451     }
7452
7453   return result;
7454 }
7455
7456 static int
7457 aarch64_register_move_cost (machine_mode mode,
7458                             reg_class_t from_i, reg_class_t to_i)
7459 {
7460   enum reg_class from = (enum reg_class) from_i;
7461   enum reg_class to = (enum reg_class) to_i;
7462   const struct cpu_regmove_cost *regmove_cost
7463     = aarch64_tune_params.regmove_cost;
7464
7465   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
7466   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7467     to = GENERAL_REGS;
7468
7469   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7470     from = GENERAL_REGS;
7471
7472   /* Moving between GPR and stack cost is the same as GP2GP.  */
7473   if ((from == GENERAL_REGS && to == STACK_REG)
7474       || (to == GENERAL_REGS && from == STACK_REG))
7475     return regmove_cost->GP2GP;
7476
7477   /* To/From the stack register, we move via the gprs.  */
7478   if (to == STACK_REG || from == STACK_REG)
7479     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7480             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7481
7482   if (GET_MODE_SIZE (mode) == 16)
7483     {
7484       /* 128-bit operations on general registers require 2 instructions.  */
7485       if (from == GENERAL_REGS && to == GENERAL_REGS)
7486         return regmove_cost->GP2GP * 2;
7487       else if (from == GENERAL_REGS)
7488         return regmove_cost->GP2FP * 2;
7489       else if (to == GENERAL_REGS)
7490         return regmove_cost->FP2GP * 2;
7491
7492       /* When AdvSIMD instructions are disabled it is not possible to move
7493          a 128-bit value directly between Q registers.  This is handled in
7494          secondary reload.  A general register is used as a scratch to move
7495          the upper DI value and the lower DI value is moved directly,
7496          hence the cost is the sum of three moves. */
7497       if (! TARGET_SIMD)
7498         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7499
7500       return regmove_cost->FP2FP;
7501     }
7502
7503   if (from == GENERAL_REGS && to == GENERAL_REGS)
7504     return regmove_cost->GP2GP;
7505   else if (from == GENERAL_REGS)
7506     return regmove_cost->GP2FP;
7507   else if (to == GENERAL_REGS)
7508     return regmove_cost->FP2GP;
7509
7510   return regmove_cost->FP2FP;
7511 }
7512
7513 static int
7514 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7515                           reg_class_t rclass ATTRIBUTE_UNUSED,
7516                           bool in ATTRIBUTE_UNUSED)
7517 {
7518   return aarch64_tune_params.memmov_cost;
7519 }
7520
7521 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7522    to optimize 1.0/sqrt.  */
7523
7524 static bool
7525 use_rsqrt_p (machine_mode mode)
7526 {
7527   return (!flag_trapping_math
7528           && flag_unsafe_math_optimizations
7529           && ((aarch64_tune_params.approx_modes->recip_sqrt
7530                & AARCH64_APPROX_MODE (mode))
7531               || flag_mrecip_low_precision_sqrt));
7532 }
7533
7534 /* Function to decide when to use the approximate reciprocal square root
7535    builtin.  */
7536
7537 static tree
7538 aarch64_builtin_reciprocal (tree fndecl)
7539 {
7540   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
7541
7542   if (!use_rsqrt_p (mode))
7543     return NULL_TREE;
7544   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7545 }
7546
7547 typedef rtx (*rsqrte_type) (rtx, rtx);
7548
7549 /* Select reciprocal square root initial estimate insn depending on machine
7550    mode.  */
7551
7552 static rsqrte_type
7553 get_rsqrte_type (machine_mode mode)
7554 {
7555   switch (mode)
7556   {
7557     case DFmode:   return gen_aarch64_rsqrtedf;
7558     case SFmode:   return gen_aarch64_rsqrtesf;
7559     case V2DFmode: return gen_aarch64_rsqrtev2df;
7560     case V2SFmode: return gen_aarch64_rsqrtev2sf;
7561     case V4SFmode: return gen_aarch64_rsqrtev4sf;
7562     default: gcc_unreachable ();
7563   }
7564 }
7565
7566 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7567
7568 /* Select reciprocal square root series step insn depending on machine mode.  */
7569
7570 static rsqrts_type
7571 get_rsqrts_type (machine_mode mode)
7572 {
7573   switch (mode)
7574   {
7575     case DFmode:   return gen_aarch64_rsqrtsdf;
7576     case SFmode:   return gen_aarch64_rsqrtssf;
7577     case V2DFmode: return gen_aarch64_rsqrtsv2df;
7578     case V2SFmode: return gen_aarch64_rsqrtsv2sf;
7579     case V4SFmode: return gen_aarch64_rsqrtsv4sf;
7580     default: gcc_unreachable ();
7581   }
7582 }
7583
7584 /* Emit instruction sequence to compute either the approximate square root
7585    or its approximate reciprocal, depending on the flag RECP, and return
7586    whether the sequence was emitted or not.  */
7587
7588 bool
7589 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
7590 {
7591   machine_mode mode = GET_MODE (dst);
7592
7593   if (GET_MODE_INNER (mode) == HFmode)
7594     return false;
7595
7596   machine_mode mmsk = mode_for_vector
7597                         (int_mode_for_mode (GET_MODE_INNER (mode)),
7598                          GET_MODE_NUNITS (mode));
7599   bool use_approx_sqrt_p = (!recp
7600                             && (flag_mlow_precision_sqrt
7601                                 || (aarch64_tune_params.approx_modes->sqrt
7602                                     & AARCH64_APPROX_MODE (mode))));
7603   bool use_approx_rsqrt_p = (recp
7604                              && (flag_mrecip_low_precision_sqrt
7605                                  || (aarch64_tune_params.approx_modes->recip_sqrt
7606                                      & AARCH64_APPROX_MODE (mode))));
7607
7608   if (!flag_finite_math_only
7609       || flag_trapping_math
7610       || !flag_unsafe_math_optimizations
7611       || !(use_approx_sqrt_p || use_approx_rsqrt_p)
7612       || optimize_function_for_size_p (cfun))
7613     return false;
7614
7615   rtx xmsk = gen_reg_rtx (mmsk);
7616   if (!recp)
7617     /* When calculating the approximate square root, compare the argument with
7618        0.0 and create a mask.  */
7619     emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
7620                                                           CONST0_RTX (mode)))));
7621
7622   /* Estimate the approximate reciprocal square root.  */
7623   rtx xdst = gen_reg_rtx (mode);
7624   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
7625
7626   /* Iterate over the series twice for SF and thrice for DF.  */
7627   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7628
7629   /* Optionally iterate over the series once less for faster performance
7630      while sacrificing the accuracy.  */
7631   if ((recp && flag_mrecip_low_precision_sqrt)
7632       || (!recp && flag_mlow_precision_sqrt))
7633     iterations--;
7634
7635   /* Iterate over the series to calculate the approximate reciprocal square
7636      root.  */
7637   rtx x1 = gen_reg_rtx (mode);
7638   while (iterations--)
7639     {
7640       rtx x2 = gen_reg_rtx (mode);
7641       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
7642
7643       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
7644
7645       if (iterations > 0)
7646         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
7647     }
7648
7649   if (!recp)
7650     {
7651       /* Qualify the approximate reciprocal square root when the argument is
7652          0.0 by squashing the intermediary result to 0.0.  */
7653       rtx xtmp = gen_reg_rtx (mmsk);
7654       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
7655                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
7656       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
7657
7658       /* Calculate the approximate square root.  */
7659       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
7660     }
7661
7662   /* Finalize the approximation.  */
7663   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
7664
7665   return true;
7666 }
7667
7668 typedef rtx (*recpe_type) (rtx, rtx);
7669
7670 /* Select reciprocal initial estimate insn depending on machine mode.  */
7671
7672 static recpe_type
7673 get_recpe_type (machine_mode mode)
7674 {
7675   switch (mode)
7676   {
7677     case SFmode:   return (gen_aarch64_frecpesf);
7678     case V2SFmode: return (gen_aarch64_frecpev2sf);
7679     case V4SFmode: return (gen_aarch64_frecpev4sf);
7680     case DFmode:   return (gen_aarch64_frecpedf);
7681     case V2DFmode: return (gen_aarch64_frecpev2df);
7682     default:       gcc_unreachable ();
7683   }
7684 }
7685
7686 typedef rtx (*recps_type) (rtx, rtx, rtx);
7687
7688 /* Select reciprocal series step insn depending on machine mode.  */
7689
7690 static recps_type
7691 get_recps_type (machine_mode mode)
7692 {
7693   switch (mode)
7694   {
7695     case SFmode:   return (gen_aarch64_frecpssf);
7696     case V2SFmode: return (gen_aarch64_frecpsv2sf);
7697     case V4SFmode: return (gen_aarch64_frecpsv4sf);
7698     case DFmode:   return (gen_aarch64_frecpsdf);
7699     case V2DFmode: return (gen_aarch64_frecpsv2df);
7700     default:       gcc_unreachable ();
7701   }
7702 }
7703
7704 /* Emit the instruction sequence to compute the approximation for the division
7705    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
7706
7707 bool
7708 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
7709 {
7710   machine_mode mode = GET_MODE (quo);
7711
7712   if (GET_MODE_INNER (mode) == HFmode)
7713     return false;
7714
7715   bool use_approx_division_p = (flag_mlow_precision_div
7716                                 || (aarch64_tune_params.approx_modes->division
7717                                     & AARCH64_APPROX_MODE (mode)));
7718
7719   if (!flag_finite_math_only
7720       || flag_trapping_math
7721       || !flag_unsafe_math_optimizations
7722       || optimize_function_for_size_p (cfun)
7723       || !use_approx_division_p)
7724     return false;
7725
7726   /* Estimate the approximate reciprocal.  */
7727   rtx xrcp = gen_reg_rtx (mode);
7728   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
7729
7730   /* Iterate over the series twice for SF and thrice for DF.  */
7731   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7732
7733   /* Optionally iterate over the series once less for faster performance,
7734      while sacrificing the accuracy.  */
7735   if (flag_mlow_precision_div)
7736     iterations--;
7737
7738   /* Iterate over the series to calculate the approximate reciprocal.  */
7739   rtx xtmp = gen_reg_rtx (mode);
7740   while (iterations--)
7741     {
7742       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
7743
7744       if (iterations > 0)
7745         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
7746     }
7747
7748   if (num != CONST1_RTX (mode))
7749     {
7750       /* As the approximate reciprocal of DEN is already calculated, only
7751          calculate the approximate division when NUM is not 1.0.  */
7752       rtx xnum = force_reg (mode, num);
7753       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
7754     }
7755
7756   /* Finalize the approximation.  */
7757   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
7758   return true;
7759 }
7760
7761 /* Return the number of instructions that can be issued per cycle.  */
7762 static int
7763 aarch64_sched_issue_rate (void)
7764 {
7765   return aarch64_tune_params.issue_rate;
7766 }
7767
7768 static int
7769 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
7770 {
7771   int issue_rate = aarch64_sched_issue_rate ();
7772
7773   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
7774 }
7775
7776
7777 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
7778    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
7779    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
7780
7781 static int
7782 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
7783                                                     int ready_index)
7784 {
7785   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
7786 }
7787
7788
7789 /* Vectorizer cost model target hooks.  */
7790
7791 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
7792 static int
7793 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
7794                                     tree vectype,
7795                                     int misalign ATTRIBUTE_UNUSED)
7796 {
7797   unsigned elements;
7798
7799   switch (type_of_cost)
7800     {
7801       case scalar_stmt:
7802         return aarch64_tune_params.vec_costs->scalar_stmt_cost;
7803
7804       case scalar_load:
7805         return aarch64_tune_params.vec_costs->scalar_load_cost;
7806
7807       case scalar_store:
7808         return aarch64_tune_params.vec_costs->scalar_store_cost;
7809
7810       case vector_stmt:
7811         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7812
7813       case vector_load:
7814         return aarch64_tune_params.vec_costs->vec_align_load_cost;
7815
7816       case vector_store:
7817         return aarch64_tune_params.vec_costs->vec_store_cost;
7818
7819       case vec_to_scalar:
7820         return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
7821
7822       case scalar_to_vec:
7823         return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
7824
7825       case unaligned_load:
7826         return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
7827
7828       case unaligned_store:
7829         return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
7830
7831       case cond_branch_taken:
7832         return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
7833
7834       case cond_branch_not_taken:
7835         return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
7836
7837       case vec_perm:
7838         return aarch64_tune_params.vec_costs->vec_permute_cost;
7839
7840       case vec_promote_demote:
7841         return aarch64_tune_params.vec_costs->vec_stmt_cost;
7842
7843       case vec_construct:
7844         elements = TYPE_VECTOR_SUBPARTS (vectype);
7845         return elements / 2 + 1;
7846
7847       default:
7848         gcc_unreachable ();
7849     }
7850 }
7851
7852 /* Implement targetm.vectorize.add_stmt_cost.  */
7853 static unsigned
7854 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
7855                        struct _stmt_vec_info *stmt_info, int misalign,
7856                        enum vect_cost_model_location where)
7857 {
7858   unsigned *cost = (unsigned *) data;
7859   unsigned retval = 0;
7860
7861   if (flag_vect_cost_model)
7862     {
7863       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
7864       int stmt_cost =
7865             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
7866
7867       /* Statements in an inner loop relative to the loop being
7868          vectorized are weighted more heavily.  The value here is
7869          arbitrary and could potentially be improved with analysis.  */
7870       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
7871         count *= 50; /*  FIXME  */
7872
7873       retval = (unsigned) (count * stmt_cost);
7874       cost[where] += retval;
7875     }
7876
7877   return retval;
7878 }
7879
7880 static void initialize_aarch64_code_model (struct gcc_options *);
7881
7882 /* Parse the TO_PARSE string and put the architecture struct that it
7883    selects into RES and the architectural features into ISA_FLAGS.
7884    Return an aarch64_parse_opt_result describing the parse result.
7885    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
7886
7887 static enum aarch64_parse_opt_result
7888 aarch64_parse_arch (const char *to_parse, const struct processor **res,
7889                     unsigned long *isa_flags)
7890 {
7891   char *ext;
7892   const struct processor *arch;
7893   char *str = (char *) alloca (strlen (to_parse) + 1);
7894   size_t len;
7895
7896   strcpy (str, to_parse);
7897
7898   ext = strchr (str, '+');
7899
7900   if (ext != NULL)
7901     len = ext - str;
7902   else
7903     len = strlen (str);
7904
7905   if (len == 0)
7906     return AARCH64_PARSE_MISSING_ARG;
7907
7908
7909   /* Loop through the list of supported ARCHes to find a match.  */
7910   for (arch = all_architectures; arch->name != NULL; arch++)
7911     {
7912       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7913         {
7914           unsigned long isa_temp = arch->flags;
7915
7916           if (ext != NULL)
7917             {
7918               /* TO_PARSE string contains at least one extension.  */
7919               enum aarch64_parse_opt_result ext_res
7920                 = aarch64_parse_extension (ext, &isa_temp);
7921
7922               if (ext_res != AARCH64_PARSE_OK)
7923                 return ext_res;
7924             }
7925           /* Extension parsing was successful.  Confirm the result
7926              arch and ISA flags.  */
7927           *res = arch;
7928           *isa_flags = isa_temp;
7929           return AARCH64_PARSE_OK;
7930         }
7931     }
7932
7933   /* ARCH name not found in list.  */
7934   return AARCH64_PARSE_INVALID_ARG;
7935 }
7936
7937 /* Parse the TO_PARSE string and put the result tuning in RES and the
7938    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
7939    describing the parse result.  If there is an error parsing, RES and
7940    ISA_FLAGS are left unchanged.  */
7941
7942 static enum aarch64_parse_opt_result
7943 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
7944                    unsigned long *isa_flags)
7945 {
7946   char *ext;
7947   const struct processor *cpu;
7948   char *str = (char *) alloca (strlen (to_parse) + 1);
7949   size_t len;
7950
7951   strcpy (str, to_parse);
7952
7953   ext = strchr (str, '+');
7954
7955   if (ext != NULL)
7956     len = ext - str;
7957   else
7958     len = strlen (str);
7959
7960   if (len == 0)
7961     return AARCH64_PARSE_MISSING_ARG;
7962
7963
7964   /* Loop through the list of supported CPUs to find a match.  */
7965   for (cpu = all_cores; cpu->name != NULL; cpu++)
7966     {
7967       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7968         {
7969           unsigned long isa_temp = cpu->flags;
7970
7971
7972           if (ext != NULL)
7973             {
7974               /* TO_PARSE string contains at least one extension.  */
7975               enum aarch64_parse_opt_result ext_res
7976                 = aarch64_parse_extension (ext, &isa_temp);
7977
7978               if (ext_res != AARCH64_PARSE_OK)
7979                 return ext_res;
7980             }
7981           /* Extension parsing was successfull.  Confirm the result
7982              cpu and ISA flags.  */
7983           *res = cpu;
7984           *isa_flags = isa_temp;
7985           return AARCH64_PARSE_OK;
7986         }
7987     }
7988
7989   /* CPU name not found in list.  */
7990   return AARCH64_PARSE_INVALID_ARG;
7991 }
7992
7993 /* Parse the TO_PARSE string and put the cpu it selects into RES.
7994    Return an aarch64_parse_opt_result describing the parse result.
7995    If the parsing fails the RES does not change.  */
7996
7997 static enum aarch64_parse_opt_result
7998 aarch64_parse_tune (const char *to_parse, const struct processor **res)
7999 {
8000   const struct processor *cpu;
8001   char *str = (char *) alloca (strlen (to_parse) + 1);
8002
8003   strcpy (str, to_parse);
8004
8005   /* Loop through the list of supported CPUs to find a match.  */
8006   for (cpu = all_cores; cpu->name != NULL; cpu++)
8007     {
8008       if (strcmp (cpu->name, str) == 0)
8009         {
8010           *res = cpu;
8011           return AARCH64_PARSE_OK;
8012         }
8013     }
8014
8015   /* CPU name not found in list.  */
8016   return AARCH64_PARSE_INVALID_ARG;
8017 }
8018
8019 /* Parse TOKEN, which has length LENGTH to see if it is an option
8020    described in FLAG.  If it is, return the index bit for that fusion type.
8021    If not, error (printing OPTION_NAME) and return zero.  */
8022
8023 static unsigned int
8024 aarch64_parse_one_option_token (const char *token,
8025                                 size_t length,
8026                                 const struct aarch64_flag_desc *flag,
8027                                 const char *option_name)
8028 {
8029   for (; flag->name != NULL; flag++)
8030     {
8031       if (length == strlen (flag->name)
8032           && !strncmp (flag->name, token, length))
8033         return flag->flag;
8034     }
8035
8036   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8037   return 0;
8038 }
8039
8040 /* Parse OPTION which is a comma-separated list of flags to enable.
8041    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8042    default state we inherit from the CPU tuning structures.  OPTION_NAME
8043    gives the top-level option we are parsing in the -moverride string,
8044    for use in error messages.  */
8045
8046 static unsigned int
8047 aarch64_parse_boolean_options (const char *option,
8048                                const struct aarch64_flag_desc *flags,
8049                                unsigned int initial_state,
8050                                const char *option_name)
8051 {
8052   const char separator = '.';
8053   const char* specs = option;
8054   const char* ntoken = option;
8055   unsigned int found_flags = initial_state;
8056
8057   while ((ntoken = strchr (specs, separator)))
8058     {
8059       size_t token_length = ntoken - specs;
8060       unsigned token_ops = aarch64_parse_one_option_token (specs,
8061                                                            token_length,
8062                                                            flags,
8063                                                            option_name);
8064       /* If we find "none" (or, for simplicity's sake, an error) anywhere
8065          in the token stream, reset the supported operations.  So:
8066
8067            adrp+add.cmp+branch.none.adrp+add
8068
8069            would have the result of turning on only adrp+add fusion.  */
8070       if (!token_ops)
8071         found_flags = 0;
8072
8073       found_flags |= token_ops;
8074       specs = ++ntoken;
8075     }
8076
8077   /* We ended with a comma, print something.  */
8078   if (!(*specs))
8079     {
8080       error ("%s string ill-formed\n", option_name);
8081       return 0;
8082     }
8083
8084   /* We still have one more token to parse.  */
8085   size_t token_length = strlen (specs);
8086   unsigned token_ops = aarch64_parse_one_option_token (specs,
8087                                                        token_length,
8088                                                        flags,
8089                                                        option_name);
8090    if (!token_ops)
8091      found_flags = 0;
8092
8093   found_flags |= token_ops;
8094   return found_flags;
8095 }
8096
8097 /* Support for overriding instruction fusion.  */
8098
8099 static void
8100 aarch64_parse_fuse_string (const char *fuse_string,
8101                             struct tune_params *tune)
8102 {
8103   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8104                                                      aarch64_fusible_pairs,
8105                                                      tune->fusible_ops,
8106                                                      "fuse=");
8107 }
8108
8109 /* Support for overriding other tuning flags.  */
8110
8111 static void
8112 aarch64_parse_tune_string (const char *tune_string,
8113                             struct tune_params *tune)
8114 {
8115   tune->extra_tuning_flags
8116     = aarch64_parse_boolean_options (tune_string,
8117                                      aarch64_tuning_flags,
8118                                      tune->extra_tuning_flags,
8119                                      "tune=");
8120 }
8121
8122 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8123    we understand.  If it is, extract the option string and handoff to
8124    the appropriate function.  */
8125
8126 void
8127 aarch64_parse_one_override_token (const char* token,
8128                                   size_t length,
8129                                   struct tune_params *tune)
8130 {
8131   const struct aarch64_tuning_override_function *fn
8132     = aarch64_tuning_override_functions;
8133
8134   const char *option_part = strchr (token, '=');
8135   if (!option_part)
8136     {
8137       error ("tuning string missing in option (%s)", token);
8138       return;
8139     }
8140
8141   /* Get the length of the option name.  */
8142   length = option_part - token;
8143   /* Skip the '=' to get to the option string.  */
8144   option_part++;
8145
8146   for (; fn->name != NULL; fn++)
8147     {
8148       if (!strncmp (fn->name, token, length))
8149         {
8150           fn->parse_override (option_part, tune);
8151           return;
8152         }
8153     }
8154
8155   error ("unknown tuning option (%s)",token);
8156   return;
8157 }
8158
8159 /* A checking mechanism for the implementation of the tls size.  */
8160
8161 static void
8162 initialize_aarch64_tls_size (struct gcc_options *opts)
8163 {
8164   if (aarch64_tls_size == 0)
8165     aarch64_tls_size = 24;
8166
8167   switch (opts->x_aarch64_cmodel_var)
8168     {
8169     case AARCH64_CMODEL_TINY:
8170       /* Both the default and maximum TLS size allowed under tiny is 1M which
8171          needs two instructions to address, so we clamp the size to 24.  */
8172       if (aarch64_tls_size > 24)
8173         aarch64_tls_size = 24;
8174       break;
8175     case AARCH64_CMODEL_SMALL:
8176       /* The maximum TLS size allowed under small is 4G.  */
8177       if (aarch64_tls_size > 32)
8178         aarch64_tls_size = 32;
8179       break;
8180     case AARCH64_CMODEL_LARGE:
8181       /* The maximum TLS size allowed under large is 16E.
8182          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
8183       if (aarch64_tls_size > 48)
8184         aarch64_tls_size = 48;
8185       break;
8186     default:
8187       gcc_unreachable ();
8188     }
8189
8190   return;
8191 }
8192
8193 /* Parse STRING looking for options in the format:
8194      string     :: option:string
8195      option     :: name=substring
8196      name       :: {a-z}
8197      substring  :: defined by option.  */
8198
8199 static void
8200 aarch64_parse_override_string (const char* input_string,
8201                                struct tune_params* tune)
8202 {
8203   const char separator = ':';
8204   size_t string_length = strlen (input_string) + 1;
8205   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8206   char *string = string_root;
8207   strncpy (string, input_string, string_length);
8208   string[string_length - 1] = '\0';
8209
8210   char* ntoken = string;
8211
8212   while ((ntoken = strchr (string, separator)))
8213     {
8214       size_t token_length = ntoken - string;
8215       /* Make this substring look like a string.  */
8216       *ntoken = '\0';
8217       aarch64_parse_one_override_token (string, token_length, tune);
8218       string = ++ntoken;
8219     }
8220
8221   /* One last option to parse.  */
8222   aarch64_parse_one_override_token (string, strlen (string), tune);
8223   free (string_root);
8224 }
8225
8226
8227 static void
8228 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8229 {
8230   /* The logic here is that if we are disabling all frame pointer generation
8231      then we do not need to disable leaf frame pointer generation as a
8232      separate operation.  But if we are *only* disabling leaf frame pointer
8233      generation then we set flag_omit_frame_pointer to true, but in
8234      aarch64_frame_pointer_required we return false only for leaf functions.
8235
8236      PR 70044: We have to be careful about being called multiple times for the
8237      same function.  Once we have decided to set flag_omit_frame_pointer just
8238      so that we can omit leaf frame pointers, we must then not interpret a
8239      second call as meaning that all frame pointer generation should be
8240      omitted.  We do this by setting flag_omit_frame_pointer to a special,
8241      non-zero value.  */
8242   if (opts->x_flag_omit_frame_pointer == 2)
8243     opts->x_flag_omit_frame_pointer = 0;
8244
8245   if (opts->x_flag_omit_frame_pointer)
8246     opts->x_flag_omit_leaf_frame_pointer = false;
8247   else if (opts->x_flag_omit_leaf_frame_pointer)
8248     opts->x_flag_omit_frame_pointer = 2;
8249
8250   /* If not optimizing for size, set the default
8251      alignment to what the target wants.  */
8252   if (!opts->x_optimize_size)
8253     {
8254       if (opts->x_align_loops <= 0)
8255         opts->x_align_loops = aarch64_tune_params.loop_align;
8256       if (opts->x_align_jumps <= 0)
8257         opts->x_align_jumps = aarch64_tune_params.jump_align;
8258       if (opts->x_align_functions <= 0)
8259         opts->x_align_functions = aarch64_tune_params.function_align;
8260     }
8261
8262   /* We default to no pc-relative literal loads.  */
8263
8264   aarch64_pcrelative_literal_loads = false;
8265
8266   /* If -mpc-relative-literal-loads is set on the command line, this
8267      implies that the user asked for PC relative literal loads.  */
8268   if (opts->x_pcrelative_literal_loads == 1)
8269     aarch64_pcrelative_literal_loads = true;
8270
8271   /* This is PR70113. When building the Linux kernel with
8272      CONFIG_ARM64_ERRATUM_843419, support for relocations
8273      R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8274      removed from the kernel to avoid loading objects with possibly
8275      offending sequences.  Without -mpc-relative-literal-loads we would
8276      generate such relocations, preventing the kernel build from
8277      succeeding.  */
8278   if (opts->x_pcrelative_literal_loads == 2
8279       && TARGET_FIX_ERR_A53_843419)
8280     aarch64_pcrelative_literal_loads = true;
8281
8282   /* In the tiny memory model it makes no sense to disallow PC relative
8283      literal pool loads.  */
8284   if (aarch64_cmodel == AARCH64_CMODEL_TINY
8285       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8286     aarch64_pcrelative_literal_loads = true;
8287
8288   /* When enabling the lower precision Newton series for the square root, also
8289      enable it for the reciprocal square root, since the latter is an
8290      intermediary step for the former.  */
8291   if (flag_mlow_precision_sqrt)
8292     flag_mrecip_low_precision_sqrt = true;
8293 }
8294
8295 /* 'Unpack' up the internal tuning structs and update the options
8296     in OPTS.  The caller must have set up selected_tune and selected_arch
8297     as all the other target-specific codegen decisions are
8298     derived from them.  */
8299
8300 void
8301 aarch64_override_options_internal (struct gcc_options *opts)
8302 {
8303   aarch64_tune_flags = selected_tune->flags;
8304   aarch64_tune = selected_tune->sched_core;
8305   /* Make a copy of the tuning parameters attached to the core, which
8306      we may later overwrite.  */
8307   aarch64_tune_params = *(selected_tune->tune);
8308   aarch64_architecture_version = selected_arch->architecture_version;
8309
8310   if (opts->x_aarch64_override_tune_string)
8311     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8312                                   &aarch64_tune_params);
8313
8314   /* This target defaults to strict volatile bitfields.  */
8315   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8316     opts->x_flag_strict_volatile_bitfields = 1;
8317
8318   initialize_aarch64_code_model (opts);
8319   initialize_aarch64_tls_size (opts);
8320
8321   int queue_depth = 0;
8322   switch (aarch64_tune_params.autoprefetcher_model)
8323     {
8324       case tune_params::AUTOPREFETCHER_OFF:
8325         queue_depth = -1;
8326         break;
8327       case tune_params::AUTOPREFETCHER_WEAK:
8328         queue_depth = 0;
8329         break;
8330       case tune_params::AUTOPREFETCHER_STRONG:
8331         queue_depth = max_insn_queue_index + 1;
8332         break;
8333       default:
8334         gcc_unreachable ();
8335     }
8336
8337   /* We don't mind passing in global_options_set here as we don't use
8338      the *options_set structs anyway.  */
8339   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8340                          queue_depth,
8341                          opts->x_param_values,
8342                          global_options_set.x_param_values);
8343
8344   /* Set the L1 cache line size.  */
8345   if (selected_cpu->tune->cache_line_size != 0)
8346     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8347                            selected_cpu->tune->cache_line_size,
8348                            opts->x_param_values,
8349                            global_options_set.x_param_values);
8350
8351   aarch64_override_options_after_change_1 (opts);
8352 }
8353
8354 /* Print a hint with a suggestion for a core or architecture name that
8355    most closely resembles what the user passed in STR.  ARCH is true if
8356    the user is asking for an architecture name.  ARCH is false if the user
8357    is asking for a core name.  */
8358
8359 static void
8360 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
8361 {
8362   auto_vec<const char *> candidates;
8363   const struct processor *entry = arch ? all_architectures : all_cores;
8364   for (; entry->name != NULL; entry++)
8365     candidates.safe_push (entry->name);
8366   char *s;
8367   const char *hint = candidates_list_and_hint (str, s, candidates);
8368   if (hint)
8369     inform (input_location, "valid arguments are: %s;"
8370                              " did you mean %qs?", s, hint);
8371   XDELETEVEC (s);
8372 }
8373
8374 /* Print a hint with a suggestion for a core name that most closely resembles
8375    what the user passed in STR.  */
8376
8377 inline static void
8378 aarch64_print_hint_for_core (const char *str)
8379 {
8380   aarch64_print_hint_for_core_or_arch (str, false);
8381 }
8382
8383 /* Print a hint with a suggestion for an architecture name that most closely
8384    resembles what the user passed in STR.  */
8385
8386 inline static void
8387 aarch64_print_hint_for_arch (const char *str)
8388 {
8389   aarch64_print_hint_for_core_or_arch (str, true);
8390 }
8391
8392 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
8393    specified in STR and throw errors if appropriate.  Put the results if
8394    they are valid in RES and ISA_FLAGS.  Return whether the option is
8395    valid.  */
8396
8397 static bool
8398 aarch64_validate_mcpu (const char *str, const struct processor **res,
8399                        unsigned long *isa_flags)
8400 {
8401   enum aarch64_parse_opt_result parse_res
8402     = aarch64_parse_cpu (str, res, isa_flags);
8403
8404   if (parse_res == AARCH64_PARSE_OK)
8405     return true;
8406
8407   switch (parse_res)
8408     {
8409       case AARCH64_PARSE_MISSING_ARG:
8410         error ("missing cpu name in -mcpu=%qs", str);
8411         break;
8412       case AARCH64_PARSE_INVALID_ARG:
8413         error ("unknown value %qs for -mcpu", str);
8414         aarch64_print_hint_for_core (str);
8415         break;
8416       case AARCH64_PARSE_INVALID_FEATURE:
8417         error ("invalid feature modifier in -mcpu=%qs", str);
8418         break;
8419       default:
8420         gcc_unreachable ();
8421     }
8422
8423   return false;
8424 }
8425
8426 /* Validate a command-line -march option.  Parse the arch and extensions
8427    (if any) specified in STR and throw errors if appropriate.  Put the
8428    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
8429    option is valid.  */
8430
8431 static bool
8432 aarch64_validate_march (const char *str, const struct processor **res,
8433                          unsigned long *isa_flags)
8434 {
8435   enum aarch64_parse_opt_result parse_res
8436     = aarch64_parse_arch (str, res, isa_flags);
8437
8438   if (parse_res == AARCH64_PARSE_OK)
8439     return true;
8440
8441   switch (parse_res)
8442     {
8443       case AARCH64_PARSE_MISSING_ARG:
8444         error ("missing arch name in -march=%qs", str);
8445         break;
8446       case AARCH64_PARSE_INVALID_ARG:
8447         error ("unknown value %qs for -march", str);
8448         aarch64_print_hint_for_arch (str);
8449         break;
8450       case AARCH64_PARSE_INVALID_FEATURE:
8451         error ("invalid feature modifier in -march=%qs", str);
8452         break;
8453       default:
8454         gcc_unreachable ();
8455     }
8456
8457   return false;
8458 }
8459
8460 /* Validate a command-line -mtune option.  Parse the cpu
8461    specified in STR and throw errors if appropriate.  Put the
8462    result, if it is valid, in RES.  Return whether the option is
8463    valid.  */
8464
8465 static bool
8466 aarch64_validate_mtune (const char *str, const struct processor **res)
8467 {
8468   enum aarch64_parse_opt_result parse_res
8469     = aarch64_parse_tune (str, res);
8470
8471   if (parse_res == AARCH64_PARSE_OK)
8472     return true;
8473
8474   switch (parse_res)
8475     {
8476       case AARCH64_PARSE_MISSING_ARG:
8477         error ("missing cpu name in -mtune=%qs", str);
8478         break;
8479       case AARCH64_PARSE_INVALID_ARG:
8480         error ("unknown value %qs for -mtune", str);
8481         aarch64_print_hint_for_core (str);
8482         break;
8483       default:
8484         gcc_unreachable ();
8485     }
8486   return false;
8487 }
8488
8489 /* Return the CPU corresponding to the enum CPU.
8490    If it doesn't specify a cpu, return the default.  */
8491
8492 static const struct processor *
8493 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8494 {
8495   if (cpu != aarch64_none)
8496     return &all_cores[cpu];
8497
8498   /* The & 0x3f is to extract the bottom 6 bits that encode the
8499      default cpu as selected by the --with-cpu GCC configure option
8500      in config.gcc.
8501      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8502      flags mechanism should be reworked to make it more sane.  */
8503   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8504 }
8505
8506 /* Return the architecture corresponding to the enum ARCH.
8507    If it doesn't specify a valid architecture, return the default.  */
8508
8509 static const struct processor *
8510 aarch64_get_arch (enum aarch64_arch arch)
8511 {
8512   if (arch != aarch64_no_arch)
8513     return &all_architectures[arch];
8514
8515   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8516
8517   return &all_architectures[cpu->arch];
8518 }
8519
8520 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
8521    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8522    tuning structs.  In particular it must set selected_tune and
8523    aarch64_isa_flags that define the available ISA features and tuning
8524    decisions.  It must also set selected_arch as this will be used to
8525    output the .arch asm tags for each function.  */
8526
8527 static void
8528 aarch64_override_options (void)
8529 {
8530   unsigned long cpu_isa = 0;
8531   unsigned long arch_isa = 0;
8532   aarch64_isa_flags = 0;
8533
8534   bool valid_cpu = true;
8535   bool valid_tune = true;
8536   bool valid_arch = true;
8537
8538   selected_cpu = NULL;
8539   selected_arch = NULL;
8540   selected_tune = NULL;
8541
8542   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8543      If either of -march or -mtune is given, they override their
8544      respective component of -mcpu.  */
8545   if (aarch64_cpu_string)
8546     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8547                                         &cpu_isa);
8548
8549   if (aarch64_arch_string)
8550     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8551                                           &arch_isa);
8552
8553   if (aarch64_tune_string)
8554     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8555
8556   /* If the user did not specify a processor, choose the default
8557      one for them.  This will be the CPU set during configuration using
8558      --with-cpu, otherwise it is "generic".  */
8559   if (!selected_cpu)
8560     {
8561       if (selected_arch)
8562         {
8563           selected_cpu = &all_cores[selected_arch->ident];
8564           aarch64_isa_flags = arch_isa;
8565           explicit_arch = selected_arch->arch;
8566         }
8567       else
8568         {
8569           /* Get default configure-time CPU.  */
8570           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8571           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8572         }
8573
8574       if (selected_tune)
8575         explicit_tune_core = selected_tune->ident;
8576     }
8577   /* If both -mcpu and -march are specified check that they are architecturally
8578      compatible, warn if they're not and prefer the -march ISA flags.  */
8579   else if (selected_arch)
8580     {
8581       if (selected_arch->arch != selected_cpu->arch)
8582         {
8583           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8584                        all_architectures[selected_cpu->arch].name,
8585                        selected_arch->name);
8586         }
8587       aarch64_isa_flags = arch_isa;
8588       explicit_arch = selected_arch->arch;
8589       explicit_tune_core = selected_tune ? selected_tune->ident
8590                                           : selected_cpu->ident;
8591     }
8592   else
8593     {
8594       /* -mcpu but no -march.  */
8595       aarch64_isa_flags = cpu_isa;
8596       explicit_tune_core = selected_tune ? selected_tune->ident
8597                                           : selected_cpu->ident;
8598       gcc_assert (selected_cpu);
8599       selected_arch = &all_architectures[selected_cpu->arch];
8600       explicit_arch = selected_arch->arch;
8601     }
8602
8603   /* Set the arch as well as we will need it when outputing
8604      the .arch directive in assembly.  */
8605   if (!selected_arch)
8606     {
8607       gcc_assert (selected_cpu);
8608       selected_arch = &all_architectures[selected_cpu->arch];
8609     }
8610
8611   if (!selected_tune)
8612     selected_tune = selected_cpu;
8613
8614 #ifndef HAVE_AS_MABI_OPTION
8615   /* The compiler may have been configured with 2.23.* binutils, which does
8616      not have support for ILP32.  */
8617   if (TARGET_ILP32)
8618     error ("Assembler does not support -mabi=ilp32");
8619 #endif
8620
8621   /* Make sure we properly set up the explicit options.  */
8622   if ((aarch64_cpu_string && valid_cpu)
8623        || (aarch64_tune_string && valid_tune))
8624     gcc_assert (explicit_tune_core != aarch64_none);
8625
8626   if ((aarch64_cpu_string && valid_cpu)
8627        || (aarch64_arch_string && valid_arch))
8628     gcc_assert (explicit_arch != aarch64_no_arch);
8629
8630   aarch64_override_options_internal (&global_options);
8631
8632   /* Save these options as the default ones in case we push and pop them later
8633      while processing functions with potential target attributes.  */
8634   target_option_default_node = target_option_current_node
8635       = build_target_option_node (&global_options);
8636 }
8637
8638 /* Implement targetm.override_options_after_change.  */
8639
8640 static void
8641 aarch64_override_options_after_change (void)
8642 {
8643   aarch64_override_options_after_change_1 (&global_options);
8644 }
8645
8646 static struct machine_function *
8647 aarch64_init_machine_status (void)
8648 {
8649   struct machine_function *machine;
8650   machine = ggc_cleared_alloc<machine_function> ();
8651   return machine;
8652 }
8653
8654 void
8655 aarch64_init_expanders (void)
8656 {
8657   init_machine_status = aarch64_init_machine_status;
8658 }
8659
8660 /* A checking mechanism for the implementation of the various code models.  */
8661 static void
8662 initialize_aarch64_code_model (struct gcc_options *opts)
8663 {
8664    if (opts->x_flag_pic)
8665      {
8666        switch (opts->x_aarch64_cmodel_var)
8667          {
8668          case AARCH64_CMODEL_TINY:
8669            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8670            break;
8671          case AARCH64_CMODEL_SMALL:
8672 #ifdef HAVE_AS_SMALL_PIC_RELOCS
8673            aarch64_cmodel = (flag_pic == 2
8674                              ? AARCH64_CMODEL_SMALL_PIC
8675                              : AARCH64_CMODEL_SMALL_SPIC);
8676 #else
8677            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
8678 #endif
8679            break;
8680          case AARCH64_CMODEL_LARGE:
8681            sorry ("code model %qs with -f%s", "large",
8682                   opts->x_flag_pic > 1 ? "PIC" : "pic");
8683            break;
8684          default:
8685            gcc_unreachable ();
8686          }
8687      }
8688    else
8689      aarch64_cmodel = opts->x_aarch64_cmodel_var;
8690 }
8691
8692 /* Implement TARGET_OPTION_SAVE.  */
8693
8694 static void
8695 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
8696 {
8697   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
8698 }
8699
8700 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
8701    using the information saved in PTR.  */
8702
8703 static void
8704 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
8705 {
8706   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
8707   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8708   opts->x_explicit_arch = ptr->x_explicit_arch;
8709   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
8710   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
8711
8712   aarch64_override_options_internal (opts);
8713 }
8714
8715 /* Implement TARGET_OPTION_PRINT.  */
8716
8717 static void
8718 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
8719 {
8720   const struct processor *cpu
8721     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8722   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
8723   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
8724   std::string extension
8725     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
8726
8727   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
8728   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
8729            arch->name, extension.c_str ());
8730 }
8731
8732 static GTY(()) tree aarch64_previous_fndecl;
8733
8734 void
8735 aarch64_reset_previous_fndecl (void)
8736 {
8737   aarch64_previous_fndecl = NULL;
8738 }
8739
8740 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
8741    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
8742    make sure optab availability predicates are recomputed when necessary.  */
8743
8744 void
8745 aarch64_save_restore_target_globals (tree new_tree)
8746 {
8747   if (TREE_TARGET_GLOBALS (new_tree))
8748     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8749   else if (new_tree == target_option_default_node)
8750     restore_target_globals (&default_target_globals);
8751   else
8752     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
8753 }
8754
8755 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
8756    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
8757    of the function, if such exists.  This function may be called multiple
8758    times on a single function so use aarch64_previous_fndecl to avoid
8759    setting up identical state.  */
8760
8761 static void
8762 aarch64_set_current_function (tree fndecl)
8763 {
8764   if (!fndecl || fndecl == aarch64_previous_fndecl)
8765     return;
8766
8767   tree old_tree = (aarch64_previous_fndecl
8768                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
8769                    : NULL_TREE);
8770
8771   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
8772
8773   /* If current function has no attributes but the previous one did,
8774      use the default node.  */
8775   if (!new_tree && old_tree)
8776     new_tree = target_option_default_node;
8777
8778   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
8779      the default have been handled by aarch64_save_restore_target_globals from
8780      aarch64_pragma_target_parse.  */
8781   if (old_tree == new_tree)
8782     return;
8783
8784   aarch64_previous_fndecl = fndecl;
8785
8786   /* First set the target options.  */
8787   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
8788
8789   aarch64_save_restore_target_globals (new_tree);
8790 }
8791
8792 /* Enum describing the various ways we can handle attributes.
8793    In many cases we can reuse the generic option handling machinery.  */
8794
8795 enum aarch64_attr_opt_type
8796 {
8797   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
8798   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
8799   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
8800   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
8801 };
8802
8803 /* All the information needed to handle a target attribute.
8804    NAME is the name of the attribute.
8805    ATTR_TYPE specifies the type of behavior of the attribute as described
8806    in the definition of enum aarch64_attr_opt_type.
8807    ALLOW_NEG is true if the attribute supports a "no-" form.
8808    HANDLER is the function that takes the attribute string and whether
8809    it is a pragma or attribute and handles the option.  It is needed only
8810    when the ATTR_TYPE is aarch64_attr_custom.
8811    OPT_NUM is the enum specifying the option that the attribute modifies.
8812    This is needed for attributes that mirror the behavior of a command-line
8813    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
8814    aarch64_attr_enum.  */
8815
8816 struct aarch64_attribute_info
8817 {
8818   const char *name;
8819   enum aarch64_attr_opt_type attr_type;
8820   bool allow_neg;
8821   bool (*handler) (const char *, const char *);
8822   enum opt_code opt_num;
8823 };
8824
8825 /* Handle the ARCH_STR argument to the arch= target attribute.
8826    PRAGMA_OR_ATTR is used in potential error messages.  */
8827
8828 static bool
8829 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
8830 {
8831   const struct processor *tmp_arch = NULL;
8832   enum aarch64_parse_opt_result parse_res
8833     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
8834
8835   if (parse_res == AARCH64_PARSE_OK)
8836     {
8837       gcc_assert (tmp_arch);
8838       selected_arch = tmp_arch;
8839       explicit_arch = selected_arch->arch;
8840       return true;
8841     }
8842
8843   switch (parse_res)
8844     {
8845       case AARCH64_PARSE_MISSING_ARG:
8846         error ("missing architecture name in 'arch' target %s", pragma_or_attr);
8847         break;
8848       case AARCH64_PARSE_INVALID_ARG:
8849         error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
8850         aarch64_print_hint_for_arch (str);
8851         break;
8852       case AARCH64_PARSE_INVALID_FEATURE:
8853         error ("invalid feature modifier %qs for 'arch' target %s",
8854                str, pragma_or_attr);
8855         break;
8856       default:
8857         gcc_unreachable ();
8858     }
8859
8860   return false;
8861 }
8862
8863 /* Handle the argument CPU_STR to the cpu= target attribute.
8864    PRAGMA_OR_ATTR is used in potential error messages.  */
8865
8866 static bool
8867 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
8868 {
8869   const struct processor *tmp_cpu = NULL;
8870   enum aarch64_parse_opt_result parse_res
8871     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
8872
8873   if (parse_res == AARCH64_PARSE_OK)
8874     {
8875       gcc_assert (tmp_cpu);
8876       selected_tune = tmp_cpu;
8877       explicit_tune_core = selected_tune->ident;
8878
8879       selected_arch = &all_architectures[tmp_cpu->arch];
8880       explicit_arch = selected_arch->arch;
8881       return true;
8882     }
8883
8884   switch (parse_res)
8885     {
8886       case AARCH64_PARSE_MISSING_ARG:
8887         error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
8888         break;
8889       case AARCH64_PARSE_INVALID_ARG:
8890         error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
8891         aarch64_print_hint_for_core (str);
8892         break;
8893       case AARCH64_PARSE_INVALID_FEATURE:
8894         error ("invalid feature modifier %qs for 'cpu' target %s",
8895                str, pragma_or_attr);
8896         break;
8897       default:
8898         gcc_unreachable ();
8899     }
8900
8901   return false;
8902 }
8903
8904 /* Handle the argument STR to the tune= target attribute.
8905    PRAGMA_OR_ATTR is used in potential error messages.  */
8906
8907 static bool
8908 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
8909 {
8910   const struct processor *tmp_tune = NULL;
8911   enum aarch64_parse_opt_result parse_res
8912     = aarch64_parse_tune (str, &tmp_tune);
8913
8914   if (parse_res == AARCH64_PARSE_OK)
8915     {
8916       gcc_assert (tmp_tune);
8917       selected_tune = tmp_tune;
8918       explicit_tune_core = selected_tune->ident;
8919       return true;
8920     }
8921
8922   switch (parse_res)
8923     {
8924       case AARCH64_PARSE_INVALID_ARG:
8925         error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
8926         aarch64_print_hint_for_core (str);
8927         break;
8928       default:
8929         gcc_unreachable ();
8930     }
8931
8932   return false;
8933 }
8934
8935 /* Parse an architecture extensions target attribute string specified in STR.
8936    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
8937    if successful.  Update aarch64_isa_flags to reflect the ISA features
8938    modified.
8939    PRAGMA_OR_ATTR is used in potential error messages.  */
8940
8941 static bool
8942 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
8943 {
8944   enum aarch64_parse_opt_result parse_res;
8945   unsigned long isa_flags = aarch64_isa_flags;
8946
8947   /* We allow "+nothing" in the beginning to clear out all architectural
8948      features if the user wants to handpick specific features.  */
8949   if (strncmp ("+nothing", str, 8) == 0)
8950     {
8951       isa_flags = 0;
8952       str += 8;
8953     }
8954
8955   parse_res = aarch64_parse_extension (str, &isa_flags);
8956
8957   if (parse_res == AARCH64_PARSE_OK)
8958     {
8959       aarch64_isa_flags = isa_flags;
8960       return true;
8961     }
8962
8963   switch (parse_res)
8964     {
8965       case AARCH64_PARSE_MISSING_ARG:
8966         error ("missing feature modifier in target %s %qs",
8967                pragma_or_attr, str);
8968         break;
8969
8970       case AARCH64_PARSE_INVALID_FEATURE:
8971         error ("invalid feature modifier in target %s %qs",
8972                pragma_or_attr, str);
8973         break;
8974
8975       default:
8976         gcc_unreachable ();
8977     }
8978
8979  return false;
8980 }
8981
8982 /* The target attributes that we support.  On top of these we also support just
8983    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
8984    handled explicitly in aarch64_process_one_target_attr.  */
8985
8986 static const struct aarch64_attribute_info aarch64_attributes[] =
8987 {
8988   { "general-regs-only", aarch64_attr_mask, false, NULL,
8989      OPT_mgeneral_regs_only },
8990   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
8991      OPT_mfix_cortex_a53_835769 },
8992   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
8993      OPT_mfix_cortex_a53_843419 },
8994   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
8995   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
8996   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
8997      OPT_momit_leaf_frame_pointer },
8998   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
8999   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9000      OPT_march_ },
9001   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9002   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9003      OPT_mtune_ },
9004   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9005 };
9006
9007 /* Parse ARG_STR which contains the definition of one target attribute.
9008    Show appropriate errors if any or return true if the attribute is valid.
9009    PRAGMA_OR_ATTR holds the string to use in error messages about whether
9010    we're processing a target attribute or pragma.  */
9011
9012 static bool
9013 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9014 {
9015   bool invert = false;
9016
9017   size_t len = strlen (arg_str);
9018
9019   if (len == 0)
9020     {
9021       error ("malformed target %s", pragma_or_attr);
9022       return false;
9023     }
9024
9025   char *str_to_check = (char *) alloca (len + 1);
9026   strcpy (str_to_check, arg_str);
9027
9028   /* Skip leading whitespace.  */
9029   while (*str_to_check == ' ' || *str_to_check == '\t')
9030     str_to_check++;
9031
9032   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9033      It is easier to detect and handle it explicitly here rather than going
9034      through the machinery for the rest of the target attributes in this
9035      function.  */
9036   if (*str_to_check == '+')
9037     return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9038
9039   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9040     {
9041       invert = true;
9042       str_to_check += 3;
9043     }
9044   char *arg = strchr (str_to_check, '=');
9045
9046   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9047      and point ARG to "foo".  */
9048   if (arg)
9049     {
9050       *arg = '\0';
9051       arg++;
9052     }
9053   const struct aarch64_attribute_info *p_attr;
9054   bool found = false;
9055   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9056     {
9057       /* If the names don't match up, or the user has given an argument
9058          to an attribute that doesn't accept one, or didn't give an argument
9059          to an attribute that expects one, fail to match.  */
9060       if (strcmp (str_to_check, p_attr->name) != 0)
9061         continue;
9062
9063       found = true;
9064       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9065                               || p_attr->attr_type == aarch64_attr_enum;
9066
9067       if (attr_need_arg_p ^ (arg != NULL))
9068         {
9069           error ("target %s %qs does not accept an argument",
9070                   pragma_or_attr, str_to_check);
9071           return false;
9072         }
9073
9074       /* If the name matches but the attribute does not allow "no-" versions
9075          then we can't match.  */
9076       if (invert && !p_attr->allow_neg)
9077         {
9078           error ("target %s %qs does not allow a negated form",
9079                   pragma_or_attr, str_to_check);
9080           return false;
9081         }
9082
9083       switch (p_attr->attr_type)
9084         {
9085         /* Has a custom handler registered.
9086            For example, cpu=, arch=, tune=.  */
9087           case aarch64_attr_custom:
9088             gcc_assert (p_attr->handler);
9089             if (!p_attr->handler (arg, pragma_or_attr))
9090               return false;
9091             break;
9092
9093           /* Either set or unset a boolean option.  */
9094           case aarch64_attr_bool:
9095             {
9096               struct cl_decoded_option decoded;
9097
9098               generate_option (p_attr->opt_num, NULL, !invert,
9099                                CL_TARGET, &decoded);
9100               aarch64_handle_option (&global_options, &global_options_set,
9101                                       &decoded, input_location);
9102               break;
9103             }
9104           /* Set or unset a bit in the target_flags.  aarch64_handle_option
9105              should know what mask to apply given the option number.  */
9106           case aarch64_attr_mask:
9107             {
9108               struct cl_decoded_option decoded;
9109               /* We only need to specify the option number.
9110                  aarch64_handle_option will know which mask to apply.  */
9111               decoded.opt_index = p_attr->opt_num;
9112               decoded.value = !invert;
9113               aarch64_handle_option (&global_options, &global_options_set,
9114                                       &decoded, input_location);
9115               break;
9116             }
9117           /* Use the option setting machinery to set an option to an enum.  */
9118           case aarch64_attr_enum:
9119             {
9120               gcc_assert (arg);
9121               bool valid;
9122               int value;
9123               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9124                                               &value, CL_TARGET);
9125               if (valid)
9126                 {
9127                   set_option (&global_options, NULL, p_attr->opt_num, value,
9128                               NULL, DK_UNSPECIFIED, input_location,
9129                               global_dc);
9130                 }
9131               else
9132                 {
9133                   error ("target %s %s=%s is not valid",
9134                          pragma_or_attr, str_to_check, arg);
9135                 }
9136               break;
9137             }
9138           default:
9139             gcc_unreachable ();
9140         }
9141     }
9142
9143   /* If we reached here we either have found an attribute and validated
9144      it or didn't match any.  If we matched an attribute but its arguments
9145      were malformed we will have returned false already.  */
9146   return found;
9147 }
9148
9149 /* Count how many times the character C appears in
9150    NULL-terminated string STR.  */
9151
9152 static unsigned int
9153 num_occurences_in_str (char c, char *str)
9154 {
9155   unsigned int res = 0;
9156   while (*str != '\0')
9157     {
9158       if (*str == c)
9159         res++;
9160
9161       str++;
9162     }
9163
9164   return res;
9165 }
9166
9167 /* Parse the tree in ARGS that contains the target attribute information
9168    and update the global target options space.  PRAGMA_OR_ATTR is a string
9169    to be used in error messages, specifying whether this is processing
9170    a target attribute or a target pragma.  */
9171
9172 bool
9173 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9174 {
9175   if (TREE_CODE (args) == TREE_LIST)
9176     {
9177       do
9178         {
9179           tree head = TREE_VALUE (args);
9180           if (head)
9181             {
9182               if (!aarch64_process_target_attr (head, pragma_or_attr))
9183                 return false;
9184             }
9185           args = TREE_CHAIN (args);
9186         } while (args);
9187
9188       return true;
9189     }
9190   /* We expect to find a string to parse.  */
9191   gcc_assert (TREE_CODE (args) == STRING_CST);
9192
9193   size_t len = strlen (TREE_STRING_POINTER (args));
9194   char *str_to_check = (char *) alloca (len + 1);
9195   strcpy (str_to_check, TREE_STRING_POINTER (args));
9196
9197   if (len == 0)
9198     {
9199       error ("malformed target %s value", pragma_or_attr);
9200       return false;
9201     }
9202
9203   /* Used to catch empty spaces between commas i.e.
9204      attribute ((target ("attr1,,attr2"))).  */
9205   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9206
9207   /* Handle multiple target attributes separated by ','.  */
9208   char *token = strtok (str_to_check, ",");
9209
9210   unsigned int num_attrs = 0;
9211   while (token)
9212     {
9213       num_attrs++;
9214       if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9215         {
9216           error ("target %s %qs is invalid", pragma_or_attr, token);
9217           return false;
9218         }
9219
9220       token = strtok (NULL, ",");
9221     }
9222
9223   if (num_attrs != num_commas + 1)
9224     {
9225       error ("malformed target %s list %qs",
9226               pragma_or_attr, TREE_STRING_POINTER (args));
9227       return false;
9228     }
9229
9230   return true;
9231 }
9232
9233 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
9234    process attribute ((target ("..."))).  */
9235
9236 static bool
9237 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9238 {
9239   struct cl_target_option cur_target;
9240   bool ret;
9241   tree old_optimize;
9242   tree new_target, new_optimize;
9243   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9244
9245   /* If what we're processing is the current pragma string then the
9246      target option node is already stored in target_option_current_node
9247      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
9248      having to re-parse the string.  This is especially useful to keep
9249      arm_neon.h compile times down since that header contains a lot
9250      of intrinsics enclosed in pragmas.  */
9251   if (!existing_target && args == current_target_pragma)
9252     {
9253       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9254       return true;
9255     }
9256   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9257
9258   old_optimize = build_optimization_node (&global_options);
9259   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9260
9261   /* If the function changed the optimization levels as well as setting
9262      target options, start with the optimizations specified.  */
9263   if (func_optimize && func_optimize != old_optimize)
9264     cl_optimization_restore (&global_options,
9265                              TREE_OPTIMIZATION (func_optimize));
9266
9267   /* Save the current target options to restore at the end.  */
9268   cl_target_option_save (&cur_target, &global_options);
9269
9270   /* If fndecl already has some target attributes applied to it, unpack
9271      them so that we add this attribute on top of them, rather than
9272      overwriting them.  */
9273   if (existing_target)
9274     {
9275       struct cl_target_option *existing_options
9276         = TREE_TARGET_OPTION (existing_target);
9277
9278       if (existing_options)
9279         cl_target_option_restore (&global_options, existing_options);
9280     }
9281   else
9282     cl_target_option_restore (&global_options,
9283                         TREE_TARGET_OPTION (target_option_current_node));
9284
9285
9286   ret = aarch64_process_target_attr (args, "attribute");
9287
9288   /* Set up any additional state.  */
9289   if (ret)
9290     {
9291       aarch64_override_options_internal (&global_options);
9292       /* Initialize SIMD builtins if we haven't already.
9293          Set current_target_pragma to NULL for the duration so that
9294          the builtin initialization code doesn't try to tag the functions
9295          being built with the attributes specified by any current pragma, thus
9296          going into an infinite recursion.  */
9297       if (TARGET_SIMD)
9298         {
9299           tree saved_current_target_pragma = current_target_pragma;
9300           current_target_pragma = NULL;
9301           aarch64_init_simd_builtins ();
9302           current_target_pragma = saved_current_target_pragma;
9303         }
9304       new_target = build_target_option_node (&global_options);
9305     }
9306   else
9307     new_target = NULL;
9308
9309   new_optimize = build_optimization_node (&global_options);
9310
9311   if (fndecl && ret)
9312     {
9313       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9314
9315       if (old_optimize != new_optimize)
9316         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9317     }
9318
9319   cl_target_option_restore (&global_options, &cur_target);
9320
9321   if (old_optimize != new_optimize)
9322     cl_optimization_restore (&global_options,
9323                              TREE_OPTIMIZATION (old_optimize));
9324   return ret;
9325 }
9326
9327 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
9328    tri-bool options (yes, no, don't care) and the default value is
9329    DEF, determine whether to reject inlining.  */
9330
9331 static bool
9332 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9333                                      int dont_care, int def)
9334 {
9335   /* If the callee doesn't care, always allow inlining.  */
9336   if (callee == dont_care)
9337     return true;
9338
9339   /* If the caller doesn't care, always allow inlining.  */
9340   if (caller == dont_care)
9341     return true;
9342
9343   /* Otherwise, allow inlining if either the callee and caller values
9344      agree, or if the callee is using the default value.  */
9345   return (callee == caller || callee == def);
9346 }
9347
9348 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
9349    to inline CALLEE into CALLER based on target-specific info.
9350    Make sure that the caller and callee have compatible architectural
9351    features.  Then go through the other possible target attributes
9352    and see if they can block inlining.  Try not to reject always_inline
9353    callees unless they are incompatible architecturally.  */
9354
9355 static bool
9356 aarch64_can_inline_p (tree caller, tree callee)
9357 {
9358   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9359   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9360
9361   /* If callee has no option attributes, then it is ok to inline.  */
9362   if (!callee_tree)
9363     return true;
9364
9365   struct cl_target_option *caller_opts
9366         = TREE_TARGET_OPTION (caller_tree ? caller_tree
9367                                            : target_option_default_node);
9368
9369   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9370
9371
9372   /* Callee's ISA flags should be a subset of the caller's.  */
9373   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9374        != callee_opts->x_aarch64_isa_flags)
9375     return false;
9376
9377   /* Allow non-strict aligned functions inlining into strict
9378      aligned ones.  */
9379   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9380        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9381       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9382            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9383     return false;
9384
9385   bool always_inline = lookup_attribute ("always_inline",
9386                                           DECL_ATTRIBUTES (callee));
9387
9388   /* If the architectural features match up and the callee is always_inline
9389      then the other attributes don't matter.  */
9390   if (always_inline)
9391     return true;
9392
9393   if (caller_opts->x_aarch64_cmodel_var
9394       != callee_opts->x_aarch64_cmodel_var)
9395     return false;
9396
9397   if (caller_opts->x_aarch64_tls_dialect
9398       != callee_opts->x_aarch64_tls_dialect)
9399     return false;
9400
9401   /* Honour explicit requests to workaround errata.  */
9402   if (!aarch64_tribools_ok_for_inlining_p (
9403           caller_opts->x_aarch64_fix_a53_err835769,
9404           callee_opts->x_aarch64_fix_a53_err835769,
9405           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9406     return false;
9407
9408   if (!aarch64_tribools_ok_for_inlining_p (
9409           caller_opts->x_aarch64_fix_a53_err843419,
9410           callee_opts->x_aarch64_fix_a53_err843419,
9411           2, TARGET_FIX_ERR_A53_843419))
9412     return false;
9413
9414   /* If the user explicitly specified -momit-leaf-frame-pointer for the
9415      caller and calle and they don't match up, reject inlining.  */
9416   if (!aarch64_tribools_ok_for_inlining_p (
9417           caller_opts->x_flag_omit_leaf_frame_pointer,
9418           callee_opts->x_flag_omit_leaf_frame_pointer,
9419           2, 1))
9420     return false;
9421
9422   /* If the callee has specific tuning overrides, respect them.  */
9423   if (callee_opts->x_aarch64_override_tune_string != NULL
9424       && caller_opts->x_aarch64_override_tune_string == NULL)
9425     return false;
9426
9427   /* If the user specified tuning override strings for the
9428      caller and callee and they don't match up, reject inlining.
9429      We just do a string compare here, we don't analyze the meaning
9430      of the string, as it would be too costly for little gain.  */
9431   if (callee_opts->x_aarch64_override_tune_string
9432       && caller_opts->x_aarch64_override_tune_string
9433       && (strcmp (callee_opts->x_aarch64_override_tune_string,
9434                   caller_opts->x_aarch64_override_tune_string) != 0))
9435     return false;
9436
9437   return true;
9438 }
9439
9440 /* Return true if SYMBOL_REF X binds locally.  */
9441
9442 static bool
9443 aarch64_symbol_binds_local_p (const_rtx x)
9444 {
9445   return (SYMBOL_REF_DECL (x)
9446           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9447           : SYMBOL_REF_LOCAL_P (x));
9448 }
9449
9450 /* Return true if SYMBOL_REF X is thread local */
9451 static bool
9452 aarch64_tls_symbol_p (rtx x)
9453 {
9454   if (! TARGET_HAVE_TLS)
9455     return false;
9456
9457   if (GET_CODE (x) != SYMBOL_REF)
9458     return false;
9459
9460   return SYMBOL_REF_TLS_MODEL (x) != 0;
9461 }
9462
9463 /* Classify a TLS symbol into one of the TLS kinds.  */
9464 enum aarch64_symbol_type
9465 aarch64_classify_tls_symbol (rtx x)
9466 {
9467   enum tls_model tls_kind = tls_symbolic_operand_type (x);
9468
9469   switch (tls_kind)
9470     {
9471     case TLS_MODEL_GLOBAL_DYNAMIC:
9472     case TLS_MODEL_LOCAL_DYNAMIC:
9473       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9474
9475     case TLS_MODEL_INITIAL_EXEC:
9476       switch (aarch64_cmodel)
9477         {
9478         case AARCH64_CMODEL_TINY:
9479         case AARCH64_CMODEL_TINY_PIC:
9480           return SYMBOL_TINY_TLSIE;
9481         default:
9482           return SYMBOL_SMALL_TLSIE;
9483         }
9484
9485     case TLS_MODEL_LOCAL_EXEC:
9486       if (aarch64_tls_size == 12)
9487         return SYMBOL_TLSLE12;
9488       else if (aarch64_tls_size == 24)
9489         return SYMBOL_TLSLE24;
9490       else if (aarch64_tls_size == 32)
9491         return SYMBOL_TLSLE32;
9492       else if (aarch64_tls_size == 48)
9493         return SYMBOL_TLSLE48;
9494       else
9495         gcc_unreachable ();
9496
9497     case TLS_MODEL_EMULATED:
9498     case TLS_MODEL_NONE:
9499       return SYMBOL_FORCE_TO_MEM;
9500
9501     default:
9502       gcc_unreachable ();
9503     }
9504 }
9505
9506 /* Return the method that should be used to access SYMBOL_REF or
9507    LABEL_REF X.  */
9508
9509 enum aarch64_symbol_type
9510 aarch64_classify_symbol (rtx x, rtx offset)
9511 {
9512   if (GET_CODE (x) == LABEL_REF)
9513     {
9514       switch (aarch64_cmodel)
9515         {
9516         case AARCH64_CMODEL_LARGE:
9517           return SYMBOL_FORCE_TO_MEM;
9518
9519         case AARCH64_CMODEL_TINY_PIC:
9520         case AARCH64_CMODEL_TINY:
9521           return SYMBOL_TINY_ABSOLUTE;
9522
9523         case AARCH64_CMODEL_SMALL_SPIC:
9524         case AARCH64_CMODEL_SMALL_PIC:
9525         case AARCH64_CMODEL_SMALL:
9526           return SYMBOL_SMALL_ABSOLUTE;
9527
9528         default:
9529           gcc_unreachable ();
9530         }
9531     }
9532
9533   if (GET_CODE (x) == SYMBOL_REF)
9534     {
9535       if (aarch64_tls_symbol_p (x))
9536         return aarch64_classify_tls_symbol (x);
9537
9538       switch (aarch64_cmodel)
9539         {
9540         case AARCH64_CMODEL_TINY:
9541           /* When we retrieve symbol + offset address, we have to make sure
9542              the offset does not cause overflow of the final address.  But
9543              we have no way of knowing the address of symbol at compile time
9544              so we can't accurately say if the distance between the PC and
9545              symbol + offset is outside the addressible range of +/-1M in the
9546              TINY code model.  So we rely on images not being greater than
9547              1M and cap the offset at 1M and anything beyond 1M will have to
9548              be loaded using an alternative mechanism.  Furthermore if the
9549              symbol is a weak reference to something that isn't known to
9550              resolve to a symbol in this module, then force to memory.  */
9551           if ((SYMBOL_REF_WEAK (x)
9552                && !aarch64_symbol_binds_local_p (x))
9553               || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9554             return SYMBOL_FORCE_TO_MEM;
9555           return SYMBOL_TINY_ABSOLUTE;
9556
9557         case AARCH64_CMODEL_SMALL:
9558           /* Same reasoning as the tiny code model, but the offset cap here is
9559              4G.  */
9560           if ((SYMBOL_REF_WEAK (x)
9561                && !aarch64_symbol_binds_local_p (x))
9562               || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9563                             HOST_WIDE_INT_C (4294967264)))
9564             return SYMBOL_FORCE_TO_MEM;
9565           return SYMBOL_SMALL_ABSOLUTE;
9566
9567         case AARCH64_CMODEL_TINY_PIC:
9568           if (!aarch64_symbol_binds_local_p (x))
9569             return SYMBOL_TINY_GOT;
9570           return SYMBOL_TINY_ABSOLUTE;
9571
9572         case AARCH64_CMODEL_SMALL_SPIC:
9573         case AARCH64_CMODEL_SMALL_PIC:
9574           if (!aarch64_symbol_binds_local_p (x))
9575             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9576                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9577           return SYMBOL_SMALL_ABSOLUTE;
9578
9579         case AARCH64_CMODEL_LARGE:
9580           /* This is alright even in PIC code as the constant
9581              pool reference is always PC relative and within
9582              the same translation unit.  */
9583           if (CONSTANT_POOL_ADDRESS_P (x))
9584             return SYMBOL_SMALL_ABSOLUTE;
9585           else
9586             return SYMBOL_FORCE_TO_MEM;
9587
9588         default:
9589           gcc_unreachable ();
9590         }
9591     }
9592
9593   /* By default push everything into the constant pool.  */
9594   return SYMBOL_FORCE_TO_MEM;
9595 }
9596
9597 bool
9598 aarch64_constant_address_p (rtx x)
9599 {
9600   return (CONSTANT_P (x) && memory_address_p (DImode, x));
9601 }
9602
9603 bool
9604 aarch64_legitimate_pic_operand_p (rtx x)
9605 {
9606   if (GET_CODE (x) == SYMBOL_REF
9607       || (GET_CODE (x) == CONST
9608           && GET_CODE (XEXP (x, 0)) == PLUS
9609           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9610      return false;
9611
9612   return true;
9613 }
9614
9615 /* Return true if X holds either a quarter-precision or
9616      floating-point +0.0 constant.  */
9617 static bool
9618 aarch64_valid_floating_const (machine_mode mode, rtx x)
9619 {
9620   if (!CONST_DOUBLE_P (x))
9621     return false;
9622
9623   if (aarch64_float_const_zero_rtx_p (x))
9624     return true;
9625
9626   /* We only handle moving 0.0 to a TFmode register.  */
9627   if (!(mode == SFmode || mode == DFmode))
9628     return false;
9629
9630   return aarch64_float_const_representable_p (x);
9631 }
9632
9633 static bool
9634 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9635 {
9636   /* Do not allow vector struct mode constants.  We could support
9637      0 and -1 easily, but they need support in aarch64-simd.md.  */
9638   if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9639     return false;
9640
9641   /* This could probably go away because
9642      we now decompose CONST_INTs according to expand_mov_immediate.  */
9643   if ((GET_CODE (x) == CONST_VECTOR
9644        && aarch64_simd_valid_immediate (x, mode, false, NULL))
9645       || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9646         return !targetm.cannot_force_const_mem (mode, x);
9647
9648   if (GET_CODE (x) == HIGH
9649       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9650     return true;
9651
9652   return aarch64_constant_address_p (x);
9653 }
9654
9655 rtx
9656 aarch64_load_tp (rtx target)
9657 {
9658   if (!target
9659       || GET_MODE (target) != Pmode
9660       || !register_operand (target, Pmode))
9661     target = gen_reg_rtx (Pmode);
9662
9663   /* Can return in any reg.  */
9664   emit_insn (gen_aarch64_load_tp_hard (target));
9665   return target;
9666 }
9667
9668 /* On AAPCS systems, this is the "struct __va_list".  */
9669 static GTY(()) tree va_list_type;
9670
9671 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
9672    Return the type to use as __builtin_va_list.
9673
9674    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
9675
9676    struct __va_list
9677    {
9678      void *__stack;
9679      void *__gr_top;
9680      void *__vr_top;
9681      int   __gr_offs;
9682      int   __vr_offs;
9683    };  */
9684
9685 static tree
9686 aarch64_build_builtin_va_list (void)
9687 {
9688   tree va_list_name;
9689   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9690
9691   /* Create the type.  */
9692   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
9693   /* Give it the required name.  */
9694   va_list_name = build_decl (BUILTINS_LOCATION,
9695                              TYPE_DECL,
9696                              get_identifier ("__va_list"),
9697                              va_list_type);
9698   DECL_ARTIFICIAL (va_list_name) = 1;
9699   TYPE_NAME (va_list_type) = va_list_name;
9700   TYPE_STUB_DECL (va_list_type) = va_list_name;
9701
9702   /* Create the fields.  */
9703   f_stack = build_decl (BUILTINS_LOCATION,
9704                         FIELD_DECL, get_identifier ("__stack"),
9705                         ptr_type_node);
9706   f_grtop = build_decl (BUILTINS_LOCATION,
9707                         FIELD_DECL, get_identifier ("__gr_top"),
9708                         ptr_type_node);
9709   f_vrtop = build_decl (BUILTINS_LOCATION,
9710                         FIELD_DECL, get_identifier ("__vr_top"),
9711                         ptr_type_node);
9712   f_groff = build_decl (BUILTINS_LOCATION,
9713                         FIELD_DECL, get_identifier ("__gr_offs"),
9714                         integer_type_node);
9715   f_vroff = build_decl (BUILTINS_LOCATION,
9716                         FIELD_DECL, get_identifier ("__vr_offs"),
9717                         integer_type_node);
9718
9719   /* Tell tree-stdarg pass about our internal offset fields.
9720      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
9721      purpose to identify whether the code is updating va_list internal
9722      offset fields through irregular way.  */
9723   va_list_gpr_counter_field = f_groff;
9724   va_list_fpr_counter_field = f_vroff;
9725
9726   DECL_ARTIFICIAL (f_stack) = 1;
9727   DECL_ARTIFICIAL (f_grtop) = 1;
9728   DECL_ARTIFICIAL (f_vrtop) = 1;
9729   DECL_ARTIFICIAL (f_groff) = 1;
9730   DECL_ARTIFICIAL (f_vroff) = 1;
9731
9732   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
9733   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
9734   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
9735   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
9736   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
9737
9738   TYPE_FIELDS (va_list_type) = f_stack;
9739   DECL_CHAIN (f_stack) = f_grtop;
9740   DECL_CHAIN (f_grtop) = f_vrtop;
9741   DECL_CHAIN (f_vrtop) = f_groff;
9742   DECL_CHAIN (f_groff) = f_vroff;
9743
9744   /* Compute its layout.  */
9745   layout_type (va_list_type);
9746
9747   return va_list_type;
9748 }
9749
9750 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
9751 static void
9752 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
9753 {
9754   const CUMULATIVE_ARGS *cum;
9755   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9756   tree stack, grtop, vrtop, groff, vroff;
9757   tree t;
9758   int gr_save_area_size = cfun->va_list_gpr_size;
9759   int vr_save_area_size = cfun->va_list_fpr_size;
9760   int vr_offset;
9761
9762   cum = &crtl->args.info;
9763   if (cfun->va_list_gpr_size)
9764     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
9765                              cfun->va_list_gpr_size);
9766   if (cfun->va_list_fpr_size)
9767     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
9768                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
9769
9770   if (!TARGET_FLOAT)
9771     {
9772       gcc_assert (cum->aapcs_nvrn == 0);
9773       vr_save_area_size = 0;
9774     }
9775
9776   f_stack = TYPE_FIELDS (va_list_type_node);
9777   f_grtop = DECL_CHAIN (f_stack);
9778   f_vrtop = DECL_CHAIN (f_grtop);
9779   f_groff = DECL_CHAIN (f_vrtop);
9780   f_vroff = DECL_CHAIN (f_groff);
9781
9782   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
9783                   NULL_TREE);
9784   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
9785                   NULL_TREE);
9786   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
9787                   NULL_TREE);
9788   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
9789                   NULL_TREE);
9790   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
9791                   NULL_TREE);
9792
9793   /* Emit code to initialize STACK, which points to the next varargs stack
9794      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
9795      by named arguments.  STACK is 8-byte aligned.  */
9796   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
9797   if (cum->aapcs_stack_size > 0)
9798     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
9799   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
9800   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9801
9802   /* Emit code to initialize GRTOP, the top of the GR save area.
9803      virtual_incoming_args_rtx should have been 16 byte aligned.  */
9804   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
9805   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
9806   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9807
9808   /* Emit code to initialize VRTOP, the top of the VR save area.
9809      This address is gr_save_area_bytes below GRTOP, rounded
9810      down to the next 16-byte boundary.  */
9811   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
9812   vr_offset = ROUND_UP (gr_save_area_size,
9813                         STACK_BOUNDARY / BITS_PER_UNIT);
9814
9815   if (vr_offset)
9816     t = fold_build_pointer_plus_hwi (t, -vr_offset);
9817   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
9818   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9819
9820   /* Emit code to initialize GROFF, the offset from GRTOP of the
9821      next GPR argument.  */
9822   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
9823               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
9824   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9825
9826   /* Likewise emit code to initialize VROFF, the offset from FTOP
9827      of the next VR argument.  */
9828   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
9829               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
9830   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9831 }
9832
9833 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
9834
9835 static tree
9836 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
9837                               gimple_seq *post_p ATTRIBUTE_UNUSED)
9838 {
9839   tree addr;
9840   bool indirect_p;
9841   bool is_ha;           /* is HFA or HVA.  */
9842   bool dw_align;        /* double-word align.  */
9843   machine_mode ag_mode = VOIDmode;
9844   int nregs;
9845   machine_mode mode;
9846
9847   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9848   tree stack, f_top, f_off, off, arg, roundup, on_stack;
9849   HOST_WIDE_INT size, rsize, adjust, align;
9850   tree t, u, cond1, cond2;
9851
9852   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9853   if (indirect_p)
9854     type = build_pointer_type (type);
9855
9856   mode = TYPE_MODE (type);
9857
9858   f_stack = TYPE_FIELDS (va_list_type_node);
9859   f_grtop = DECL_CHAIN (f_stack);
9860   f_vrtop = DECL_CHAIN (f_grtop);
9861   f_groff = DECL_CHAIN (f_vrtop);
9862   f_vroff = DECL_CHAIN (f_groff);
9863
9864   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
9865                   f_stack, NULL_TREE);
9866   size = int_size_in_bytes (type);
9867   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
9868
9869   dw_align = false;
9870   adjust = 0;
9871   if (aarch64_vfp_is_call_or_return_candidate (mode,
9872                                                type,
9873                                                &ag_mode,
9874                                                &nregs,
9875                                                &is_ha))
9876     {
9877       /* TYPE passed in fp/simd registers.  */
9878       if (!TARGET_FLOAT)
9879         aarch64_err_no_fpadvsimd (mode, "varargs");
9880
9881       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
9882                       unshare_expr (valist), f_vrtop, NULL_TREE);
9883       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
9884                       unshare_expr (valist), f_vroff, NULL_TREE);
9885
9886       rsize = nregs * UNITS_PER_VREG;
9887
9888       if (is_ha)
9889         {
9890           if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
9891             adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
9892         }
9893       else if (BLOCK_REG_PADDING (mode, type, 1) == downward
9894                && size < UNITS_PER_VREG)
9895         {
9896           adjust = UNITS_PER_VREG - size;
9897         }
9898     }
9899   else
9900     {
9901       /* TYPE passed in general registers.  */
9902       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
9903                       unshare_expr (valist), f_grtop, NULL_TREE);
9904       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
9905                       unshare_expr (valist), f_groff, NULL_TREE);
9906       rsize = ROUND_UP (size, UNITS_PER_WORD);
9907       nregs = rsize / UNITS_PER_WORD;
9908
9909       if (align > 8)
9910         dw_align = true;
9911
9912       if (BLOCK_REG_PADDING (mode, type, 1) == downward
9913           && size < UNITS_PER_WORD)
9914         {
9915           adjust = UNITS_PER_WORD  - size;
9916         }
9917     }
9918
9919   /* Get a local temporary for the field value.  */
9920   off = get_initialized_tmp_var (f_off, pre_p, NULL);
9921
9922   /* Emit code to branch if off >= 0.  */
9923   t = build2 (GE_EXPR, boolean_type_node, off,
9924               build_int_cst (TREE_TYPE (off), 0));
9925   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
9926
9927   if (dw_align)
9928     {
9929       /* Emit: offs = (offs + 15) & -16.  */
9930       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9931                   build_int_cst (TREE_TYPE (off), 15));
9932       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
9933                   build_int_cst (TREE_TYPE (off), -16));
9934       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
9935     }
9936   else
9937     roundup = NULL;
9938
9939   /* Update ap.__[g|v]r_offs  */
9940   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9941               build_int_cst (TREE_TYPE (off), rsize));
9942   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
9943
9944   /* String up.  */
9945   if (roundup)
9946     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9947
9948   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
9949   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
9950               build_int_cst (TREE_TYPE (f_off), 0));
9951   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
9952
9953   /* String up: make sure the assignment happens before the use.  */
9954   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
9955   COND_EXPR_ELSE (cond1) = t;
9956
9957   /* Prepare the trees handling the argument that is passed on the stack;
9958      the top level node will store in ON_STACK.  */
9959   arg = get_initialized_tmp_var (stack, pre_p, NULL);
9960   if (align > 8)
9961     {
9962       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
9963       t = fold_convert (intDI_type_node, arg);
9964       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9965                   build_int_cst (TREE_TYPE (t), 15));
9966       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9967                   build_int_cst (TREE_TYPE (t), -16));
9968       t = fold_convert (TREE_TYPE (arg), t);
9969       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
9970     }
9971   else
9972     roundup = NULL;
9973   /* Advance ap.__stack  */
9974   t = fold_convert (intDI_type_node, arg);
9975   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9976               build_int_cst (TREE_TYPE (t), size + 7));
9977   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9978               build_int_cst (TREE_TYPE (t), -8));
9979   t = fold_convert (TREE_TYPE (arg), t);
9980   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
9981   /* String up roundup and advance.  */
9982   if (roundup)
9983     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9984   /* String up with arg */
9985   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
9986   /* Big-endianness related address adjustment.  */
9987   if (BLOCK_REG_PADDING (mode, type, 1) == downward
9988       && size < UNITS_PER_WORD)
9989   {
9990     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
9991                 size_int (UNITS_PER_WORD - size));
9992     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
9993   }
9994
9995   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
9996   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
9997
9998   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
9999   t = off;
10000   if (adjust)
10001     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10002                 build_int_cst (TREE_TYPE (off), adjust));
10003
10004   t = fold_convert (sizetype, t);
10005   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10006
10007   if (is_ha)
10008     {
10009       /* type ha; // treat as "struct {ftype field[n];}"
10010          ... [computing offs]
10011          for (i = 0; i <nregs; ++i, offs += 16)
10012            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10013          return ha;  */
10014       int i;
10015       tree tmp_ha, field_t, field_ptr_t;
10016
10017       /* Declare a local variable.  */
10018       tmp_ha = create_tmp_var_raw (type, "ha");
10019       gimple_add_tmp_var (tmp_ha);
10020
10021       /* Establish the base type.  */
10022       switch (ag_mode)
10023         {
10024         case SFmode:
10025           field_t = float_type_node;
10026           field_ptr_t = float_ptr_type_node;
10027           break;
10028         case DFmode:
10029           field_t = double_type_node;
10030           field_ptr_t = double_ptr_type_node;
10031           break;
10032         case TFmode:
10033           field_t = long_double_type_node;
10034           field_ptr_t = long_double_ptr_type_node;
10035           break;
10036         case HFmode:
10037           field_t = aarch64_fp16_type_node;
10038           field_ptr_t = aarch64_fp16_ptr_type_node;
10039           break;
10040         case V2SImode:
10041         case V4SImode:
10042             {
10043               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10044               field_t = build_vector_type_for_mode (innertype, ag_mode);
10045               field_ptr_t = build_pointer_type (field_t);
10046             }
10047           break;
10048         default:
10049           gcc_assert (0);
10050         }
10051
10052       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
10053       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10054       addr = t;
10055       t = fold_convert (field_ptr_t, addr);
10056       t = build2 (MODIFY_EXPR, field_t,
10057                   build1 (INDIRECT_REF, field_t, tmp_ha),
10058                   build1 (INDIRECT_REF, field_t, t));
10059
10060       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
10061       for (i = 1; i < nregs; ++i)
10062         {
10063           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10064           u = fold_convert (field_ptr_t, addr);
10065           u = build2 (MODIFY_EXPR, field_t,
10066                       build2 (MEM_REF, field_t, tmp_ha,
10067                               build_int_cst (field_ptr_t,
10068                                              (i *
10069                                               int_size_in_bytes (field_t)))),
10070                       build1 (INDIRECT_REF, field_t, u));
10071           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10072         }
10073
10074       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10075       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10076     }
10077
10078   COND_EXPR_ELSE (cond2) = t;
10079   addr = fold_convert (build_pointer_type (type), cond1);
10080   addr = build_va_arg_indirect_ref (addr);
10081
10082   if (indirect_p)
10083     addr = build_va_arg_indirect_ref (addr);
10084
10085   return addr;
10086 }
10087
10088 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
10089
10090 static void
10091 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10092                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10093                                 int no_rtl)
10094 {
10095   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10096   CUMULATIVE_ARGS local_cum;
10097   int gr_saved = cfun->va_list_gpr_size;
10098   int vr_saved = cfun->va_list_fpr_size;
10099
10100   /* The caller has advanced CUM up to, but not beyond, the last named
10101      argument.  Advance a local copy of CUM past the last "real" named
10102      argument, to find out how many registers are left over.  */
10103   local_cum = *cum;
10104   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10105
10106   /* Found out how many registers we need to save.
10107      Honor tree-stdvar analysis results.  */
10108   if (cfun->va_list_gpr_size)
10109     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10110                     cfun->va_list_gpr_size / UNITS_PER_WORD);
10111   if (cfun->va_list_fpr_size)
10112     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10113                     cfun->va_list_fpr_size / UNITS_PER_VREG);
10114
10115   if (!TARGET_FLOAT)
10116     {
10117       gcc_assert (local_cum.aapcs_nvrn == 0);
10118       vr_saved = 0;
10119     }
10120
10121   if (!no_rtl)
10122     {
10123       if (gr_saved > 0)
10124         {
10125           rtx ptr, mem;
10126
10127           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
10128           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10129                                - gr_saved * UNITS_PER_WORD);
10130           mem = gen_frame_mem (BLKmode, ptr);
10131           set_mem_alias_set (mem, get_varargs_alias_set ());
10132
10133           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10134                                mem, gr_saved);
10135         }
10136       if (vr_saved > 0)
10137         {
10138           /* We can't use move_block_from_reg, because it will use
10139              the wrong mode, storing D regs only.  */
10140           machine_mode mode = TImode;
10141           int off, i, vr_start;
10142
10143           /* Set OFF to the offset from virtual_incoming_args_rtx of
10144              the first vector register.  The VR save area lies below
10145              the GR one, and is aligned to 16 bytes.  */
10146           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10147                            STACK_BOUNDARY / BITS_PER_UNIT);
10148           off -= vr_saved * UNITS_PER_VREG;
10149
10150           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10151           for (i = 0; i < vr_saved; ++i)
10152             {
10153               rtx ptr, mem;
10154
10155               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10156               mem = gen_frame_mem (mode, ptr);
10157               set_mem_alias_set (mem, get_varargs_alias_set ());
10158               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10159               off += UNITS_PER_VREG;
10160             }
10161         }
10162     }
10163
10164   /* We don't save the size into *PRETEND_SIZE because we want to avoid
10165      any complication of having crtl->args.pretend_args_size changed.  */
10166   cfun->machine->frame.saved_varargs_size
10167     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10168                  STACK_BOUNDARY / BITS_PER_UNIT)
10169        + vr_saved * UNITS_PER_VREG);
10170 }
10171
10172 static void
10173 aarch64_conditional_register_usage (void)
10174 {
10175   int i;
10176   if (!TARGET_FLOAT)
10177     {
10178       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10179         {
10180           fixed_regs[i] = 1;
10181           call_used_regs[i] = 1;
10182         }
10183     }
10184 }
10185
10186 /* Walk down the type tree of TYPE counting consecutive base elements.
10187    If *MODEP is VOIDmode, then set it to the first valid floating point
10188    type.  If a non-floating point type is found, or if a floating point
10189    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10190    otherwise return the count in the sub-tree.  */
10191 static int
10192 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10193 {
10194   machine_mode mode;
10195   HOST_WIDE_INT size;
10196
10197   switch (TREE_CODE (type))
10198     {
10199     case REAL_TYPE:
10200       mode = TYPE_MODE (type);
10201       if (mode != DFmode && mode != SFmode
10202           && mode != TFmode && mode != HFmode)
10203         return -1;
10204
10205       if (*modep == VOIDmode)
10206         *modep = mode;
10207
10208       if (*modep == mode)
10209         return 1;
10210
10211       break;
10212
10213     case COMPLEX_TYPE:
10214       mode = TYPE_MODE (TREE_TYPE (type));
10215       if (mode != DFmode && mode != SFmode
10216           && mode != TFmode && mode != HFmode)
10217         return -1;
10218
10219       if (*modep == VOIDmode)
10220         *modep = mode;
10221
10222       if (*modep == mode)
10223         return 2;
10224
10225       break;
10226
10227     case VECTOR_TYPE:
10228       /* Use V2SImode and V4SImode as representatives of all 64-bit
10229          and 128-bit vector types.  */
10230       size = int_size_in_bytes (type);
10231       switch (size)
10232         {
10233         case 8:
10234           mode = V2SImode;
10235           break;
10236         case 16:
10237           mode = V4SImode;
10238           break;
10239         default:
10240           return -1;
10241         }
10242
10243       if (*modep == VOIDmode)
10244         *modep = mode;
10245
10246       /* Vector modes are considered to be opaque: two vectors are
10247          equivalent for the purposes of being homogeneous aggregates
10248          if they are the same size.  */
10249       if (*modep == mode)
10250         return 1;
10251
10252       break;
10253
10254     case ARRAY_TYPE:
10255       {
10256         int count;
10257         tree index = TYPE_DOMAIN (type);
10258
10259         /* Can't handle incomplete types nor sizes that are not
10260            fixed.  */
10261         if (!COMPLETE_TYPE_P (type)
10262             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10263           return -1;
10264
10265         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10266         if (count == -1
10267             || !index
10268             || !TYPE_MAX_VALUE (index)
10269             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10270             || !TYPE_MIN_VALUE (index)
10271             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10272             || count < 0)
10273           return -1;
10274
10275         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10276                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10277
10278         /* There must be no padding.  */
10279         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10280           return -1;
10281
10282         return count;
10283       }
10284
10285     case RECORD_TYPE:
10286       {
10287         int count = 0;
10288         int sub_count;
10289         tree field;
10290
10291         /* Can't handle incomplete types nor sizes that are not
10292            fixed.  */
10293         if (!COMPLETE_TYPE_P (type)
10294             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10295           return -1;
10296
10297         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10298           {
10299             if (TREE_CODE (field) != FIELD_DECL)
10300               continue;
10301
10302             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10303             if (sub_count < 0)
10304               return -1;
10305             count += sub_count;
10306           }
10307
10308         /* There must be no padding.  */
10309         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10310           return -1;
10311
10312         return count;
10313       }
10314
10315     case UNION_TYPE:
10316     case QUAL_UNION_TYPE:
10317       {
10318         /* These aren't very interesting except in a degenerate case.  */
10319         int count = 0;
10320         int sub_count;
10321         tree field;
10322
10323         /* Can't handle incomplete types nor sizes that are not
10324            fixed.  */
10325         if (!COMPLETE_TYPE_P (type)
10326             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10327           return -1;
10328
10329         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10330           {
10331             if (TREE_CODE (field) != FIELD_DECL)
10332               continue;
10333
10334             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10335             if (sub_count < 0)
10336               return -1;
10337             count = count > sub_count ? count : sub_count;
10338           }
10339
10340         /* There must be no padding.  */
10341         if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10342           return -1;
10343
10344         return count;
10345       }
10346
10347     default:
10348       break;
10349     }
10350
10351   return -1;
10352 }
10353
10354 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10355    type as described in AAPCS64 \S 4.1.2.
10356
10357    See the comment above aarch64_composite_type_p for the notes on MODE.  */
10358
10359 static bool
10360 aarch64_short_vector_p (const_tree type,
10361                         machine_mode mode)
10362 {
10363   HOST_WIDE_INT size = -1;
10364
10365   if (type && TREE_CODE (type) == VECTOR_TYPE)
10366     size = int_size_in_bytes (type);
10367   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10368             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10369     size = GET_MODE_SIZE (mode);
10370
10371   return (size == 8 || size == 16);
10372 }
10373
10374 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10375    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
10376    array types.  The C99 floating-point complex types are also considered
10377    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
10378    types, which are GCC extensions and out of the scope of AAPCS64, are
10379    treated as composite types here as well.
10380
10381    Note that MODE itself is not sufficient in determining whether a type
10382    is such a composite type or not.  This is because
10383    stor-layout.c:compute_record_mode may have already changed the MODE
10384    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
10385    structure with only one field may have its MODE set to the mode of the
10386    field.  Also an integer mode whose size matches the size of the
10387    RECORD_TYPE type may be used to substitute the original mode
10388    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
10389    solely relied on.  */
10390
10391 static bool
10392 aarch64_composite_type_p (const_tree type,
10393                           machine_mode mode)
10394 {
10395   if (aarch64_short_vector_p (type, mode))
10396     return false;
10397
10398   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10399     return true;
10400
10401   if (mode == BLKmode
10402       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10403       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10404     return true;
10405
10406   return false;
10407 }
10408
10409 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10410    shall be passed or returned in simd/fp register(s) (providing these
10411    parameter passing registers are available).
10412
10413    Upon successful return, *COUNT returns the number of needed registers,
10414    *BASE_MODE returns the mode of the individual register and when IS_HAF
10415    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10416    floating-point aggregate or a homogeneous short-vector aggregate.  */
10417
10418 static bool
10419 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10420                                          const_tree type,
10421                                          machine_mode *base_mode,
10422                                          int *count,
10423                                          bool *is_ha)
10424 {
10425   machine_mode new_mode = VOIDmode;
10426   bool composite_p = aarch64_composite_type_p (type, mode);
10427
10428   if (is_ha != NULL) *is_ha = false;
10429
10430   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10431       || aarch64_short_vector_p (type, mode))
10432     {
10433       *count = 1;
10434       new_mode = mode;
10435     }
10436   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10437     {
10438       if (is_ha != NULL) *is_ha = true;
10439       *count = 2;
10440       new_mode = GET_MODE_INNER (mode);
10441     }
10442   else if (type && composite_p)
10443     {
10444       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10445
10446       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10447         {
10448           if (is_ha != NULL) *is_ha = true;
10449           *count = ag_count;
10450         }
10451       else
10452         return false;
10453     }
10454   else
10455     return false;
10456
10457   *base_mode = new_mode;
10458   return true;
10459 }
10460
10461 /* Implement TARGET_STRUCT_VALUE_RTX.  */
10462
10463 static rtx
10464 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10465                           int incoming ATTRIBUTE_UNUSED)
10466 {
10467   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10468 }
10469
10470 /* Implements target hook vector_mode_supported_p.  */
10471 static bool
10472 aarch64_vector_mode_supported_p (machine_mode mode)
10473 {
10474   if (TARGET_SIMD
10475       && (mode == V4SImode  || mode == V8HImode
10476           || mode == V16QImode || mode == V2DImode
10477           || mode == V2SImode  || mode == V4HImode
10478           || mode == V8QImode || mode == V2SFmode
10479           || mode == V4SFmode || mode == V2DFmode
10480           || mode == V4HFmode || mode == V8HFmode
10481           || mode == V1DFmode))
10482     return true;
10483
10484   return false;
10485 }
10486
10487 /* Return appropriate SIMD container
10488    for MODE within a vector of WIDTH bits.  */
10489 static machine_mode
10490 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10491 {
10492   gcc_assert (width == 64 || width == 128);
10493   if (TARGET_SIMD)
10494     {
10495       if (width == 128)
10496         switch (mode)
10497           {
10498           case DFmode:
10499             return V2DFmode;
10500           case SFmode:
10501             return V4SFmode;
10502           case SImode:
10503             return V4SImode;
10504           case HImode:
10505             return V8HImode;
10506           case QImode:
10507             return V16QImode;
10508           case DImode:
10509             return V2DImode;
10510           default:
10511             break;
10512           }
10513       else
10514         switch (mode)
10515           {
10516           case SFmode:
10517             return V2SFmode;
10518           case SImode:
10519             return V2SImode;
10520           case HImode:
10521             return V4HImode;
10522           case QImode:
10523             return V8QImode;
10524           default:
10525             break;
10526           }
10527     }
10528   return word_mode;
10529 }
10530
10531 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
10532 static machine_mode
10533 aarch64_preferred_simd_mode (machine_mode mode)
10534 {
10535   return aarch64_simd_container_mode (mode, 128);
10536 }
10537
10538 /* Return the bitmask of possible vector sizes for the vectorizer
10539    to iterate over.  */
10540 static unsigned int
10541 aarch64_autovectorize_vector_sizes (void)
10542 {
10543   return (16 | 8);
10544 }
10545
10546 /* Implement TARGET_MANGLE_TYPE.  */
10547
10548 static const char *
10549 aarch64_mangle_type (const_tree type)
10550 {
10551   /* The AArch64 ABI documents say that "__va_list" has to be
10552      managled as if it is in the "std" namespace.  */
10553   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10554     return "St9__va_list";
10555
10556   /* Half-precision float.  */
10557   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10558     return "Dh";
10559
10560   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
10561      builtin types.  */
10562   if (TYPE_NAME (type) != NULL)
10563     return aarch64_mangle_builtin_type (type);
10564
10565   /* Use the default mangling.  */
10566   return NULL;
10567 }
10568
10569
10570 /* Return true if the rtx_insn contains a MEM RTX somewhere
10571    in it.  */
10572
10573 static bool
10574 has_memory_op (rtx_insn *mem_insn)
10575 {
10576   subrtx_iterator::array_type array;
10577   FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
10578     if (MEM_P (*iter))
10579       return true;
10580
10581   return false;
10582 }
10583
10584 /* Find the first rtx_insn before insn that will generate an assembly
10585    instruction.  */
10586
10587 static rtx_insn *
10588 aarch64_prev_real_insn (rtx_insn *insn)
10589 {
10590   if (!insn)
10591     return NULL;
10592
10593   do
10594     {
10595       insn = prev_real_insn (insn);
10596     }
10597   while (insn && recog_memoized (insn) < 0);
10598
10599   return insn;
10600 }
10601
10602 static bool
10603 is_madd_op (enum attr_type t1)
10604 {
10605   unsigned int i;
10606   /* A number of these may be AArch32 only.  */
10607   enum attr_type mlatypes[] = {
10608     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10609     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10610     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10611   };
10612
10613   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10614     {
10615       if (t1 == mlatypes[i])
10616         return true;
10617     }
10618
10619   return false;
10620 }
10621
10622 /* Check if there is a register dependency between a load and the insn
10623    for which we hold recog_data.  */
10624
10625 static bool
10626 dep_between_memop_and_curr (rtx memop)
10627 {
10628   rtx load_reg;
10629   int opno;
10630
10631   gcc_assert (GET_CODE (memop) == SET);
10632
10633   if (!REG_P (SET_DEST (memop)))
10634     return false;
10635
10636   load_reg = SET_DEST (memop);
10637   for (opno = 1; opno < recog_data.n_operands; opno++)
10638     {
10639       rtx operand = recog_data.operand[opno];
10640       if (REG_P (operand)
10641           && reg_overlap_mentioned_p (load_reg, operand))
10642         return true;
10643
10644     }
10645   return false;
10646 }
10647
10648
10649 /* When working around the Cortex-A53 erratum 835769,
10650    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10651    instruction and has a preceding memory instruction such that a NOP
10652    should be inserted between them.  */
10653
10654 bool
10655 aarch64_madd_needs_nop (rtx_insn* insn)
10656 {
10657   enum attr_type attr_type;
10658   rtx_insn *prev;
10659   rtx body;
10660
10661   if (!TARGET_FIX_ERR_A53_835769)
10662     return false;
10663
10664   if (!INSN_P (insn) || recog_memoized (insn) < 0)
10665     return false;
10666
10667   attr_type = get_attr_type (insn);
10668   if (!is_madd_op (attr_type))
10669     return false;
10670
10671   prev = aarch64_prev_real_insn (insn);
10672   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
10673      Restore recog state to INSN to avoid state corruption.  */
10674   extract_constrain_insn_cached (insn);
10675
10676   if (!prev || !has_memory_op (prev))
10677     return false;
10678
10679   body = single_set (prev);
10680
10681   /* If the previous insn is a memory op and there is no dependency between
10682      it and the DImode madd, emit a NOP between them.  If body is NULL then we
10683      have a complex memory operation, probably a load/store pair.
10684      Be conservative for now and emit a NOP.  */
10685   if (GET_MODE (recog_data.operand[0]) == DImode
10686       && (!body || !dep_between_memop_and_curr (body)))
10687     return true;
10688
10689   return false;
10690
10691 }
10692
10693
10694 /* Implement FINAL_PRESCAN_INSN.  */
10695
10696 void
10697 aarch64_final_prescan_insn (rtx_insn *insn)
10698 {
10699   if (aarch64_madd_needs_nop (insn))
10700     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
10701 }
10702
10703
10704 /* Return the equivalent letter for size.  */
10705 static char
10706 sizetochar (int size)
10707 {
10708   switch (size)
10709     {
10710     case 64: return 'd';
10711     case 32: return 's';
10712     case 16: return 'h';
10713     case 8 : return 'b';
10714     default: gcc_unreachable ();
10715     }
10716 }
10717
10718 /* Return true iff x is a uniform vector of floating-point
10719    constants, and the constant can be represented in
10720    quarter-precision form.  Note, as aarch64_float_const_representable
10721    rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0.  */
10722 static bool
10723 aarch64_vect_float_const_representable_p (rtx x)
10724 {
10725   rtx elt;
10726   return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10727           && const_vec_duplicate_p (x, &elt)
10728           && aarch64_float_const_representable_p (elt));
10729 }
10730
10731 /* Return true for valid and false for invalid.  */
10732 bool
10733 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
10734                               struct simd_immediate_info *info)
10735 {
10736 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG)  \
10737   matches = 1;                                          \
10738   for (i = 0; i < idx; i += (STRIDE))                   \
10739     if (!(TEST))                                        \
10740       matches = 0;                                      \
10741   if (matches)                                          \
10742     {                                                   \
10743       immtype = (CLASS);                                \
10744       elsize = (ELSIZE);                                \
10745       eshift = (SHIFT);                                 \
10746       emvn = (NEG);                                     \
10747       break;                                            \
10748     }
10749
10750   unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
10751   unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
10752   unsigned char bytes[16];
10753   int immtype = -1, matches;
10754   unsigned int invmask = inverse ? 0xff : 0;
10755   int eshift, emvn;
10756
10757   if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10758     {
10759       if (! (aarch64_simd_imm_zero_p (op, mode)
10760              || aarch64_vect_float_const_representable_p (op)))
10761         return false;
10762
10763       if (info)
10764         {
10765           info->value = CONST_VECTOR_ELT (op, 0);
10766           info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
10767           info->mvn = false;
10768           info->shift = 0;
10769         }
10770
10771       return true;
10772     }
10773
10774   /* Splat vector constant out into a byte vector.  */
10775   for (i = 0; i < n_elts; i++)
10776     {
10777       /* The vector is provided in gcc endian-neutral fashion.  For aarch64_be,
10778          it must be laid out in the vector register in reverse order.  */
10779       rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
10780       unsigned HOST_WIDE_INT elpart;
10781
10782       gcc_assert (CONST_INT_P (el));
10783       elpart = INTVAL (el);
10784
10785       for (unsigned int byte = 0; byte < innersize; byte++)
10786         {
10787           bytes[idx++] = (elpart & 0xff) ^ invmask;
10788           elpart >>= BITS_PER_UNIT;
10789         }
10790
10791     }
10792
10793   /* Sanity check.  */
10794   gcc_assert (idx == GET_MODE_SIZE (mode));
10795
10796   do
10797     {
10798       CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
10799              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
10800
10801       CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10802              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10803
10804       CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
10805              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10806
10807       CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
10808              && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
10809
10810       CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
10811
10812       CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
10813
10814       CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
10815              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
10816
10817       CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10818              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10819
10820       CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
10821              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10822
10823       CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
10824              && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
10825
10826       CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
10827
10828       CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
10829
10830       CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10831              && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10832
10833       CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10834              && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10835
10836       CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
10837              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10838
10839       CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
10840              && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10841
10842       CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
10843
10844       CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
10845              && bytes[i] == bytes[(i + 8) % idx], 0, 0);
10846     }
10847   while (0);
10848
10849   if (immtype == -1)
10850     return false;
10851
10852   if (info)
10853     {
10854       info->element_width = elsize;
10855       info->mvn = emvn != 0;
10856       info->shift = eshift;
10857
10858       unsigned HOST_WIDE_INT imm = 0;
10859
10860       if (immtype >= 12 && immtype <= 15)
10861         info->msl = true;
10862
10863       /* Un-invert bytes of recognized vector, if necessary.  */
10864       if (invmask != 0)
10865         for (i = 0; i < idx; i++)
10866           bytes[i] ^= invmask;
10867
10868       if (immtype == 17)
10869         {
10870           /* FIXME: Broken on 32-bit H_W_I hosts.  */
10871           gcc_assert (sizeof (HOST_WIDE_INT) == 8);
10872
10873           for (i = 0; i < 8; i++)
10874             imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
10875               << (i * BITS_PER_UNIT);
10876
10877
10878           info->value = GEN_INT (imm);
10879         }
10880       else
10881         {
10882           for (i = 0; i < elsize / BITS_PER_UNIT; i++)
10883             imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
10884
10885           /* Construct 'abcdefgh' because the assembler cannot handle
10886              generic constants.  */
10887           if (info->mvn)
10888             imm = ~imm;
10889           imm = (imm >> info->shift) & 0xff;
10890           info->value = GEN_INT (imm);
10891         }
10892     }
10893
10894   return true;
10895 #undef CHECK
10896 }
10897
10898 /* Check of immediate shift constants are within range.  */
10899 bool
10900 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
10901 {
10902   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
10903   if (left)
10904     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
10905   else
10906     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
10907 }
10908
10909 /* Return true if X is a uniform vector where all elements
10910    are either the floating-point constant 0.0 or the
10911    integer constant 0.  */
10912 bool
10913 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
10914 {
10915   return x == CONST0_RTX (mode);
10916 }
10917
10918
10919 /* Return the bitmask CONST_INT to select the bits required by a zero extract
10920    operation of width WIDTH at bit position POS.  */
10921
10922 rtx
10923 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
10924 {
10925   gcc_assert (CONST_INT_P (width));
10926   gcc_assert (CONST_INT_P (pos));
10927
10928   unsigned HOST_WIDE_INT mask
10929     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
10930   return GEN_INT (mask << UINTVAL (pos));
10931 }
10932
10933 bool
10934 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
10935 {
10936   HOST_WIDE_INT imm = INTVAL (x);
10937   int i;
10938
10939   for (i = 0; i < 8; i++)
10940     {
10941       unsigned int byte = imm & 0xff;
10942       if (byte != 0xff && byte != 0)
10943        return false;
10944       imm >>= 8;
10945     }
10946
10947   return true;
10948 }
10949
10950 bool
10951 aarch64_mov_operand_p (rtx x, machine_mode mode)
10952 {
10953   if (GET_CODE (x) == HIGH
10954       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10955     return true;
10956
10957   if (CONST_INT_P (x))
10958     return true;
10959
10960   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
10961     return true;
10962
10963   return aarch64_classify_symbolic_expression (x)
10964     == SYMBOL_TINY_ABSOLUTE;
10965 }
10966
10967 /* Return a const_int vector of VAL.  */
10968 rtx
10969 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
10970 {
10971   int nunits = GET_MODE_NUNITS (mode);
10972   rtvec v = rtvec_alloc (nunits);
10973   int i;
10974
10975   for (i=0; i < nunits; i++)
10976     RTVEC_ELT (v, i) = GEN_INT (val);
10977
10978   return gen_rtx_CONST_VECTOR (mode, v);
10979 }
10980
10981 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
10982
10983 bool
10984 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
10985 {
10986   machine_mode vmode;
10987
10988   gcc_assert (!VECTOR_MODE_P (mode));
10989   vmode = aarch64_preferred_simd_mode (mode);
10990   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
10991   return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
10992 }
10993
10994 /* Construct and return a PARALLEL RTX vector with elements numbering the
10995    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
10996    the vector - from the perspective of the architecture.  This does not
10997    line up with GCC's perspective on lane numbers, so we end up with
10998    different masks depending on our target endian-ness.  The diagram
10999    below may help.  We must draw the distinction when building masks
11000    which select one half of the vector.  An instruction selecting
11001    architectural low-lanes for a big-endian target, must be described using
11002    a mask selecting GCC high-lanes.
11003
11004                  Big-Endian             Little-Endian
11005
11006 GCC             0   1   2   3           3   2   1   0
11007               | x | x | x | x |       | x | x | x | x |
11008 Architecture    3   2   1   0           3   2   1   0
11009
11010 Low Mask:         { 2, 3 }                { 0, 1 }
11011 High Mask:        { 0, 1 }                { 2, 3 }
11012 */
11013
11014 rtx
11015 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11016 {
11017   int nunits = GET_MODE_NUNITS (mode);
11018   rtvec v = rtvec_alloc (nunits / 2);
11019   int high_base = nunits / 2;
11020   int low_base = 0;
11021   int base;
11022   rtx t1;
11023   int i;
11024
11025   if (BYTES_BIG_ENDIAN)
11026     base = high ? low_base : high_base;
11027   else
11028     base = high ? high_base : low_base;
11029
11030   for (i = 0; i < nunits / 2; i++)
11031     RTVEC_ELT (v, i) = GEN_INT (base + i);
11032
11033   t1 = gen_rtx_PARALLEL (mode, v);
11034   return t1;
11035 }
11036
11037 /* Check OP for validity as a PARALLEL RTX vector with elements
11038    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11039    from the perspective of the architecture.  See the diagram above
11040    aarch64_simd_vect_par_cnst_half for more details.  */
11041
11042 bool
11043 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11044                                        bool high)
11045 {
11046   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11047   HOST_WIDE_INT count_op = XVECLEN (op, 0);
11048   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11049   int i = 0;
11050
11051   if (!VECTOR_MODE_P (mode))
11052     return false;
11053
11054   if (count_op != count_ideal)
11055     return false;
11056
11057   for (i = 0; i < count_ideal; i++)
11058     {
11059       rtx elt_op = XVECEXP (op, 0, i);
11060       rtx elt_ideal = XVECEXP (ideal, 0, i);
11061
11062       if (!CONST_INT_P (elt_op)
11063           || INTVAL (elt_ideal) != INTVAL (elt_op))
11064         return false;
11065     }
11066   return true;
11067 }
11068
11069 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
11070    HIGH (exclusive).  */
11071 void
11072 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11073                           const_tree exp)
11074 {
11075   HOST_WIDE_INT lane;
11076   gcc_assert (CONST_INT_P (operand));
11077   lane = INTVAL (operand);
11078
11079   if (lane < low || lane >= high)
11080   {
11081     if (exp)
11082       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11083     else
11084       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11085   }
11086 }
11087
11088 /* Return TRUE if OP is a valid vector addressing mode.  */
11089 bool
11090 aarch64_simd_mem_operand_p (rtx op)
11091 {
11092   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11093                         || REG_P (XEXP (op, 0)));
11094 }
11095
11096 /* Emit a register copy from operand to operand, taking care not to
11097    early-clobber source registers in the process.
11098
11099    COUNT is the number of components into which the copy needs to be
11100    decomposed.  */
11101 void
11102 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
11103                                 unsigned int count)
11104 {
11105   unsigned int i;
11106   int rdest = REGNO (operands[0]);
11107   int rsrc = REGNO (operands[1]);
11108
11109   if (!reg_overlap_mentioned_p (operands[0], operands[1])
11110       || rdest < rsrc)
11111     for (i = 0; i < count; i++)
11112       emit_move_insn (gen_rtx_REG (mode, rdest + i),
11113                       gen_rtx_REG (mode, rsrc + i));
11114   else
11115     for (i = 0; i < count; i++)
11116       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11117                       gen_rtx_REG (mode, rsrc + count - i - 1));
11118 }
11119
11120 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11121    one of VSTRUCT modes: OI, CI, or XI.  */
11122 int
11123 aarch64_simd_attr_length_rglist (enum machine_mode mode)
11124 {
11125   return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11126 }
11127
11128 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
11129    alignment of a vector to 128 bits.  */
11130 static HOST_WIDE_INT
11131 aarch64_simd_vector_alignment (const_tree type)
11132 {
11133   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11134   return MIN (align, 128);
11135 }
11136
11137 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
11138 static bool
11139 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11140 {
11141   if (is_packed)
11142     return false;
11143
11144   /* We guarantee alignment for vectors up to 128-bits.  */
11145   if (tree_int_cst_compare (TYPE_SIZE (type),
11146                             bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11147     return false;
11148
11149   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
11150   return true;
11151 }
11152
11153 /* If VALS is a vector constant that can be loaded into a register
11154    using DUP, generate instructions to do so and return an RTX to
11155    assign to the register.  Otherwise return NULL_RTX.  */
11156 static rtx
11157 aarch64_simd_dup_constant (rtx vals)
11158 {
11159   machine_mode mode = GET_MODE (vals);
11160   machine_mode inner_mode = GET_MODE_INNER (mode);
11161   rtx x;
11162
11163   if (!const_vec_duplicate_p (vals, &x))
11164     return NULL_RTX;
11165
11166   /* We can load this constant by using DUP and a constant in a
11167      single ARM register.  This will be cheaper than a vector
11168      load.  */
11169   x = copy_to_mode_reg (inner_mode, x);
11170   return gen_rtx_VEC_DUPLICATE (mode, x);
11171 }
11172
11173
11174 /* Generate code to load VALS, which is a PARALLEL containing only
11175    constants (for vec_init) or CONST_VECTOR, efficiently into a
11176    register.  Returns an RTX to copy into the register, or NULL_RTX
11177    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
11178 static rtx
11179 aarch64_simd_make_constant (rtx vals)
11180 {
11181   machine_mode mode = GET_MODE (vals);
11182   rtx const_dup;
11183   rtx const_vec = NULL_RTX;
11184   int n_elts = GET_MODE_NUNITS (mode);
11185   int n_const = 0;
11186   int i;
11187
11188   if (GET_CODE (vals) == CONST_VECTOR)
11189     const_vec = vals;
11190   else if (GET_CODE (vals) == PARALLEL)
11191     {
11192       /* A CONST_VECTOR must contain only CONST_INTs and
11193          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11194          Only store valid constants in a CONST_VECTOR.  */
11195       for (i = 0; i < n_elts; ++i)
11196         {
11197           rtx x = XVECEXP (vals, 0, i);
11198           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11199             n_const++;
11200         }
11201       if (n_const == n_elts)
11202         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11203     }
11204   else
11205     gcc_unreachable ();
11206
11207   if (const_vec != NULL_RTX
11208       && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11209     /* Load using MOVI/MVNI.  */
11210     return const_vec;
11211   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11212     /* Loaded using DUP.  */
11213     return const_dup;
11214   else if (const_vec != NULL_RTX)
11215     /* Load from constant pool. We can not take advantage of single-cycle
11216        LD1 because we need a PC-relative addressing mode.  */
11217     return const_vec;
11218   else
11219     /* A PARALLEL containing something not valid inside CONST_VECTOR.
11220        We can not construct an initializer.  */
11221     return NULL_RTX;
11222 }
11223
11224 /* Expand a vector initialisation sequence, such that TARGET is
11225    initialised to contain VALS.  */
11226
11227 void
11228 aarch64_expand_vector_init (rtx target, rtx vals)
11229 {
11230   machine_mode mode = GET_MODE (target);
11231   machine_mode inner_mode = GET_MODE_INNER (mode);
11232   /* The number of vector elements.  */
11233   int n_elts = GET_MODE_NUNITS (mode);
11234   /* The number of vector elements which are not constant.  */
11235   int n_var = 0;
11236   rtx any_const = NULL_RTX;
11237   /* The first element of vals.  */
11238   rtx v0 = XVECEXP (vals, 0, 0);
11239   bool all_same = true;
11240
11241   /* Count the number of variable elements to initialise.  */
11242   for (int i = 0; i < n_elts; ++i)
11243     {
11244       rtx x = XVECEXP (vals, 0, i);
11245       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11246         ++n_var;
11247       else
11248         any_const = x;
11249
11250       all_same &= rtx_equal_p (x, v0);
11251     }
11252
11253   /* No variable elements, hand off to aarch64_simd_make_constant which knows
11254      how best to handle this.  */
11255   if (n_var == 0)
11256     {
11257       rtx constant = aarch64_simd_make_constant (vals);
11258       if (constant != NULL_RTX)
11259         {
11260           emit_move_insn (target, constant);
11261           return;
11262         }
11263     }
11264
11265   /* Splat a single non-constant element if we can.  */
11266   if (all_same)
11267     {
11268       rtx x = copy_to_mode_reg (inner_mode, v0);
11269       aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11270       return;
11271     }
11272
11273   /* Initialise a vector which is part-variable.  We want to first try
11274      to build those lanes which are constant in the most efficient way we
11275      can.  */
11276   if (n_var != n_elts)
11277     {
11278       rtx copy = copy_rtx (vals);
11279
11280       /* Load constant part of vector.  We really don't care what goes into the
11281          parts we will overwrite, but we're more likely to be able to load the
11282          constant efficiently if it has fewer, larger, repeating parts
11283          (see aarch64_simd_valid_immediate).  */
11284       for (int i = 0; i < n_elts; i++)
11285         {
11286           rtx x = XVECEXP (vals, 0, i);
11287           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11288             continue;
11289           rtx subst = any_const;
11290           for (int bit = n_elts / 2; bit > 0; bit /= 2)
11291             {
11292               /* Look in the copied vector, as more elements are const.  */
11293               rtx test = XVECEXP (copy, 0, i ^ bit);
11294               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11295                 {
11296                   subst = test;
11297                   break;
11298                 }
11299             }
11300           XVECEXP (copy, 0, i) = subst;
11301         }
11302       aarch64_expand_vector_init (target, copy);
11303     }
11304
11305   /* Insert the variable lanes directly.  */
11306
11307   enum insn_code icode = optab_handler (vec_set_optab, mode);
11308   gcc_assert (icode != CODE_FOR_nothing);
11309
11310   for (int i = 0; i < n_elts; i++)
11311     {
11312       rtx x = XVECEXP (vals, 0, i);
11313       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11314         continue;
11315       x = copy_to_mode_reg (inner_mode, x);
11316       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11317     }
11318 }
11319
11320 static unsigned HOST_WIDE_INT
11321 aarch64_shift_truncation_mask (machine_mode mode)
11322 {
11323   return
11324     (!SHIFT_COUNT_TRUNCATED
11325      || aarch64_vector_mode_supported_p (mode)
11326      || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11327 }
11328
11329 /* Select a format to encode pointers in exception handling data.  */
11330 int
11331 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11332 {
11333    int type;
11334    switch (aarch64_cmodel)
11335      {
11336      case AARCH64_CMODEL_TINY:
11337      case AARCH64_CMODEL_TINY_PIC:
11338      case AARCH64_CMODEL_SMALL:
11339      case AARCH64_CMODEL_SMALL_PIC:
11340      case AARCH64_CMODEL_SMALL_SPIC:
11341        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
11342           for everything.  */
11343        type = DW_EH_PE_sdata4;
11344        break;
11345      default:
11346        /* No assumptions here.  8-byte relocs required.  */
11347        type = DW_EH_PE_sdata8;
11348        break;
11349      }
11350    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11351 }
11352
11353 /* The last .arch and .tune assembly strings that we printed.  */
11354 static std::string aarch64_last_printed_arch_string;
11355 static std::string aarch64_last_printed_tune_string;
11356
11357 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
11358    by the function fndecl.  */
11359
11360 void
11361 aarch64_declare_function_name (FILE *stream, const char* name,
11362                                 tree fndecl)
11363 {
11364   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11365
11366   struct cl_target_option *targ_options;
11367   if (target_parts)
11368     targ_options = TREE_TARGET_OPTION (target_parts);
11369   else
11370     targ_options = TREE_TARGET_OPTION (target_option_current_node);
11371   gcc_assert (targ_options);
11372
11373   const struct processor *this_arch
11374     = aarch64_get_arch (targ_options->x_explicit_arch);
11375
11376   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11377   std::string extension
11378     = aarch64_get_extension_string_for_isa_flags (isa_flags,
11379                                                   this_arch->flags);
11380   /* Only update the assembler .arch string if it is distinct from the last
11381      such string we printed.  */
11382   std::string to_print = this_arch->name + extension;
11383   if (to_print != aarch64_last_printed_arch_string)
11384     {
11385       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11386       aarch64_last_printed_arch_string = to_print;
11387     }
11388
11389   /* Print the cpu name we're tuning for in the comments, might be
11390      useful to readers of the generated asm.  Do it only when it changes
11391      from function to function and verbose assembly is requested.  */
11392   const struct processor *this_tune
11393     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11394
11395   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11396     {
11397       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11398                    this_tune->name);
11399       aarch64_last_printed_tune_string = this_tune->name;
11400     }
11401
11402   /* Don't forget the type directive for ELF.  */
11403   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11404   ASM_OUTPUT_LABEL (stream, name);
11405 }
11406
11407 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
11408
11409 static void
11410 aarch64_start_file (void)
11411 {
11412   struct cl_target_option *default_options
11413     = TREE_TARGET_OPTION (target_option_default_node);
11414
11415   const struct processor *default_arch
11416     = aarch64_get_arch (default_options->x_explicit_arch);
11417   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11418   std::string extension
11419     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11420                                                   default_arch->flags);
11421
11422    aarch64_last_printed_arch_string = default_arch->name + extension;
11423    aarch64_last_printed_tune_string = "";
11424    asm_fprintf (asm_out_file, "\t.arch %s\n",
11425                 aarch64_last_printed_arch_string.c_str ());
11426
11427    default_file_start ();
11428 }
11429
11430 /* Emit load exclusive.  */
11431
11432 static void
11433 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11434                              rtx mem, rtx model_rtx)
11435 {
11436   rtx (*gen) (rtx, rtx, rtx);
11437
11438   switch (mode)
11439     {
11440     case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11441     case HImode: gen = gen_aarch64_load_exclusivehi; break;
11442     case SImode: gen = gen_aarch64_load_exclusivesi; break;
11443     case DImode: gen = gen_aarch64_load_exclusivedi; break;
11444     default:
11445       gcc_unreachable ();
11446     }
11447
11448   emit_insn (gen (rval, mem, model_rtx));
11449 }
11450
11451 /* Emit store exclusive.  */
11452
11453 static void
11454 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11455                               rtx rval, rtx mem, rtx model_rtx)
11456 {
11457   rtx (*gen) (rtx, rtx, rtx, rtx);
11458
11459   switch (mode)
11460     {
11461     case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11462     case HImode: gen = gen_aarch64_store_exclusivehi; break;
11463     case SImode: gen = gen_aarch64_store_exclusivesi; break;
11464     case DImode: gen = gen_aarch64_store_exclusivedi; break;
11465     default:
11466       gcc_unreachable ();
11467     }
11468
11469   emit_insn (gen (bval, rval, mem, model_rtx));
11470 }
11471
11472 /* Mark the previous jump instruction as unlikely.  */
11473
11474 static void
11475 aarch64_emit_unlikely_jump (rtx insn)
11476 {
11477   int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11478
11479   insn = emit_jump_insn (insn);
11480   add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
11481 }
11482
11483 /* Expand a compare and swap pattern.  */
11484
11485 void
11486 aarch64_expand_compare_and_swap (rtx operands[])
11487 {
11488   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11489   machine_mode mode, cmp_mode;
11490   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11491   int idx;
11492   gen_cas_fn gen;
11493   const gen_cas_fn split_cas[] =
11494   {
11495     gen_aarch64_compare_and_swapqi,
11496     gen_aarch64_compare_and_swaphi,
11497     gen_aarch64_compare_and_swapsi,
11498     gen_aarch64_compare_and_swapdi
11499   };
11500   const gen_cas_fn atomic_cas[] =
11501   {
11502     gen_aarch64_compare_and_swapqi_lse,
11503     gen_aarch64_compare_and_swaphi_lse,
11504     gen_aarch64_compare_and_swapsi_lse,
11505     gen_aarch64_compare_and_swapdi_lse
11506   };
11507
11508   bval = operands[0];
11509   rval = operands[1];
11510   mem = operands[2];
11511   oldval = operands[3];
11512   newval = operands[4];
11513   is_weak = operands[5];
11514   mod_s = operands[6];
11515   mod_f = operands[7];
11516   mode = GET_MODE (mem);
11517   cmp_mode = mode;
11518
11519   /* Normally the succ memory model must be stronger than fail, but in the
11520      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11521      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
11522
11523   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11524       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11525     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11526
11527   switch (mode)
11528     {
11529     case QImode:
11530     case HImode:
11531       /* For short modes, we're going to perform the comparison in SImode,
11532          so do the zero-extension now.  */
11533       cmp_mode = SImode;
11534       rval = gen_reg_rtx (SImode);
11535       oldval = convert_modes (SImode, mode, oldval, true);
11536       /* Fall through.  */
11537
11538     case SImode:
11539     case DImode:
11540       /* Force the value into a register if needed.  */
11541       if (!aarch64_plus_operand (oldval, mode))
11542         oldval = force_reg (cmp_mode, oldval);
11543       break;
11544
11545     default:
11546       gcc_unreachable ();
11547     }
11548
11549   switch (mode)
11550     {
11551     case QImode: idx = 0; break;
11552     case HImode: idx = 1; break;
11553     case SImode: idx = 2; break;
11554     case DImode: idx = 3; break;
11555     default:
11556       gcc_unreachable ();
11557     }
11558   if (TARGET_LSE)
11559     gen = atomic_cas[idx];
11560   else
11561     gen = split_cas[idx];
11562
11563   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11564
11565   if (mode == QImode || mode == HImode)
11566     emit_move_insn (operands[1], gen_lowpart (mode, rval));
11567
11568   x = gen_rtx_REG (CCmode, CC_REGNUM);
11569   x = gen_rtx_EQ (SImode, x, const0_rtx);
11570   emit_insn (gen_rtx_SET (bval, x));
11571 }
11572
11573 /* Test whether the target supports using a atomic load-operate instruction.
11574    CODE is the operation and AFTER is TRUE if the data in memory after the
11575    operation should be returned and FALSE if the data before the operation
11576    should be returned.  Returns FALSE if the operation isn't supported by the
11577    architecture.  */
11578
11579 bool
11580 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11581 {
11582   if (!TARGET_LSE)
11583     return false;
11584
11585   switch (code)
11586     {
11587     case SET:
11588     case AND:
11589     case IOR:
11590     case XOR:
11591     case MINUS:
11592     case PLUS:
11593       return true;
11594     default:
11595       return false;
11596     }
11597 }
11598
11599 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11600    sequence implementing an atomic operation.  */
11601
11602 static void
11603 aarch64_emit_post_barrier (enum memmodel model)
11604 {
11605   const enum memmodel base_model = memmodel_base (model);
11606
11607   if (is_mm_sync (model)
11608       && (base_model == MEMMODEL_ACQUIRE
11609           || base_model == MEMMODEL_ACQ_REL
11610           || base_model == MEMMODEL_SEQ_CST))
11611     {
11612       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11613     }
11614 }
11615
11616 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
11617    for the data in memory.  EXPECTED is the value expected to be in memory.
11618    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
11619    is the memory ordering to use.  */
11620
11621 void
11622 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11623                         rtx expected, rtx desired,
11624                         rtx model)
11625 {
11626   rtx (*gen) (rtx, rtx, rtx, rtx);
11627   machine_mode mode;
11628
11629   mode = GET_MODE (mem);
11630
11631   switch (mode)
11632     {
11633     case QImode: gen = gen_aarch64_atomic_casqi; break;
11634     case HImode: gen = gen_aarch64_atomic_cashi; break;
11635     case SImode: gen = gen_aarch64_atomic_cassi; break;
11636     case DImode: gen = gen_aarch64_atomic_casdi; break;
11637     default:
11638       gcc_unreachable ();
11639     }
11640
11641   /* Move the expected value into the CAS destination register.  */
11642   emit_insn (gen_rtx_SET (rval, expected));
11643
11644   /* Emit the CAS.  */
11645   emit_insn (gen (rval, mem, desired, model));
11646
11647   /* Compare the expected value with the value loaded by the CAS, to establish
11648      whether the swap was made.  */
11649   aarch64_gen_compare_reg (EQ, rval, expected);
11650 }
11651
11652 /* Split a compare and swap pattern.  */
11653
11654 void
11655 aarch64_split_compare_and_swap (rtx operands[])
11656 {
11657   rtx rval, mem, oldval, newval, scratch;
11658   machine_mode mode;
11659   bool is_weak;
11660   rtx_code_label *label1, *label2;
11661   rtx x, cond;
11662   enum memmodel model;
11663   rtx model_rtx;
11664
11665   rval = operands[0];
11666   mem = operands[1];
11667   oldval = operands[2];
11668   newval = operands[3];
11669   is_weak = (operands[4] != const0_rtx);
11670   model_rtx = operands[5];
11671   scratch = operands[7];
11672   mode = GET_MODE (mem);
11673   model = memmodel_from_int (INTVAL (model_rtx));
11674
11675   label1 = NULL;
11676   if (!is_weak)
11677     {
11678       label1 = gen_label_rtx ();
11679       emit_label (label1);
11680     }
11681   label2 = gen_label_rtx ();
11682
11683   /* The initial load can be relaxed for a __sync operation since a final
11684      barrier will be emitted to stop code hoisting.  */
11685   if (is_mm_sync (model))
11686     aarch64_emit_load_exclusive (mode, rval, mem,
11687                                  GEN_INT (MEMMODEL_RELAXED));
11688   else
11689     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
11690
11691   cond = aarch64_gen_compare_reg (NE, rval, oldval);
11692   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11693   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11694                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11695   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11696
11697   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
11698
11699   if (!is_weak)
11700     {
11701       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
11702       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11703                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
11704       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11705     }
11706   else
11707     {
11708       cond = gen_rtx_REG (CCmode, CC_REGNUM);
11709       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
11710       emit_insn (gen_rtx_SET (cond, x));
11711     }
11712
11713   emit_label (label2);
11714
11715   /* Emit any final barrier needed for a __sync operation.  */
11716   if (is_mm_sync (model))
11717     aarch64_emit_post_barrier (model);
11718 }
11719
11720 /* Emit a BIC instruction.  */
11721
11722 static void
11723 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
11724 {
11725   rtx shift_rtx = GEN_INT (shift);
11726   rtx (*gen) (rtx, rtx, rtx, rtx);
11727
11728   switch (mode)
11729     {
11730     case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
11731     case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
11732     default:
11733       gcc_unreachable ();
11734     }
11735
11736   emit_insn (gen (dst, s2, shift_rtx, s1));
11737 }
11738
11739 /* Emit an atomic swap.  */
11740
11741 static void
11742 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
11743                           rtx mem, rtx model)
11744 {
11745   rtx (*gen) (rtx, rtx, rtx, rtx);
11746
11747   switch (mode)
11748     {
11749     case QImode: gen = gen_aarch64_atomic_swpqi; break;
11750     case HImode: gen = gen_aarch64_atomic_swphi; break;
11751     case SImode: gen = gen_aarch64_atomic_swpsi; break;
11752     case DImode: gen = gen_aarch64_atomic_swpdi; break;
11753     default:
11754       gcc_unreachable ();
11755     }
11756
11757   emit_insn (gen (dst, mem, value, model));
11758 }
11759
11760 /* Operations supported by aarch64_emit_atomic_load_op.  */
11761
11762 enum aarch64_atomic_load_op_code
11763 {
11764   AARCH64_LDOP_PLUS,    /* A + B  */
11765   AARCH64_LDOP_XOR,     /* A ^ B  */
11766   AARCH64_LDOP_OR,      /* A | B  */
11767   AARCH64_LDOP_BIC      /* A & ~B  */
11768 };
11769
11770 /* Emit an atomic load-operate.  */
11771
11772 static void
11773 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
11774                              machine_mode mode, rtx dst, rtx src,
11775                              rtx mem, rtx model)
11776 {
11777   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
11778   const aarch64_atomic_load_op_fn plus[] =
11779   {
11780     gen_aarch64_atomic_loadaddqi,
11781     gen_aarch64_atomic_loadaddhi,
11782     gen_aarch64_atomic_loadaddsi,
11783     gen_aarch64_atomic_loadadddi
11784   };
11785   const aarch64_atomic_load_op_fn eor[] =
11786   {
11787     gen_aarch64_atomic_loadeorqi,
11788     gen_aarch64_atomic_loadeorhi,
11789     gen_aarch64_atomic_loadeorsi,
11790     gen_aarch64_atomic_loadeordi
11791   };
11792   const aarch64_atomic_load_op_fn ior[] =
11793   {
11794     gen_aarch64_atomic_loadsetqi,
11795     gen_aarch64_atomic_loadsethi,
11796     gen_aarch64_atomic_loadsetsi,
11797     gen_aarch64_atomic_loadsetdi
11798   };
11799   const aarch64_atomic_load_op_fn bic[] =
11800   {
11801     gen_aarch64_atomic_loadclrqi,
11802     gen_aarch64_atomic_loadclrhi,
11803     gen_aarch64_atomic_loadclrsi,
11804     gen_aarch64_atomic_loadclrdi
11805   };
11806   aarch64_atomic_load_op_fn gen;
11807   int idx = 0;
11808
11809   switch (mode)
11810     {
11811     case QImode: idx = 0; break;
11812     case HImode: idx = 1; break;
11813     case SImode: idx = 2; break;
11814     case DImode: idx = 3; break;
11815     default:
11816       gcc_unreachable ();
11817     }
11818
11819   switch (code)
11820     {
11821     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
11822     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
11823     case AARCH64_LDOP_OR: gen = ior[idx]; break;
11824     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
11825     default:
11826       gcc_unreachable ();
11827     }
11828
11829   emit_insn (gen (dst, mem, src, model));
11830 }
11831
11832 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
11833    location to store the data read from memory.  OUT_RESULT is the location to
11834    store the result of the operation.  MEM is the memory location to read and
11835    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
11836    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
11837    be NULL.  */
11838
11839 void
11840 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
11841                          rtx mem, rtx value, rtx model_rtx)
11842 {
11843   machine_mode mode = GET_MODE (mem);
11844   machine_mode wmode = (mode == DImode ? DImode : SImode);
11845   const bool short_mode = (mode < SImode);
11846   aarch64_atomic_load_op_code ldop_code;
11847   rtx src;
11848   rtx x;
11849
11850   if (out_data)
11851     out_data = gen_lowpart (mode, out_data);
11852
11853   if (out_result)
11854     out_result = gen_lowpart (mode, out_result);
11855
11856   /* Make sure the value is in a register, putting it into a destination
11857      register if it needs to be manipulated.  */
11858   if (!register_operand (value, mode)
11859       || code == AND || code == MINUS)
11860     {
11861       src = out_result ? out_result : out_data;
11862       emit_move_insn (src, gen_lowpart (mode, value));
11863     }
11864   else
11865     src = value;
11866   gcc_assert (register_operand (src, mode));
11867
11868   /* Preprocess the data for the operation as necessary.  If the operation is
11869      a SET then emit a swap instruction and finish.  */
11870   switch (code)
11871     {
11872     case SET:
11873       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
11874       return;
11875
11876     case MINUS:
11877       /* Negate the value and treat it as a PLUS.  */
11878       {
11879         rtx neg_src;
11880
11881         /* Resize the value if necessary.  */
11882         if (short_mode)
11883           src = gen_lowpart (wmode, src);
11884
11885         neg_src = gen_rtx_NEG (wmode, src);
11886         emit_insn (gen_rtx_SET (src, neg_src));
11887
11888         if (short_mode)
11889           src = gen_lowpart (mode, src);
11890       }
11891       /* Fall-through.  */
11892     case PLUS:
11893       ldop_code = AARCH64_LDOP_PLUS;
11894       break;
11895
11896     case IOR:
11897       ldop_code = AARCH64_LDOP_OR;
11898       break;
11899
11900     case XOR:
11901       ldop_code = AARCH64_LDOP_XOR;
11902       break;
11903
11904     case AND:
11905       {
11906         rtx not_src;
11907
11908         /* Resize the value if necessary.  */
11909         if (short_mode)
11910           src = gen_lowpart (wmode, src);
11911
11912         not_src = gen_rtx_NOT (wmode, src);
11913         emit_insn (gen_rtx_SET (src, not_src));
11914
11915         if (short_mode)
11916           src = gen_lowpart (mode, src);
11917       }
11918       ldop_code = AARCH64_LDOP_BIC;
11919       break;
11920
11921     default:
11922       /* The operation can't be done with atomic instructions.  */
11923       gcc_unreachable ();
11924     }
11925
11926   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
11927
11928   /* If necessary, calculate the data in memory after the update by redoing the
11929      operation from values in registers.  */
11930   if (!out_result)
11931     return;
11932
11933   if (short_mode)
11934     {
11935       src = gen_lowpart (wmode, src);
11936       out_data = gen_lowpart (wmode, out_data);
11937       out_result = gen_lowpart (wmode, out_result);
11938     }
11939
11940   x = NULL_RTX;
11941
11942   switch (code)
11943     {
11944     case MINUS:
11945     case PLUS:
11946       x = gen_rtx_PLUS (wmode, out_data, src);
11947       break;
11948     case IOR:
11949       x = gen_rtx_IOR (wmode, out_data, src);
11950       break;
11951     case XOR:
11952       x = gen_rtx_XOR (wmode, out_data, src);
11953       break;
11954     case AND:
11955       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
11956       return;
11957     default:
11958       gcc_unreachable ();
11959     }
11960
11961   emit_set_insn (out_result, x);
11962
11963   return;
11964 }
11965
11966 /* Split an atomic operation.  */
11967
11968 void
11969 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
11970                          rtx value, rtx model_rtx, rtx cond)
11971 {
11972   machine_mode mode = GET_MODE (mem);
11973   machine_mode wmode = (mode == DImode ? DImode : SImode);
11974   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
11975   const bool is_sync = is_mm_sync (model);
11976   rtx_code_label *label;
11977   rtx x;
11978
11979   /* Split the atomic operation into a sequence.  */
11980   label = gen_label_rtx ();
11981   emit_label (label);
11982
11983   if (new_out)
11984     new_out = gen_lowpart (wmode, new_out);
11985   if (old_out)
11986     old_out = gen_lowpart (wmode, old_out);
11987   else
11988     old_out = new_out;
11989   value = simplify_gen_subreg (wmode, value, mode, 0);
11990
11991   /* The initial load can be relaxed for a __sync operation since a final
11992      barrier will be emitted to stop code hoisting.  */
11993  if (is_sync)
11994     aarch64_emit_load_exclusive (mode, old_out, mem,
11995                                  GEN_INT (MEMMODEL_RELAXED));
11996   else
11997     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
11998
11999   switch (code)
12000     {
12001     case SET:
12002       new_out = value;
12003       break;
12004
12005     case NOT:
12006       x = gen_rtx_AND (wmode, old_out, value);
12007       emit_insn (gen_rtx_SET (new_out, x));
12008       x = gen_rtx_NOT (wmode, new_out);
12009       emit_insn (gen_rtx_SET (new_out, x));
12010       break;
12011
12012     case MINUS:
12013       if (CONST_INT_P (value))
12014         {
12015           value = GEN_INT (-INTVAL (value));
12016           code = PLUS;
12017         }
12018       /* Fall through.  */
12019
12020     default:
12021       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12022       emit_insn (gen_rtx_SET (new_out, x));
12023       break;
12024     }
12025
12026   aarch64_emit_store_exclusive (mode, cond, mem,
12027                                 gen_lowpart (mode, new_out), model_rtx);
12028
12029   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12030   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12031                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12032   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12033
12034   /* Emit any final barrier needed for a __sync operation.  */
12035   if (is_sync)
12036     aarch64_emit_post_barrier (model);
12037 }
12038
12039 static void
12040 aarch64_init_libfuncs (void)
12041 {
12042    /* Half-precision float operations.  The compiler handles all operations
12043      with NULL libfuncs by converting to SFmode.  */
12044
12045   /* Conversions.  */
12046   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12047   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12048
12049   /* Arithmetic.  */
12050   set_optab_libfunc (add_optab, HFmode, NULL);
12051   set_optab_libfunc (sdiv_optab, HFmode, NULL);
12052   set_optab_libfunc (smul_optab, HFmode, NULL);
12053   set_optab_libfunc (neg_optab, HFmode, NULL);
12054   set_optab_libfunc (sub_optab, HFmode, NULL);
12055
12056   /* Comparisons.  */
12057   set_optab_libfunc (eq_optab, HFmode, NULL);
12058   set_optab_libfunc (ne_optab, HFmode, NULL);
12059   set_optab_libfunc (lt_optab, HFmode, NULL);
12060   set_optab_libfunc (le_optab, HFmode, NULL);
12061   set_optab_libfunc (ge_optab, HFmode, NULL);
12062   set_optab_libfunc (gt_optab, HFmode, NULL);
12063   set_optab_libfunc (unord_optab, HFmode, NULL);
12064 }
12065
12066 /* Target hook for c_mode_for_suffix.  */
12067 static machine_mode
12068 aarch64_c_mode_for_suffix (char suffix)
12069 {
12070   if (suffix == 'q')
12071     return TFmode;
12072
12073   return VOIDmode;
12074 }
12075
12076 /* We can only represent floating point constants which will fit in
12077    "quarter-precision" values.  These values are characterised by
12078    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
12079    by:
12080
12081    (-1)^s * (n/16) * 2^r
12082
12083    Where:
12084      's' is the sign bit.
12085      'n' is an integer in the range 16 <= n <= 31.
12086      'r' is an integer in the range -3 <= r <= 4.  */
12087
12088 /* Return true iff X can be represented by a quarter-precision
12089    floating point immediate operand X.  Note, we cannot represent 0.0.  */
12090 bool
12091 aarch64_float_const_representable_p (rtx x)
12092 {
12093   /* This represents our current view of how many bits
12094      make up the mantissa.  */
12095   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12096   int exponent;
12097   unsigned HOST_WIDE_INT mantissa, mask;
12098   REAL_VALUE_TYPE r, m;
12099   bool fail;
12100
12101   if (!CONST_DOUBLE_P (x))
12102     return false;
12103
12104   /* We don't support HFmode constants yet.  */
12105   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12106     return false;
12107
12108   r = *CONST_DOUBLE_REAL_VALUE (x);
12109
12110   /* We cannot represent infinities, NaNs or +/-zero.  We won't
12111      know if we have +zero until we analyse the mantissa, but we
12112      can reject the other invalid values.  */
12113   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12114       || REAL_VALUE_MINUS_ZERO (r))
12115     return false;
12116
12117   /* Extract exponent.  */
12118   r = real_value_abs (&r);
12119   exponent = REAL_EXP (&r);
12120
12121   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12122      highest (sign) bit, with a fixed binary point at bit point_pos.
12123      m1 holds the low part of the mantissa, m2 the high part.
12124      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12125      bits for the mantissa, this can fail (low bits will be lost).  */
12126   real_ldexp (&m, &r, point_pos - exponent);
12127   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12128
12129   /* If the low part of the mantissa has bits set we cannot represent
12130      the value.  */
12131   if (w.elt (0) != 0)
12132     return false;
12133   /* We have rejected the lower HOST_WIDE_INT, so update our
12134      understanding of how many bits lie in the mantissa and
12135      look only at the high HOST_WIDE_INT.  */
12136   mantissa = w.elt (1);
12137   point_pos -= HOST_BITS_PER_WIDE_INT;
12138
12139   /* We can only represent values with a mantissa of the form 1.xxxx.  */
12140   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12141   if ((mantissa & mask) != 0)
12142     return false;
12143
12144   /* Having filtered unrepresentable values, we may now remove all
12145      but the highest 5 bits.  */
12146   mantissa >>= point_pos - 5;
12147
12148   /* We cannot represent the value 0.0, so reject it.  This is handled
12149      elsewhere.  */
12150   if (mantissa == 0)
12151     return false;
12152
12153   /* Then, as bit 4 is always set, we can mask it off, leaving
12154      the mantissa in the range [0, 15].  */
12155   mantissa &= ~(1 << 4);
12156   gcc_assert (mantissa <= 15);
12157
12158   /* GCC internally does not use IEEE754-like encoding (where normalized
12159      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
12160      Our mantissa values are shifted 4 places to the left relative to
12161      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12162      by 5 places to correct for GCC's representation.  */
12163   exponent = 5 - exponent;
12164
12165   return (exponent >= 0 && exponent <= 7);
12166 }
12167
12168 char*
12169 aarch64_output_simd_mov_immediate (rtx const_vector,
12170                                    machine_mode mode,
12171                                    unsigned width)
12172 {
12173   bool is_valid;
12174   static char templ[40];
12175   const char *mnemonic;
12176   const char *shift_op;
12177   unsigned int lane_count = 0;
12178   char element_char;
12179
12180   struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12181
12182   /* This will return true to show const_vector is legal for use as either
12183      a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate.  It will
12184      also update INFO to show how the immediate should be generated.  */
12185   is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12186   gcc_assert (is_valid);
12187
12188   element_char = sizetochar (info.element_width);
12189   lane_count = width / info.element_width;
12190
12191   mode = GET_MODE_INNER (mode);
12192   if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12193     {
12194       gcc_assert (info.shift == 0 && ! info.mvn);
12195       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12196          move immediate path.  */
12197       if (aarch64_float_const_zero_rtx_p (info.value))
12198         info.value = GEN_INT (0);
12199       else
12200         {
12201           const unsigned int buf_size = 20;
12202           char float_buf[buf_size] = {'\0'};
12203           real_to_decimal_for_mode (float_buf,
12204                                     CONST_DOUBLE_REAL_VALUE (info.value),
12205                                     buf_size, buf_size, 1, mode);
12206
12207           if (lane_count == 1)
12208             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12209           else
12210             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12211                       lane_count, element_char, float_buf);
12212           return templ;
12213         }
12214     }
12215
12216   mnemonic = info.mvn ? "mvni" : "movi";
12217   shift_op = info.msl ? "msl" : "lsl";
12218
12219   gcc_assert (CONST_INT_P (info.value));
12220   if (lane_count == 1)
12221     snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12222               mnemonic, UINTVAL (info.value));
12223   else if (info.shift)
12224     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12225               ", %s %d", mnemonic, lane_count, element_char,
12226               UINTVAL (info.value), shift_op, info.shift);
12227   else
12228     snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12229               mnemonic, lane_count, element_char, UINTVAL (info.value));
12230   return templ;
12231 }
12232
12233 char*
12234 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12235                                           machine_mode mode)
12236 {
12237   machine_mode vmode;
12238
12239   gcc_assert (!VECTOR_MODE_P (mode));
12240   vmode = aarch64_simd_container_mode (mode, 64);
12241   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12242   return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12243 }
12244
12245 /* Split operands into moves from op[1] + op[2] into op[0].  */
12246
12247 void
12248 aarch64_split_combinev16qi (rtx operands[3])
12249 {
12250   unsigned int dest = REGNO (operands[0]);
12251   unsigned int src1 = REGNO (operands[1]);
12252   unsigned int src2 = REGNO (operands[2]);
12253   machine_mode halfmode = GET_MODE (operands[1]);
12254   unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12255   rtx destlo, desthi;
12256
12257   gcc_assert (halfmode == V16QImode);
12258
12259   if (src1 == dest && src2 == dest + halfregs)
12260     {
12261       /* No-op move.  Can't split to nothing; emit something.  */
12262       emit_note (NOTE_INSN_DELETED);
12263       return;
12264     }
12265
12266   /* Preserve register attributes for variable tracking.  */
12267   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12268   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12269                                GET_MODE_SIZE (halfmode));
12270
12271   /* Special case of reversed high/low parts.  */
12272   if (reg_overlap_mentioned_p (operands[2], destlo)
12273       && reg_overlap_mentioned_p (operands[1], desthi))
12274     {
12275       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12276       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12277       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12278     }
12279   else if (!reg_overlap_mentioned_p (operands[2], destlo))
12280     {
12281       /* Try to avoid unnecessary moves if part of the result
12282          is in the right place already.  */
12283       if (src1 != dest)
12284         emit_move_insn (destlo, operands[1]);
12285       if (src2 != dest + halfregs)
12286         emit_move_insn (desthi, operands[2]);
12287     }
12288   else
12289     {
12290       if (src2 != dest + halfregs)
12291         emit_move_insn (desthi, operands[2]);
12292       if (src1 != dest)
12293         emit_move_insn (destlo, operands[1]);
12294     }
12295 }
12296
12297 /* vec_perm support.  */
12298
12299 #define MAX_VECT_LEN 16
12300
12301 struct expand_vec_perm_d
12302 {
12303   rtx target, op0, op1;
12304   unsigned char perm[MAX_VECT_LEN];
12305   machine_mode vmode;
12306   unsigned char nelt;
12307   bool one_vector_p;
12308   bool testing_p;
12309 };
12310
12311 /* Generate a variable permutation.  */
12312
12313 static void
12314 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12315 {
12316   machine_mode vmode = GET_MODE (target);
12317   bool one_vector_p = rtx_equal_p (op0, op1);
12318
12319   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12320   gcc_checking_assert (GET_MODE (op0) == vmode);
12321   gcc_checking_assert (GET_MODE (op1) == vmode);
12322   gcc_checking_assert (GET_MODE (sel) == vmode);
12323   gcc_checking_assert (TARGET_SIMD);
12324
12325   if (one_vector_p)
12326     {
12327       if (vmode == V8QImode)
12328         {
12329           /* Expand the argument to a V16QI mode by duplicating it.  */
12330           rtx pair = gen_reg_rtx (V16QImode);
12331           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12332           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12333         }
12334       else
12335         {
12336           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12337         }
12338     }
12339   else
12340     {
12341       rtx pair;
12342
12343       if (vmode == V8QImode)
12344         {
12345           pair = gen_reg_rtx (V16QImode);
12346           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12347           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12348         }
12349       else
12350         {
12351           pair = gen_reg_rtx (OImode);
12352           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12353           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12354         }
12355     }
12356 }
12357
12358 void
12359 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12360 {
12361   machine_mode vmode = GET_MODE (target);
12362   unsigned int nelt = GET_MODE_NUNITS (vmode);
12363   bool one_vector_p = rtx_equal_p (op0, op1);
12364   rtx mask;
12365
12366   /* The TBL instruction does not use a modulo index, so we must take care
12367      of that ourselves.  */
12368   mask = aarch64_simd_gen_const_vector_dup (vmode,
12369       one_vector_p ? nelt - 1 : 2 * nelt - 1);
12370   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12371
12372   /* For big-endian, we also need to reverse the index within the vector
12373      (but not which vector).  */
12374   if (BYTES_BIG_ENDIAN)
12375     {
12376       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
12377       if (!one_vector_p)
12378         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12379       sel = expand_simple_binop (vmode, XOR, sel, mask,
12380                                  NULL, 0, OPTAB_LIB_WIDEN);
12381     }
12382   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12383 }
12384
12385 /* Recognize patterns suitable for the TRN instructions.  */
12386 static bool
12387 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12388 {
12389   unsigned int i, odd, mask, nelt = d->nelt;
12390   rtx out, in0, in1, x;
12391   rtx (*gen) (rtx, rtx, rtx);
12392   machine_mode vmode = d->vmode;
12393
12394   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12395     return false;
12396
12397   /* Note that these are little-endian tests.
12398      We correct for big-endian later.  */
12399   if (d->perm[0] == 0)
12400     odd = 0;
12401   else if (d->perm[0] == 1)
12402     odd = 1;
12403   else
12404     return false;
12405   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12406
12407   for (i = 0; i < nelt; i += 2)
12408     {
12409       if (d->perm[i] != i + odd)
12410         return false;
12411       if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12412         return false;
12413     }
12414
12415   /* Success!  */
12416   if (d->testing_p)
12417     return true;
12418
12419   in0 = d->op0;
12420   in1 = d->op1;
12421   if (BYTES_BIG_ENDIAN)
12422     {
12423       x = in0, in0 = in1, in1 = x;
12424       odd = !odd;
12425     }
12426   out = d->target;
12427
12428   if (odd)
12429     {
12430       switch (vmode)
12431         {
12432         case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12433         case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12434         case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12435         case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12436         case V4SImode: gen = gen_aarch64_trn2v4si; break;
12437         case V2SImode: gen = gen_aarch64_trn2v2si; break;
12438         case V2DImode: gen = gen_aarch64_trn2v2di; break;
12439         case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
12440         case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
12441         case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12442         case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12443         case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12444         default:
12445           return false;
12446         }
12447     }
12448   else
12449     {
12450       switch (vmode)
12451         {
12452         case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12453         case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12454         case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12455         case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12456         case V4SImode: gen = gen_aarch64_trn1v4si; break;
12457         case V2SImode: gen = gen_aarch64_trn1v2si; break;
12458         case V2DImode: gen = gen_aarch64_trn1v2di; break;
12459         case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
12460         case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
12461         case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12462         case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12463         case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12464         default:
12465           return false;
12466         }
12467     }
12468
12469   emit_insn (gen (out, in0, in1));
12470   return true;
12471 }
12472
12473 /* Recognize patterns suitable for the UZP instructions.  */
12474 static bool
12475 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12476 {
12477   unsigned int i, odd, mask, nelt = d->nelt;
12478   rtx out, in0, in1, x;
12479   rtx (*gen) (rtx, rtx, rtx);
12480   machine_mode vmode = d->vmode;
12481
12482   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12483     return false;
12484
12485   /* Note that these are little-endian tests.
12486      We correct for big-endian later.  */
12487   if (d->perm[0] == 0)
12488     odd = 0;
12489   else if (d->perm[0] == 1)
12490     odd = 1;
12491   else
12492     return false;
12493   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12494
12495   for (i = 0; i < nelt; i++)
12496     {
12497       unsigned elt = (i * 2 + odd) & mask;
12498       if (d->perm[i] != elt)
12499         return false;
12500     }
12501
12502   /* Success!  */
12503   if (d->testing_p)
12504     return true;
12505
12506   in0 = d->op0;
12507   in1 = d->op1;
12508   if (BYTES_BIG_ENDIAN)
12509     {
12510       x = in0, in0 = in1, in1 = x;
12511       odd = !odd;
12512     }
12513   out = d->target;
12514
12515   if (odd)
12516     {
12517       switch (vmode)
12518         {
12519         case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12520         case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12521         case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12522         case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12523         case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12524         case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12525         case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12526         case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
12527         case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
12528         case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12529         case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12530         case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12531         default:
12532           return false;
12533         }
12534     }
12535   else
12536     {
12537       switch (vmode)
12538         {
12539         case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12540         case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12541         case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12542         case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12543         case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12544         case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12545         case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12546         case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
12547         case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
12548         case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12549         case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12550         case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12551         default:
12552           return false;
12553         }
12554     }
12555
12556   emit_insn (gen (out, in0, in1));
12557   return true;
12558 }
12559
12560 /* Recognize patterns suitable for the ZIP instructions.  */
12561 static bool
12562 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12563 {
12564   unsigned int i, high, mask, nelt = d->nelt;
12565   rtx out, in0, in1, x;
12566   rtx (*gen) (rtx, rtx, rtx);
12567   machine_mode vmode = d->vmode;
12568
12569   if (GET_MODE_UNIT_SIZE (vmode) > 8)
12570     return false;
12571
12572   /* Note that these are little-endian tests.
12573      We correct for big-endian later.  */
12574   high = nelt / 2;
12575   if (d->perm[0] == high)
12576     /* Do Nothing.  */
12577     ;
12578   else if (d->perm[0] == 0)
12579     high = 0;
12580   else
12581     return false;
12582   mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12583
12584   for (i = 0; i < nelt / 2; i++)
12585     {
12586       unsigned elt = (i + high) & mask;
12587       if (d->perm[i * 2] != elt)
12588         return false;
12589       elt = (elt + nelt) & mask;
12590       if (d->perm[i * 2 + 1] != elt)
12591         return false;
12592     }
12593
12594   /* Success!  */
12595   if (d->testing_p)
12596     return true;
12597
12598   in0 = d->op0;
12599   in1 = d->op1;
12600   if (BYTES_BIG_ENDIAN)
12601     {
12602       x = in0, in0 = in1, in1 = x;
12603       high = !high;
12604     }
12605   out = d->target;
12606
12607   if (high)
12608     {
12609       switch (vmode)
12610         {
12611         case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12612         case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12613         case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12614         case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12615         case V4SImode: gen = gen_aarch64_zip2v4si; break;
12616         case V2SImode: gen = gen_aarch64_zip2v2si; break;
12617         case V2DImode: gen = gen_aarch64_zip2v2di; break;
12618         case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
12619         case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
12620         case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12621         case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12622         case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12623         default:
12624           return false;
12625         }
12626     }
12627   else
12628     {
12629       switch (vmode)
12630         {
12631         case V16QImode: gen = gen_aarch64_zip1v16qi; break;
12632         case V8QImode: gen = gen_aarch64_zip1v8qi; break;
12633         case V8HImode: gen = gen_aarch64_zip1v8hi; break;
12634         case V4HImode: gen = gen_aarch64_zip1v4hi; break;
12635         case V4SImode: gen = gen_aarch64_zip1v4si; break;
12636         case V2SImode: gen = gen_aarch64_zip1v2si; break;
12637         case V2DImode: gen = gen_aarch64_zip1v2di; break;
12638         case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
12639         case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
12640         case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
12641         case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
12642         case V2DFmode: gen = gen_aarch64_zip1v2df; break;
12643         default:
12644           return false;
12645         }
12646     }
12647
12648   emit_insn (gen (out, in0, in1));
12649   return true;
12650 }
12651
12652 /* Recognize patterns for the EXT insn.  */
12653
12654 static bool
12655 aarch64_evpc_ext (struct expand_vec_perm_d *d)
12656 {
12657   unsigned int i, nelt = d->nelt;
12658   rtx (*gen) (rtx, rtx, rtx, rtx);
12659   rtx offset;
12660
12661   unsigned int location = d->perm[0]; /* Always < nelt.  */
12662
12663   /* Check if the extracted indices are increasing by one.  */
12664   for (i = 1; i < nelt; i++)
12665     {
12666       unsigned int required = location + i;
12667       if (d->one_vector_p)
12668         {
12669           /* We'll pass the same vector in twice, so allow indices to wrap.  */
12670           required &= (nelt - 1);
12671         }
12672       if (d->perm[i] != required)
12673         return false;
12674     }
12675
12676   switch (d->vmode)
12677     {
12678     case V16QImode: gen = gen_aarch64_extv16qi; break;
12679     case V8QImode: gen = gen_aarch64_extv8qi; break;
12680     case V4HImode: gen = gen_aarch64_extv4hi; break;
12681     case V8HImode: gen = gen_aarch64_extv8hi; break;
12682     case V2SImode: gen = gen_aarch64_extv2si; break;
12683     case V4SImode: gen = gen_aarch64_extv4si; break;
12684     case V4HFmode: gen = gen_aarch64_extv4hf; break;
12685     case V8HFmode: gen = gen_aarch64_extv8hf; break;
12686     case V2SFmode: gen = gen_aarch64_extv2sf; break;
12687     case V4SFmode: gen = gen_aarch64_extv4sf; break;
12688     case V2DImode: gen = gen_aarch64_extv2di; break;
12689     case V2DFmode: gen = gen_aarch64_extv2df; break;
12690     default:
12691       return false;
12692     }
12693
12694   /* Success! */
12695   if (d->testing_p)
12696     return true;
12697
12698   /* The case where (location == 0) is a no-op for both big- and little-endian,
12699      and is removed by the mid-end at optimization levels -O1 and higher.  */
12700
12701   if (BYTES_BIG_ENDIAN && (location != 0))
12702     {
12703       /* After setup, we want the high elements of the first vector (stored
12704          at the LSB end of the register), and the low elements of the second
12705          vector (stored at the MSB end of the register). So swap.  */
12706       std::swap (d->op0, d->op1);
12707       /* location != 0 (above), so safe to assume (nelt - location) < nelt.  */
12708       location = nelt - location;
12709     }
12710
12711   offset = GEN_INT (location);
12712   emit_insn (gen (d->target, d->op0, d->op1, offset));
12713   return true;
12714 }
12715
12716 /* Recognize patterns for the REV insns.  */
12717
12718 static bool
12719 aarch64_evpc_rev (struct expand_vec_perm_d *d)
12720 {
12721   unsigned int i, j, diff, nelt = d->nelt;
12722   rtx (*gen) (rtx, rtx);
12723
12724   if (!d->one_vector_p)
12725     return false;
12726
12727   diff = d->perm[0];
12728   switch (diff)
12729     {
12730     case 7:
12731       switch (d->vmode)
12732         {
12733         case V16QImode: gen = gen_aarch64_rev64v16qi; break;
12734         case V8QImode: gen = gen_aarch64_rev64v8qi;  break;
12735         default:
12736           return false;
12737         }
12738       break;
12739     case 3:
12740       switch (d->vmode)
12741         {
12742         case V16QImode: gen = gen_aarch64_rev32v16qi; break;
12743         case V8QImode: gen = gen_aarch64_rev32v8qi;  break;
12744         case V8HImode: gen = gen_aarch64_rev64v8hi;  break;
12745         case V4HImode: gen = gen_aarch64_rev64v4hi;  break;
12746         default:
12747           return false;
12748         }
12749       break;
12750     case 1:
12751       switch (d->vmode)
12752         {
12753         case V16QImode: gen = gen_aarch64_rev16v16qi; break;
12754         case V8QImode: gen = gen_aarch64_rev16v8qi;  break;
12755         case V8HImode: gen = gen_aarch64_rev32v8hi;  break;
12756         case V4HImode: gen = gen_aarch64_rev32v4hi;  break;
12757         case V4SImode: gen = gen_aarch64_rev64v4si;  break;
12758         case V2SImode: gen = gen_aarch64_rev64v2si;  break;
12759         case V4SFmode: gen = gen_aarch64_rev64v4sf;  break;
12760         case V2SFmode: gen = gen_aarch64_rev64v2sf;  break;
12761         case V8HFmode: gen = gen_aarch64_rev64v8hf;  break;
12762         case V4HFmode: gen = gen_aarch64_rev64v4hf;  break;
12763         default:
12764           return false;
12765         }
12766       break;
12767     default:
12768       return false;
12769     }
12770
12771   for (i = 0; i < nelt ; i += diff + 1)
12772     for (j = 0; j <= diff; j += 1)
12773       {
12774         /* This is guaranteed to be true as the value of diff
12775            is 7, 3, 1 and we should have enough elements in the
12776            queue to generate this.  Getting a vector mask with a
12777            value of diff other than these values implies that
12778            something is wrong by the time we get here.  */
12779         gcc_assert (i + j < nelt);
12780         if (d->perm[i + j] != i + diff - j)
12781           return false;
12782       }
12783
12784   /* Success! */
12785   if (d->testing_p)
12786     return true;
12787
12788   emit_insn (gen (d->target, d->op0));
12789   return true;
12790 }
12791
12792 static bool
12793 aarch64_evpc_dup (struct expand_vec_perm_d *d)
12794 {
12795   rtx (*gen) (rtx, rtx, rtx);
12796   rtx out = d->target;
12797   rtx in0;
12798   machine_mode vmode = d->vmode;
12799   unsigned int i, elt, nelt = d->nelt;
12800   rtx lane;
12801
12802   elt = d->perm[0];
12803   for (i = 1; i < nelt; i++)
12804     {
12805       if (elt != d->perm[i])
12806         return false;
12807     }
12808
12809   /* The generic preparation in aarch64_expand_vec_perm_const_1
12810      swaps the operand order and the permute indices if it finds
12811      d->perm[0] to be in the second operand.  Thus, we can always
12812      use d->op0 and need not do any extra arithmetic to get the
12813      correct lane number.  */
12814   in0 = d->op0;
12815   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
12816
12817   switch (vmode)
12818     {
12819     case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
12820     case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
12821     case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
12822     case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
12823     case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
12824     case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
12825     case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
12826     case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
12827     case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
12828     case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
12829     case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
12830     case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
12831     default:
12832       return false;
12833     }
12834
12835   emit_insn (gen (out, in0, lane));
12836   return true;
12837 }
12838
12839 static bool
12840 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
12841 {
12842   rtx rperm[MAX_VECT_LEN], sel;
12843   machine_mode vmode = d->vmode;
12844   unsigned int i, nelt = d->nelt;
12845
12846   if (d->testing_p)
12847     return true;
12848
12849   /* Generic code will try constant permutation twice.  Once with the
12850      original mode and again with the elements lowered to QImode.
12851      So wait and don't do the selector expansion ourselves.  */
12852   if (vmode != V8QImode && vmode != V16QImode)
12853     return false;
12854
12855   for (i = 0; i < nelt; ++i)
12856     {
12857       int nunits = GET_MODE_NUNITS (vmode);
12858
12859       /* If big-endian and two vectors we end up with a weird mixed-endian
12860          mode on NEON.  Reverse the index within each word but not the word
12861          itself.  */
12862       rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
12863                                            : d->perm[i]);
12864     }
12865   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
12866   sel = force_reg (vmode, sel);
12867
12868   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
12869   return true;
12870 }
12871
12872 static bool
12873 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
12874 {
12875   /* The pattern matching functions above are written to look for a small
12876      number to begin the sequence (0, 1, N/2).  If we begin with an index
12877      from the second operand, we can swap the operands.  */
12878   if (d->perm[0] >= d->nelt)
12879     {
12880       unsigned i, nelt = d->nelt;
12881
12882       gcc_assert (nelt == (nelt & -nelt));
12883       for (i = 0; i < nelt; ++i)
12884         d->perm[i] ^= nelt; /* Keep the same index, but in the other vector.  */
12885
12886       std::swap (d->op0, d->op1);
12887     }
12888
12889   if (TARGET_SIMD)
12890     {
12891       if (aarch64_evpc_rev (d))
12892         return true;
12893       else if (aarch64_evpc_ext (d))
12894         return true;
12895       else if (aarch64_evpc_dup (d))
12896         return true;
12897       else if (aarch64_evpc_zip (d))
12898         return true;
12899       else if (aarch64_evpc_uzp (d))
12900         return true;
12901       else if (aarch64_evpc_trn (d))
12902         return true;
12903       return aarch64_evpc_tbl (d);
12904     }
12905   return false;
12906 }
12907
12908 /* Expand a vec_perm_const pattern.  */
12909
12910 bool
12911 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
12912 {
12913   struct expand_vec_perm_d d;
12914   int i, nelt, which;
12915
12916   d.target = target;
12917   d.op0 = op0;
12918   d.op1 = op1;
12919
12920   d.vmode = GET_MODE (target);
12921   gcc_assert (VECTOR_MODE_P (d.vmode));
12922   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12923   d.testing_p = false;
12924
12925   for (i = which = 0; i < nelt; ++i)
12926     {
12927       rtx e = XVECEXP (sel, 0, i);
12928       int ei = INTVAL (e) & (2 * nelt - 1);
12929       which |= (ei < nelt ? 1 : 2);
12930       d.perm[i] = ei;
12931     }
12932
12933   switch (which)
12934     {
12935     default:
12936       gcc_unreachable ();
12937
12938     case 3:
12939       d.one_vector_p = false;
12940       if (!rtx_equal_p (op0, op1))
12941         break;
12942
12943       /* The elements of PERM do not suggest that only the first operand
12944          is used, but both operands are identical.  Allow easier matching
12945          of the permutation by folding the permutation into the single
12946          input vector.  */
12947       /* Fall Through.  */
12948     case 2:
12949       for (i = 0; i < nelt; ++i)
12950         d.perm[i] &= nelt - 1;
12951       d.op0 = op1;
12952       d.one_vector_p = true;
12953       break;
12954
12955     case 1:
12956       d.op1 = op0;
12957       d.one_vector_p = true;
12958       break;
12959     }
12960
12961   return aarch64_expand_vec_perm_const_1 (&d);
12962 }
12963
12964 static bool
12965 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
12966                                      const unsigned char *sel)
12967 {
12968   struct expand_vec_perm_d d;
12969   unsigned int i, nelt, which;
12970   bool ret;
12971
12972   d.vmode = vmode;
12973   d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12974   d.testing_p = true;
12975   memcpy (d.perm, sel, nelt);
12976
12977   /* Calculate whether all elements are in one vector.  */
12978   for (i = which = 0; i < nelt; ++i)
12979     {
12980       unsigned char e = d.perm[i];
12981       gcc_assert (e < 2 * nelt);
12982       which |= (e < nelt ? 1 : 2);
12983     }
12984
12985   /* If all elements are from the second vector, reindex as if from the
12986      first vector.  */
12987   if (which == 2)
12988     for (i = 0; i < nelt; ++i)
12989       d.perm[i] -= nelt;
12990
12991   /* Check whether the mask can be applied to a single vector.  */
12992   d.one_vector_p = (which != 3);
12993
12994   d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
12995   d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
12996   if (!d.one_vector_p)
12997     d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
12998
12999   start_sequence ();
13000   ret = aarch64_expand_vec_perm_const_1 (&d);
13001   end_sequence ();
13002
13003   return ret;
13004 }
13005
13006 rtx
13007 aarch64_reverse_mask (enum machine_mode mode)
13008 {
13009   /* We have to reverse each vector because we dont have
13010      a permuted load that can reverse-load according to ABI rules.  */
13011   rtx mask;
13012   rtvec v = rtvec_alloc (16);
13013   int i, j;
13014   int nunits = GET_MODE_NUNITS (mode);
13015   int usize = GET_MODE_UNIT_SIZE (mode);
13016
13017   gcc_assert (BYTES_BIG_ENDIAN);
13018   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13019
13020   for (i = 0; i < nunits; i++)
13021     for (j = 0; j < usize; j++)
13022       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13023   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13024   return force_reg (V16QImode, mask);
13025 }
13026
13027 /* Implement MODES_TIEABLE_P.  In principle we should always return true.
13028    However due to issues with register allocation it is preferable to avoid
13029    tieing integer scalar and FP scalar modes.  Executing integer operations
13030    in general registers is better than treating them as scalar vector
13031    operations.  This reduces latency and avoids redundant int<->FP moves.
13032    So tie modes if they are either the same class, or vector modes with
13033    other vector modes, vector structs or any scalar mode.
13034 */
13035
13036 bool
13037 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13038 {
13039   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13040     return true;
13041
13042   /* We specifically want to allow elements of "structure" modes to
13043      be tieable to the structure.  This more general condition allows
13044      other rarer situations too.  */
13045   if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13046     return true;
13047
13048   /* Also allow any scalar modes with vectors.  */
13049   if (aarch64_vector_mode_supported_p (mode1)
13050       || aarch64_vector_mode_supported_p (mode2))
13051     return true;
13052
13053   return false;
13054 }
13055
13056 /* Return a new RTX holding the result of moving POINTER forward by
13057    AMOUNT bytes.  */
13058
13059 static rtx
13060 aarch64_move_pointer (rtx pointer, int amount)
13061 {
13062   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13063
13064   return adjust_automodify_address (pointer, GET_MODE (pointer),
13065                                     next, amount);
13066 }
13067
13068 /* Return a new RTX holding the result of moving POINTER forward by the
13069    size of the mode it points to.  */
13070
13071 static rtx
13072 aarch64_progress_pointer (rtx pointer)
13073 {
13074   HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13075
13076   return aarch64_move_pointer (pointer, amount);
13077 }
13078
13079 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13080    MODE bytes.  */
13081
13082 static void
13083 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13084                                               machine_mode mode)
13085 {
13086   rtx reg = gen_reg_rtx (mode);
13087
13088   /* "Cast" the pointers to the correct mode.  */
13089   *src = adjust_address (*src, mode, 0);
13090   *dst = adjust_address (*dst, mode, 0);
13091   /* Emit the memcpy.  */
13092   emit_move_insn (reg, *src);
13093   emit_move_insn (*dst, reg);
13094   /* Move the pointers forward.  */
13095   *src = aarch64_progress_pointer (*src);
13096   *dst = aarch64_progress_pointer (*dst);
13097 }
13098
13099 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
13100    we succeed, otherwise return false.  */
13101
13102 bool
13103 aarch64_expand_movmem (rtx *operands)
13104 {
13105   unsigned int n;
13106   rtx dst = operands[0];
13107   rtx src = operands[1];
13108   rtx base;
13109   bool speed_p = !optimize_function_for_size_p (cfun);
13110
13111   /* When optimizing for size, give a better estimate of the length of a
13112      memcpy call, but use the default otherwise.  */
13113   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13114
13115   /* We can't do anything smart if the amount to copy is not constant.  */
13116   if (!CONST_INT_P (operands[2]))
13117     return false;
13118
13119   n = UINTVAL (operands[2]);
13120
13121   /* Try to keep the number of instructions low.  For cases below 16 bytes we
13122      need to make at most two moves.  For cases above 16 bytes it will be one
13123      move for each 16 byte chunk, then at most two additional moves.  */
13124   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13125     return false;
13126
13127   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13128   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13129
13130   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13131   src = adjust_automodify_address (src, VOIDmode, base, 0);
13132
13133   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13134      1-byte chunk.  */
13135   if (n < 4)
13136     {
13137       if (n >= 2)
13138         {
13139           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13140           n -= 2;
13141         }
13142
13143       if (n == 1)
13144         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13145
13146       return true;
13147     }
13148
13149   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
13150      4-byte chunk, partially overlapping with the previously copied chunk.  */
13151   if (n < 8)
13152     {
13153       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13154       n -= 4;
13155       if (n > 0)
13156         {
13157           int move = n - 4;
13158
13159           src = aarch64_move_pointer (src, move);
13160           dst = aarch64_move_pointer (dst, move);
13161           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13162         }
13163       return true;
13164     }
13165
13166   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
13167      them, then (if applicable) an 8-byte chunk.  */
13168   while (n >= 8)
13169     {
13170       if (n / 16)
13171         {
13172           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13173           n -= 16;
13174         }
13175       else
13176         {
13177           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13178           n -= 8;
13179         }
13180     }
13181
13182   /* Finish the final bytes of the copy.  We can always do this in one
13183      instruction.  We either copy the exact amount we need, or partially
13184      overlap with the previous chunk we copied and copy 8-bytes.  */
13185   if (n == 0)
13186     return true;
13187   else if (n == 1)
13188     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13189   else if (n == 2)
13190     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13191   else if (n == 4)
13192     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13193   else
13194     {
13195       if (n == 3)
13196         {
13197           src = aarch64_move_pointer (src, -1);
13198           dst = aarch64_move_pointer (dst, -1);
13199           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13200         }
13201       else
13202         {
13203           int move = n - 8;
13204
13205           src = aarch64_move_pointer (src, move);
13206           dst = aarch64_move_pointer (dst, move);
13207           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13208         }
13209     }
13210
13211   return true;
13212 }
13213
13214 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
13215    SImode stores.  Handle the case when the constant has identical
13216    bottom and top halves.  This is beneficial when the two stores can be
13217    merged into an STP and we avoid synthesising potentially expensive
13218    immediates twice.  Return true if such a split is possible.  */
13219
13220 bool
13221 aarch64_split_dimode_const_store (rtx dst, rtx src)
13222 {
13223   rtx lo = gen_lowpart (SImode, src);
13224   rtx hi = gen_highpart_mode (SImode, DImode, src);
13225
13226   bool size_p = optimize_function_for_size_p (cfun);
13227
13228   if (!rtx_equal_p (lo, hi))
13229     return false;
13230
13231   unsigned int orig_cost
13232     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
13233   unsigned int lo_cost
13234     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
13235
13236   /* We want to transform:
13237      MOV        x1, 49370
13238      MOVK       x1, 0x140, lsl 16
13239      MOVK       x1, 0xc0da, lsl 32
13240      MOVK       x1, 0x140, lsl 48
13241      STR        x1, [x0]
13242    into:
13243      MOV        w1, 49370
13244      MOVK       w1, 0x140, lsl 16
13245      STP        w1, w1, [x0]
13246    So we want to perform this only when we save two instructions
13247    or more.  When optimizing for size, however, accept any code size
13248    savings we can.  */
13249   if (size_p && orig_cost <= lo_cost)
13250     return false;
13251
13252   if (!size_p
13253       && (orig_cost <= lo_cost + 1))
13254     return false;
13255
13256   rtx mem_lo = adjust_address (dst, SImode, 0);
13257   if (!aarch64_mem_pair_operand (mem_lo, SImode))
13258     return false;
13259
13260   rtx tmp_reg = gen_reg_rtx (SImode);
13261   aarch64_expand_mov_immediate (tmp_reg, lo);
13262   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
13263   /* Don't emit an explicit store pair as this may not be always profitable.
13264      Let the sched-fusion logic decide whether to merge them.  */
13265   emit_move_insn (mem_lo, tmp_reg);
13266   emit_move_insn (mem_hi, tmp_reg);
13267
13268   return true;
13269 }
13270
13271 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
13272
13273 static unsigned HOST_WIDE_INT
13274 aarch64_asan_shadow_offset (void)
13275 {
13276   return (HOST_WIDE_INT_1 << 36);
13277 }
13278
13279 static bool
13280 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13281                                         unsigned int align,
13282                                         enum by_pieces_operation op,
13283                                         bool speed_p)
13284 {
13285   /* STORE_BY_PIECES can be used when copying a constant string, but
13286      in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13287      For now we always fail this and let the move_by_pieces code copy
13288      the string from read-only memory.  */
13289   if (op == STORE_BY_PIECES)
13290     return false;
13291
13292   return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13293 }
13294
13295 static rtx
13296 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
13297                         int code, tree treeop0, tree treeop1)
13298 {
13299   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13300   rtx op0, op1;
13301   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13302   insn_code icode;
13303   struct expand_operand ops[4];
13304
13305   start_sequence ();
13306   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13307
13308   op_mode = GET_MODE (op0);
13309   if (op_mode == VOIDmode)
13310     op_mode = GET_MODE (op1);
13311
13312   switch (op_mode)
13313     {
13314     case QImode:
13315     case HImode:
13316     case SImode:
13317       cmp_mode = SImode;
13318       icode = CODE_FOR_cmpsi;
13319       break;
13320
13321     case DImode:
13322       cmp_mode = DImode;
13323       icode = CODE_FOR_cmpdi;
13324       break;
13325
13326     case SFmode:
13327       cmp_mode = SFmode;
13328       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13329       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13330       break;
13331
13332     case DFmode:
13333       cmp_mode = DFmode;
13334       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13335       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13336       break;
13337
13338     default:
13339       end_sequence ();
13340       return NULL_RTX;
13341     }
13342
13343   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13344   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13345   if (!op0 || !op1)
13346     {
13347       end_sequence ();
13348       return NULL_RTX;
13349     }
13350   *prep_seq = get_insns ();
13351   end_sequence ();
13352
13353   create_fixed_operand (&ops[0], op0);
13354   create_fixed_operand (&ops[1], op1);
13355
13356   start_sequence ();
13357   if (!maybe_expand_insn (icode, 2, ops))
13358     {
13359       end_sequence ();
13360       return NULL_RTX;
13361     }
13362   *gen_seq = get_insns ();
13363   end_sequence ();
13364
13365   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13366                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13367 }
13368
13369 static rtx
13370 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
13371                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
13372 {
13373   rtx op0, op1, target;
13374   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13375   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13376   insn_code icode;
13377   struct expand_operand ops[6];
13378   int aarch64_cond;
13379
13380   push_to_sequence (*prep_seq);
13381   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13382
13383   op_mode = GET_MODE (op0);
13384   if (op_mode == VOIDmode)
13385     op_mode = GET_MODE (op1);
13386
13387   switch (op_mode)
13388     {
13389     case QImode:
13390     case HImode:
13391     case SImode:
13392       cmp_mode = SImode;
13393       icode = CODE_FOR_ccmpsi;
13394       break;
13395
13396     case DImode:
13397       cmp_mode = DImode;
13398       icode = CODE_FOR_ccmpdi;
13399       break;
13400
13401     case SFmode:
13402       cmp_mode = SFmode;
13403       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13404       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13405       break;
13406
13407     case DFmode:
13408       cmp_mode = DFmode;
13409       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13410       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13411       break;
13412
13413     default:
13414       end_sequence ();
13415       return NULL_RTX;
13416     }
13417
13418   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13419   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13420   if (!op0 || !op1)
13421     {
13422       end_sequence ();
13423       return NULL_RTX;
13424     }
13425   *prep_seq = get_insns ();
13426   end_sequence ();
13427
13428   target = gen_rtx_REG (cc_mode, CC_REGNUM);
13429   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
13430
13431   if (bit_code != AND)
13432     {
13433       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
13434                                                 GET_MODE (XEXP (prev, 0))),
13435                              VOIDmode, XEXP (prev, 0), const0_rtx);
13436       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
13437     }
13438
13439   create_fixed_operand (&ops[0], XEXP (prev, 0));
13440   create_fixed_operand (&ops[1], target);
13441   create_fixed_operand (&ops[2], op0);
13442   create_fixed_operand (&ops[3], op1);
13443   create_fixed_operand (&ops[4], prev);
13444   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
13445
13446   push_to_sequence (*gen_seq);
13447   if (!maybe_expand_insn (icode, 6, ops))
13448     {
13449       end_sequence ();
13450       return NULL_RTX;
13451     }
13452
13453   *gen_seq = get_insns ();
13454   end_sequence ();
13455
13456   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
13457 }
13458
13459 #undef TARGET_GEN_CCMP_FIRST
13460 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13461
13462 #undef TARGET_GEN_CCMP_NEXT
13463 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13464
13465 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
13466    instruction fusion of some sort.  */
13467
13468 static bool
13469 aarch64_macro_fusion_p (void)
13470 {
13471   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13472 }
13473
13474
13475 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
13476    should be kept together during scheduling.  */
13477
13478 static bool
13479 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13480 {
13481   rtx set_dest;
13482   rtx prev_set = single_set (prev);
13483   rtx curr_set = single_set (curr);
13484   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
13485   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13486
13487   if (!aarch64_macro_fusion_p ())
13488     return false;
13489
13490   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
13491     {
13492       /* We are trying to match:
13493          prev (mov)  == (set (reg r0) (const_int imm16))
13494          curr (movk) == (set (zero_extract (reg r0)
13495                                            (const_int 16)
13496                                            (const_int 16))
13497                              (const_int imm16_1))  */
13498
13499       set_dest = SET_DEST (curr_set);
13500
13501       if (GET_CODE (set_dest) == ZERO_EXTRACT
13502           && CONST_INT_P (SET_SRC (curr_set))
13503           && CONST_INT_P (SET_SRC (prev_set))
13504           && CONST_INT_P (XEXP (set_dest, 2))
13505           && INTVAL (XEXP (set_dest, 2)) == 16
13506           && REG_P (XEXP (set_dest, 0))
13507           && REG_P (SET_DEST (prev_set))
13508           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13509         {
13510           return true;
13511         }
13512     }
13513
13514   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
13515     {
13516
13517       /*  We're trying to match:
13518           prev (adrp) == (set (reg r1)
13519                               (high (symbol_ref ("SYM"))))
13520           curr (add) == (set (reg r0)
13521                              (lo_sum (reg r1)
13522                                      (symbol_ref ("SYM"))))
13523           Note that r0 need not necessarily be the same as r1, especially
13524           during pre-regalloc scheduling.  */
13525
13526       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13527           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13528         {
13529           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13530               && REG_P (XEXP (SET_SRC (curr_set), 0))
13531               && REGNO (XEXP (SET_SRC (curr_set), 0))
13532                  == REGNO (SET_DEST (prev_set))
13533               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13534                               XEXP (SET_SRC (curr_set), 1)))
13535             return true;
13536         }
13537     }
13538
13539   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
13540     {
13541
13542       /* We're trying to match:
13543          prev (movk) == (set (zero_extract (reg r0)
13544                                            (const_int 16)
13545                                            (const_int 32))
13546                              (const_int imm16_1))
13547          curr (movk) == (set (zero_extract (reg r0)
13548                                            (const_int 16)
13549                                            (const_int 48))
13550                              (const_int imm16_2))  */
13551
13552       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13553           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13554           && REG_P (XEXP (SET_DEST (prev_set), 0))
13555           && REG_P (XEXP (SET_DEST (curr_set), 0))
13556           && REGNO (XEXP (SET_DEST (prev_set), 0))
13557              == REGNO (XEXP (SET_DEST (curr_set), 0))
13558           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13559           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13560           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13561           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13562           && CONST_INT_P (SET_SRC (prev_set))
13563           && CONST_INT_P (SET_SRC (curr_set)))
13564         return true;
13565
13566     }
13567   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
13568     {
13569       /* We're trying to match:
13570           prev (adrp) == (set (reg r0)
13571                               (high (symbol_ref ("SYM"))))
13572           curr (ldr) == (set (reg r1)
13573                              (mem (lo_sum (reg r0)
13574                                              (symbol_ref ("SYM")))))
13575                  or
13576           curr (ldr) == (set (reg r1)
13577                              (zero_extend (mem
13578                                            (lo_sum (reg r0)
13579                                                    (symbol_ref ("SYM"))))))  */
13580       if (satisfies_constraint_Ush (SET_SRC (prev_set))
13581           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13582         {
13583           rtx curr_src = SET_SRC (curr_set);
13584
13585           if (GET_CODE (curr_src) == ZERO_EXTEND)
13586             curr_src = XEXP (curr_src, 0);
13587
13588           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13589               && REG_P (XEXP (XEXP (curr_src, 0), 0))
13590               && REGNO (XEXP (XEXP (curr_src, 0), 0))
13591                  == REGNO (SET_DEST (prev_set))
13592               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13593                               XEXP (SET_SRC (prev_set), 0)))
13594               return true;
13595         }
13596     }
13597
13598   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
13599        && aarch_crypto_can_dual_issue (prev, curr))
13600     return true;
13601
13602   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
13603       && any_condjump_p (curr))
13604     {
13605       enum attr_type prev_type = get_attr_type (prev);
13606
13607       /* FIXME: this misses some which is considered simple arthematic
13608          instructions for ThunderX.  Simple shifts are missed here.  */
13609       if (prev_type == TYPE_ALUS_SREG
13610           || prev_type == TYPE_ALUS_IMM
13611           || prev_type == TYPE_LOGICS_REG
13612           || prev_type == TYPE_LOGICS_IMM)
13613         return true;
13614     }
13615
13616   return false;
13617 }
13618
13619 /* Return true iff the instruction fusion described by OP is enabled.  */
13620
13621 bool
13622 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
13623 {
13624   return (aarch64_tune_params.fusible_ops & op) != 0;
13625 }
13626
13627 /* If MEM is in the form of [base+offset], extract the two parts
13628    of address and set to BASE and OFFSET, otherwise return false
13629    after clearing BASE and OFFSET.  */
13630
13631 bool
13632 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
13633 {
13634   rtx addr;
13635
13636   gcc_assert (MEM_P (mem));
13637
13638   addr = XEXP (mem, 0);
13639
13640   if (REG_P (addr))
13641     {
13642       *base = addr;
13643       *offset = const0_rtx;
13644       return true;
13645     }
13646
13647   if (GET_CODE (addr) == PLUS
13648       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
13649     {
13650       *base = XEXP (addr, 0);
13651       *offset = XEXP (addr, 1);
13652       return true;
13653     }
13654
13655   *base = NULL_RTX;
13656   *offset = NULL_RTX;
13657
13658   return false;
13659 }
13660
13661 /* Types for scheduling fusion.  */
13662 enum sched_fusion_type
13663 {
13664   SCHED_FUSION_NONE = 0,
13665   SCHED_FUSION_LD_SIGN_EXTEND,
13666   SCHED_FUSION_LD_ZERO_EXTEND,
13667   SCHED_FUSION_LD,
13668   SCHED_FUSION_ST,
13669   SCHED_FUSION_NUM
13670 };
13671
13672 /* If INSN is a load or store of address in the form of [base+offset],
13673    extract the two parts and set to BASE and OFFSET.  Return scheduling
13674    fusion type this INSN is.  */
13675
13676 static enum sched_fusion_type
13677 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
13678 {
13679   rtx x, dest, src;
13680   enum sched_fusion_type fusion = SCHED_FUSION_LD;
13681
13682   gcc_assert (INSN_P (insn));
13683   x = PATTERN (insn);
13684   if (GET_CODE (x) != SET)
13685     return SCHED_FUSION_NONE;
13686
13687   src = SET_SRC (x);
13688   dest = SET_DEST (x);
13689
13690   machine_mode dest_mode = GET_MODE (dest);
13691
13692   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
13693     return SCHED_FUSION_NONE;
13694
13695   if (GET_CODE (src) == SIGN_EXTEND)
13696     {
13697       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
13698       src = XEXP (src, 0);
13699       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13700         return SCHED_FUSION_NONE;
13701     }
13702   else if (GET_CODE (src) == ZERO_EXTEND)
13703     {
13704       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
13705       src = XEXP (src, 0);
13706       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13707         return SCHED_FUSION_NONE;
13708     }
13709
13710   if (GET_CODE (src) == MEM && REG_P (dest))
13711     extract_base_offset_in_addr (src, base, offset);
13712   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
13713     {
13714       fusion = SCHED_FUSION_ST;
13715       extract_base_offset_in_addr (dest, base, offset);
13716     }
13717   else
13718     return SCHED_FUSION_NONE;
13719
13720   if (*base == NULL_RTX || *offset == NULL_RTX)
13721     fusion = SCHED_FUSION_NONE;
13722
13723   return fusion;
13724 }
13725
13726 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
13727
13728    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
13729    and PRI are only calculated for these instructions.  For other instruction,
13730    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
13731    type instruction fusion can be added by returning different priorities.
13732
13733    It's important that irrelevant instructions get the largest FUSION_PRI.  */
13734
13735 static void
13736 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
13737                                int *fusion_pri, int *pri)
13738 {
13739   int tmp, off_val;
13740   rtx base, offset;
13741   enum sched_fusion_type fusion;
13742
13743   gcc_assert (INSN_P (insn));
13744
13745   tmp = max_pri - 1;
13746   fusion = fusion_load_store (insn, &base, &offset);
13747   if (fusion == SCHED_FUSION_NONE)
13748     {
13749       *pri = tmp;
13750       *fusion_pri = tmp;
13751       return;
13752     }
13753
13754   /* Set FUSION_PRI according to fusion type and base register.  */
13755   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
13756
13757   /* Calculate PRI.  */
13758   tmp /= 2;
13759
13760   /* INSN with smaller offset goes first.  */
13761   off_val = (int)(INTVAL (offset));
13762   if (off_val >= 0)
13763     tmp -= (off_val & 0xfffff);
13764   else
13765     tmp += ((- off_val) & 0xfffff);
13766
13767   *pri = tmp;
13768   return;
13769 }
13770
13771 /* Given OPERANDS of consecutive load/store, check if we can merge
13772    them into ldp/stp.  LOAD is true if they are load instructions.
13773    MODE is the mode of memory operands.  */
13774
13775 bool
13776 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
13777                                 enum machine_mode mode)
13778 {
13779   HOST_WIDE_INT offval_1, offval_2, msize;
13780   enum reg_class rclass_1, rclass_2;
13781   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
13782
13783   if (load)
13784     {
13785       mem_1 = operands[1];
13786       mem_2 = operands[3];
13787       reg_1 = operands[0];
13788       reg_2 = operands[2];
13789       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
13790       if (REGNO (reg_1) == REGNO (reg_2))
13791         return false;
13792     }
13793   else
13794     {
13795       mem_1 = operands[0];
13796       mem_2 = operands[2];
13797       reg_1 = operands[1];
13798       reg_2 = operands[3];
13799     }
13800
13801   /* The mems cannot be volatile.  */
13802   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
13803     return false;
13804
13805   /* If we have SImode and slow unaligned ldp,
13806      check the alignment to be at least 8 byte. */
13807   if (mode == SImode
13808       && (aarch64_tune_params.extra_tuning_flags
13809           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
13810       && !optimize_size
13811       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
13812     return false;
13813
13814   /* Check if the addresses are in the form of [base+offset].  */
13815   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13816   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13817     return false;
13818   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13819   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13820     return false;
13821
13822   /* Check if the bases are same.  */
13823   if (!rtx_equal_p (base_1, base_2))
13824     return false;
13825
13826   offval_1 = INTVAL (offset_1);
13827   offval_2 = INTVAL (offset_2);
13828   msize = GET_MODE_SIZE (mode);
13829   /* Check if the offsets are consecutive.  */
13830   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
13831     return false;
13832
13833   /* Check if the addresses are clobbered by load.  */
13834   if (load)
13835     {
13836       if (reg_mentioned_p (reg_1, mem_1))
13837         return false;
13838
13839       /* In increasing order, the last load can clobber the address.  */
13840       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
13841       return false;
13842     }
13843
13844   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13845     rclass_1 = FP_REGS;
13846   else
13847     rclass_1 = GENERAL_REGS;
13848
13849   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13850     rclass_2 = FP_REGS;
13851   else
13852     rclass_2 = GENERAL_REGS;
13853
13854   /* Check if the registers are of same class.  */
13855   if (rclass_1 != rclass_2)
13856     return false;
13857
13858   return true;
13859 }
13860
13861 /* Given OPERANDS of consecutive load/store, check if we can merge
13862    them into ldp/stp by adjusting the offset.  LOAD is true if they
13863    are load instructions.  MODE is the mode of memory operands.
13864
13865    Given below consecutive stores:
13866
13867      str  w1, [xb, 0x100]
13868      str  w1, [xb, 0x104]
13869      str  w1, [xb, 0x108]
13870      str  w1, [xb, 0x10c]
13871
13872    Though the offsets are out of the range supported by stp, we can
13873    still pair them after adjusting the offset, like:
13874
13875      add  scratch, xb, 0x100
13876      stp  w1, w1, [scratch]
13877      stp  w1, w1, [scratch, 0x8]
13878
13879    The peephole patterns detecting this opportunity should guarantee
13880    the scratch register is avaliable.  */
13881
13882 bool
13883 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
13884                                        enum machine_mode mode)
13885 {
13886   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
13887   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
13888   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
13889   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
13890
13891   if (load)
13892     {
13893       reg_1 = operands[0];
13894       mem_1 = operands[1];
13895       reg_2 = operands[2];
13896       mem_2 = operands[3];
13897       reg_3 = operands[4];
13898       mem_3 = operands[5];
13899       reg_4 = operands[6];
13900       mem_4 = operands[7];
13901       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
13902                   && REG_P (reg_3) && REG_P (reg_4));
13903       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
13904         return false;
13905     }
13906   else
13907     {
13908       mem_1 = operands[0];
13909       reg_1 = operands[1];
13910       mem_2 = operands[2];
13911       reg_2 = operands[3];
13912       mem_3 = operands[4];
13913       reg_3 = operands[5];
13914       mem_4 = operands[6];
13915       reg_4 = operands[7];
13916     }
13917   /* Skip if memory operand is by itslef valid for ldp/stp.  */
13918   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
13919     return false;
13920
13921   /* The mems cannot be volatile.  */
13922   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
13923       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
13924     return false;
13925
13926   /* Check if the addresses are in the form of [base+offset].  */
13927   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13928   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13929     return false;
13930   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13931   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13932     return false;
13933   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
13934   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
13935     return false;
13936   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
13937   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
13938     return false;
13939
13940   /* Check if the bases are same.  */
13941   if (!rtx_equal_p (base_1, base_2)
13942       || !rtx_equal_p (base_2, base_3)
13943       || !rtx_equal_p (base_3, base_4))
13944     return false;
13945
13946   offval_1 = INTVAL (offset_1);
13947   offval_2 = INTVAL (offset_2);
13948   offval_3 = INTVAL (offset_3);
13949   offval_4 = INTVAL (offset_4);
13950   msize = GET_MODE_SIZE (mode);
13951   /* Check if the offsets are consecutive.  */
13952   if ((offval_1 != (offval_2 + msize)
13953        || offval_1 != (offval_3 + msize * 2)
13954        || offval_1 != (offval_4 + msize * 3))
13955       && (offval_4 != (offval_3 + msize)
13956           || offval_4 != (offval_2 + msize * 2)
13957           || offval_4 != (offval_1 + msize * 3)))
13958     return false;
13959
13960   /* Check if the addresses are clobbered by load.  */
13961   if (load)
13962     {
13963       if (reg_mentioned_p (reg_1, mem_1)
13964           || reg_mentioned_p (reg_2, mem_2)
13965           || reg_mentioned_p (reg_3, mem_3))
13966         return false;
13967
13968       /* In increasing order, the last load can clobber the address.  */
13969       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
13970         return false;
13971     }
13972
13973   /* If we have SImode and slow unaligned ldp,
13974      check the alignment to be at least 8 byte. */
13975   if (mode == SImode
13976       && (aarch64_tune_params.extra_tuning_flags
13977           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
13978       && !optimize_size
13979       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
13980     return false;
13981
13982   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13983     rclass_1 = FP_REGS;
13984   else
13985     rclass_1 = GENERAL_REGS;
13986
13987   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13988     rclass_2 = FP_REGS;
13989   else
13990     rclass_2 = GENERAL_REGS;
13991
13992   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
13993     rclass_3 = FP_REGS;
13994   else
13995     rclass_3 = GENERAL_REGS;
13996
13997   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
13998     rclass_4 = FP_REGS;
13999   else
14000     rclass_4 = GENERAL_REGS;
14001
14002   /* Check if the registers are of same class.  */
14003   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14004     return false;
14005
14006   return true;
14007 }
14008
14009 /* Given OPERANDS of consecutive load/store, this function pairs them
14010    into ldp/stp after adjusting the offset.  It depends on the fact
14011    that addresses of load/store instructions are in increasing order.
14012    MODE is the mode of memory operands.  CODE is the rtl operator
14013    which should be applied to all memory operands, it's SIGN_EXTEND,
14014    ZERO_EXTEND or UNKNOWN.  */
14015
14016 bool
14017 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14018                              enum machine_mode mode, RTX_CODE code)
14019 {
14020   rtx base, offset, t1, t2;
14021   rtx mem_1, mem_2, mem_3, mem_4;
14022   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14023
14024   if (load)
14025     {
14026       mem_1 = operands[1];
14027       mem_2 = operands[3];
14028       mem_3 = operands[5];
14029       mem_4 = operands[7];
14030     }
14031   else
14032     {
14033       mem_1 = operands[0];
14034       mem_2 = operands[2];
14035       mem_3 = operands[4];
14036       mem_4 = operands[6];
14037       gcc_assert (code == UNKNOWN);
14038     }
14039
14040   extract_base_offset_in_addr (mem_1, &base, &offset);
14041   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14042
14043   /* Adjust offset thus it can fit in ldp/stp instruction.  */
14044   msize = GET_MODE_SIZE (mode);
14045   stp_off_limit = msize * 0x40;
14046   off_val = INTVAL (offset);
14047   abs_off = (off_val < 0) ? -off_val : off_val;
14048   new_off = abs_off % stp_off_limit;
14049   adj_off = abs_off - new_off;
14050
14051   /* Further adjust to make sure all offsets are OK.  */
14052   if ((new_off + msize * 2) >= stp_off_limit)
14053     {
14054       adj_off += stp_off_limit;
14055       new_off -= stp_off_limit;
14056     }
14057
14058   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
14059   if (adj_off >= 0x1000)
14060     return false;
14061
14062   if (off_val < 0)
14063     {
14064       adj_off = -adj_off;
14065       new_off = -new_off;
14066     }
14067
14068   /* Create new memory references.  */
14069   mem_1 = change_address (mem_1, VOIDmode,
14070                           plus_constant (DImode, operands[8], new_off));
14071
14072   /* Check if the adjusted address is OK for ldp/stp.  */
14073   if (!aarch64_mem_pair_operand (mem_1, mode))
14074     return false;
14075
14076   msize = GET_MODE_SIZE (mode);
14077   mem_2 = change_address (mem_2, VOIDmode,
14078                           plus_constant (DImode,
14079                                          operands[8],
14080                                          new_off + msize));
14081   mem_3 = change_address (mem_3, VOIDmode,
14082                           plus_constant (DImode,
14083                                          operands[8],
14084                                          new_off + msize * 2));
14085   mem_4 = change_address (mem_4, VOIDmode,
14086                           plus_constant (DImode,
14087                                          operands[8],
14088                                          new_off + msize * 3));
14089
14090   if (code == ZERO_EXTEND)
14091     {
14092       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14093       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14094       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14095       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14096     }
14097   else if (code == SIGN_EXTEND)
14098     {
14099       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14100       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14101       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14102       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14103     }
14104
14105   if (load)
14106     {
14107       operands[1] = mem_1;
14108       operands[3] = mem_2;
14109       operands[5] = mem_3;
14110       operands[7] = mem_4;
14111     }
14112   else
14113     {
14114       operands[0] = mem_1;
14115       operands[2] = mem_2;
14116       operands[4] = mem_3;
14117       operands[6] = mem_4;
14118     }
14119
14120   /* Emit adjusting instruction.  */
14121   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14122   /* Emit ldp/stp instructions.  */
14123   t1 = gen_rtx_SET (operands[0], operands[1]);
14124   t2 = gen_rtx_SET (operands[2], operands[3]);
14125   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14126   t1 = gen_rtx_SET (operands[4], operands[5]);
14127   t2 = gen_rtx_SET (operands[6], operands[7]);
14128   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14129   return true;
14130 }
14131
14132 /* Return 1 if pseudo register should be created and used to hold
14133    GOT address for PIC code.  */
14134
14135 bool
14136 aarch64_use_pseudo_pic_reg (void)
14137 {
14138   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14139 }
14140
14141 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
14142
14143 static int
14144 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14145 {
14146   switch (XINT (x, 1))
14147     {
14148     case UNSPEC_GOTSMALLPIC:
14149     case UNSPEC_GOTSMALLPIC28K:
14150     case UNSPEC_GOTTINYPIC:
14151       return 0;
14152     default:
14153       break;
14154     }
14155
14156   return default_unspec_may_trap_p (x, flags);
14157 }
14158
14159
14160 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14161    return the log2 of that value.  Otherwise return -1.  */
14162
14163 int
14164 aarch64_fpconst_pow_of_2 (rtx x)
14165 {
14166   const REAL_VALUE_TYPE *r;
14167
14168   if (!CONST_DOUBLE_P (x))
14169     return -1;
14170
14171   r = CONST_DOUBLE_REAL_VALUE (x);
14172
14173   if (REAL_VALUE_NEGATIVE (*r)
14174       || REAL_VALUE_ISNAN (*r)
14175       || REAL_VALUE_ISINF (*r)
14176       || !real_isinteger (r, DFmode))
14177     return -1;
14178
14179   return exact_log2 (real_to_integer (r));
14180 }
14181
14182 /* If X is a vector of equal CONST_DOUBLE values and that value is
14183    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
14184
14185 int
14186 aarch64_vec_fpconst_pow_of_2 (rtx x)
14187 {
14188   if (GET_CODE (x) != CONST_VECTOR)
14189     return -1;
14190
14191   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14192     return -1;
14193
14194   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14195   if (firstval <= 0)
14196     return -1;
14197
14198   for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14199     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14200       return -1;
14201
14202   return firstval;
14203 }
14204
14205 /* Implement TARGET_PROMOTED_TYPE to promote __fp16 to float.  */
14206 static tree
14207 aarch64_promoted_type (const_tree t)
14208 {
14209   if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
14210     return float_type_node;
14211   return NULL_TREE;
14212 }
14213
14214 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
14215
14216 static bool
14217 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
14218                            optimization_type opt_type)
14219 {
14220   switch (op)
14221     {
14222     case rsqrt_optab:
14223       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14224
14225     default:
14226       return true;
14227     }
14228 }
14229
14230 #undef TARGET_ADDRESS_COST
14231 #define TARGET_ADDRESS_COST aarch64_address_cost
14232
14233 /* This hook will determines whether unnamed bitfields affect the alignment
14234    of the containing structure.  The hook returns true if the structure
14235    should inherit the alignment requirements of an unnamed bitfield's
14236    type.  */
14237 #undef TARGET_ALIGN_ANON_BITFIELD
14238 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14239
14240 #undef TARGET_ASM_ALIGNED_DI_OP
14241 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14242
14243 #undef TARGET_ASM_ALIGNED_HI_OP
14244 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14245
14246 #undef TARGET_ASM_ALIGNED_SI_OP
14247 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14248
14249 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14250 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14251   hook_bool_const_tree_hwi_hwi_const_tree_true
14252
14253 #undef TARGET_ASM_FILE_START
14254 #define TARGET_ASM_FILE_START aarch64_start_file
14255
14256 #undef TARGET_ASM_OUTPUT_MI_THUNK
14257 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14258
14259 #undef TARGET_ASM_SELECT_RTX_SECTION
14260 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14261
14262 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14263 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14264
14265 #undef TARGET_BUILD_BUILTIN_VA_LIST
14266 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14267
14268 #undef TARGET_CALLEE_COPIES
14269 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14270
14271 #undef TARGET_CAN_ELIMINATE
14272 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14273
14274 #undef TARGET_CAN_INLINE_P
14275 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14276
14277 #undef TARGET_CANNOT_FORCE_CONST_MEM
14278 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14279
14280 #undef TARGET_CASE_VALUES_THRESHOLD
14281 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14282
14283 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14284 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14285
14286 /* Only the least significant bit is used for initialization guard
14287    variables.  */
14288 #undef TARGET_CXX_GUARD_MASK_BIT
14289 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14290
14291 #undef TARGET_C_MODE_FOR_SUFFIX
14292 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14293
14294 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14295 #undef  TARGET_DEFAULT_TARGET_FLAGS
14296 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14297 #endif
14298
14299 #undef TARGET_CLASS_MAX_NREGS
14300 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14301
14302 #undef TARGET_BUILTIN_DECL
14303 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14304
14305 #undef TARGET_BUILTIN_RECIPROCAL
14306 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14307
14308 #undef  TARGET_EXPAND_BUILTIN
14309 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14310
14311 #undef TARGET_EXPAND_BUILTIN_VA_START
14312 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14313
14314 #undef TARGET_FOLD_BUILTIN
14315 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14316
14317 #undef TARGET_FUNCTION_ARG
14318 #define TARGET_FUNCTION_ARG aarch64_function_arg
14319
14320 #undef TARGET_FUNCTION_ARG_ADVANCE
14321 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14322
14323 #undef TARGET_FUNCTION_ARG_BOUNDARY
14324 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14325
14326 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14327 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14328
14329 #undef TARGET_FUNCTION_VALUE
14330 #define TARGET_FUNCTION_VALUE aarch64_function_value
14331
14332 #undef TARGET_FUNCTION_VALUE_REGNO_P
14333 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14334
14335 #undef TARGET_FRAME_POINTER_REQUIRED
14336 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14337
14338 #undef TARGET_GIMPLE_FOLD_BUILTIN
14339 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14340
14341 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14342 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14343
14344 #undef  TARGET_INIT_BUILTINS
14345 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
14346
14347 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14348 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14349   aarch64_ira_change_pseudo_allocno_class
14350
14351 #undef TARGET_LEGITIMATE_ADDRESS_P
14352 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14353
14354 #undef TARGET_LEGITIMATE_CONSTANT_P
14355 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14356
14357 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
14358 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
14359   aarch64_legitimize_address_displacement
14360
14361 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14362 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14363
14364 #undef TARGET_MANGLE_TYPE
14365 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14366
14367 #undef TARGET_MEMORY_MOVE_COST
14368 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14369
14370 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14371 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14372
14373 #undef TARGET_MUST_PASS_IN_STACK
14374 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14375
14376 /* This target hook should return true if accesses to volatile bitfields
14377    should use the narrowest mode possible.  It should return false if these
14378    accesses should use the bitfield container type.  */
14379 #undef TARGET_NARROW_VOLATILE_BITFIELD
14380 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14381
14382 #undef  TARGET_OPTION_OVERRIDE
14383 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14384
14385 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14386 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14387   aarch64_override_options_after_change
14388
14389 #undef TARGET_OPTION_SAVE
14390 #define TARGET_OPTION_SAVE aarch64_option_save
14391
14392 #undef TARGET_OPTION_RESTORE
14393 #define TARGET_OPTION_RESTORE aarch64_option_restore
14394
14395 #undef TARGET_OPTION_PRINT
14396 #define TARGET_OPTION_PRINT aarch64_option_print
14397
14398 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14399 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14400
14401 #undef TARGET_SET_CURRENT_FUNCTION
14402 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14403
14404 #undef TARGET_PASS_BY_REFERENCE
14405 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14406
14407 #undef TARGET_PREFERRED_RELOAD_CLASS
14408 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14409
14410 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14411 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14412
14413 #undef TARGET_PROMOTED_TYPE
14414 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14415
14416 #undef TARGET_SECONDARY_RELOAD
14417 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14418
14419 #undef TARGET_SHIFT_TRUNCATION_MASK
14420 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14421
14422 #undef TARGET_SETUP_INCOMING_VARARGS
14423 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14424
14425 #undef TARGET_STRUCT_VALUE_RTX
14426 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
14427
14428 #undef TARGET_REGISTER_MOVE_COST
14429 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14430
14431 #undef TARGET_RETURN_IN_MEMORY
14432 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14433
14434 #undef TARGET_RETURN_IN_MSB
14435 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14436
14437 #undef TARGET_RTX_COSTS
14438 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14439
14440 #undef TARGET_SCHED_ISSUE_RATE
14441 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14442
14443 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14444 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14445   aarch64_sched_first_cycle_multipass_dfa_lookahead
14446
14447 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14448 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14449   aarch64_first_cycle_multipass_dfa_lookahead_guard
14450
14451 #undef TARGET_TRAMPOLINE_INIT
14452 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
14453
14454 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
14455 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
14456
14457 #undef TARGET_VECTOR_MODE_SUPPORTED_P
14458 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
14459
14460 #undef TARGET_ARRAY_MODE_SUPPORTED_P
14461 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
14462
14463 #undef TARGET_VECTORIZE_ADD_STMT_COST
14464 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
14465
14466 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
14467 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
14468   aarch64_builtin_vectorization_cost
14469
14470 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
14471 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
14472
14473 #undef TARGET_VECTORIZE_BUILTINS
14474 #define TARGET_VECTORIZE_BUILTINS
14475
14476 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
14477 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
14478   aarch64_builtin_vectorized_function
14479
14480 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
14481 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
14482   aarch64_autovectorize_vector_sizes
14483
14484 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
14485 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
14486   aarch64_atomic_assign_expand_fenv
14487
14488 /* Section anchor support.  */
14489
14490 #undef TARGET_MIN_ANCHOR_OFFSET
14491 #define TARGET_MIN_ANCHOR_OFFSET -256
14492
14493 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
14494    byte offset; we can do much more for larger data types, but have no way
14495    to determine the size of the access.  We assume accesses are aligned.  */
14496 #undef TARGET_MAX_ANCHOR_OFFSET
14497 #define TARGET_MAX_ANCHOR_OFFSET 4095
14498
14499 #undef TARGET_VECTOR_ALIGNMENT
14500 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
14501
14502 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
14503 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
14504   aarch64_simd_vector_alignment_reachable
14505
14506 /* vec_perm support.  */
14507
14508 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
14509 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
14510   aarch64_vectorize_vec_perm_const_ok
14511
14512 #undef TARGET_INIT_LIBFUNCS
14513 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
14514
14515 #undef TARGET_FIXED_CONDITION_CODE_REGS
14516 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
14517
14518 #undef TARGET_FLAGS_REGNUM
14519 #define TARGET_FLAGS_REGNUM CC_REGNUM
14520
14521 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
14522 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
14523
14524 #undef TARGET_ASAN_SHADOW_OFFSET
14525 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
14526
14527 #undef TARGET_LEGITIMIZE_ADDRESS
14528 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
14529
14530 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
14531 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
14532   aarch64_use_by_pieces_infrastructure_p
14533
14534 #undef TARGET_CAN_USE_DOLOOP_P
14535 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
14536
14537 #undef TARGET_SCHED_MACRO_FUSION_P
14538 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
14539
14540 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
14541 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
14542
14543 #undef TARGET_SCHED_FUSION_PRIORITY
14544 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
14545
14546 #undef TARGET_UNSPEC_MAY_TRAP_P
14547 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
14548
14549 #undef TARGET_USE_PSEUDO_PIC_REG
14550 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
14551
14552 #undef TARGET_PRINT_OPERAND
14553 #define TARGET_PRINT_OPERAND aarch64_print_operand
14554
14555 #undef TARGET_PRINT_OPERAND_ADDRESS
14556 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
14557
14558 #undef TARGET_OPTAB_SUPPORTED_P
14559 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
14560
14561 #undef TARGET_OMIT_STRUCT_RETURN_REG
14562 #define TARGET_OMIT_STRUCT_RETURN_REG true
14563
14564 struct gcc_target targetm = TARGET_INITIALIZER;
14565
14566 #include "gt-aarch64.h"