gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2018 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "diagnostic.h"
  44 #include "insn-attr.h"
  45 #include "alias.h"
  46 #include "fold-const.h"
  47 #include "stor-layout.h"
  48 #include "calls.h"
  49 #include "varasm.h"
  50 #include "output.h"
  51 #include "flags.h"
  52 #include "explow.h"
  53 #include "expr.h"
  54 #include "reload.h"
  55 #include "langhooks.h"
  56 #include "opts.h"
  57 #include "params.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74
  75 /* This file should be included last.  */
  76 #include "target-def.h"
  77
  78 /* Defined for convenience.  */
  79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  80
  81 /* Classifies an address.
  82
  83    ADDRESS_REG_IMM
  84        A simple base register plus immediate offset.
  85
  86    ADDRESS_REG_WB
  87        A base register indexed by immediate offset with writeback.
  88
  89    ADDRESS_REG_REG
  90        A base register indexed by (optionally scaled) register.
  91
  92    ADDRESS_REG_UXTW
  93        A base register indexed by (optionally scaled) zero-extended register.
  94
  95    ADDRESS_REG_SXTW
  96        A base register indexed by (optionally scaled) sign-extended register.
  97
  98    ADDRESS_LO_SUM
  99        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 100
 101    ADDRESS_SYMBOLIC:
 102        A constant symbolic address, in pc-relative literal pool.  */
 103
 104 enum aarch64_address_type {
 105   ADDRESS_REG_IMM,
 106   ADDRESS_REG_WB,
 107   ADDRESS_REG_REG,
 108   ADDRESS_REG_UXTW,
 109   ADDRESS_REG_SXTW,
 110   ADDRESS_LO_SUM,
 111   ADDRESS_SYMBOLIC
 112 };
 113
 114 struct aarch64_address_info {
 115   enum aarch64_address_type type;
 116   rtx base;
 117   rtx offset;
 118   poly_int64 const_offset;
 119   int shift;
 120   enum aarch64_symbol_type symbol_type;
 121 };
 122
 123 /* Information about a legitimate vector immediate operand.  */
 124 struct simd_immediate_info
 125 {
 126   enum insn_type { MOV, MVN };
 127   enum modifier_type { LSL, MSL };
 128
 129   simd_immediate_info () {}
 130   simd_immediate_info (scalar_float_mode, rtx);
 131   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 132                        insn_type = MOV, modifier_type = LSL,
 133                        unsigned int = 0);
 134   simd_immediate_info (scalar_mode, rtx, rtx);
 135
 136   /* The mode of the elements.  */
 137   scalar_mode elt_mode;
 138
 139   /* The value of each element if all elements are the same, or the
 140      first value if the constant is a series.  */
 141   rtx value;
 142
 143   /* The value of the step if the constant is a series, null otherwise.  */
 144   rtx step;
 145
 146   /* The instruction to use to move the immediate into a vector.  */
 147   insn_type insn;
 148
 149   /* The kind of shift modifier to use, and the number of bits to shift.
 150      This is (LSL, 0) if no shift is needed.  */
 151   modifier_type modifier;
 152   unsigned int shift;
 153 };
 154
 155 /* Construct a floating-point immediate in which each element has mode
 156    ELT_MODE_IN and value VALUE_IN.  */
 157 inline simd_immediate_info
 158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 159   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 160     modifier (LSL), shift (0)
 161 {}
 162
 163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 164    and value VALUE_IN.  The other parameters are as for the structure
 165    fields.  */
 166 inline simd_immediate_info
 167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 168                        unsigned HOST_WIDE_INT value_in,
 169                        insn_type insn_in, modifier_type modifier_in,
 170                        unsigned int shift_in)
 171   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 172     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 173 {}
 174
 175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 176    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 177 inline simd_immediate_info
 178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 179   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 180     modifier (LSL), shift (0)
 181 {}
 182
 183 /* The current code model.  */
 184 enum aarch64_code_model aarch64_cmodel;
 185
 186 /* The number of 64-bit elements in an SVE vector.  */
 187 poly_uint16 aarch64_sve_vg;
 188
 189 #ifdef HAVE_AS_TLS
 190 #undef TARGET_HAVE_TLS
 191 #define TARGET_HAVE_TLS 1
 192 #endif
 193
 194 static bool aarch64_composite_type_p (const_tree, machine_mode);
 195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 196                                                      const_tree,
 197                                                      machine_mode *, int *,
 198                                                      bool *);
 199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 201 static void aarch64_override_options_after_change (void);
 202 static bool aarch64_vector_mode_supported_p (machine_mode);
 203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 205                                                          const_tree type,
 206                                                          int misalignment,
 207                                                          bool is_packed);
 208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
 210
 211 /* Major revision number of the ARM Architecture implemented by the target.  */
 212 unsigned aarch64_architecture_version;
 213
 214 /* The processor for which instructions should be scheduled.  */
 215 enum aarch64_processor aarch64_tune = cortexa53;
 216
 217 /* Mask to specify which instruction scheduling options should be used.  */
 218 unsigned long aarch64_tune_flags = 0;
 219
 220 /* Global flag for PC relative loads.  */
 221 bool aarch64_pcrelative_literal_loads;
 222
 223 /* Support for command line parsing of boolean flags in the tuning
 224    structures.  */
 225 struct aarch64_flag_desc
 226 {
 227   const char* name;
 228   unsigned int flag;
 229 };
 230
 231 #define AARCH64_FUSION_PAIR(name, internal_name) \
 232   { name, AARCH64_FUSE_##internal_name },
 233 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 234 {
 235   { "none", AARCH64_FUSE_NOTHING },
 236 #include "aarch64-fusion-pairs.def"
 237   { "all", AARCH64_FUSE_ALL },
 238   { NULL, AARCH64_FUSE_NOTHING }
 239 };
 240
 241 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 242   { name, AARCH64_EXTRA_TUNE_##internal_name },
 243 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 244 {
 245   { "none", AARCH64_EXTRA_TUNE_NONE },
 246 #include "aarch64-tuning-flags.def"
 247   { "all", AARCH64_EXTRA_TUNE_ALL },
 248   { NULL, AARCH64_EXTRA_TUNE_NONE }
 249 };
 250
 251 /* Tuning parameters.  */
 252
 253 static const struct cpu_addrcost_table generic_addrcost_table =
 254 {
 255     {
 256       1, /* hi  */
 257       0, /* si  */
 258       0, /* di  */
 259       1, /* ti  */
 260     },
 261   0, /* pre_modify  */
 262   0, /* post_modify  */
 263   0, /* register_offset  */
 264   0, /* register_sextend  */
 265   0, /* register_zextend  */
 266   0 /* imm_offset  */
 267 };
 268
 269 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 270 {
 271     {
 272       0, /* hi  */
 273       0, /* si  */
 274       0, /* di  */
 275       2, /* ti  */
 276     },
 277   0, /* pre_modify  */
 278   0, /* post_modify  */
 279   1, /* register_offset  */
 280   1, /* register_sextend  */
 281   2, /* register_zextend  */
 282   0, /* imm_offset  */
 283 };
 284
 285 static const struct cpu_addrcost_table xgene1_addrcost_table =
 286 {
 287     {
 288       1, /* hi  */
 289       0, /* si  */
 290       0, /* di  */
 291       1, /* ti  */
 292     },
 293   1, /* pre_modify  */
 294   0, /* post_modify  */
 295   0, /* register_offset  */
 296   1, /* register_sextend  */
 297   1, /* register_zextend  */
 298   0, /* imm_offset  */
 299 };
 300
 301 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 302 {
 303     {
 304       1, /* hi  */
 305       1, /* si  */
 306       1, /* di  */
 307       2, /* ti  */
 308     },
 309   0, /* pre_modify  */
 310   0, /* post_modify  */
 311   2, /* register_offset  */
 312   3, /* register_sextend  */
 313   3, /* register_zextend  */
 314   0, /* imm_offset  */
 315 };
 316
 317 static const struct cpu_regmove_cost generic_regmove_cost =
 318 {
 319   1, /* GP2GP  */
 320   /* Avoid the use of slow int<->fp moves for spilling by setting
 321      their cost higher than memmov_cost.  */
 322   5, /* GP2FP  */
 323   5, /* FP2GP  */
 324   2 /* FP2FP  */
 325 };
 326
 327 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 328 {
 329   1, /* GP2GP  */
 330   /* Avoid the use of slow int<->fp moves for spilling by setting
 331      their cost higher than memmov_cost.  */
 332   5, /* GP2FP  */
 333   5, /* FP2GP  */
 334   2 /* FP2FP  */
 335 };
 336
 337 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 338 {
 339   1, /* GP2GP  */
 340   /* Avoid the use of slow int<->fp moves for spilling by setting
 341      their cost higher than memmov_cost.  */
 342   5, /* GP2FP  */
 343   5, /* FP2GP  */
 344   2 /* FP2FP  */
 345 };
 346
 347 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 348 {
 349   1, /* GP2GP  */
 350   /* Avoid the use of slow int<->fp moves for spilling by setting
 351      their cost higher than memmov_cost (actual, 4 and 9).  */
 352   9, /* GP2FP  */
 353   9, /* FP2GP  */
 354   1 /* FP2FP  */
 355 };
 356
 357 static const struct cpu_regmove_cost thunderx_regmove_cost =
 358 {
 359   2, /* GP2GP  */
 360   2, /* GP2FP  */
 361   6, /* FP2GP  */
 362   4 /* FP2FP  */
 363 };
 364
 365 static const struct cpu_regmove_cost xgene1_regmove_cost =
 366 {
 367   1, /* GP2GP  */
 368   /* Avoid the use of slow int<->fp moves for spilling by setting
 369      their cost higher than memmov_cost.  */
 370   8, /* GP2FP  */
 371   8, /* FP2GP  */
 372   2 /* FP2FP  */
 373 };
 374
 375 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 376 {
 377   2, /* GP2GP  */
 378   /* Avoid the use of int<->fp moves for spilling.  */
 379   6, /* GP2FP  */
 380   6, /* FP2GP  */
 381   4 /* FP2FP  */
 382 };
 383
 384 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 385 {
 386   1, /* GP2GP  */
 387   /* Avoid the use of int<->fp moves for spilling.  */
 388   8, /* GP2FP  */
 389   8, /* FP2GP  */
 390   4  /* FP2FP  */
 391 };
 392
 393 /* Generic costs for vector insn classes.  */
 394 static const struct cpu_vector_cost generic_vector_cost =
 395 {
 396   1, /* scalar_int_stmt_cost  */
 397   1, /* scalar_fp_stmt_cost  */
 398   1, /* scalar_load_cost  */
 399   1, /* scalar_store_cost  */
 400   1, /* vec_int_stmt_cost  */
 401   1, /* vec_fp_stmt_cost  */
 402   2, /* vec_permute_cost  */
 403   1, /* vec_to_scalar_cost  */
 404   1, /* scalar_to_vec_cost  */
 405   1, /* vec_align_load_cost  */
 406   1, /* vec_unalign_load_cost  */
 407   1, /* vec_unalign_store_cost  */
 408   1, /* vec_store_cost  */
 409   3, /* cond_taken_branch_cost  */
 410   1 /* cond_not_taken_branch_cost  */
 411 };
 412
 413 /* ThunderX costs for vector insn classes.  */
 414 static const struct cpu_vector_cost thunderx_vector_cost =
 415 {
 416   1, /* scalar_int_stmt_cost  */
 417   1, /* scalar_fp_stmt_cost  */
 418   3, /* scalar_load_cost  */
 419   1, /* scalar_store_cost  */
 420   4, /* vec_int_stmt_cost  */
 421   1, /* vec_fp_stmt_cost  */
 422   4, /* vec_permute_cost  */
 423   2, /* vec_to_scalar_cost  */
 424   2, /* scalar_to_vec_cost  */
 425   3, /* vec_align_load_cost  */
 426   5, /* vec_unalign_load_cost  */
 427   5, /* vec_unalign_store_cost  */
 428   1, /* vec_store_cost  */
 429   3, /* cond_taken_branch_cost  */
 430   3 /* cond_not_taken_branch_cost  */
 431 };
 432
 433 /* Generic costs for vector insn classes.  */
 434 static const struct cpu_vector_cost cortexa57_vector_cost =
 435 {
 436   1, /* scalar_int_stmt_cost  */
 437   1, /* scalar_fp_stmt_cost  */
 438   4, /* scalar_load_cost  */
 439   1, /* scalar_store_cost  */
 440   2, /* vec_int_stmt_cost  */
 441   2, /* vec_fp_stmt_cost  */
 442   3, /* vec_permute_cost  */
 443   8, /* vec_to_scalar_cost  */
 444   8, /* scalar_to_vec_cost  */
 445   4, /* vec_align_load_cost  */
 446   4, /* vec_unalign_load_cost  */
 447   1, /* vec_unalign_store_cost  */
 448   1, /* vec_store_cost  */
 449   1, /* cond_taken_branch_cost  */
 450   1 /* cond_not_taken_branch_cost  */
 451 };
 452
 453 static const struct cpu_vector_cost exynosm1_vector_cost =
 454 {
 455   1, /* scalar_int_stmt_cost  */
 456   1, /* scalar_fp_stmt_cost  */
 457   5, /* scalar_load_cost  */
 458   1, /* scalar_store_cost  */
 459   3, /* vec_int_stmt_cost  */
 460   3, /* vec_fp_stmt_cost  */
 461   3, /* vec_permute_cost  */
 462   3, /* vec_to_scalar_cost  */
 463   3, /* scalar_to_vec_cost  */
 464   5, /* vec_align_load_cost  */
 465   5, /* vec_unalign_load_cost  */
 466   1, /* vec_unalign_store_cost  */
 467   1, /* vec_store_cost  */
 468   1, /* cond_taken_branch_cost  */
 469   1 /* cond_not_taken_branch_cost  */
 470 };
 471
 472 /* Generic costs for vector insn classes.  */
 473 static const struct cpu_vector_cost xgene1_vector_cost =
 474 {
 475   1, /* scalar_int_stmt_cost  */
 476   1, /* scalar_fp_stmt_cost  */
 477   5, /* scalar_load_cost  */
 478   1, /* scalar_store_cost  */
 479   2, /* vec_int_stmt_cost  */
 480   2, /* vec_fp_stmt_cost  */
 481   2, /* vec_permute_cost  */
 482   4, /* vec_to_scalar_cost  */
 483   4, /* scalar_to_vec_cost  */
 484   10, /* vec_align_load_cost  */
 485   10, /* vec_unalign_load_cost  */
 486   2, /* vec_unalign_store_cost  */
 487   2, /* vec_store_cost  */
 488   2, /* cond_taken_branch_cost  */
 489   1 /* cond_not_taken_branch_cost  */
 490 };
 491
 492 /* Costs for vector insn classes for Vulcan.  */
 493 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 494 {
 495   1, /* scalar_int_stmt_cost  */
 496   6, /* scalar_fp_stmt_cost  */
 497   4, /* scalar_load_cost  */
 498   1, /* scalar_store_cost  */
 499   5, /* vec_int_stmt_cost  */
 500   6, /* vec_fp_stmt_cost  */
 501   3, /* vec_permute_cost  */
 502   6, /* vec_to_scalar_cost  */
 503   5, /* scalar_to_vec_cost  */
 504   8, /* vec_align_load_cost  */
 505   8, /* vec_unalign_load_cost  */
 506   4, /* vec_unalign_store_cost  */
 507   4, /* vec_store_cost  */
 508   2, /* cond_taken_branch_cost  */
 509   1  /* cond_not_taken_branch_cost  */
 510 };
 511
 512 /* Generic costs for branch instructions.  */
 513 static const struct cpu_branch_cost generic_branch_cost =
 514 {
 515   1,  /* Predictable.  */
 516   3   /* Unpredictable.  */
 517 };
 518
 519 /* Generic approximation modes.  */
 520 static const cpu_approx_modes generic_approx_modes =
 521 {
 522   AARCH64_APPROX_NONE,  /* division  */
 523   AARCH64_APPROX_NONE,  /* sqrt  */
 524   AARCH64_APPROX_NONE   /* recip_sqrt  */
 525 };
 526
 527 /* Approximation modes for Exynos M1.  */
 528 static const cpu_approx_modes exynosm1_approx_modes =
 529 {
 530   AARCH64_APPROX_NONE,  /* division  */
 531   AARCH64_APPROX_ALL,   /* sqrt  */
 532   AARCH64_APPROX_ALL    /* recip_sqrt  */
 533 };
 534
 535 /* Approximation modes for X-Gene 1.  */
 536 static const cpu_approx_modes xgene1_approx_modes =
 537 {
 538   AARCH64_APPROX_NONE,  /* division  */
 539   AARCH64_APPROX_NONE,  /* sqrt  */
 540   AARCH64_APPROX_ALL    /* recip_sqrt  */
 541 };
 542
 543 /* Generic prefetch settings (which disable prefetch).  */
 544 static const cpu_prefetch_tune generic_prefetch_tune =
 545 {
 546   0,                    /* num_slots  */
 547   -1,                   /* l1_cache_size  */
 548   -1,                   /* l1_cache_line_size  */
 549   -1,                   /* l2_cache_size  */
 550   -1                    /* default_opt_level  */
 551 };
 552
 553 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 554 {
 555   0,                    /* num_slots  */
 556   -1,                   /* l1_cache_size  */
 557   64,                   /* l1_cache_line_size  */
 558   -1,                   /* l2_cache_size  */
 559   -1                    /* default_opt_level  */
 560 };
 561
 562 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 563 {
 564   4,                    /* num_slots  */
 565   32,                   /* l1_cache_size  */
 566   64,                   /* l1_cache_line_size  */
 567   1024,                 /* l2_cache_size  */
 568   -1                    /* default_opt_level  */
 569 };
 570
 571 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 572 {
 573   8,                    /* num_slots  */
 574   32,                   /* l1_cache_size  */
 575   128,                  /* l1_cache_line_size  */
 576   16*1024,              /* l2_cache_size  */
 577   3                     /* default_opt_level  */
 578 };
 579
 580 static const cpu_prefetch_tune thunderx_prefetch_tune =
 581 {
 582   8,                    /* num_slots  */
 583   32,                   /* l1_cache_size  */
 584   128,                  /* l1_cache_line_size  */
 585   -1,                   /* l2_cache_size  */
 586   -1                    /* default_opt_level  */
 587 };
 588
 589 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 590 {
 591   8,                    /* num_slots  */
 592   32,                   /* l1_cache_size  */
 593   64,                   /* l1_cache_line_size  */
 594   256,                  /* l2_cache_size  */
 595   -1                    /* default_opt_level  */
 596 };
 597
 598 static const struct tune_params generic_tunings =
 599 {
 600   &cortexa57_extra_costs,
 601   &generic_addrcost_table,
 602   &generic_regmove_cost,
 603   &generic_vector_cost,
 604   &generic_branch_cost,
 605   &generic_approx_modes,
 606   4, /* memmov_cost  */
 607   2, /* issue_rate  */
 608   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 609   8,    /* function_align.  */
 610   4,    /* jump_align.  */
 611   8,    /* loop_align.  */
 612   2,    /* int_reassoc_width.  */
 613   4,    /* fp_reassoc_width.  */
 614   1,    /* vec_reassoc_width.  */
 615   2,    /* min_div_recip_mul_sf.  */
 616   2,    /* min_div_recip_mul_df.  */
 617   0,    /* max_case_values.  */
 618   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 619   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 620   &generic_prefetch_tune
 621 };
 622
 623 static const struct tune_params cortexa35_tunings =
 624 {
 625   &cortexa53_extra_costs,
 626   &generic_addrcost_table,
 627   &cortexa53_regmove_cost,
 628   &generic_vector_cost,
 629   &generic_branch_cost,
 630   &generic_approx_modes,
 631   4, /* memmov_cost  */
 632   1, /* issue_rate  */
 633   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 634    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 635   16,   /* function_align.  */
 636   4,    /* jump_align.  */
 637   8,    /* loop_align.  */
 638   2,    /* int_reassoc_width.  */
 639   4,    /* fp_reassoc_width.  */
 640   1,    /* vec_reassoc_width.  */
 641   2,    /* min_div_recip_mul_sf.  */
 642   2,    /* min_div_recip_mul_df.  */
 643   0,    /* max_case_values.  */
 644   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 645   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 646   &generic_prefetch_tune
 647 };
 648
 649 static const struct tune_params cortexa53_tunings =
 650 {
 651   &cortexa53_extra_costs,
 652   &generic_addrcost_table,
 653   &cortexa53_regmove_cost,
 654   &generic_vector_cost,
 655   &generic_branch_cost,
 656   &generic_approx_modes,
 657   4, /* memmov_cost  */
 658   2, /* issue_rate  */
 659   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 660    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 661   16,   /* function_align.  */
 662   4,    /* jump_align.  */
 663   8,    /* loop_align.  */
 664   2,    /* int_reassoc_width.  */
 665   4,    /* fp_reassoc_width.  */
 666   1,    /* vec_reassoc_width.  */
 667   2,    /* min_div_recip_mul_sf.  */
 668   2,    /* min_div_recip_mul_df.  */
 669   0,    /* max_case_values.  */
 670   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 671   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 672   &generic_prefetch_tune
 673 };
 674
 675 static const struct tune_params cortexa57_tunings =
 676 {
 677   &cortexa57_extra_costs,
 678   &generic_addrcost_table,
 679   &cortexa57_regmove_cost,
 680   &cortexa57_vector_cost,
 681   &generic_branch_cost,
 682   &generic_approx_modes,
 683   4, /* memmov_cost  */
 684   3, /* issue_rate  */
 685   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 686    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 687   16,   /* function_align.  */
 688   4,    /* jump_align.  */
 689   8,    /* loop_align.  */
 690   2,    /* int_reassoc_width.  */
 691   4,    /* fp_reassoc_width.  */
 692   1,    /* vec_reassoc_width.  */
 693   2,    /* min_div_recip_mul_sf.  */
 694   2,    /* min_div_recip_mul_df.  */
 695   0,    /* max_case_values.  */
 696   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 697   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 698   &generic_prefetch_tune
 699 };
 700
 701 static const struct tune_params cortexa72_tunings =
 702 {
 703   &cortexa57_extra_costs,
 704   &generic_addrcost_table,
 705   &cortexa57_regmove_cost,
 706   &cortexa57_vector_cost,
 707   &generic_branch_cost,
 708   &generic_approx_modes,
 709   4, /* memmov_cost  */
 710   3, /* issue_rate  */
 711   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 712    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 713   16,   /* function_align.  */
 714   4,    /* jump_align.  */
 715   8,    /* loop_align.  */
 716   2,    /* int_reassoc_width.  */
 717   4,    /* fp_reassoc_width.  */
 718   1,    /* vec_reassoc_width.  */
 719   2,    /* min_div_recip_mul_sf.  */
 720   2,    /* min_div_recip_mul_df.  */
 721   0,    /* max_case_values.  */
 722   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 723   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 724   &generic_prefetch_tune
 725 };
 726
 727 static const struct tune_params cortexa73_tunings =
 728 {
 729   &cortexa57_extra_costs,
 730   &generic_addrcost_table,
 731   &cortexa57_regmove_cost,
 732   &cortexa57_vector_cost,
 733   &generic_branch_cost,
 734   &generic_approx_modes,
 735   4, /* memmov_cost.  */
 736   2, /* issue_rate.  */
 737   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 738    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 739   16,   /* function_align.  */
 740   4,    /* jump_align.  */
 741   8,    /* loop_align.  */
 742   2,    /* int_reassoc_width.  */
 743   4,    /* fp_reassoc_width.  */
 744   1,    /* vec_reassoc_width.  */
 745   2,    /* min_div_recip_mul_sf.  */
 746   2,    /* min_div_recip_mul_df.  */
 747   0,    /* max_case_values.  */
 748   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 749   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 750   &generic_prefetch_tune
 751 };
 752
 753
 754
 755 static const struct tune_params exynosm1_tunings =
 756 {
 757   &exynosm1_extra_costs,
 758   &exynosm1_addrcost_table,
 759   &exynosm1_regmove_cost,
 760   &exynosm1_vector_cost,
 761   &generic_branch_cost,
 762   &exynosm1_approx_modes,
 763   4,    /* memmov_cost  */
 764   3,    /* issue_rate  */
 765   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 766   4,    /* function_align.  */
 767   4,    /* jump_align.  */
 768   4,    /* loop_align.  */
 769   2,    /* int_reassoc_width.  */
 770   4,    /* fp_reassoc_width.  */
 771   1,    /* vec_reassoc_width.  */
 772   2,    /* min_div_recip_mul_sf.  */
 773   2,    /* min_div_recip_mul_df.  */
 774   48,   /* max_case_values.  */
 775   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 776   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 777   &exynosm1_prefetch_tune
 778 };
 779
 780 static const struct tune_params thunderxt88_tunings =
 781 {
 782   &thunderx_extra_costs,
 783   &generic_addrcost_table,
 784   &thunderx_regmove_cost,
 785   &thunderx_vector_cost,
 786   &generic_branch_cost,
 787   &generic_approx_modes,
 788   6, /* memmov_cost  */
 789   2, /* issue_rate  */
 790   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 791   8,    /* function_align.  */
 792   8,    /* jump_align.  */
 793   8,    /* loop_align.  */
 794   2,    /* int_reassoc_width.  */
 795   4,    /* fp_reassoc_width.  */
 796   1,    /* vec_reassoc_width.  */
 797   2,    /* min_div_recip_mul_sf.  */
 798   2,    /* min_div_recip_mul_df.  */
 799   0,    /* max_case_values.  */
 800   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 801   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 802   &thunderxt88_prefetch_tune
 803 };
 804
 805 static const struct tune_params thunderx_tunings =
 806 {
 807   &thunderx_extra_costs,
 808   &generic_addrcost_table,
 809   &thunderx_regmove_cost,
 810   &thunderx_vector_cost,
 811   &generic_branch_cost,
 812   &generic_approx_modes,
 813   6, /* memmov_cost  */
 814   2, /* issue_rate  */
 815   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 816   8,    /* function_align.  */
 817   8,    /* jump_align.  */
 818   8,    /* loop_align.  */
 819   2,    /* int_reassoc_width.  */
 820   4,    /* fp_reassoc_width.  */
 821   1,    /* vec_reassoc_width.  */
 822   2,    /* min_div_recip_mul_sf.  */
 823   2,    /* min_div_recip_mul_df.  */
 824   0,    /* max_case_values.  */
 825   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 826   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 827    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 828   &thunderx_prefetch_tune
 829 };
 830
 831 static const struct tune_params xgene1_tunings =
 832 {
 833   &xgene1_extra_costs,
 834   &xgene1_addrcost_table,
 835   &xgene1_regmove_cost,
 836   &xgene1_vector_cost,
 837   &generic_branch_cost,
 838   &xgene1_approx_modes,
 839   6, /* memmov_cost  */
 840   4, /* issue_rate  */
 841   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 842   16,   /* function_align.  */
 843   8,    /* jump_align.  */
 844   16,   /* loop_align.  */
 845   2,    /* int_reassoc_width.  */
 846   4,    /* fp_reassoc_width.  */
 847   1,    /* vec_reassoc_width.  */
 848   2,    /* min_div_recip_mul_sf.  */
 849   2,    /* min_div_recip_mul_df.  */
 850   0,    /* max_case_values.  */
 851   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 852   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 853   &generic_prefetch_tune
 854 };
 855
 856 static const struct tune_params qdf24xx_tunings =
 857 {
 858   &qdf24xx_extra_costs,
 859   &generic_addrcost_table,
 860   &qdf24xx_regmove_cost,
 861   &generic_vector_cost,
 862   &generic_branch_cost,
 863   &generic_approx_modes,
 864   4, /* memmov_cost  */
 865   4, /* issue_rate  */
 866   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 867    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 868   16,   /* function_align.  */
 869   8,    /* jump_align.  */
 870   16,   /* loop_align.  */
 871   2,    /* int_reassoc_width.  */
 872   4,    /* fp_reassoc_width.  */
 873   1,    /* vec_reassoc_width.  */
 874   2,    /* min_div_recip_mul_sf.  */
 875   2,    /* min_div_recip_mul_df.  */
 876   0,    /* max_case_values.  */
 877   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 878   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 879   &qdf24xx_prefetch_tune
 880 };
 881
 882 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
 883    for now.  */
 884 static const struct tune_params saphira_tunings =
 885 {
 886   &generic_extra_costs,
 887   &generic_addrcost_table,
 888   &generic_regmove_cost,
 889   &generic_vector_cost,
 890   &generic_branch_cost,
 891   &generic_approx_modes,
 892   4, /* memmov_cost  */
 893   4, /* issue_rate  */
 894   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 895    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 896   16,   /* function_align.  */
 897   8,    /* jump_align.  */
 898   16,   /* loop_align.  */
 899   2,    /* int_reassoc_width.  */
 900   4,    /* fp_reassoc_width.  */
 901   1,    /* vec_reassoc_width.  */
 902   2,    /* min_div_recip_mul_sf.  */
 903   2,    /* min_div_recip_mul_df.  */
 904   0,    /* max_case_values.  */
 905   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 906   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 907   &generic_prefetch_tune
 908 };
 909
 910 static const struct tune_params thunderx2t99_tunings =
 911 {
 912   &thunderx2t99_extra_costs,
 913   &thunderx2t99_addrcost_table,
 914   &thunderx2t99_regmove_cost,
 915   &thunderx2t99_vector_cost,
 916   &generic_branch_cost,
 917   &generic_approx_modes,
 918   4, /* memmov_cost.  */
 919   4, /* issue_rate.  */
 920   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 921    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 922   16,   /* function_align.  */
 923   8,    /* jump_align.  */
 924   16,   /* loop_align.  */
 925   3,    /* int_reassoc_width.  */
 926   2,    /* fp_reassoc_width.  */
 927   2,    /* vec_reassoc_width.  */
 928   2,    /* min_div_recip_mul_sf.  */
 929   2,    /* min_div_recip_mul_df.  */
 930   0,    /* max_case_values.  */
 931   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 932   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 933   &thunderx2t99_prefetch_tune
 934 };
 935
 936 /* Support for fine-grained override of the tuning structures.  */
 937 struct aarch64_tuning_override_function
 938 {
 939   const char* name;
 940   void (*parse_override)(const char*, struct tune_params*);
 941 };
 942
 943 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 944 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 945
 946 static const struct aarch64_tuning_override_function
 947 aarch64_tuning_override_functions[] =
 948 {
 949   { "fuse", aarch64_parse_fuse_string },
 950   { "tune", aarch64_parse_tune_string },
 951   { NULL, NULL }
 952 };
 953
 954 /* A processor implementing AArch64.  */
 955 struct processor
 956 {
 957   const char *const name;
 958   enum aarch64_processor ident;
 959   enum aarch64_processor sched_core;
 960   enum aarch64_arch arch;
 961   unsigned architecture_version;
 962   const unsigned long flags;
 963   const struct tune_params *const tune;
 964 };
 965
 966 /* Architectures implementing AArch64.  */
 967 static const struct processor all_architectures[] =
 968 {
 969 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 970   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 971 #include "aarch64-arches.def"
 972   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 973 };
 974
 975 /* Processor cores implementing AArch64.  */
 976 static const struct processor all_cores[] =
 977 {
 978 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 979   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 980   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 981   FLAGS, &COSTS##_tunings},
 982 #include "aarch64-cores.def"
 983   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 984     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 985   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 986 };
 987
 988
 989 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 990    handling code or by target attributes.  */
 991 static const struct processor *selected_arch;
 992 static const struct processor *selected_cpu;
 993 static const struct processor *selected_tune;
 994
 995 /* The current tuning set.  */
 996 struct tune_params aarch64_tune_params = generic_tunings;
 997
 998 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 999
1000 /* An ISA extension in the co-processor and main instruction set space.  */
1001 struct aarch64_option_extension
1002 {
1003   const char *const name;
1004   const unsigned long flags_on;
1005   const unsigned long flags_off;
1006 };
1007
1008 typedef enum aarch64_cond_code
1009 {
1010   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1011   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1012   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1013 }
1014 aarch64_cc;
1015
1016 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1017
1018 /* The condition codes of the processor, and the inverse function.  */
1019 static const char * const aarch64_condition_codes[] =
1020 {
1021   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1022   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1023 };
1024
1025 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1026 const char *
1027 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1028                         const char * branch_format)
1029 {
1030     rtx_code_label * tmp_label = gen_label_rtx ();
1031     char label_buf[256];
1032     char buffer[128];
1033     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1034                                  CODE_LABEL_NUMBER (tmp_label));
1035     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1036     rtx dest_label = operands[pos_label];
1037     operands[pos_label] = tmp_label;
1038
1039     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1040     output_asm_insn (buffer, operands);
1041
1042     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1043     operands[pos_label] = dest_label;
1044     output_asm_insn (buffer, operands);
1045     return "";
1046 }
1047
1048 void
1049 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
1050 {
1051   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
1052   if (TARGET_GENERAL_REGS_ONLY)
1053     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
1054   else
1055     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1056 }
1057
1058 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1059    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1060    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
1061    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1062    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
1063    irrespectively of its cost results in bad allocations with many redundant
1064    int<->FP moves which are expensive on various cores.
1065    To avoid this we don't allow ALL_REGS as the allocno class, but force a
1066    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
1067    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
1068    Otherwise set the allocno class depending on the mode.
1069    The result of this is that it is no longer inefficient to have a higher
1070    memory move cost than the register move cost.
1071 */
1072
1073 static reg_class_t
1074 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1075                                          reg_class_t best_class)
1076 {
1077   machine_mode mode;
1078
1079   if (allocno_class != ALL_REGS)
1080     return allocno_class;
1081
1082   if (best_class != ALL_REGS)
1083     return best_class;
1084
1085   mode = PSEUDO_REGNO_MODE (regno);
1086   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1087 }
1088
1089 static unsigned int
1090 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1091 {
1092   if (GET_MODE_UNIT_SIZE (mode) == 4)
1093     return aarch64_tune_params.min_div_recip_mul_sf;
1094   return aarch64_tune_params.min_div_recip_mul_df;
1095 }
1096
1097 /* Return the reassociation width of treeop OPC with mode MODE.  */
1098 static int
1099 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1100 {
1101   if (VECTOR_MODE_P (mode))
1102     return aarch64_tune_params.vec_reassoc_width;
1103   if (INTEGRAL_MODE_P (mode))
1104     return aarch64_tune_params.int_reassoc_width;
1105   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1106   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1107     return aarch64_tune_params.fp_reassoc_width;
1108   return 1;
1109 }
1110
1111 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1112 unsigned
1113 aarch64_dbx_register_number (unsigned regno)
1114 {
1115    if (GP_REGNUM_P (regno))
1116      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1117    else if (regno == SP_REGNUM)
1118      return AARCH64_DWARF_SP;
1119    else if (FP_REGNUM_P (regno))
1120      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1121    else if (PR_REGNUM_P (regno))
1122      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1123    else if (regno == VG_REGNUM)
1124      return AARCH64_DWARF_VG;
1125
1126    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1127       equivalent DWARF register.  */
1128    return DWARF_FRAME_REGISTERS;
1129 }
1130
1131 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1132 static bool
1133 aarch64_advsimd_struct_mode_p (machine_mode mode)
1134 {
1135   return (TARGET_SIMD
1136           && (mode == OImode || mode == CImode || mode == XImode));
1137 }
1138
1139 /* Return true if MODE is an SVE predicate mode.  */
1140 static bool
1141 aarch64_sve_pred_mode_p (machine_mode mode)
1142 {
1143   return (TARGET_SVE
1144           && (mode == VNx16BImode
1145               || mode == VNx8BImode
1146               || mode == VNx4BImode
1147               || mode == VNx2BImode));
1148 }
1149
1150 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1151 const unsigned int VEC_ADVSIMD  = 1;
1152 const unsigned int VEC_SVE_DATA = 2;
1153 const unsigned int VEC_SVE_PRED = 4;
1154 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1155    a structure of 2, 3 or 4 vectors.  */
1156 const unsigned int VEC_STRUCT   = 8;
1157 /* Useful combinations of the above.  */
1158 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1159 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1160
1161 /* Return a set of flags describing the vector properties of mode MODE.
1162    Ignore modes that are not supported by the current target.  */
1163 static unsigned int
1164 aarch64_classify_vector_mode (machine_mode mode)
1165 {
1166   if (aarch64_advsimd_struct_mode_p (mode))
1167     return VEC_ADVSIMD | VEC_STRUCT;
1168
1169   if (aarch64_sve_pred_mode_p (mode))
1170     return VEC_SVE_PRED;
1171
1172   scalar_mode inner = GET_MODE_INNER (mode);
1173   if (VECTOR_MODE_P (mode)
1174       && (inner == QImode
1175           || inner == HImode
1176           || inner == HFmode
1177           || inner == SImode
1178           || inner == SFmode
1179           || inner == DImode
1180           || inner == DFmode))
1181     {
1182       if (TARGET_SVE)
1183         {
1184           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1185             return VEC_SVE_DATA;
1186           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1187               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1188               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1189             return VEC_SVE_DATA | VEC_STRUCT;
1190         }
1191
1192       /* This includes V1DF but not V1DI (which doesn't exist).  */
1193       if (TARGET_SIMD
1194           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1195               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1196         return VEC_ADVSIMD;
1197     }
1198
1199   return 0;
1200 }
1201
1202 /* Return true if MODE is any of the data vector modes, including
1203    structure modes.  */
1204 static bool
1205 aarch64_vector_data_mode_p (machine_mode mode)
1206 {
1207   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1208 }
1209
1210 /* Return true if MODE is an SVE data vector mode; either a single vector
1211    or a structure of vectors.  */
1212 static bool
1213 aarch64_sve_data_mode_p (machine_mode mode)
1214 {
1215   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1216 }
1217
1218 /* Implement target hook TARGET_ARRAY_MODE.  */
1219 static opt_machine_mode
1220 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1221 {
1222   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1223       && IN_RANGE (nelems, 2, 4))
1224     return mode_for_vector (GET_MODE_INNER (mode),
1225                             GET_MODE_NUNITS (mode) * nelems);
1226
1227   return opt_machine_mode ();
1228 }
1229
1230 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1231 static bool
1232 aarch64_array_mode_supported_p (machine_mode mode,
1233                                 unsigned HOST_WIDE_INT nelems)
1234 {
1235   if (TARGET_SIMD
1236       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1237           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1238       && (nelems >= 2 && nelems <= 4))
1239     return true;
1240
1241   return false;
1242 }
1243
1244 /* Return the SVE predicate mode to use for elements that have
1245    ELEM_NBYTES bytes, if such a mode exists.  */
1246
1247 opt_machine_mode
1248 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1249 {
1250   if (TARGET_SVE)
1251     {
1252       if (elem_nbytes == 1)
1253         return VNx16BImode;
1254       if (elem_nbytes == 2)
1255         return VNx8BImode;
1256       if (elem_nbytes == 4)
1257         return VNx4BImode;
1258       if (elem_nbytes == 8)
1259         return VNx2BImode;
1260     }
1261   return opt_machine_mode ();
1262 }
1263
1264 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1265
1266 static opt_machine_mode
1267 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1268 {
1269   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1270     {
1271       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1272       machine_mode pred_mode;
1273       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1274         return pred_mode;
1275     }
1276
1277   return default_get_mask_mode (nunits, nbytes);
1278 }
1279
1280 /* Implement TARGET_HARD_REGNO_NREGS.  */
1281
1282 static unsigned int
1283 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1284 {
1285   /* ??? Logically we should only need to provide a value when
1286      HARD_REGNO_MODE_OK says that the combination is valid,
1287      but at the moment we need to handle all modes.  Just ignore
1288      any runtime parts for registers that can't store them.  */
1289   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1290   switch (aarch64_regno_regclass (regno))
1291     {
1292     case FP_REGS:
1293     case FP_LO_REGS:
1294       if (aarch64_sve_data_mode_p (mode))
1295         return exact_div (GET_MODE_SIZE (mode),
1296                           BYTES_PER_SVE_VECTOR).to_constant ();
1297       return CEIL (lowest_size, UNITS_PER_VREG);
1298     case PR_REGS:
1299     case PR_LO_REGS:
1300     case PR_HI_REGS:
1301       return 1;
1302     default:
1303       return CEIL (lowest_size, UNITS_PER_WORD);
1304     }
1305   gcc_unreachable ();
1306 }
1307
1308 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1309
1310 static bool
1311 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1312 {
1313   if (GET_MODE_CLASS (mode) == MODE_CC)
1314     return regno == CC_REGNUM;
1315
1316   if (regno == VG_REGNUM)
1317     /* This must have the same size as _Unwind_Word.  */
1318     return mode == DImode;
1319
1320   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1321   if (vec_flags & VEC_SVE_PRED)
1322     return PR_REGNUM_P (regno);
1323
1324   if (PR_REGNUM_P (regno))
1325     return 0;
1326
1327   if (regno == SP_REGNUM)
1328     /* The purpose of comparing with ptr_mode is to support the
1329        global register variable associated with the stack pointer
1330        register via the syntax of asm ("wsp") in ILP32.  */
1331     return mode == Pmode || mode == ptr_mode;
1332
1333   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1334     return mode == Pmode;
1335
1336   if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1337     return true;
1338
1339   if (FP_REGNUM_P (regno))
1340     {
1341       if (vec_flags & VEC_STRUCT)
1342         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1343       else
1344         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1345     }
1346
1347   return false;
1348 }
1349
1350 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1351    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1352    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1353
1354 static bool
1355 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1356 {
1357   return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1358 }
1359
1360 /* Implement REGMODE_NATURAL_SIZE.  */
1361 poly_uint64
1362 aarch64_regmode_natural_size (machine_mode mode)
1363 {
1364   /* The natural size for SVE data modes is one SVE data vector,
1365      and similarly for predicates.  We can't independently modify
1366      anything smaller than that.  */
1367   /* ??? For now, only do this for variable-width SVE registers.
1368      Doing it for constant-sized registers breaks lower-subreg.c.  */
1369   /* ??? And once that's fixed, we should probably have similar
1370      code for Advanced SIMD.  */
1371   if (!aarch64_sve_vg.is_constant ())
1372     {
1373       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1374       if (vec_flags & VEC_SVE_PRED)
1375         return BYTES_PER_SVE_PRED;
1376       if (vec_flags & VEC_SVE_DATA)
1377         return BYTES_PER_SVE_VECTOR;
1378     }
1379   return UNITS_PER_WORD;
1380 }
1381
1382 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1383 machine_mode
1384 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1385                                      machine_mode mode)
1386 {
1387   /* The predicate mode determines which bits are significant and
1388      which are "don't care".  Decreasing the number of lanes would
1389      lose data while increasing the number of lanes would make bits
1390      unnecessarily significant.  */
1391   if (PR_REGNUM_P (regno))
1392     return mode;
1393   if (known_ge (GET_MODE_SIZE (mode), 4))
1394     return mode;
1395   else
1396     return SImode;
1397 }
1398
1399 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1400    that strcpy from constants will be faster.  */
1401
1402 static HOST_WIDE_INT
1403 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1404 {
1405   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1406     return MAX (align, BITS_PER_WORD);
1407   return align;
1408 }
1409
1410 /* Return true if calls to DECL should be treated as
1411    long-calls (ie called via a register).  */
1412 static bool
1413 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1414 {
1415   return false;
1416 }
1417
1418 /* Return true if calls to symbol-ref SYM should be treated as
1419    long-calls (ie called via a register).  */
1420 bool
1421 aarch64_is_long_call_p (rtx sym)
1422 {
1423   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1424 }
1425
1426 /* Return true if calls to symbol-ref SYM should not go through
1427    plt stubs.  */
1428
1429 bool
1430 aarch64_is_noplt_call_p (rtx sym)
1431 {
1432   const_tree decl = SYMBOL_REF_DECL (sym);
1433
1434   if (flag_pic
1435       && decl
1436       && (!flag_plt
1437           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1438       && !targetm.binds_local_p (decl))
1439     return true;
1440
1441   return false;
1442 }
1443
1444 /* Return true if the offsets to a zero/sign-extract operation
1445    represent an expression that matches an extend operation.  The
1446    operands represent the paramters from
1447
1448    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1449 bool
1450 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1451                                 rtx extract_imm)
1452 {
1453   HOST_WIDE_INT mult_val, extract_val;
1454
1455   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1456     return false;
1457
1458   mult_val = INTVAL (mult_imm);
1459   extract_val = INTVAL (extract_imm);
1460
1461   if (extract_val > 8
1462       && extract_val < GET_MODE_BITSIZE (mode)
1463       && exact_log2 (extract_val & ~7) > 0
1464       && (extract_val & 7) <= 4
1465       && mult_val == (1 << (extract_val & 7)))
1466     return true;
1467
1468   return false;
1469 }
1470
1471 /* Emit an insn that's a simple single-set.  Both the operands must be
1472    known to be valid.  */
1473 inline static rtx_insn *
1474 emit_set_insn (rtx x, rtx y)
1475 {
1476   return emit_insn (gen_rtx_SET (x, y));
1477 }
1478
1479 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1480    return the rtx for register 0 in the proper mode.  */
1481 rtx
1482 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1483 {
1484   machine_mode mode = SELECT_CC_MODE (code, x, y);
1485   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1486
1487   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1488   return cc_reg;
1489 }
1490
1491 /* Build the SYMBOL_REF for __tls_get_addr.  */
1492
1493 static GTY(()) rtx tls_get_addr_libfunc;
1494
1495 rtx
1496 aarch64_tls_get_addr (void)
1497 {
1498   if (!tls_get_addr_libfunc)
1499     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1500   return tls_get_addr_libfunc;
1501 }
1502
1503 /* Return the TLS model to use for ADDR.  */
1504
1505 static enum tls_model
1506 tls_symbolic_operand_type (rtx addr)
1507 {
1508   enum tls_model tls_kind = TLS_MODEL_NONE;
1509   if (GET_CODE (addr) == CONST)
1510     {
1511       poly_int64 addend;
1512       rtx sym = strip_offset (addr, &addend);
1513       if (GET_CODE (sym) == SYMBOL_REF)
1514         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1515     }
1516   else if (GET_CODE (addr) == SYMBOL_REF)
1517     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1518
1519   return tls_kind;
1520 }
1521
1522 /* We'll allow lo_sum's in addresses in our legitimate addresses
1523    so that combine would take care of combining addresses where
1524    necessary, but for generation purposes, we'll generate the address
1525    as :
1526    RTL                               Absolute
1527    tmp = hi (symbol_ref);            adrp  x1, foo
1528    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1529                                      nop
1530
1531    PIC                               TLS
1532    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1533    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1534                                      bl   __tls_get_addr
1535                                      nop
1536
1537    Load TLS symbol, depending on TLS mechanism and TLS access model.
1538
1539    Global Dynamic - Traditional TLS:
1540    adrp tmp, :tlsgd:imm
1541    add  dest, tmp, #:tlsgd_lo12:imm
1542    bl   __tls_get_addr
1543
1544    Global Dynamic - TLS Descriptors:
1545    adrp dest, :tlsdesc:imm
1546    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1547    add  dest, dest, #:tlsdesc_lo12:imm
1548    blr  tmp
1549    mrs  tp, tpidr_el0
1550    add  dest, dest, tp
1551
1552    Initial Exec:
1553    mrs  tp, tpidr_el0
1554    adrp tmp, :gottprel:imm
1555    ldr  dest, [tmp, #:gottprel_lo12:imm]
1556    add  dest, dest, tp
1557
1558    Local Exec:
1559    mrs  tp, tpidr_el0
1560    add  t0, tp, #:tprel_hi12:imm, lsl #12
1561    add  t0, t0, #:tprel_lo12_nc:imm
1562 */
1563
1564 static void
1565 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1566                                    enum aarch64_symbol_type type)
1567 {
1568   switch (type)
1569     {
1570     case SYMBOL_SMALL_ABSOLUTE:
1571       {
1572         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1573         rtx tmp_reg = dest;
1574         machine_mode mode = GET_MODE (dest);
1575
1576         gcc_assert (mode == Pmode || mode == ptr_mode);
1577
1578         if (can_create_pseudo_p ())
1579           tmp_reg = gen_reg_rtx (mode);
1580
1581         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1582         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1583         return;
1584       }
1585
1586     case SYMBOL_TINY_ABSOLUTE:
1587       emit_insn (gen_rtx_SET (dest, imm));
1588       return;
1589
1590     case SYMBOL_SMALL_GOT_28K:
1591       {
1592         machine_mode mode = GET_MODE (dest);
1593         rtx gp_rtx = pic_offset_table_rtx;
1594         rtx insn;
1595         rtx mem;
1596
1597         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1598            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1599            decide rtx costs, in which case pic_offset_table_rtx is not
1600            initialized.  For that case no need to generate the first adrp
1601            instruction as the final cost for global variable access is
1602            one instruction.  */
1603         if (gp_rtx != NULL)
1604           {
1605             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1606                using the page base as GOT base, the first page may be wasted,
1607                in the worst scenario, there is only 28K space for GOT).
1608
1609                The generate instruction sequence for accessing global variable
1610                is:
1611
1612                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1613
1614                Only one instruction needed. But we must initialize
1615                pic_offset_table_rtx properly.  We generate initialize insn for
1616                every global access, and allow CSE to remove all redundant.
1617
1618                The final instruction sequences will look like the following
1619                for multiply global variables access.
1620
1621                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1622
1623                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1624                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1625                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1626                  ...  */
1627
1628             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1629             crtl->uses_pic_offset_table = 1;
1630             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1631
1632             if (mode != GET_MODE (gp_rtx))
1633              gp_rtx = gen_lowpart (mode, gp_rtx);
1634
1635           }
1636
1637         if (mode == ptr_mode)
1638           {
1639             if (mode == DImode)
1640               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1641             else
1642               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1643
1644             mem = XVECEXP (SET_SRC (insn), 0, 0);
1645           }
1646         else
1647           {
1648             gcc_assert (mode == Pmode);
1649
1650             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1651             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1652           }
1653
1654         /* The operand is expected to be MEM.  Whenever the related insn
1655            pattern changed, above code which calculate mem should be
1656            updated.  */
1657         gcc_assert (GET_CODE (mem) == MEM);
1658         MEM_READONLY_P (mem) = 1;
1659         MEM_NOTRAP_P (mem) = 1;
1660         emit_insn (insn);
1661         return;
1662       }
1663
1664     case SYMBOL_SMALL_GOT_4G:
1665       {
1666         /* In ILP32, the mode of dest can be either SImode or DImode,
1667            while the got entry is always of SImode size.  The mode of
1668            dest depends on how dest is used: if dest is assigned to a
1669            pointer (e.g. in the memory), it has SImode; it may have
1670            DImode if dest is dereferenced to access the memeory.
1671            This is why we have to handle three different ldr_got_small
1672            patterns here (two patterns for ILP32).  */
1673
1674         rtx insn;
1675         rtx mem;
1676         rtx tmp_reg = dest;
1677         machine_mode mode = GET_MODE (dest);
1678
1679         if (can_create_pseudo_p ())
1680           tmp_reg = gen_reg_rtx (mode);
1681
1682         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1683         if (mode == ptr_mode)
1684           {
1685             if (mode == DImode)
1686               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1687             else
1688               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1689
1690             mem = XVECEXP (SET_SRC (insn), 0, 0);
1691           }
1692         else
1693           {
1694             gcc_assert (mode == Pmode);
1695
1696             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1697             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1698           }
1699
1700         gcc_assert (GET_CODE (mem) == MEM);
1701         MEM_READONLY_P (mem) = 1;
1702         MEM_NOTRAP_P (mem) = 1;
1703         emit_insn (insn);
1704         return;
1705       }
1706
1707     case SYMBOL_SMALL_TLSGD:
1708       {
1709         rtx_insn *insns;
1710         machine_mode mode = GET_MODE (dest);
1711         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1712
1713         start_sequence ();
1714         if (TARGET_ILP32)
1715           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1716         else
1717           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1718         insns = get_insns ();
1719         end_sequence ();
1720
1721         RTL_CONST_CALL_P (insns) = 1;
1722         emit_libcall_block (insns, dest, result, imm);
1723         return;
1724       }
1725
1726     case SYMBOL_SMALL_TLSDESC:
1727       {
1728         machine_mode mode = GET_MODE (dest);
1729         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1730         rtx tp;
1731
1732         gcc_assert (mode == Pmode || mode == ptr_mode);
1733
1734         /* In ILP32, the got entry is always of SImode size.  Unlike
1735            small GOT, the dest is fixed at reg 0.  */
1736         if (TARGET_ILP32)
1737           emit_insn (gen_tlsdesc_small_si (imm));
1738         else
1739           emit_insn (gen_tlsdesc_small_di (imm));
1740         tp = aarch64_load_tp (NULL);
1741
1742         if (mode != Pmode)
1743           tp = gen_lowpart (mode, tp);
1744
1745         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1746         if (REG_P (dest))
1747           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1748         return;
1749       }
1750
1751     case SYMBOL_SMALL_TLSIE:
1752       {
1753         /* In ILP32, the mode of dest can be either SImode or DImode,
1754            while the got entry is always of SImode size.  The mode of
1755            dest depends on how dest is used: if dest is assigned to a
1756            pointer (e.g. in the memory), it has SImode; it may have
1757            DImode if dest is dereferenced to access the memeory.
1758            This is why we have to handle three different tlsie_small
1759            patterns here (two patterns for ILP32).  */
1760         machine_mode mode = GET_MODE (dest);
1761         rtx tmp_reg = gen_reg_rtx (mode);
1762         rtx tp = aarch64_load_tp (NULL);
1763
1764         if (mode == ptr_mode)
1765           {
1766             if (mode == DImode)
1767               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1768             else
1769               {
1770                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1771                 tp = gen_lowpart (mode, tp);
1772               }
1773           }
1774         else
1775           {
1776             gcc_assert (mode == Pmode);
1777             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1778           }
1779
1780         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1781         if (REG_P (dest))
1782           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1783         return;
1784       }
1785
1786     case SYMBOL_TLSLE12:
1787     case SYMBOL_TLSLE24:
1788     case SYMBOL_TLSLE32:
1789     case SYMBOL_TLSLE48:
1790       {
1791         machine_mode mode = GET_MODE (dest);
1792         rtx tp = aarch64_load_tp (NULL);
1793
1794         if (mode != Pmode)
1795           tp = gen_lowpart (mode, tp);
1796
1797         switch (type)
1798           {
1799           case SYMBOL_TLSLE12:
1800             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1801                         (dest, tp, imm));
1802             break;
1803           case SYMBOL_TLSLE24:
1804             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1805                         (dest, tp, imm));
1806           break;
1807           case SYMBOL_TLSLE32:
1808             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1809                         (dest, imm));
1810             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1811                         (dest, dest, tp));
1812           break;
1813           case SYMBOL_TLSLE48:
1814             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1815                         (dest, imm));
1816             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1817                         (dest, dest, tp));
1818             break;
1819           default:
1820             gcc_unreachable ();
1821           }
1822
1823         if (REG_P (dest))
1824           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1825         return;
1826       }
1827
1828     case SYMBOL_TINY_GOT:
1829       emit_insn (gen_ldr_got_tiny (dest, imm));
1830       return;
1831
1832     case SYMBOL_TINY_TLSIE:
1833       {
1834         machine_mode mode = GET_MODE (dest);
1835         rtx tp = aarch64_load_tp (NULL);
1836
1837         if (mode == ptr_mode)
1838           {
1839             if (mode == DImode)
1840               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1841             else
1842               {
1843                 tp = gen_lowpart (mode, tp);
1844                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1845               }
1846           }
1847         else
1848           {
1849             gcc_assert (mode == Pmode);
1850             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1851           }
1852
1853         if (REG_P (dest))
1854           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1855         return;
1856       }
1857
1858     default:
1859       gcc_unreachable ();
1860     }
1861 }
1862
1863 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1864    handle all moves if !can_create_pseudo_p ().  The distinction is
1865    important because, unlike emit_move_insn, the move expanders know
1866    how to force Pmode objects into the constant pool even when the
1867    constant pool address is not itself legitimate.  */
1868 static rtx
1869 aarch64_emit_move (rtx dest, rtx src)
1870 {
1871   return (can_create_pseudo_p ()
1872           ? emit_move_insn (dest, src)
1873           : emit_move_insn_1 (dest, src));
1874 }
1875
1876 /* Split a 128-bit move operation into two 64-bit move operations,
1877    taking care to handle partial overlap of register to register
1878    copies.  Special cases are needed when moving between GP regs and
1879    FP regs.  SRC can be a register, constant or memory; DST a register
1880    or memory.  If either operand is memory it must not have any side
1881    effects.  */
1882 void
1883 aarch64_split_128bit_move (rtx dst, rtx src)
1884 {
1885   rtx dst_lo, dst_hi;
1886   rtx src_lo, src_hi;
1887
1888   machine_mode mode = GET_MODE (dst);
1889
1890   gcc_assert (mode == TImode || mode == TFmode);
1891   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1892   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1893
1894   if (REG_P (dst) && REG_P (src))
1895     {
1896       int src_regno = REGNO (src);
1897       int dst_regno = REGNO (dst);
1898
1899       /* Handle FP <-> GP regs.  */
1900       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1901         {
1902           src_lo = gen_lowpart (word_mode, src);
1903           src_hi = gen_highpart (word_mode, src);
1904
1905           if (mode == TImode)
1906             {
1907               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1908               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1909             }
1910           else
1911             {
1912               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1913               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1914             }
1915           return;
1916         }
1917       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1918         {
1919           dst_lo = gen_lowpart (word_mode, dst);
1920           dst_hi = gen_highpart (word_mode, dst);
1921
1922           if (mode == TImode)
1923             {
1924               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1925               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1926             }
1927           else
1928             {
1929               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1930               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1931             }
1932           return;
1933         }
1934     }
1935
1936   dst_lo = gen_lowpart (word_mode, dst);
1937   dst_hi = gen_highpart (word_mode, dst);
1938   src_lo = gen_lowpart (word_mode, src);
1939   src_hi = gen_highpart_mode (word_mode, mode, src);
1940
1941   /* At most one pairing may overlap.  */
1942   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1943     {
1944       aarch64_emit_move (dst_hi, src_hi);
1945       aarch64_emit_move (dst_lo, src_lo);
1946     }
1947   else
1948     {
1949       aarch64_emit_move (dst_lo, src_lo);
1950       aarch64_emit_move (dst_hi, src_hi);
1951     }
1952 }
1953
1954 bool
1955 aarch64_split_128bit_move_p (rtx dst, rtx src)
1956 {
1957   return (! REG_P (src)
1958           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1959 }
1960
1961 /* Split a complex SIMD combine.  */
1962
1963 void
1964 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1965 {
1966   machine_mode src_mode = GET_MODE (src1);
1967   machine_mode dst_mode = GET_MODE (dst);
1968
1969   gcc_assert (VECTOR_MODE_P (dst_mode));
1970   gcc_assert (register_operand (dst, dst_mode)
1971               && register_operand (src1, src_mode)
1972               && register_operand (src2, src_mode));
1973
1974   rtx (*gen) (rtx, rtx, rtx);
1975
1976   switch (src_mode)
1977     {
1978     case E_V8QImode:
1979       gen = gen_aarch64_simd_combinev8qi;
1980       break;
1981     case E_V4HImode:
1982       gen = gen_aarch64_simd_combinev4hi;
1983       break;
1984     case E_V2SImode:
1985       gen = gen_aarch64_simd_combinev2si;
1986       break;
1987     case E_V4HFmode:
1988       gen = gen_aarch64_simd_combinev4hf;
1989       break;
1990     case E_V2SFmode:
1991       gen = gen_aarch64_simd_combinev2sf;
1992       break;
1993     case E_DImode:
1994       gen = gen_aarch64_simd_combinedi;
1995       break;
1996     case E_DFmode:
1997       gen = gen_aarch64_simd_combinedf;
1998       break;
1999     default:
2000       gcc_unreachable ();
2001     }
2002
2003   emit_insn (gen (dst, src1, src2));
2004   return;
2005 }
2006
2007 /* Split a complex SIMD move.  */
2008
2009 void
2010 aarch64_split_simd_move (rtx dst, rtx src)
2011 {
2012   machine_mode src_mode = GET_MODE (src);
2013   machine_mode dst_mode = GET_MODE (dst);
2014
2015   gcc_assert (VECTOR_MODE_P (dst_mode));
2016
2017   if (REG_P (dst) && REG_P (src))
2018     {
2019       rtx (*gen) (rtx, rtx);
2020
2021       gcc_assert (VECTOR_MODE_P (src_mode));
2022
2023       switch (src_mode)
2024         {
2025         case E_V16QImode:
2026           gen = gen_aarch64_split_simd_movv16qi;
2027           break;
2028         case E_V8HImode:
2029           gen = gen_aarch64_split_simd_movv8hi;
2030           break;
2031         case E_V4SImode:
2032           gen = gen_aarch64_split_simd_movv4si;
2033           break;
2034         case E_V2DImode:
2035           gen = gen_aarch64_split_simd_movv2di;
2036           break;
2037         case E_V8HFmode:
2038           gen = gen_aarch64_split_simd_movv8hf;
2039           break;
2040         case E_V4SFmode:
2041           gen = gen_aarch64_split_simd_movv4sf;
2042           break;
2043         case E_V2DFmode:
2044           gen = gen_aarch64_split_simd_movv2df;
2045           break;
2046         default:
2047           gcc_unreachable ();
2048         }
2049
2050       emit_insn (gen (dst, src));
2051       return;
2052     }
2053 }
2054
2055 bool
2056 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2057                               machine_mode ymode, rtx y)
2058 {
2059   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2060   gcc_assert (r != NULL);
2061   return rtx_equal_p (x, r);
2062 }
2063
2064
2065 static rtx
2066 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2067 {
2068   if (can_create_pseudo_p ())
2069     return force_reg (mode, value);
2070   else
2071     {
2072       gcc_assert (x);
2073       aarch64_emit_move (x, value);
2074       return x;
2075     }
2076 }
2077
2078 /* Return true if we can move VALUE into a register using a single
2079    CNT[BHWD] instruction.  */
2080
2081 static bool
2082 aarch64_sve_cnt_immediate_p (poly_int64 value)
2083 {
2084   HOST_WIDE_INT factor = value.coeffs[0];
2085   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2086   return (value.coeffs[1] == factor
2087           && IN_RANGE (factor, 2, 16 * 16)
2088           && (factor & 1) == 0
2089           && factor <= 16 * (factor & -factor));
2090 }
2091
2092 /* Likewise for rtx X.  */
2093
2094 bool
2095 aarch64_sve_cnt_immediate_p (rtx x)
2096 {
2097   poly_int64 value;
2098   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2099 }
2100
2101 /* Return the asm string for an instruction with a CNT-like vector size
2102    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2103    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2104    first part of the operands template (the part that comes before the
2105    vector size itself).  FACTOR is the number of quadwords.
2106    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2107    If it is zero, we can use any element size.  */
2108
2109 static char *
2110 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2111                                   unsigned int factor,
2112                                   unsigned int nelts_per_vq)
2113 {
2114   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2115
2116   if (nelts_per_vq == 0)
2117     /* There is some overlap in the ranges of the four CNT instructions.
2118        Here we always use the smallest possible element size, so that the
2119        multiplier is 1 whereever possible.  */
2120     nelts_per_vq = factor & -factor;
2121   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2122   gcc_assert (IN_RANGE (shift, 1, 4));
2123   char suffix = "dwhb"[shift - 1];
2124
2125   factor >>= shift;
2126   unsigned int written;
2127   if (factor == 1)
2128     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2129                         prefix, suffix, operands);
2130   else
2131     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2132                         prefix, suffix, operands, factor);
2133   gcc_assert (written < sizeof (buffer));
2134   return buffer;
2135 }
2136
2137 /* Return the asm string for an instruction with a CNT-like vector size
2138    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2139    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2140    first part of the operands template (the part that comes before the
2141    vector size itself).  X is the value of the vector size operand,
2142    as a polynomial integer rtx.  */
2143
2144 char *
2145 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2146                                   rtx x)
2147 {
2148   poly_int64 value = rtx_to_poly_int64 (x);
2149   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2150   return aarch64_output_sve_cnt_immediate (prefix, operands,
2151                                            value.coeffs[1], 0);
2152 }
2153
2154 /* Return true if we can add VALUE to a register using a single ADDVL
2155    or ADDPL instruction.  */
2156
2157 static bool
2158 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2159 {
2160   HOST_WIDE_INT factor = value.coeffs[0];
2161   if (factor == 0 || value.coeffs[1] != factor)
2162     return false;
2163   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2164      and a value of 16 is one vector width.  */
2165   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2166           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2167 }
2168
2169 /* Likewise for rtx X.  */
2170
2171 bool
2172 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2173 {
2174   poly_int64 value;
2175   return (poly_int_rtx_p (x, &value)
2176           && aarch64_sve_addvl_addpl_immediate_p (value));
2177 }
2178
2179 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2180    and storing the result in operand 0.  */
2181
2182 char *
2183 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2184 {
2185   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2186   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2187   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2188
2189   /* Use INC or DEC if possible.  */
2190   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2191     {
2192       if (aarch64_sve_cnt_immediate_p (offset_value))
2193         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2194                                                  offset_value.coeffs[1], 0);
2195       if (aarch64_sve_cnt_immediate_p (-offset_value))
2196         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2197                                                  -offset_value.coeffs[1], 0);
2198     }
2199
2200   int factor = offset_value.coeffs[1];
2201   if ((factor & 15) == 0)
2202     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2203   else
2204     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2205   return buffer;
2206 }
2207
2208 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2209    instruction.  If it is, store the number of elements in each vector
2210    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2211    factor in *FACTOR_OUT (if nonnull).  */
2212
2213 bool
2214 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2215                                  unsigned int *nelts_per_vq_out)
2216 {
2217   rtx elt;
2218   poly_int64 value;
2219
2220   if (!const_vec_duplicate_p (x, &elt)
2221       || !poly_int_rtx_p (elt, &value))
2222     return false;
2223
2224   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2225   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2226     /* There's no vector INCB.  */
2227     return false;
2228
2229   HOST_WIDE_INT factor = value.coeffs[0];
2230   if (value.coeffs[1] != factor)
2231     return false;
2232
2233   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2234   if ((factor % nelts_per_vq) != 0
2235       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2236     return false;
2237
2238   if (factor_out)
2239     *factor_out = factor;
2240   if (nelts_per_vq_out)
2241     *nelts_per_vq_out = nelts_per_vq;
2242   return true;
2243 }
2244
2245 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2246    instruction.  */
2247
2248 bool
2249 aarch64_sve_inc_dec_immediate_p (rtx x)
2250 {
2251   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2252 }
2253
2254 /* Return the asm template for an SVE vector INC or DEC instruction.
2255    OPERANDS gives the operands before the vector count and X is the
2256    value of the vector count operand itself.  */
2257
2258 char *
2259 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2260 {
2261   int factor;
2262   unsigned int nelts_per_vq;
2263   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2264     gcc_unreachable ();
2265   if (factor < 0)
2266     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2267                                              nelts_per_vq);
2268   else
2269     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2270                                              nelts_per_vq);
2271 }
2272
2273 static int
2274 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2275                                 scalar_int_mode mode)
2276 {
2277   int i;
2278   unsigned HOST_WIDE_INT val, val2, mask;
2279   int one_match, zero_match;
2280   int num_insns;
2281
2282   val = INTVAL (imm);
2283
2284   if (aarch64_move_imm (val, mode))
2285     {
2286       if (generate)
2287         emit_insn (gen_rtx_SET (dest, imm));
2288       return 1;
2289     }
2290
2291   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2292      (with XXXX non-zero). In that case check to see if the move can be done in
2293      a smaller mode.  */
2294   val2 = val & 0xffffffff;
2295   if (mode == DImode
2296       && aarch64_move_imm (val2, SImode)
2297       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2298     {
2299       if (generate)
2300         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2301
2302       /* Check if we have to emit a second instruction by checking to see
2303          if any of the upper 32 bits of the original DI mode value is set.  */
2304       if (val == val2)
2305         return 1;
2306
2307       i = (val >> 48) ? 48 : 32;
2308
2309       if (generate)
2310          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2311                                     GEN_INT ((val >> i) & 0xffff)));
2312
2313       return 2;
2314     }
2315
2316   if ((val >> 32) == 0 || mode == SImode)
2317     {
2318       if (generate)
2319         {
2320           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2321           if (mode == SImode)
2322             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2323                                        GEN_INT ((val >> 16) & 0xffff)));
2324           else
2325             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2326                                        GEN_INT ((val >> 16) & 0xffff)));
2327         }
2328       return 2;
2329     }
2330
2331   /* Remaining cases are all for DImode.  */
2332
2333   mask = 0xffff;
2334   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2335     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2336   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2337     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2338
2339   if (zero_match != 2 && one_match != 2)
2340     {
2341       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2342          For a 64-bit bitmask try whether changing 16 bits to all ones or
2343          zeroes creates a valid bitmask.  To check any repeated bitmask,
2344          try using 16 bits from the other 32-bit half of val.  */
2345
2346       for (i = 0; i < 64; i += 16, mask <<= 16)
2347         {
2348           val2 = val & ~mask;
2349           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2350             break;
2351           val2 = val | mask;
2352           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2353             break;
2354           val2 = val2 & ~mask;
2355           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2356           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2357             break;
2358         }
2359       if (i != 64)
2360         {
2361           if (generate)
2362             {
2363               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2364               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2365                                          GEN_INT ((val >> i) & 0xffff)));
2366             }
2367           return 2;
2368         }
2369     }
2370
2371   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2372      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2373      otherwise skip zero bits.  */
2374
2375   num_insns = 1;
2376   mask = 0xffff;
2377   val2 = one_match > zero_match ? ~val : val;
2378   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2379
2380   if (generate)
2381     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2382                                            ? (val | ~(mask << i))
2383                                            : (val & (mask << i)))));
2384   for (i += 16; i < 64; i += 16)
2385     {
2386       if ((val2 & (mask << i)) == 0)
2387         continue;
2388       if (generate)
2389         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2390                                    GEN_INT ((val >> i) & 0xffff)));
2391       num_insns ++;
2392     }
2393
2394   return num_insns;
2395 }
2396
2397 /* Return whether imm is a 128-bit immediate which is simple enough to
2398    expand inline.  */
2399 bool
2400 aarch64_mov128_immediate (rtx imm)
2401 {
2402   if (GET_CODE (imm) == CONST_INT)
2403     return true;
2404
2405   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2406
2407   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2408   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2409
2410   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2411          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2412 }
2413
2414
2415 /* Return the number of temporary registers that aarch64_add_offset_1
2416    would need to add OFFSET to a register.  */
2417
2418 static unsigned int
2419 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2420 {
2421   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2422 }
2423
2424 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2425    a non-polynomial OFFSET.  MODE is the mode of the addition.
2426    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2427    be set and CFA adjustments added to the generated instructions.
2428
2429    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2430    temporary if register allocation is already complete.  This temporary
2431    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2432    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2433    the immediate again.
2434
2435    Since this function may be used to adjust the stack pointer, we must
2436    ensure that it cannot cause transient stack deallocation (for example
2437    by first incrementing SP and then decrementing when adjusting by a
2438    large immediate).  */
2439
2440 static void
2441 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2442                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2443                       bool frame_related_p, bool emit_move_imm)
2444 {
2445   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2446   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2447
2448   HOST_WIDE_INT moffset = abs_hwi (offset);
2449   rtx_insn *insn;
2450
2451   if (!moffset)
2452     {
2453       if (!rtx_equal_p (dest, src))
2454         {
2455           insn = emit_insn (gen_rtx_SET (dest, src));
2456           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2457         }
2458       return;
2459     }
2460
2461   /* Single instruction adjustment.  */
2462   if (aarch64_uimm12_shift (moffset))
2463     {
2464       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2465       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2466       return;
2467     }
2468
2469   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2470      and either:
2471
2472      a) the offset cannot be loaded by a 16-bit move or
2473      b) there is no spare register into which we can move it.  */
2474   if (moffset < 0x1000000
2475       && ((!temp1 && !can_create_pseudo_p ())
2476           || !aarch64_move_imm (moffset, mode)))
2477     {
2478       HOST_WIDE_INT low_off = moffset & 0xfff;
2479
2480       low_off = offset < 0 ? -low_off : low_off;
2481       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2482       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2483       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2484       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2485       return;
2486     }
2487
2488   /* Emit a move immediate if required and an addition/subtraction.  */
2489   if (emit_move_imm)
2490     {
2491       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2492       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2493     }
2494   insn = emit_insn (offset < 0
2495                     ? gen_sub3_insn (dest, src, temp1)
2496                     : gen_add3_insn (dest, src, temp1));
2497   if (frame_related_p)
2498     {
2499       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2500       rtx adj = plus_constant (mode, src, offset);
2501       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2502     }
2503 }
2504
2505 /* Return the number of temporary registers that aarch64_add_offset
2506    would need to move OFFSET into a register or add OFFSET to a register;
2507    ADD_P is true if we want the latter rather than the former.  */
2508
2509 static unsigned int
2510 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2511 {
2512   /* This follows the same structure as aarch64_add_offset.  */
2513   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2514     return 0;
2515
2516   unsigned int count = 0;
2517   HOST_WIDE_INT factor = offset.coeffs[1];
2518   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2519   poly_int64 poly_offset (factor, factor);
2520   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2521     /* Need one register for the ADDVL/ADDPL result.  */
2522     count += 1;
2523   else if (factor != 0)
2524     {
2525       factor = abs (factor);
2526       if (factor > 16 * (factor & -factor))
2527         /* Need one register for the CNT result and one for the multiplication
2528            factor.  If necessary, the second temporary can be reused for the
2529            constant part of the offset.  */
2530         return 2;
2531       /* Need one register for the CNT result (which might then
2532          be shifted).  */
2533       count += 1;
2534     }
2535   return count + aarch64_add_offset_1_temporaries (constant);
2536 }
2537
2538 /* If X can be represented as a poly_int64, return the number
2539    of temporaries that are required to add it to a register.
2540    Return -1 otherwise.  */
2541
2542 int
2543 aarch64_add_offset_temporaries (rtx x)
2544 {
2545   poly_int64 offset;
2546   if (!poly_int_rtx_p (x, &offset))
2547     return -1;
2548   return aarch64_offset_temporaries (true, offset);
2549 }
2550
2551 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2552    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2553    be set and CFA adjustments added to the generated instructions.
2554
2555    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2556    temporary if register allocation is already complete.  This temporary
2557    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2558    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2559    false to avoid emitting the immediate again.
2560
2561    TEMP2, if nonnull, is a second temporary register that doesn't
2562    overlap either DEST or REG.
2563
2564    Since this function may be used to adjust the stack pointer, we must
2565    ensure that it cannot cause transient stack deallocation (for example
2566    by first incrementing SP and then decrementing when adjusting by a
2567    large immediate).  */
2568
2569 static void
2570 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2571                     poly_int64 offset, rtx temp1, rtx temp2,
2572                     bool frame_related_p, bool emit_move_imm = true)
2573 {
2574   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2575   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2576   gcc_assert (temp1 == NULL_RTX
2577               || !frame_related_p
2578               || !reg_overlap_mentioned_p (temp1, dest));
2579   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2580
2581   /* Try using ADDVL or ADDPL to add the whole value.  */
2582   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2583     {
2584       rtx offset_rtx = gen_int_mode (offset, mode);
2585       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2586       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2587       return;
2588     }
2589
2590   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2591      SVE vector register, over and above the minimum size of 128 bits.
2592      This is equivalent to half the value returned by CNTD with a
2593      vector shape of ALL.  */
2594   HOST_WIDE_INT factor = offset.coeffs[1];
2595   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2596
2597   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2598   poly_int64 poly_offset (factor, factor);
2599   if (src != const0_rtx
2600       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2601     {
2602       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2603       if (frame_related_p)
2604         {
2605           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2606           RTX_FRAME_RELATED_P (insn) = true;
2607           src = dest;
2608         }
2609       else
2610         {
2611           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2612           src = aarch64_force_temporary (mode, temp1, addr);
2613           temp1 = temp2;
2614           temp2 = NULL_RTX;
2615         }
2616     }
2617   /* Otherwise use a CNT-based sequence.  */
2618   else if (factor != 0)
2619     {
2620       /* Use a subtraction if we have a negative factor.  */
2621       rtx_code code = PLUS;
2622       if (factor < 0)
2623         {
2624           factor = -factor;
2625           code = MINUS;
2626         }
2627
2628       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2629          into the multiplication.  */
2630       rtx val;
2631       int shift = 0;
2632       if (factor & 1)
2633         /* Use a right shift by 1.  */
2634         shift = -1;
2635       else
2636         factor /= 2;
2637       HOST_WIDE_INT low_bit = factor & -factor;
2638       if (factor <= 16 * low_bit)
2639         {
2640           if (factor > 16 * 8)
2641             {
2642               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2643                  the value with the minimum multiplier and shift it into
2644                  position.  */
2645               int extra_shift = exact_log2 (low_bit);
2646               shift += extra_shift;
2647               factor >>= extra_shift;
2648             }
2649           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2650         }
2651       else
2652         {
2653           /* Use CNTD, then multiply it by FACTOR.  */
2654           val = gen_int_mode (poly_int64 (2, 2), mode);
2655           val = aarch64_force_temporary (mode, temp1, val);
2656
2657           /* Go back to using a negative multiplication factor if we have
2658              no register from which to subtract.  */
2659           if (code == MINUS && src == const0_rtx)
2660             {
2661               factor = -factor;
2662               code = PLUS;
2663             }
2664           rtx coeff1 = gen_int_mode (factor, mode);
2665           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2666           val = gen_rtx_MULT (mode, val, coeff1);
2667         }
2668
2669       if (shift > 0)
2670         {
2671           /* Multiply by 1 << SHIFT.  */
2672           val = aarch64_force_temporary (mode, temp1, val);
2673           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2674         }
2675       else if (shift == -1)
2676         {
2677           /* Divide by 2.  */
2678           val = aarch64_force_temporary (mode, temp1, val);
2679           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2680         }
2681
2682       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
2683       if (src != const0_rtx)
2684         {
2685           val = aarch64_force_temporary (mode, temp1, val);
2686           val = gen_rtx_fmt_ee (code, mode, src, val);
2687         }
2688       else if (code == MINUS)
2689         {
2690           val = aarch64_force_temporary (mode, temp1, val);
2691           val = gen_rtx_NEG (mode, val);
2692         }
2693
2694       if (constant == 0 || frame_related_p)
2695         {
2696           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2697           if (frame_related_p)
2698             {
2699               RTX_FRAME_RELATED_P (insn) = true;
2700               add_reg_note (insn, REG_CFA_ADJUST_CFA,
2701                             gen_rtx_SET (dest, plus_constant (Pmode, src,
2702                                                               poly_offset)));
2703             }
2704           src = dest;
2705           if (constant == 0)
2706             return;
2707         }
2708       else
2709         {
2710           src = aarch64_force_temporary (mode, temp1, val);
2711           temp1 = temp2;
2712           temp2 = NULL_RTX;
2713         }
2714
2715       emit_move_imm = true;
2716     }
2717
2718   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2719                         frame_related_p, emit_move_imm);
2720 }
2721
2722 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2723    than a poly_int64.  */
2724
2725 void
2726 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2727                           rtx offset_rtx, rtx temp1, rtx temp2)
2728 {
2729   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2730                       temp1, temp2, false);
2731 }
2732
2733 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2734    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
2735    if TEMP1 already contains abs (DELTA).  */
2736
2737 static inline void
2738 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2739 {
2740   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2741                       temp1, temp2, true, emit_move_imm);
2742 }
2743
2744 /* Subtract DELTA from the stack pointer, marking the instructions
2745    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
2746    if nonnull.  */
2747
2748 static inline void
2749 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2750 {
2751   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2752                       temp1, temp2, frame_related_p);
2753 }
2754
2755 /* Set DEST to (vec_series BASE STEP).  */
2756
2757 static void
2758 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2759 {
2760   machine_mode mode = GET_MODE (dest);
2761   scalar_mode inner = GET_MODE_INNER (mode);
2762
2763   /* Each operand can be a register or an immediate in the range [-16, 15].  */
2764   if (!aarch64_sve_index_immediate_p (base))
2765     base = force_reg (inner, base);
2766   if (!aarch64_sve_index_immediate_p (step))
2767     step = force_reg (inner, step);
2768
2769   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2770 }
2771
2772 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2773    integer of mode INT_MODE.  Return true on success.  */
2774
2775 static bool
2776 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2777                                       rtx src)
2778 {
2779   /* If the constant is smaller than 128 bits, we can do the move
2780      using a vector of SRC_MODEs.  */
2781   if (src_mode != TImode)
2782     {
2783       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2784                                      GET_MODE_SIZE (src_mode));
2785       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2786       emit_move_insn (gen_lowpart (dup_mode, dest),
2787                       gen_const_vec_duplicate (dup_mode, src));
2788       return true;
2789     }
2790
2791   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
2792   src = force_const_mem (src_mode, src);
2793   if (!src)
2794     return false;
2795
2796   /* Make sure that the address is legitimate.  */
2797   if (!aarch64_sve_ld1r_operand_p (src))
2798     {
2799       rtx addr = force_reg (Pmode, XEXP (src, 0));
2800       src = replace_equiv_address (src, addr);
2801     }
2802
2803   machine_mode mode = GET_MODE (dest);
2804   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2805   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2806   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2807   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2808   emit_insn (gen_rtx_SET (dest, src));
2809   return true;
2810 }
2811
2812 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2813    isn't a simple duplicate or series.  */
2814
2815 static void
2816 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2817 {
2818   machine_mode mode = GET_MODE (src);
2819   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2820   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2821   gcc_assert (npatterns > 1);
2822
2823   if (nelts_per_pattern == 1)
2824     {
2825       /* The constant is a repeating seqeuence of at least two elements,
2826          where the repeating elements occupy no more than 128 bits.
2827          Get an integer representation of the replicated value.  */
2828       scalar_int_mode int_mode;
2829       if (BYTES_BIG_ENDIAN)
2830         /* For now, always use LD1RQ to load the value on big-endian
2831            targets, since the handling of smaller integers includes a
2832            subreg that is semantically an element reverse.  */
2833         int_mode = TImode;
2834       else
2835         {
2836           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2837           gcc_assert (int_bits <= 128);
2838           int_mode = int_mode_for_size (int_bits, 0).require ();
2839         }
2840       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2841       if (int_value
2842           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2843         return;
2844     }
2845
2846   /* Expand each pattern individually.  */
2847   rtx_vector_builder builder;
2848   auto_vec<rtx, 16> vectors (npatterns);
2849   for (unsigned int i = 0; i < npatterns; ++i)
2850     {
2851       builder.new_vector (mode, 1, nelts_per_pattern);
2852       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2853         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2854       vectors.quick_push (force_reg (mode, builder.build ()));
2855     }
2856
2857   /* Use permutes to interleave the separate vectors.  */
2858   while (npatterns > 1)
2859     {
2860       npatterns /= 2;
2861       for (unsigned int i = 0; i < npatterns; ++i)
2862         {
2863           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2864           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2865           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2866           vectors[i] = tmp;
2867         }
2868     }
2869   gcc_assert (vectors[0] == dest);
2870 }
2871
2872 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
2873    is a pattern that can be used to set DEST to a replicated scalar
2874    element.  */
2875
2876 void
2877 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2878                               rtx (*gen_vec_duplicate) (rtx, rtx))
2879 {
2880   machine_mode mode = GET_MODE (dest);
2881
2882   /* Check on what type of symbol it is.  */
2883   scalar_int_mode int_mode;
2884   if ((GET_CODE (imm) == SYMBOL_REF
2885        || GET_CODE (imm) == LABEL_REF
2886        || GET_CODE (imm) == CONST
2887        || GET_CODE (imm) == CONST_POLY_INT)
2888       && is_a <scalar_int_mode> (mode, &int_mode))
2889     {
2890       rtx mem;
2891       poly_int64 offset;
2892       HOST_WIDE_INT const_offset;
2893       enum aarch64_symbol_type sty;
2894
2895       /* If we have (const (plus symbol offset)), separate out the offset
2896          before we start classifying the symbol.  */
2897       rtx base = strip_offset (imm, &offset);
2898
2899       /* We must always add an offset involving VL separately, rather than
2900          folding it into the relocation.  */
2901       if (!offset.is_constant (&const_offset))
2902         {
2903           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2904             emit_insn (gen_rtx_SET (dest, imm));
2905           else
2906             {
2907               /* Do arithmetic on 32-bit values if the result is smaller
2908                  than that.  */
2909               if (partial_subreg_p (int_mode, SImode))
2910                 {
2911                   /* It is invalid to do symbol calculations in modes
2912                      narrower than SImode.  */
2913                   gcc_assert (base == const0_rtx);
2914                   dest = gen_lowpart (SImode, dest);
2915                   int_mode = SImode;
2916                 }
2917               if (base != const0_rtx)
2918                 {
2919                   base = aarch64_force_temporary (int_mode, dest, base);
2920                   aarch64_add_offset (int_mode, dest, base, offset,
2921                                       NULL_RTX, NULL_RTX, false);
2922                 }
2923               else
2924                 aarch64_add_offset (int_mode, dest, base, offset,
2925                                     dest, NULL_RTX, false);
2926             }
2927           return;
2928         }
2929
2930       sty = aarch64_classify_symbol (base, const_offset);
2931       switch (sty)
2932         {
2933         case SYMBOL_FORCE_TO_MEM:
2934           if (const_offset != 0
2935               && targetm.cannot_force_const_mem (int_mode, imm))
2936             {
2937               gcc_assert (can_create_pseudo_p ());
2938               base = aarch64_force_temporary (int_mode, dest, base);
2939               aarch64_add_offset (int_mode, dest, base, const_offset,
2940                                   NULL_RTX, NULL_RTX, false);
2941               return;
2942             }
2943
2944           mem = force_const_mem (ptr_mode, imm);
2945           gcc_assert (mem);
2946
2947           /* If we aren't generating PC relative literals, then
2948              we need to expand the literal pool access carefully.
2949              This is something that needs to be done in a number
2950              of places, so could well live as a separate function.  */
2951           if (!aarch64_pcrelative_literal_loads)
2952             {
2953               gcc_assert (can_create_pseudo_p ());
2954               base = gen_reg_rtx (ptr_mode);
2955               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2956               if (ptr_mode != Pmode)
2957                 base = convert_memory_address (Pmode, base);
2958               mem = gen_rtx_MEM (ptr_mode, base);
2959             }
2960
2961           if (int_mode != ptr_mode)
2962             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2963
2964           emit_insn (gen_rtx_SET (dest, mem));
2965
2966           return;
2967
2968         case SYMBOL_SMALL_TLSGD:
2969         case SYMBOL_SMALL_TLSDESC:
2970         case SYMBOL_SMALL_TLSIE:
2971         case SYMBOL_SMALL_GOT_28K:
2972         case SYMBOL_SMALL_GOT_4G:
2973         case SYMBOL_TINY_GOT:
2974         case SYMBOL_TINY_TLSIE:
2975           if (const_offset != 0)
2976             {
2977               gcc_assert(can_create_pseudo_p ());
2978               base = aarch64_force_temporary (int_mode, dest, base);
2979               aarch64_add_offset (int_mode, dest, base, const_offset,
2980                                   NULL_RTX, NULL_RTX, false);
2981               return;
2982             }
2983           /* FALLTHRU */
2984
2985         case SYMBOL_SMALL_ABSOLUTE:
2986         case SYMBOL_TINY_ABSOLUTE:
2987         case SYMBOL_TLSLE12:
2988         case SYMBOL_TLSLE24:
2989         case SYMBOL_TLSLE32:
2990         case SYMBOL_TLSLE48:
2991           aarch64_load_symref_appropriately (dest, imm, sty);
2992           return;
2993
2994         default:
2995           gcc_unreachable ();
2996         }
2997     }
2998
2999   if (!CONST_INT_P (imm))
3000     {
3001       rtx base, step, value;
3002       if (GET_CODE (imm) == HIGH
3003           || aarch64_simd_valid_immediate (imm, NULL))
3004         emit_insn (gen_rtx_SET (dest, imm));
3005       else if (const_vec_series_p (imm, &base, &step))
3006         aarch64_expand_vec_series (dest, base, step);
3007       else if (const_vec_duplicate_p (imm, &value))
3008         {
3009           /* If the constant is out of range of an SVE vector move,
3010              load it from memory if we can, otherwise move it into
3011              a register and use a DUP.  */
3012           scalar_mode inner_mode = GET_MODE_INNER (mode);
3013           rtx op = force_const_mem (inner_mode, value);
3014           if (!op)
3015             op = force_reg (inner_mode, value);
3016           else if (!aarch64_sve_ld1r_operand_p (op))
3017             {
3018               rtx addr = force_reg (Pmode, XEXP (op, 0));
3019               op = replace_equiv_address (op, addr);
3020             }
3021           emit_insn (gen_vec_duplicate (dest, op));
3022         }
3023       else if (GET_CODE (imm) == CONST_VECTOR
3024                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3025         aarch64_expand_sve_const_vector (dest, imm);
3026       else
3027         {
3028           rtx mem = force_const_mem (mode, imm);
3029           gcc_assert (mem);
3030           emit_move_insn (dest, mem);
3031         }
3032
3033       return;
3034     }
3035
3036   aarch64_internal_mov_immediate (dest, imm, true,
3037                                   as_a <scalar_int_mode> (mode));
3038 }
3039
3040 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3041    that is known to contain PTRUE.  */
3042
3043 void
3044 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3045 {
3046   emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3047                                                 gen_rtvec (2, pred, src),
3048                                                 UNSPEC_MERGE_PTRUE)));
3049 }
3050
3051 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3052    operand is in memory.  In this case we need to use the predicated LD1
3053    and ST1 instead of LDR and STR, both for correctness on big-endian
3054    targets and because LD1 and ST1 support a wider range of addressing modes.
3055    PRED_MODE is the mode of the predicate.
3056
3057    See the comment at the head of aarch64-sve.md for details about the
3058    big-endian handling.  */
3059
3060 void
3061 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3062 {
3063   machine_mode mode = GET_MODE (dest);
3064   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3065   if (!register_operand (src, mode)
3066       && !register_operand (dest, mode))
3067     {
3068       rtx tmp = gen_reg_rtx (mode);
3069       if (MEM_P (src))
3070         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3071       else
3072         emit_move_insn (tmp, src);
3073       src = tmp;
3074     }
3075   aarch64_emit_sve_pred_move (dest, ptrue, src);
3076 }
3077
3078 /* Called only on big-endian targets.  See whether an SVE vector move
3079    from SRC to DEST is effectively a REV[BHW] instruction, because at
3080    least one operand is a subreg of an SVE vector that has wider or
3081    narrower elements.  Return true and emit the instruction if so.
3082
3083    For example:
3084
3085      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3086
3087    represents a VIEW_CONVERT between the following vectors, viewed
3088    in memory order:
3089
3090      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3091      R1: { [0],      [1],      [2],      [3],     ... }
3092
3093    The high part of lane X in R2 should therefore correspond to lane X*2
3094    of R1, but the register representations are:
3095
3096          msb                                      lsb
3097      R2: ...... [1].high  [1].low   [0].high  [0].low
3098      R1: ...... [3]       [2]       [1]       [0]
3099
3100    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3101    We therefore need a reverse operation to swap the high and low values
3102    around.
3103
3104    This is purely an optimization.  Without it we would spill the
3105    subreg operand to the stack in one mode and reload it in the
3106    other mode, which has the same effect as the REV.  */
3107
3108 bool
3109 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3110 {
3111   gcc_assert (BYTES_BIG_ENDIAN);
3112   if (GET_CODE (dest) == SUBREG)
3113     dest = SUBREG_REG (dest);
3114   if (GET_CODE (src) == SUBREG)
3115     src = SUBREG_REG (src);
3116
3117   /* The optimization handles two single SVE REGs with different element
3118      sizes.  */
3119   if (!REG_P (dest)
3120       || !REG_P (src)
3121       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3122       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3123       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3124           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3125     return false;
3126
3127   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3128   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3129   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3130                                UNSPEC_REV_SUBREG);
3131   emit_insn (gen_rtx_SET (dest, unspec));
3132   return true;
3133 }
3134
3135 /* Return a copy of X with mode MODE, without changing its other
3136    attributes.  Unlike gen_lowpart, this doesn't care whether the
3137    mode change is valid.  */
3138
3139 static rtx
3140 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3141 {
3142   if (GET_MODE (x) == mode)
3143     return x;
3144
3145   x = shallow_copy_rtx (x);
3146   set_mode_and_regno (x, mode, REGNO (x));
3147   return x;
3148 }
3149
3150 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3151    operands.  */
3152
3153 void
3154 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3155 {
3156   /* Decide which REV operation we need.  The mode with narrower elements
3157      determines the mode of the operands and the mode with the wider
3158      elements determines the reverse width.  */
3159   machine_mode mode_with_wider_elts = GET_MODE (dest);
3160   machine_mode mode_with_narrower_elts = GET_MODE (src);
3161   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3162       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3163     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3164
3165   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3166   unsigned int unspec;
3167   if (wider_bytes == 8)
3168     unspec = UNSPEC_REV64;
3169   else if (wider_bytes == 4)
3170     unspec = UNSPEC_REV32;
3171   else if (wider_bytes == 2)
3172     unspec = UNSPEC_REV16;
3173   else
3174     gcc_unreachable ();
3175   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3176
3177   /* Emit:
3178
3179        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3180                          UNSPEC_MERGE_PTRUE))
3181
3182      with the appropriate modes.  */
3183   ptrue = gen_lowpart (pred_mode, ptrue);
3184   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3185   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3186   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3187   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3188                         UNSPEC_MERGE_PTRUE);
3189   emit_insn (gen_rtx_SET (dest, src));
3190 }
3191
3192 static bool
3193 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3194                                  tree exp ATTRIBUTE_UNUSED)
3195 {
3196   /* Currently, always true.  */
3197   return true;
3198 }
3199
3200 /* Implement TARGET_PASS_BY_REFERENCE.  */
3201
3202 static bool
3203 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3204                            machine_mode mode,
3205                            const_tree type,
3206                            bool named ATTRIBUTE_UNUSED)
3207 {
3208   HOST_WIDE_INT size;
3209   machine_mode dummymode;
3210   int nregs;
3211
3212   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3213   if (mode == BLKmode && type)
3214     size = int_size_in_bytes (type);
3215   else
3216     /* No frontends can create types with variable-sized modes, so we
3217        shouldn't be asked to pass or return them.  */
3218     size = GET_MODE_SIZE (mode).to_constant ();
3219
3220   /* Aggregates are passed by reference based on their size.  */
3221   if (type && AGGREGATE_TYPE_P (type))
3222     {
3223       size = int_size_in_bytes (type);
3224     }
3225
3226   /* Variable sized arguments are always returned by reference.  */
3227   if (size < 0)
3228     return true;
3229
3230   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3231   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3232                                                &dummymode, &nregs,
3233                                                NULL))
3234     return false;
3235
3236   /* Arguments which are variable sized or larger than 2 registers are
3237      passed by reference unless they are a homogenous floating point
3238      aggregate.  */
3239   return size > 2 * UNITS_PER_WORD;
3240 }
3241
3242 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3243 static bool
3244 aarch64_return_in_msb (const_tree valtype)
3245 {
3246   machine_mode dummy_mode;
3247   int dummy_int;
3248
3249   /* Never happens in little-endian mode.  */
3250   if (!BYTES_BIG_ENDIAN)
3251     return false;
3252
3253   /* Only composite types smaller than or equal to 16 bytes can
3254      be potentially returned in registers.  */
3255   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3256       || int_size_in_bytes (valtype) <= 0
3257       || int_size_in_bytes (valtype) > 16)
3258     return false;
3259
3260   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3261      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3262      is always passed/returned in the least significant bits of fp/simd
3263      register(s).  */
3264   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3265                                                &dummy_mode, &dummy_int, NULL))
3266     return false;
3267
3268   return true;
3269 }
3270
3271 /* Implement TARGET_FUNCTION_VALUE.
3272    Define how to find the value returned by a function.  */
3273
3274 static rtx
3275 aarch64_function_value (const_tree type, const_tree func,
3276                         bool outgoing ATTRIBUTE_UNUSED)
3277 {
3278   machine_mode mode;
3279   int unsignedp;
3280   int count;
3281   machine_mode ag_mode;
3282
3283   mode = TYPE_MODE (type);
3284   if (INTEGRAL_TYPE_P (type))
3285     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3286
3287   if (aarch64_return_in_msb (type))
3288     {
3289       HOST_WIDE_INT size = int_size_in_bytes (type);
3290
3291       if (size % UNITS_PER_WORD != 0)
3292         {
3293           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3294           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3295         }
3296     }
3297
3298   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3299                                                &ag_mode, &count, NULL))
3300     {
3301       if (!aarch64_composite_type_p (type, mode))
3302         {
3303           gcc_assert (count == 1 && mode == ag_mode);
3304           return gen_rtx_REG (mode, V0_REGNUM);
3305         }
3306       else
3307         {
3308           int i;
3309           rtx par;
3310
3311           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3312           for (i = 0; i < count; i++)
3313             {
3314               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3315               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3316               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3317               XVECEXP (par, 0, i) = tmp;
3318             }
3319           return par;
3320         }
3321     }
3322   else
3323     return gen_rtx_REG (mode, R0_REGNUM);
3324 }
3325
3326 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3327    Return true if REGNO is the number of a hard register in which the values
3328    of called function may come back.  */
3329
3330 static bool
3331 aarch64_function_value_regno_p (const unsigned int regno)
3332 {
3333   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3334      of 16-byte return values are: 128-bit integers and 16-byte small
3335      structures (excluding homogeneous floating-point aggregates).  */
3336   if (regno == R0_REGNUM || regno == R1_REGNUM)
3337     return true;
3338
3339   /* Up to four fp/simd registers can return a function value, e.g. a
3340      homogeneous floating-point aggregate having four members.  */
3341   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3342     return TARGET_FLOAT;
3343
3344   return false;
3345 }
3346
3347 /* Implement TARGET_RETURN_IN_MEMORY.
3348
3349    If the type T of the result of a function is such that
3350      void func (T arg)
3351    would require that arg be passed as a value in a register (or set of
3352    registers) according to the parameter passing rules, then the result
3353    is returned in the same registers as would be used for such an
3354    argument.  */
3355
3356 static bool
3357 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3358 {
3359   HOST_WIDE_INT size;
3360   machine_mode ag_mode;
3361   int count;
3362
3363   if (!AGGREGATE_TYPE_P (type)
3364       && TREE_CODE (type) != COMPLEX_TYPE
3365       && TREE_CODE (type) != VECTOR_TYPE)
3366     /* Simple scalar types always returned in registers.  */
3367     return false;
3368
3369   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3370                                                type,
3371                                                &ag_mode,
3372                                                &count,
3373                                                NULL))
3374     return false;
3375
3376   /* Types larger than 2 registers returned in memory.  */
3377   size = int_size_in_bytes (type);
3378   return (size < 0 || size > 2 * UNITS_PER_WORD);
3379 }
3380
3381 static bool
3382 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3383                                const_tree type, int *nregs)
3384 {
3385   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3386   return aarch64_vfp_is_call_or_return_candidate (mode,
3387                                                   type,
3388                                                   &pcum->aapcs_vfp_rmode,
3389                                                   nregs,
3390                                                   NULL);
3391 }
3392
3393 /* Given MODE and TYPE of a function argument, return the alignment in
3394    bits.  The idea is to suppress any stronger alignment requested by
3395    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3396    This is a helper function for local use only.  */
3397
3398 static unsigned int
3399 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3400 {
3401   if (!type)
3402     return GET_MODE_ALIGNMENT (mode);
3403
3404   if (integer_zerop (TYPE_SIZE (type)))
3405     return 0;
3406
3407   gcc_assert (TYPE_MODE (type) == mode);
3408
3409   if (!AGGREGATE_TYPE_P (type))
3410     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3411
3412   if (TREE_CODE (type) == ARRAY_TYPE)
3413     return TYPE_ALIGN (TREE_TYPE (type));
3414
3415   unsigned int alignment = 0;
3416   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3417     if (TREE_CODE (field) == FIELD_DECL)
3418       alignment = std::max (alignment, DECL_ALIGN (field));
3419
3420   return alignment;
3421 }
3422
3423 /* Layout a function argument according to the AAPCS64 rules.  The rule
3424    numbers refer to the rule numbers in the AAPCS64.  */
3425
3426 static void
3427 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3428                     const_tree type,
3429                     bool named ATTRIBUTE_UNUSED)
3430 {
3431   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3432   int ncrn, nvrn, nregs;
3433   bool allocate_ncrn, allocate_nvrn;
3434   HOST_WIDE_INT size;
3435
3436   /* We need to do this once per argument.  */
3437   if (pcum->aapcs_arg_processed)
3438     return;
3439
3440   pcum->aapcs_arg_processed = true;
3441
3442   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3443   if (type)
3444     size = int_size_in_bytes (type);
3445   else
3446     /* No frontends can create types with variable-sized modes, so we
3447        shouldn't be asked to pass or return them.  */
3448     size = GET_MODE_SIZE (mode).to_constant ();
3449   size = ROUND_UP (size, UNITS_PER_WORD);
3450
3451   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3452   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3453                                                  mode,
3454                                                  type,
3455                                                  &nregs);
3456
3457   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3458      The following code thus handles passing by SIMD/FP registers first.  */
3459
3460   nvrn = pcum->aapcs_nvrn;
3461
3462   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3463      and homogenous short-vector aggregates (HVA).  */
3464   if (allocate_nvrn)
3465     {
3466       if (!TARGET_FLOAT)
3467         aarch64_err_no_fpadvsimd (mode, "argument");
3468
3469       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3470         {
3471           pcum->aapcs_nextnvrn = nvrn + nregs;
3472           if (!aarch64_composite_type_p (type, mode))
3473             {
3474               gcc_assert (nregs == 1);
3475               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3476             }
3477           else
3478             {
3479               rtx par;
3480               int i;
3481               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3482               for (i = 0; i < nregs; i++)
3483                 {
3484                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3485                                          V0_REGNUM + nvrn + i);
3486                   rtx offset = gen_int_mode
3487                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3488                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3489                   XVECEXP (par, 0, i) = tmp;
3490                 }
3491               pcum->aapcs_reg = par;
3492             }
3493           return;
3494         }
3495       else
3496         {
3497           /* C.3 NSRN is set to 8.  */
3498           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3499           goto on_stack;
3500         }
3501     }
3502
3503   ncrn = pcum->aapcs_ncrn;
3504   nregs = size / UNITS_PER_WORD;
3505
3506   /* C6 - C9.  though the sign and zero extension semantics are
3507      handled elsewhere.  This is the case where the argument fits
3508      entirely general registers.  */
3509   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3510     {
3511
3512       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3513
3514       /* C.8 if the argument has an alignment of 16 then the NGRN is
3515          rounded up to the next even number.  */
3516       if (nregs == 2
3517           && ncrn % 2
3518           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3519              comparison is there because for > 16 * BITS_PER_UNIT
3520              alignment nregs should be > 2 and therefore it should be
3521              passed by reference rather than value.  */
3522           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3523         {
3524           ++ncrn;
3525           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3526         }
3527
3528       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3529          A reg is still generated for it, but the caller should be smart
3530          enough not to use it.  */
3531       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3532         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3533       else
3534         {
3535           rtx par;
3536           int i;
3537
3538           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3539           for (i = 0; i < nregs; i++)
3540             {
3541               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3542               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3543                                        GEN_INT (i * UNITS_PER_WORD));
3544               XVECEXP (par, 0, i) = tmp;
3545             }
3546           pcum->aapcs_reg = par;
3547         }
3548
3549       pcum->aapcs_nextncrn = ncrn + nregs;
3550       return;
3551     }
3552
3553   /* C.11  */
3554   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3555
3556   /* The argument is passed on stack; record the needed number of words for
3557      this argument and align the total size if necessary.  */
3558 on_stack:
3559   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3560
3561   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3562     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3563                                        16 / UNITS_PER_WORD);
3564   return;
3565 }
3566
3567 /* Implement TARGET_FUNCTION_ARG.  */
3568
3569 static rtx
3570 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3571                       const_tree type, bool named)
3572 {
3573   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3574   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3575
3576   if (mode == VOIDmode)
3577     return NULL_RTX;
3578
3579   aarch64_layout_arg (pcum_v, mode, type, named);
3580   return pcum->aapcs_reg;
3581 }
3582
3583 void
3584 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3585                            const_tree fntype ATTRIBUTE_UNUSED,
3586                            rtx libname ATTRIBUTE_UNUSED,
3587                            const_tree fndecl ATTRIBUTE_UNUSED,
3588                            unsigned n_named ATTRIBUTE_UNUSED)
3589 {
3590   pcum->aapcs_ncrn = 0;
3591   pcum->aapcs_nvrn = 0;
3592   pcum->aapcs_nextncrn = 0;
3593   pcum->aapcs_nextnvrn = 0;
3594   pcum->pcs_variant = ARM_PCS_AAPCS64;
3595   pcum->aapcs_reg = NULL_RTX;
3596   pcum->aapcs_arg_processed = false;
3597   pcum->aapcs_stack_words = 0;
3598   pcum->aapcs_stack_size = 0;
3599
3600   if (!TARGET_FLOAT
3601       && fndecl && TREE_PUBLIC (fndecl)
3602       && fntype && fntype != error_mark_node)
3603     {
3604       const_tree type = TREE_TYPE (fntype);
3605       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
3606       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
3607       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3608                                                    &mode, &nregs, NULL))
3609         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
3610     }
3611   return;
3612 }
3613
3614 static void
3615 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3616                               machine_mode mode,
3617                               const_tree type,
3618                               bool named)
3619 {
3620   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3621   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3622     {
3623       aarch64_layout_arg (pcum_v, mode, type, named);
3624       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3625                   != (pcum->aapcs_stack_words != 0));
3626       pcum->aapcs_arg_processed = false;
3627       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3628       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3629       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3630       pcum->aapcs_stack_words = 0;
3631       pcum->aapcs_reg = NULL_RTX;
3632     }
3633 }
3634
3635 bool
3636 aarch64_function_arg_regno_p (unsigned regno)
3637 {
3638   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3639           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3640 }
3641
3642 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
3643    PARM_BOUNDARY bits of alignment, but will be given anything up
3644    to STACK_BOUNDARY bits if the type requires it.  This makes sure
3645    that both before and after the layout of each argument, the Next
3646    Stacked Argument Address (NSAA) will have a minimum alignment of
3647    8 bytes.  */
3648
3649 static unsigned int
3650 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3651 {
3652   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3653   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3654 }
3655
3656 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
3657
3658 static fixed_size_mode
3659 aarch64_get_reg_raw_mode (int regno)
3660 {
3661   if (TARGET_SVE && FP_REGNUM_P (regno))
3662     /* Don't use the SVE part of the register for __builtin_apply and
3663        __builtin_return.  The SVE registers aren't used by the normal PCS,
3664        so using them there would be a waste of time.  The PCS extensions
3665        for SVE types are fundamentally incompatible with the
3666        __builtin_return/__builtin_apply interface.  */
3667     return as_a <fixed_size_mode> (V16QImode);
3668   return default_get_reg_raw_mode (regno);
3669 }
3670
3671 /* Implement TARGET_FUNCTION_ARG_PADDING.
3672
3673    Small aggregate types are placed in the lowest memory address.
3674
3675    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
3676
3677 static pad_direction
3678 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3679 {
3680   /* On little-endian targets, the least significant byte of every stack
3681      argument is passed at the lowest byte address of the stack slot.  */
3682   if (!BYTES_BIG_ENDIAN)
3683     return PAD_UPWARD;
3684
3685   /* Otherwise, integral, floating-point and pointer types are padded downward:
3686      the least significant byte of a stack argument is passed at the highest
3687      byte address of the stack slot.  */
3688   if (type
3689       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3690          || POINTER_TYPE_P (type))
3691       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3692     return PAD_DOWNWARD;
3693
3694   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
3695   return PAD_UPWARD;
3696 }
3697
3698 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3699
3700    It specifies padding for the last (may also be the only)
3701    element of a block move between registers and memory.  If
3702    assuming the block is in the memory, padding upward means that
3703    the last element is padded after its highest significant byte,
3704    while in downward padding, the last element is padded at the
3705    its least significant byte side.
3706
3707    Small aggregates and small complex types are always padded
3708    upwards.
3709
3710    We don't need to worry about homogeneous floating-point or
3711    short-vector aggregates; their move is not affected by the
3712    padding direction determined here.  Regardless of endianness,
3713    each element of such an aggregate is put in the least
3714    significant bits of a fp/simd register.
3715
3716    Return !BYTES_BIG_ENDIAN if the least significant byte of the
3717    register has useful data, and return the opposite if the most
3718    significant byte does.  */
3719
3720 bool
3721 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3722                      bool first ATTRIBUTE_UNUSED)
3723 {
3724
3725   /* Small composite types are always padded upward.  */
3726   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3727     {
3728       HOST_WIDE_INT size;
3729       if (type)
3730         size = int_size_in_bytes (type);
3731       else
3732         /* No frontends can create types with variable-sized modes, so we
3733            shouldn't be asked to pass or return them.  */
3734         size = GET_MODE_SIZE (mode).to_constant ();
3735       if (size < 2 * UNITS_PER_WORD)
3736         return true;
3737     }
3738
3739   /* Otherwise, use the default padding.  */
3740   return !BYTES_BIG_ENDIAN;
3741 }
3742
3743 static scalar_int_mode
3744 aarch64_libgcc_cmp_return_mode (void)
3745 {
3746   return SImode;
3747 }
3748
3749 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3750
3751 /* We use the 12-bit shifted immediate arithmetic instructions so values
3752    must be multiple of (1 << 12), i.e. 4096.  */
3753 #define ARITH_FACTOR 4096
3754
3755 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3756 #error Cannot use simple address calculation for stack probing
3757 #endif
3758
3759 /* The pair of scratch registers used for stack probing.  */
3760 #define PROBE_STACK_FIRST_REG  9
3761 #define PROBE_STACK_SECOND_REG 10
3762
3763 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3764    inclusive.  These are offsets from the current stack pointer.  */
3765
3766 static void
3767 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3768 {
3769   HOST_WIDE_INT size;
3770   if (!poly_size.is_constant (&size))
3771     {
3772       sorry ("stack probes for SVE frames");
3773       return;
3774     }
3775
3776   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3777
3778   /* See the same assertion on PROBE_INTERVAL above.  */
3779   gcc_assert ((first % ARITH_FACTOR) == 0);
3780
3781   /* See if we have a constant small number of probes to generate.  If so,
3782      that's the easy case.  */
3783   if (size <= PROBE_INTERVAL)
3784     {
3785       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3786
3787       emit_set_insn (reg1,
3788                      plus_constant (Pmode,
3789                                     stack_pointer_rtx, -(first + base)));
3790       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3791     }
3792
3793   /* The run-time loop is made up of 8 insns in the generic case while the
3794      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
3795   else if (size <= 4 * PROBE_INTERVAL)
3796     {
3797       HOST_WIDE_INT i, rem;
3798
3799       emit_set_insn (reg1,
3800                      plus_constant (Pmode,
3801                                     stack_pointer_rtx,
3802                                     -(first + PROBE_INTERVAL)));
3803       emit_stack_probe (reg1);
3804
3805       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3806          it exceeds SIZE.  If only two probes are needed, this will not
3807          generate any code.  Then probe at FIRST + SIZE.  */
3808       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3809         {
3810           emit_set_insn (reg1,
3811                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3812           emit_stack_probe (reg1);
3813         }
3814
3815       rem = size - (i - PROBE_INTERVAL);
3816       if (rem > 256)
3817         {
3818           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3819
3820           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3821           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3822         }
3823       else
3824         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3825     }
3826
3827   /* Otherwise, do the same as above, but in a loop.  Note that we must be
3828      extra careful with variables wrapping around because we might be at
3829      the very top (or the very bottom) of the address space and we have
3830      to be able to handle this case properly; in particular, we use an
3831      equality test for the loop condition.  */
3832   else
3833     {
3834       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3835
3836       /* Step 1: round SIZE to the previous multiple of the interval.  */
3837
3838       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3839
3840
3841       /* Step 2: compute initial and final value of the loop counter.  */
3842
3843       /* TEST_ADDR = SP + FIRST.  */
3844       emit_set_insn (reg1,
3845                      plus_constant (Pmode, stack_pointer_rtx, -first));
3846
3847       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
3848       HOST_WIDE_INT adjustment = - (first + rounded_size);
3849       if (! aarch64_uimm12_shift (adjustment))
3850         {
3851           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3852                                           true, Pmode);
3853           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3854         }
3855       else
3856         emit_set_insn (reg2,
3857                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
3858
3859       /* Step 3: the loop
3860
3861          do
3862            {
3863              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3864              probe at TEST_ADDR
3865            }
3866          while (TEST_ADDR != LAST_ADDR)
3867
3868          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3869          until it is equal to ROUNDED_SIZE.  */
3870
3871       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3872
3873
3874       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3875          that SIZE is equal to ROUNDED_SIZE.  */
3876
3877       if (size != rounded_size)
3878         {
3879           HOST_WIDE_INT rem = size - rounded_size;
3880
3881           if (rem > 256)
3882             {
3883               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3884
3885               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3886               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3887             }
3888           else
3889             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3890         }
3891     }
3892
3893   /* Make sure nothing is scheduled before we are done.  */
3894   emit_insn (gen_blockage ());
3895 }
3896
3897 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
3898    absolute addresses.  */
3899
3900 const char *
3901 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3902 {
3903   static int labelno = 0;
3904   char loop_lab[32];
3905   rtx xops[2];
3906
3907   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3908
3909   /* Loop.  */
3910   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3911
3912   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
3913   xops[0] = reg1;
3914   xops[1] = GEN_INT (PROBE_INTERVAL);
3915   output_asm_insn ("sub\t%0, %0, %1", xops);
3916
3917   /* Probe at TEST_ADDR.  */
3918   output_asm_insn ("str\txzr, [%0]", xops);
3919
3920   /* Test if TEST_ADDR == LAST_ADDR.  */
3921   xops[1] = reg2;
3922   output_asm_insn ("cmp\t%0, %1", xops);
3923
3924   /* Branch.  */
3925   fputs ("\tb.ne\t", asm_out_file);
3926   assemble_name_raw (asm_out_file, loop_lab);
3927   fputc ('\n', asm_out_file);
3928
3929   return "";
3930 }
3931
3932 /* Mark the registers that need to be saved by the callee and calculate
3933    the size of the callee-saved registers area and frame record (both FP
3934    and LR may be omitted).  */
3935 static void
3936 aarch64_layout_frame (void)
3937 {
3938   HOST_WIDE_INT offset = 0;
3939   int regno, last_fp_reg = INVALID_REGNUM;
3940
3941   if (reload_completed && cfun->machine->frame.laid_out)
3942     return;
3943
3944   /* Force a frame chain for EH returns so the return address is at FP+8.  */
3945   cfun->machine->frame.emit_frame_chain
3946     = frame_pointer_needed || crtl->calls_eh_return;
3947
3948   /* Emit a frame chain if the frame pointer is enabled.
3949      If -momit-leaf-frame-pointer is used, do not use a frame chain
3950      in leaf functions which do not use LR.  */
3951   if (flag_omit_frame_pointer == 2
3952       && !(flag_omit_leaf_frame_pointer && crtl->is_leaf
3953            && !df_regs_ever_live_p (LR_REGNUM)))
3954     cfun->machine->frame.emit_frame_chain = true;
3955
3956 #define SLOT_NOT_REQUIRED (-2)
3957 #define SLOT_REQUIRED     (-1)
3958
3959   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
3960   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
3961
3962   /* First mark all the registers that really need to be saved...  */
3963   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3964     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3965
3966   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3967     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3968
3969   /* ... that includes the eh data registers (if needed)...  */
3970   if (crtl->calls_eh_return)
3971     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
3972       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
3973         = SLOT_REQUIRED;
3974
3975   /* ... and any callee saved register that dataflow says is live.  */
3976   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3977     if (df_regs_ever_live_p (regno)
3978         && (regno == R30_REGNUM
3979             || !call_used_regs[regno]))
3980       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3981
3982   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3983     if (df_regs_ever_live_p (regno)
3984         && !call_used_regs[regno])
3985       {
3986         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3987         last_fp_reg = regno;
3988       }
3989
3990   if (cfun->machine->frame.emit_frame_chain)
3991     {
3992       /* FP and LR are placed in the linkage record.  */
3993       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
3994       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
3995       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
3996       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
3997       offset = 2 * UNITS_PER_WORD;
3998     }
3999
4000   /* Now assign stack slots for them.  */
4001   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4002     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4003       {
4004         cfun->machine->frame.reg_offset[regno] = offset;
4005         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4006           cfun->machine->frame.wb_candidate1 = regno;
4007         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4008           cfun->machine->frame.wb_candidate2 = regno;
4009         offset += UNITS_PER_WORD;
4010       }
4011
4012   HOST_WIDE_INT max_int_offset = offset;
4013   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4014   bool has_align_gap = offset != max_int_offset;
4015
4016   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4017     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4018       {
4019         /* If there is an alignment gap between integer and fp callee-saves,
4020            allocate the last fp register to it if possible.  */
4021         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4022           {
4023             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4024             break;
4025           }
4026
4027         cfun->machine->frame.reg_offset[regno] = offset;
4028         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4029           cfun->machine->frame.wb_candidate1 = regno;
4030         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4031                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4032           cfun->machine->frame.wb_candidate2 = regno;
4033         offset += UNITS_PER_WORD;
4034       }
4035
4036   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4037
4038   cfun->machine->frame.saved_regs_size = offset;
4039
4040   HOST_WIDE_INT varargs_and_saved_regs_size
4041     = offset + cfun->machine->frame.saved_varargs_size;
4042
4043   cfun->machine->frame.hard_fp_offset
4044     = aligned_upper_bound (varargs_and_saved_regs_size
4045                            + get_frame_size (),
4046                            STACK_BOUNDARY / BITS_PER_UNIT);
4047
4048   /* Both these values are already aligned.  */
4049   gcc_assert (multiple_p (crtl->outgoing_args_size,
4050                           STACK_BOUNDARY / BITS_PER_UNIT));
4051   cfun->machine->frame.frame_size
4052     = (cfun->machine->frame.hard_fp_offset
4053        + crtl->outgoing_args_size);
4054
4055   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4056
4057   cfun->machine->frame.initial_adjust = 0;
4058   cfun->machine->frame.final_adjust = 0;
4059   cfun->machine->frame.callee_adjust = 0;
4060   cfun->machine->frame.callee_offset = 0;
4061
4062   HOST_WIDE_INT max_push_offset = 0;
4063   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4064     max_push_offset = 512;
4065   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4066     max_push_offset = 256;
4067
4068   HOST_WIDE_INT const_size, const_fp_offset;
4069   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4070       && const_size < max_push_offset
4071       && known_eq (crtl->outgoing_args_size, 0))
4072     {
4073       /* Simple, small frame with no outgoing arguments:
4074          stp reg1, reg2, [sp, -frame_size]!
4075          stp reg3, reg4, [sp, 16]  */
4076       cfun->machine->frame.callee_adjust = const_size;
4077     }
4078   else if (known_lt (crtl->outgoing_args_size
4079                      + cfun->machine->frame.saved_regs_size, 512)
4080            && !(cfun->calls_alloca
4081                 && known_lt (cfun->machine->frame.hard_fp_offset,
4082                              max_push_offset)))
4083     {
4084       /* Frame with small outgoing arguments:
4085          sub sp, sp, frame_size
4086          stp reg1, reg2, [sp, outgoing_args_size]
4087          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4088       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4089       cfun->machine->frame.callee_offset
4090         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4091     }
4092   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4093            && const_fp_offset < max_push_offset)
4094     {
4095       /* Frame with large outgoing arguments but a small local area:
4096          stp reg1, reg2, [sp, -hard_fp_offset]!
4097          stp reg3, reg4, [sp, 16]
4098          sub sp, sp, outgoing_args_size  */
4099       cfun->machine->frame.callee_adjust = const_fp_offset;
4100       cfun->machine->frame.final_adjust
4101         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4102     }
4103   else
4104     {
4105       /* Frame with large local area and outgoing arguments using frame pointer:
4106          sub sp, sp, hard_fp_offset
4107          stp x29, x30, [sp, 0]
4108          add x29, sp, 0
4109          stp reg3, reg4, [sp, 16]
4110          sub sp, sp, outgoing_args_size  */
4111       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4112       cfun->machine->frame.final_adjust
4113         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4114     }
4115
4116   cfun->machine->frame.laid_out = true;
4117 }
4118
4119 /* Return true if the register REGNO is saved on entry to
4120    the current function.  */
4121
4122 static bool
4123 aarch64_register_saved_on_entry (int regno)
4124 {
4125   return cfun->machine->frame.reg_offset[regno] >= 0;
4126 }
4127
4128 /* Return the next register up from REGNO up to LIMIT for the callee
4129    to save.  */
4130
4131 static unsigned
4132 aarch64_next_callee_save (unsigned regno, unsigned limit)
4133 {
4134   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4135     regno ++;
4136   return regno;
4137 }
4138
4139 /* Push the register number REGNO of mode MODE to the stack with write-back
4140    adjusting the stack by ADJUSTMENT.  */
4141
4142 static void
4143 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4144                            HOST_WIDE_INT adjustment)
4145  {
4146   rtx base_rtx = stack_pointer_rtx;
4147   rtx insn, reg, mem;
4148
4149   reg = gen_rtx_REG (mode, regno);
4150   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4151                             plus_constant (Pmode, base_rtx, -adjustment));
4152   mem = gen_frame_mem (mode, mem);
4153
4154   insn = emit_move_insn (mem, reg);
4155   RTX_FRAME_RELATED_P (insn) = 1;
4156 }
4157
4158 /* Generate and return an instruction to store the pair of registers
4159    REG and REG2 of mode MODE to location BASE with write-back adjusting
4160    the stack location BASE by ADJUSTMENT.  */
4161
4162 static rtx
4163 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4164                           HOST_WIDE_INT adjustment)
4165 {
4166   switch (mode)
4167     {
4168     case E_DImode:
4169       return gen_storewb_pairdi_di (base, base, reg, reg2,
4170                                     GEN_INT (-adjustment),
4171                                     GEN_INT (UNITS_PER_WORD - adjustment));
4172     case E_DFmode:
4173       return gen_storewb_pairdf_di (base, base, reg, reg2,
4174                                     GEN_INT (-adjustment),
4175                                     GEN_INT (UNITS_PER_WORD - adjustment));
4176     default:
4177       gcc_unreachable ();
4178     }
4179 }
4180
4181 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4182    stack pointer by ADJUSTMENT.  */
4183
4184 static void
4185 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4186 {
4187   rtx_insn *insn;
4188   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4189
4190   if (regno2 == INVALID_REGNUM)
4191     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4192
4193   rtx reg1 = gen_rtx_REG (mode, regno1);
4194   rtx reg2 = gen_rtx_REG (mode, regno2);
4195
4196   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4197                                               reg2, adjustment));
4198   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4199   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4200   RTX_FRAME_RELATED_P (insn) = 1;
4201 }
4202
4203 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4204    adjusting it by ADJUSTMENT afterwards.  */
4205
4206 static rtx
4207 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4208                          HOST_WIDE_INT adjustment)
4209 {
4210   switch (mode)
4211     {
4212     case E_DImode:
4213       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4214                                    GEN_INT (UNITS_PER_WORD));
4215     case E_DFmode:
4216       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4217                                    GEN_INT (UNITS_PER_WORD));
4218     default:
4219       gcc_unreachable ();
4220     }
4221 }
4222
4223 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4224    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4225    into CFI_OPS.  */
4226
4227 static void
4228 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4229                   rtx *cfi_ops)
4230 {
4231   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4232   rtx reg1 = gen_rtx_REG (mode, regno1);
4233
4234   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4235
4236   if (regno2 == INVALID_REGNUM)
4237     {
4238       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4239       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4240       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4241     }
4242   else
4243     {
4244       rtx reg2 = gen_rtx_REG (mode, regno2);
4245       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4246       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4247                                           reg2, adjustment));
4248     }
4249 }
4250
4251 /* Generate and return a store pair instruction of mode MODE to store
4252    register REG1 to MEM1 and register REG2 to MEM2.  */
4253
4254 static rtx
4255 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4256                         rtx reg2)
4257 {
4258   switch (mode)
4259     {
4260     case E_DImode:
4261       return gen_store_pairdi (mem1, reg1, mem2, reg2);
4262
4263     case E_DFmode:
4264       return gen_store_pairdf (mem1, reg1, mem2, reg2);
4265
4266     default:
4267       gcc_unreachable ();
4268     }
4269 }
4270
4271 /* Generate and regurn a load pair isntruction of mode MODE to load register
4272    REG1 from MEM1 and register REG2 from MEM2.  */
4273
4274 static rtx
4275 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4276                        rtx mem2)
4277 {
4278   switch (mode)
4279     {
4280     case E_DImode:
4281       return gen_load_pairdi (reg1, mem1, reg2, mem2);
4282
4283     case E_DFmode:
4284       return gen_load_pairdf (reg1, mem1, reg2, mem2);
4285
4286     default:
4287       gcc_unreachable ();
4288     }
4289 }
4290
4291 /* Return TRUE if return address signing should be enabled for the current
4292    function, otherwise return FALSE.  */
4293
4294 bool
4295 aarch64_return_address_signing_enabled (void)
4296 {
4297   /* This function should only be called after frame laid out.   */
4298   gcc_assert (cfun->machine->frame.laid_out);
4299
4300   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4301      if it's LR is pushed onto stack.  */
4302   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4303           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4304               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4305 }
4306
4307 /* Emit code to save the callee-saved registers from register number START
4308    to LIMIT to the stack at the location starting at offset START_OFFSET,
4309    skipping any write-back candidates if SKIP_WB is true.  */
4310
4311 static void
4312 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4313                            unsigned start, unsigned limit, bool skip_wb)
4314 {
4315   rtx_insn *insn;
4316   unsigned regno;
4317   unsigned regno2;
4318
4319   for (regno = aarch64_next_callee_save (start, limit);
4320        regno <= limit;
4321        regno = aarch64_next_callee_save (regno + 1, limit))
4322     {
4323       rtx reg, mem;
4324       poly_int64 offset;
4325
4326       if (skip_wb
4327           && (regno == cfun->machine->frame.wb_candidate1
4328               || regno == cfun->machine->frame.wb_candidate2))
4329         continue;
4330
4331       if (cfun->machine->reg_is_wrapped_separately[regno])
4332        continue;
4333
4334       reg = gen_rtx_REG (mode, regno);
4335       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4336       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4337                                                 offset));
4338
4339       regno2 = aarch64_next_callee_save (regno + 1, limit);
4340
4341       if (regno2 <= limit
4342           && !cfun->machine->reg_is_wrapped_separately[regno2]
4343           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4344               == cfun->machine->frame.reg_offset[regno2]))
4345
4346         {
4347           rtx reg2 = gen_rtx_REG (mode, regno2);
4348           rtx mem2;
4349
4350           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4351           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4352                                                      offset));
4353           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4354                                                     reg2));
4355
4356           /* The first part of a frame-related parallel insn is
4357              always assumed to be relevant to the frame
4358              calculations; subsequent parts, are only
4359              frame-related if explicitly marked.  */
4360           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4361           regno = regno2;
4362         }
4363       else
4364         insn = emit_move_insn (mem, reg);
4365
4366       RTX_FRAME_RELATED_P (insn) = 1;
4367     }
4368 }
4369
4370 /* Emit code to restore the callee registers of mode MODE from register
4371    number START up to and including LIMIT.  Restore from the stack offset
4372    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4373    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4374
4375 static void
4376 aarch64_restore_callee_saves (machine_mode mode,
4377                               poly_int64 start_offset, unsigned start,
4378                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4379 {
4380   rtx base_rtx = stack_pointer_rtx;
4381   unsigned regno;
4382   unsigned regno2;
4383   poly_int64 offset;
4384
4385   for (regno = aarch64_next_callee_save (start, limit);
4386        regno <= limit;
4387        regno = aarch64_next_callee_save (regno + 1, limit))
4388     {
4389       if (cfun->machine->reg_is_wrapped_separately[regno])
4390        continue;
4391
4392       rtx reg, mem;
4393
4394       if (skip_wb
4395           && (regno == cfun->machine->frame.wb_candidate1
4396               || regno == cfun->machine->frame.wb_candidate2))
4397         continue;
4398
4399       reg = gen_rtx_REG (mode, regno);
4400       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4401       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4402
4403       regno2 = aarch64_next_callee_save (regno + 1, limit);
4404
4405       if (regno2 <= limit
4406           && !cfun->machine->reg_is_wrapped_separately[regno2]
4407           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4408               == cfun->machine->frame.reg_offset[regno2]))
4409         {
4410           rtx reg2 = gen_rtx_REG (mode, regno2);
4411           rtx mem2;
4412
4413           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4414           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4415           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4416
4417           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4418           regno = regno2;
4419         }
4420       else
4421         emit_move_insn (reg, mem);
4422       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4423     }
4424 }
4425
4426 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4427    of MODE.  */
4428
4429 static inline bool
4430 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4431 {
4432   HOST_WIDE_INT multiple;
4433   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4434           && IN_RANGE (multiple, -8, 7));
4435 }
4436
4437 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4438    of MODE.  */
4439
4440 static inline bool
4441 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4442 {
4443   HOST_WIDE_INT multiple;
4444   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4445           && IN_RANGE (multiple, 0, 63));
4446 }
4447
4448 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4449    of MODE.  */
4450
4451 bool
4452 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4453 {
4454   HOST_WIDE_INT multiple;
4455   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4456           && IN_RANGE (multiple, -64, 63));
4457 }
4458
4459 /* Return true if OFFSET is a signed 9-bit value.  */
4460
4461 static inline bool
4462 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4463                                poly_int64 offset)
4464 {
4465   HOST_WIDE_INT const_offset;
4466   return (offset.is_constant (&const_offset)
4467           && IN_RANGE (const_offset, -256, 255));
4468 }
4469
4470 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4471    of MODE.  */
4472
4473 static inline bool
4474 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4475 {
4476   HOST_WIDE_INT multiple;
4477   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4478           && IN_RANGE (multiple, -256, 255));
4479 }
4480
4481 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4482    of MODE.  */
4483
4484 static inline bool
4485 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4486 {
4487   HOST_WIDE_INT multiple;
4488   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4489           && IN_RANGE (multiple, 0, 4095));
4490 }
4491
4492 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
4493
4494 static sbitmap
4495 aarch64_get_separate_components (void)
4496 {
4497   aarch64_layout_frame ();
4498
4499   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4500   bitmap_clear (components);
4501
4502   /* The registers we need saved to the frame.  */
4503   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4504     if (aarch64_register_saved_on_entry (regno))
4505       {
4506         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4507         if (!frame_pointer_needed)
4508           offset += cfun->machine->frame.frame_size
4509                     - cfun->machine->frame.hard_fp_offset;
4510         /* Check that we can access the stack slot of the register with one
4511            direct load with no adjustments needed.  */
4512         if (offset_12bit_unsigned_scaled_p (DImode, offset))
4513           bitmap_set_bit (components, regno);
4514       }
4515
4516   /* Don't mess with the hard frame pointer.  */
4517   if (frame_pointer_needed)
4518     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4519
4520   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4521   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4522   /* If aarch64_layout_frame has chosen registers to store/restore with
4523      writeback don't interfere with them to avoid having to output explicit
4524      stack adjustment instructions.  */
4525   if (reg2 != INVALID_REGNUM)
4526     bitmap_clear_bit (components, reg2);
4527   if (reg1 != INVALID_REGNUM)
4528     bitmap_clear_bit (components, reg1);
4529
4530   bitmap_clear_bit (components, LR_REGNUM);
4531   bitmap_clear_bit (components, SP_REGNUM);
4532
4533   return components;
4534 }
4535
4536 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
4537
4538 static sbitmap
4539 aarch64_components_for_bb (basic_block bb)
4540 {
4541   bitmap in = DF_LIVE_IN (bb);
4542   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4543   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4544
4545   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4546   bitmap_clear (components);
4547
4548   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
4549   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4550     if ((!call_used_regs[regno])
4551        && (bitmap_bit_p (in, regno)
4552            || bitmap_bit_p (gen, regno)
4553            || bitmap_bit_p (kill, regno)))
4554       {
4555         unsigned regno2, offset, offset2;
4556         bitmap_set_bit (components, regno);
4557
4558         /* If there is a callee-save at an adjacent offset, add it too
4559            to increase the use of LDP/STP.  */
4560         offset = cfun->machine->frame.reg_offset[regno];
4561         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4562
4563         if (regno2 <= LAST_SAVED_REGNUM)
4564           {
4565             offset2 = cfun->machine->frame.reg_offset[regno2];
4566             if ((offset & ~8) == (offset2 & ~8))
4567               bitmap_set_bit (components, regno2);
4568           }
4569       }
4570
4571   return components;
4572 }
4573
4574 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4575    Nothing to do for aarch64.  */
4576
4577 static void
4578 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4579 {
4580 }
4581
4582 /* Return the next set bit in BMP from START onwards.  Return the total number
4583    of bits in BMP if no set bit is found at or after START.  */
4584
4585 static unsigned int
4586 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4587 {
4588   unsigned int nbits = SBITMAP_SIZE (bmp);
4589   if (start == nbits)
4590     return start;
4591
4592   gcc_assert (start < nbits);
4593   for (unsigned int i = start; i < nbits; i++)
4594     if (bitmap_bit_p (bmp, i))
4595       return i;
4596
4597   return nbits;
4598 }
4599
4600 /* Do the work for aarch64_emit_prologue_components and
4601    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
4602    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4603    for these components or the epilogue sequence.  That is, it determines
4604    whether we should emit stores or loads and what kind of CFA notes to attach
4605    to the insns.  Otherwise the logic for the two sequences is very
4606    similar.  */
4607
4608 static void
4609 aarch64_process_components (sbitmap components, bool prologue_p)
4610 {
4611   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4612                              ? HARD_FRAME_POINTER_REGNUM
4613                              : STACK_POINTER_REGNUM);
4614
4615   unsigned last_regno = SBITMAP_SIZE (components);
4616   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4617   rtx_insn *insn = NULL;
4618
4619   while (regno != last_regno)
4620     {
4621       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4622          so DFmode for the vector registers is enough.  */
4623       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4624       rtx reg = gen_rtx_REG (mode, regno);
4625       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4626       if (!frame_pointer_needed)
4627         offset += cfun->machine->frame.frame_size
4628                   - cfun->machine->frame.hard_fp_offset;
4629       rtx addr = plus_constant (Pmode, ptr_reg, offset);
4630       rtx mem = gen_frame_mem (mode, addr);
4631
4632       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4633       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4634       /* No more registers to handle after REGNO.
4635          Emit a single save/restore and exit.  */
4636       if (regno2 == last_regno)
4637         {
4638           insn = emit_insn (set);
4639           RTX_FRAME_RELATED_P (insn) = 1;
4640           if (prologue_p)
4641             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4642           else
4643             add_reg_note (insn, REG_CFA_RESTORE, reg);
4644           break;
4645         }
4646
4647       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4648       /* The next register is not of the same class or its offset is not
4649          mergeable with the current one into a pair.  */
4650       if (!satisfies_constraint_Ump (mem)
4651           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4652           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4653                        GET_MODE_SIZE (mode)))
4654         {
4655           insn = emit_insn (set);
4656           RTX_FRAME_RELATED_P (insn) = 1;
4657           if (prologue_p)
4658             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4659           else
4660             add_reg_note (insn, REG_CFA_RESTORE, reg);
4661
4662           regno = regno2;
4663           continue;
4664         }
4665
4666       /* REGNO2 can be saved/restored in a pair with REGNO.  */
4667       rtx reg2 = gen_rtx_REG (mode, regno2);
4668       if (!frame_pointer_needed)
4669         offset2 += cfun->machine->frame.frame_size
4670                   - cfun->machine->frame.hard_fp_offset;
4671       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4672       rtx mem2 = gen_frame_mem (mode, addr2);
4673       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4674                              : gen_rtx_SET (reg2, mem2);
4675
4676       if (prologue_p)
4677         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4678       else
4679         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4680
4681       RTX_FRAME_RELATED_P (insn) = 1;
4682       if (prologue_p)
4683         {
4684           add_reg_note (insn, REG_CFA_OFFSET, set);
4685           add_reg_note (insn, REG_CFA_OFFSET, set2);
4686         }
4687       else
4688         {
4689           add_reg_note (insn, REG_CFA_RESTORE, reg);
4690           add_reg_note (insn, REG_CFA_RESTORE, reg2);
4691         }
4692
4693       regno = aarch64_get_next_set_bit (components, regno2 + 1);
4694     }
4695 }
4696
4697 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
4698
4699 static void
4700 aarch64_emit_prologue_components (sbitmap components)
4701 {
4702   aarch64_process_components (components, true);
4703 }
4704
4705 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
4706
4707 static void
4708 aarch64_emit_epilogue_components (sbitmap components)
4709 {
4710   aarch64_process_components (components, false);
4711 }
4712
4713 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
4714
4715 static void
4716 aarch64_set_handled_components (sbitmap components)
4717 {
4718   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4719     if (bitmap_bit_p (components, regno))
4720       cfun->machine->reg_is_wrapped_separately[regno] = true;
4721 }
4722
4723 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4724    is saved at BASE + OFFSET.  */
4725
4726 static void
4727 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4728                             rtx base, poly_int64 offset)
4729 {
4730   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4731   add_reg_note (insn, REG_CFA_EXPRESSION,
4732                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4733 }
4734
4735 /* AArch64 stack frames generated by this compiler look like:
4736
4737         +-------------------------------+
4738         |                               |
4739         |  incoming stack arguments     |
4740         |                               |
4741         +-------------------------------+
4742         |                               | <-- incoming stack pointer (aligned)
4743         |  callee-allocated save area   |
4744         |  for register varargs         |
4745         |                               |
4746         +-------------------------------+
4747         |  local variables              | <-- frame_pointer_rtx
4748         |                               |
4749         +-------------------------------+
4750         |  padding0                     | \
4751         +-------------------------------+  |
4752         |  callee-saved registers       |  | frame.saved_regs_size
4753         +-------------------------------+  |
4754         |  LR'                          |  |
4755         +-------------------------------+  |
4756         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
4757         +-------------------------------+
4758         |  dynamic allocation           |
4759         +-------------------------------+
4760         |  padding                      |
4761         +-------------------------------+
4762         |  outgoing stack arguments     | <-- arg_pointer
4763         |                               |
4764         +-------------------------------+
4765         |                               | <-- stack_pointer_rtx (aligned)
4766
4767    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4768    but leave frame_pointer_rtx and hard_frame_pointer_rtx
4769    unchanged.  */
4770
4771 /* Generate the prologue instructions for entry into a function.
4772    Establish the stack frame by decreasing the stack pointer with a
4773    properly calculated size and, if necessary, create a frame record
4774    filled with the values of LR and previous frame pointer.  The
4775    current FP is also set up if it is in use.  */
4776
4777 void
4778 aarch64_expand_prologue (void)
4779 {
4780   aarch64_layout_frame ();
4781
4782   poly_int64 frame_size = cfun->machine->frame.frame_size;
4783   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4784   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4785   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4786   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4787   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4788   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4789   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4790   rtx_insn *insn;
4791
4792   /* Sign return address for functions.  */
4793   if (aarch64_return_address_signing_enabled ())
4794     {
4795       insn = emit_insn (gen_pacisp ());
4796       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4797       RTX_FRAME_RELATED_P (insn) = 1;
4798     }
4799
4800   if (flag_stack_usage_info)
4801     current_function_static_stack_size = constant_lower_bound (frame_size);
4802
4803   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4804     {
4805       if (crtl->is_leaf && !cfun->calls_alloca)
4806         {
4807           if (maybe_gt (frame_size, PROBE_INTERVAL)
4808               && maybe_gt (frame_size, get_stack_check_protect ()))
4809             aarch64_emit_probe_stack_range (get_stack_check_protect (),
4810                                             (frame_size
4811                                              - get_stack_check_protect ()));
4812         }
4813       else if (maybe_gt (frame_size, 0))
4814         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4815     }
4816
4817   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4818   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4819
4820   aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4821
4822   if (callee_adjust != 0)
4823     aarch64_push_regs (reg1, reg2, callee_adjust);
4824
4825   if (emit_frame_chain)
4826     {
4827       poly_int64 reg_offset = callee_adjust;
4828       if (callee_adjust == 0)
4829         {
4830           reg1 = R29_REGNUM;
4831           reg2 = R30_REGNUM;
4832           reg_offset = callee_offset;
4833           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4834         }
4835       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4836                           stack_pointer_rtx, callee_offset,
4837                           ip1_rtx, ip0_rtx, frame_pointer_needed);
4838       if (frame_pointer_needed && !frame_size.is_constant ())
4839         {
4840           /* Variable-sized frames need to describe the save slot
4841              address using DW_CFA_expression rather than DW_CFA_offset.
4842              This means that, without taking further action, the
4843              locations of the registers that we've already saved would
4844              remain based on the stack pointer even after we redefine
4845              the CFA based on the frame pointer.  We therefore need new
4846              DW_CFA_expressions to re-express the save slots with addresses
4847              based on the frame pointer.  */
4848           rtx_insn *insn = get_last_insn ();
4849           gcc_assert (RTX_FRAME_RELATED_P (insn));
4850
4851           /* Add an explicit CFA definition if this was previously
4852              implicit.  */
4853           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4854             {
4855               rtx src = plus_constant (Pmode, stack_pointer_rtx,
4856                                        callee_offset);
4857               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4858                             gen_rtx_SET (hard_frame_pointer_rtx, src));
4859             }
4860
4861           /* Change the save slot expressions for the registers that
4862              we've already saved.  */
4863           reg_offset -= callee_offset;
4864           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4865                                       reg_offset + UNITS_PER_WORD);
4866           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4867                                       reg_offset);
4868         }
4869       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4870     }
4871
4872   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4873                              callee_adjust != 0 || emit_frame_chain);
4874   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4875                              callee_adjust != 0 || emit_frame_chain);
4876   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4877 }
4878
4879 /* Return TRUE if we can use a simple_return insn.
4880
4881    This function checks whether the callee saved stack is empty, which
4882    means no restore actions are need. The pro_and_epilogue will use
4883    this to check whether shrink-wrapping opt is feasible.  */
4884
4885 bool
4886 aarch64_use_return_insn_p (void)
4887 {
4888   if (!reload_completed)
4889     return false;
4890
4891   if (crtl->profile)
4892     return false;
4893
4894   aarch64_layout_frame ();
4895
4896   return known_eq (cfun->machine->frame.frame_size, 0);
4897 }
4898
4899 /* Generate the epilogue instructions for returning from a function.
4900    This is almost exactly the reverse of the prolog sequence, except
4901    that we need to insert barriers to avoid scheduling loads that read
4902    from a deallocated stack, and we optimize the unwind records by
4903    emitting them all together if possible.  */
4904 void
4905 aarch64_expand_epilogue (bool for_sibcall)
4906 {
4907   aarch64_layout_frame ();
4908
4909   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4910   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4911   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4912   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4913   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4914   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4915   rtx cfi_ops = NULL;
4916   rtx_insn *insn;
4917   /* A stack clash protection prologue may not have left IP0_REGNUM or
4918      IP1_REGNUM in a usable state.  The same is true for allocations
4919      with an SVE component, since we then need both temporary registers
4920      for each allocation.  */
4921   bool can_inherit_p = (initial_adjust.is_constant ()
4922                         && final_adjust.is_constant ()
4923                         && !flag_stack_clash_protection);
4924
4925   /* We need to add memory barrier to prevent read from deallocated stack.  */
4926   bool need_barrier_p
4927     = maybe_ne (get_frame_size ()
4928                 + cfun->machine->frame.saved_varargs_size, 0);
4929
4930   /* Emit a barrier to prevent loads from a deallocated stack.  */
4931   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4932       || cfun->calls_alloca
4933       || crtl->calls_eh_return)
4934     {
4935       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4936       need_barrier_p = false;
4937     }
4938
4939   /* Restore the stack pointer from the frame pointer if it may not
4940      be the same as the stack pointer.  */
4941   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4942   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4943   if (frame_pointer_needed
4944       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
4945     /* If writeback is used when restoring callee-saves, the CFA
4946        is restored on the instruction doing the writeback.  */
4947     aarch64_add_offset (Pmode, stack_pointer_rtx,
4948                         hard_frame_pointer_rtx, -callee_offset,
4949                         ip1_rtx, ip0_rtx, callee_adjust == 0);
4950   else
4951     aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
4952                     !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
4953
4954   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4955                                 callee_adjust != 0, &cfi_ops);
4956   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4957                                 callee_adjust != 0, &cfi_ops);
4958
4959   if (need_barrier_p)
4960     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4961
4962   if (callee_adjust != 0)
4963     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
4964
4965   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
4966     {
4967       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
4968       insn = get_last_insn ();
4969       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
4970       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
4971       RTX_FRAME_RELATED_P (insn) = 1;
4972       cfi_ops = NULL;
4973     }
4974
4975   aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
4976                   !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
4977
4978   if (cfi_ops)
4979     {
4980       /* Emit delayed restores and reset the CFA to be SP.  */
4981       insn = get_last_insn ();
4982       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
4983       REG_NOTES (insn) = cfi_ops;
4984       RTX_FRAME_RELATED_P (insn) = 1;
4985     }
4986
4987   /* We prefer to emit the combined return/authenticate instruction RETAA,
4988      however there are three cases in which we must instead emit an explicit
4989      authentication instruction.
4990
4991         1) Sibcalls don't return in a normal way, so if we're about to call one
4992            we must authenticate.
4993
4994         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
4995            generating code for !TARGET_ARMV8_3 we can't use it and must
4996            explicitly authenticate.
4997
4998         3) On an eh_return path we make extra stack adjustments to update the
4999            canonical frame address to be the exception handler's CFA.  We want
5000            to authenticate using the CFA of the function which calls eh_return.
5001     */
5002   if (aarch64_return_address_signing_enabled ()
5003       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5004     {
5005       insn = emit_insn (gen_autisp ());
5006       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5007       RTX_FRAME_RELATED_P (insn) = 1;
5008     }
5009
5010   /* Stack adjustment for exception handler.  */
5011   if (crtl->calls_eh_return)
5012     {
5013       /* We need to unwind the stack by the offset computed by
5014          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5015          to be SP; letting the CFA move during this adjustment
5016          is just as correct as retaining the CFA from the body
5017          of the function.  Therefore, do nothing special.  */
5018       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5019     }
5020
5021   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5022   if (!for_sibcall)
5023     emit_jump_insn (ret_rtx);
5024 }
5025
5026 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5027    normally or return to a previous frame after unwinding.
5028
5029    An EH return uses a single shared return sequence.  The epilogue is
5030    exactly like a normal epilogue except that it has an extra input
5031    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5032    that must be applied after the frame has been destroyed.  An extra label
5033    is inserted before the epilogue which initializes this register to zero,
5034    and this is the entry point for a normal return.
5035
5036    An actual EH return updates the return address, initializes the stack
5037    adjustment and jumps directly into the epilogue (bypassing the zeroing
5038    of the adjustment).  Since the return address is typically saved on the
5039    stack when a function makes a call, the saved LR must be updated outside
5040    the epilogue.
5041
5042    This poses problems as the store is generated well before the epilogue,
5043    so the offset of LR is not known yet.  Also optimizations will remove the
5044    store as it appears dead, even after the epilogue is generated (as the
5045    base or offset for loading LR is different in many cases).
5046
5047    To avoid these problems this implementation forces the frame pointer
5048    in eh_return functions so that the location of LR is fixed and known early.
5049    It also marks the store volatile, so no optimization is permitted to
5050    remove the store.  */
5051 rtx
5052 aarch64_eh_return_handler_rtx (void)
5053 {
5054   rtx tmp = gen_frame_mem (Pmode,
5055     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5056
5057   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5058   MEM_VOLATILE_P (tmp) = true;
5059   return tmp;
5060 }
5061
5062 /* Output code to add DELTA to the first argument, and then jump
5063    to FUNCTION.  Used for C++ multiple inheritance.  */
5064 static void
5065 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5066                          HOST_WIDE_INT delta,
5067                          HOST_WIDE_INT vcall_offset,
5068                          tree function)
5069 {
5070   /* The this pointer is always in x0.  Note that this differs from
5071      Arm where the this pointer maybe bumped to r1 if r0 is required
5072      to return a pointer to an aggregate.  On AArch64 a result value
5073      pointer will be in x8.  */
5074   int this_regno = R0_REGNUM;
5075   rtx this_rtx, temp0, temp1, addr, funexp;
5076   rtx_insn *insn;
5077
5078   reload_completed = 1;
5079   emit_note (NOTE_INSN_PROLOGUE_END);
5080
5081   this_rtx = gen_rtx_REG (Pmode, this_regno);
5082   temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5083   temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5084
5085   if (vcall_offset == 0)
5086     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5087   else
5088     {
5089       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5090
5091       addr = this_rtx;
5092       if (delta != 0)
5093         {
5094           if (delta >= -256 && delta < 256)
5095             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5096                                        plus_constant (Pmode, this_rtx, delta));
5097           else
5098             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5099                                 temp1, temp0, false);
5100         }
5101
5102       if (Pmode == ptr_mode)
5103         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5104       else
5105         aarch64_emit_move (temp0,
5106                            gen_rtx_ZERO_EXTEND (Pmode,
5107                                                 gen_rtx_MEM (ptr_mode, addr)));
5108
5109       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5110           addr = plus_constant (Pmode, temp0, vcall_offset);
5111       else
5112         {
5113           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5114                                           Pmode);
5115           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5116         }
5117
5118       if (Pmode == ptr_mode)
5119         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5120       else
5121         aarch64_emit_move (temp1,
5122                            gen_rtx_SIGN_EXTEND (Pmode,
5123                                                 gen_rtx_MEM (ptr_mode, addr)));
5124
5125       emit_insn (gen_add2_insn (this_rtx, temp1));
5126     }
5127
5128   /* Generate a tail call to the target function.  */
5129   if (!TREE_USED (function))
5130     {
5131       assemble_external (function);
5132       TREE_USED (function) = 1;
5133     }
5134   funexp = XEXP (DECL_RTL (function), 0);
5135   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5136   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5137   SIBLING_CALL_P (insn) = 1;
5138
5139   insn = get_insns ();
5140   shorten_branches (insn);
5141   final_start_function (insn, file, 1);
5142   final (insn, file, 1);
5143   final_end_function ();
5144
5145   /* Stop pretending to be a post-reload pass.  */
5146   reload_completed = 0;
5147 }
5148
5149 static bool
5150 aarch64_tls_referenced_p (rtx x)
5151 {
5152   if (!TARGET_HAVE_TLS)
5153     return false;
5154   subrtx_iterator::array_type array;
5155   FOR_EACH_SUBRTX (iter, array, x, ALL)
5156     {
5157       const_rtx x = *iter;
5158       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5159         return true;
5160       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5161          TLS offsets, not real symbol references.  */
5162       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5163         iter.skip_subrtxes ();
5164     }
5165   return false;
5166 }
5167
5168
5169 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5170    a left shift of 0 or 12 bits.  */
5171 bool
5172 aarch64_uimm12_shift (HOST_WIDE_INT val)
5173 {
5174   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5175           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5176           );
5177 }
5178
5179
5180 /* Return true if val is an immediate that can be loaded into a
5181    register by a MOVZ instruction.  */
5182 static bool
5183 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5184 {
5185   if (GET_MODE_SIZE (mode) > 4)
5186     {
5187       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5188           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5189         return 1;
5190     }
5191   else
5192     {
5193       /* Ignore sign extension.  */
5194       val &= (HOST_WIDE_INT) 0xffffffff;
5195     }
5196   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5197           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5198 }
5199
5200 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
5201    64-bit (DImode) integer.  */
5202
5203 static unsigned HOST_WIDE_INT
5204 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5205 {
5206   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5207   while (size < 64)
5208     {
5209       val &= (HOST_WIDE_INT_1U << size) - 1;
5210       val |= val << size;
5211       size *= 2;
5212     }
5213   return val;
5214 }
5215
5216 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
5217
5218 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5219   {
5220     0x0000000100000001ull,
5221     0x0001000100010001ull,
5222     0x0101010101010101ull,
5223     0x1111111111111111ull,
5224     0x5555555555555555ull,
5225   };
5226
5227
5228 /* Return true if val is a valid bitmask immediate.  */
5229
5230 bool
5231 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5232 {
5233   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5234   int bits;
5235
5236   /* Check for a single sequence of one bits and return quickly if so.
5237      The special cases of all ones and all zeroes returns false.  */
5238   val = aarch64_replicate_bitmask_imm (val_in, mode);
5239   tmp = val + (val & -val);
5240
5241   if (tmp == (tmp & -tmp))
5242     return (val + 1) > 1;
5243
5244   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
5245   if (mode == SImode)
5246     val = (val << 32) | (val & 0xffffffff);
5247
5248   /* Invert if the immediate doesn't start with a zero bit - this means we
5249      only need to search for sequences of one bits.  */
5250   if (val & 1)
5251     val = ~val;
5252
5253   /* Find the first set bit and set tmp to val with the first sequence of one
5254      bits removed.  Return success if there is a single sequence of ones.  */
5255   first_one = val & -val;
5256   tmp = val & (val + first_one);
5257
5258   if (tmp == 0)
5259     return true;
5260
5261   /* Find the next set bit and compute the difference in bit position.  */
5262   next_one = tmp & -tmp;
5263   bits = clz_hwi (first_one) - clz_hwi (next_one);
5264   mask = val ^ tmp;
5265
5266   /* Check the bit position difference is a power of 2, and that the first
5267      sequence of one bits fits within 'bits' bits.  */
5268   if ((mask >> bits) != 0 || bits != (bits & -bits))
5269     return false;
5270
5271   /* Check the sequence of one bits is repeated 64/bits times.  */
5272   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5273 }
5274
5275 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5276    Assumed precondition: VAL_IN Is not zero.  */
5277
5278 unsigned HOST_WIDE_INT
5279 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5280 {
5281   int lowest_bit_set = ctz_hwi (val_in);
5282   int highest_bit_set = floor_log2 (val_in);
5283   gcc_assert (val_in != 0);
5284
5285   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5286           (HOST_WIDE_INT_1U << lowest_bit_set));
5287 }
5288
5289 /* Create constant where bits outside of lowest bit set to highest bit set
5290    are set to 1.  */
5291
5292 unsigned HOST_WIDE_INT
5293 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5294 {
5295   return val_in | ~aarch64_and_split_imm1 (val_in);
5296 }
5297
5298 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
5299
5300 bool
5301 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5302 {
5303   scalar_int_mode int_mode;
5304   if (!is_a <scalar_int_mode> (mode, &int_mode))
5305     return false;
5306
5307   if (aarch64_bitmask_imm (val_in, int_mode))
5308     return false;
5309
5310   if (aarch64_move_imm (val_in, int_mode))
5311     return false;
5312
5313   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5314
5315   return aarch64_bitmask_imm (imm2, int_mode);
5316 }
5317
5318 /* Return true if val is an immediate that can be loaded into a
5319    register in a single instruction.  */
5320 bool
5321 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5322 {
5323   scalar_int_mode int_mode;
5324   if (!is_a <scalar_int_mode> (mode, &int_mode))
5325     return false;
5326
5327   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5328     return 1;
5329   return aarch64_bitmask_imm (val, int_mode);
5330 }
5331
5332 static bool
5333 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5334 {
5335   rtx base, offset;
5336
5337   if (GET_CODE (x) == HIGH)
5338     return true;
5339
5340   /* There's no way to calculate VL-based values using relocations.  */
5341   subrtx_iterator::array_type array;
5342   FOR_EACH_SUBRTX (iter, array, x, ALL)
5343     if (GET_CODE (*iter) == CONST_POLY_INT)
5344       return true;
5345
5346   split_const (x, &base, &offset);
5347   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5348     {
5349       if (aarch64_classify_symbol (base, INTVAL (offset))
5350           != SYMBOL_FORCE_TO_MEM)
5351         return true;
5352       else
5353         /* Avoid generating a 64-bit relocation in ILP32; leave
5354            to aarch64_expand_mov_immediate to handle it properly.  */
5355         return mode != ptr_mode;
5356     }
5357
5358   return aarch64_tls_referenced_p (x);
5359 }
5360
5361 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5362    The expansion for a table switch is quite expensive due to the number
5363    of instructions, the table lookup and hard to predict indirect jump.
5364    When optimizing for speed, and -O3 enabled, use the per-core tuning if
5365    set, otherwise use tables for > 16 cases as a tradeoff between size and
5366    performance.  When optimizing for size, use the default setting.  */
5367
5368 static unsigned int
5369 aarch64_case_values_threshold (void)
5370 {
5371   /* Use the specified limit for the number of cases before using jump
5372      tables at higher optimization levels.  */
5373   if (optimize > 2
5374       && selected_cpu->tune->max_case_values != 0)
5375     return selected_cpu->tune->max_case_values;
5376   else
5377     return optimize_size ? default_case_values_threshold () : 17;
5378 }
5379
5380 /* Return true if register REGNO is a valid index register.
5381    STRICT_P is true if REG_OK_STRICT is in effect.  */
5382
5383 bool
5384 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5385 {
5386   if (!HARD_REGISTER_NUM_P (regno))
5387     {
5388       if (!strict_p)
5389         return true;
5390
5391       if (!reg_renumber)
5392         return false;
5393
5394       regno = reg_renumber[regno];
5395     }
5396   return GP_REGNUM_P (regno);
5397 }
5398
5399 /* Return true if register REGNO is a valid base register for mode MODE.
5400    STRICT_P is true if REG_OK_STRICT is in effect.  */
5401
5402 bool
5403 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5404 {
5405   if (!HARD_REGISTER_NUM_P (regno))
5406     {
5407       if (!strict_p)
5408         return true;
5409
5410       if (!reg_renumber)
5411         return false;
5412
5413       regno = reg_renumber[regno];
5414     }
5415
5416   /* The fake registers will be eliminated to either the stack or
5417      hard frame pointer, both of which are usually valid base registers.
5418      Reload deals with the cases where the eliminated form isn't valid.  */
5419   return (GP_REGNUM_P (regno)
5420           || regno == SP_REGNUM
5421           || regno == FRAME_POINTER_REGNUM
5422           || regno == ARG_POINTER_REGNUM);
5423 }
5424
5425 /* Return true if X is a valid base register for mode MODE.
5426    STRICT_P is true if REG_OK_STRICT is in effect.  */
5427
5428 static bool
5429 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5430 {
5431   if (!strict_p
5432       && GET_CODE (x) == SUBREG
5433       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5434     x = SUBREG_REG (x);
5435
5436   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5437 }
5438
5439 /* Return true if address offset is a valid index.  If it is, fill in INFO
5440    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5441
5442 static bool
5443 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5444                         machine_mode mode, bool strict_p)
5445 {
5446   enum aarch64_address_type type;
5447   rtx index;
5448   int shift;
5449
5450   /* (reg:P) */
5451   if ((REG_P (x) || GET_CODE (x) == SUBREG)
5452       && GET_MODE (x) == Pmode)
5453     {
5454       type = ADDRESS_REG_REG;
5455       index = x;
5456       shift = 0;
5457     }
5458   /* (sign_extend:DI (reg:SI)) */
5459   else if ((GET_CODE (x) == SIGN_EXTEND
5460             || GET_CODE (x) == ZERO_EXTEND)
5461            && GET_MODE (x) == DImode
5462            && GET_MODE (XEXP (x, 0)) == SImode)
5463     {
5464       type = (GET_CODE (x) == SIGN_EXTEND)
5465         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5466       index = XEXP (x, 0);
5467       shift = 0;
5468     }
5469   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5470   else if (GET_CODE (x) == MULT
5471            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5472                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5473            && GET_MODE (XEXP (x, 0)) == DImode
5474            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5475            && CONST_INT_P (XEXP (x, 1)))
5476     {
5477       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5478         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5479       index = XEXP (XEXP (x, 0), 0);
5480       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5481     }
5482   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5483   else if (GET_CODE (x) == ASHIFT
5484            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5485                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5486            && GET_MODE (XEXP (x, 0)) == DImode
5487            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5488            && CONST_INT_P (XEXP (x, 1)))
5489     {
5490       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5491         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5492       index = XEXP (XEXP (x, 0), 0);
5493       shift = INTVAL (XEXP (x, 1));
5494     }
5495   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5496   else if ((GET_CODE (x) == SIGN_EXTRACT
5497             || GET_CODE (x) == ZERO_EXTRACT)
5498            && GET_MODE (x) == DImode
5499            && GET_CODE (XEXP (x, 0)) == MULT
5500            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5501            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5502     {
5503       type = (GET_CODE (x) == SIGN_EXTRACT)
5504         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5505       index = XEXP (XEXP (x, 0), 0);
5506       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5507       if (INTVAL (XEXP (x, 1)) != 32 + shift
5508           || INTVAL (XEXP (x, 2)) != 0)
5509         shift = -1;
5510     }
5511   /* (and:DI (mult:DI (reg:DI) (const_int scale))
5512      (const_int 0xffffffff<<shift)) */
5513   else if (GET_CODE (x) == AND
5514            && GET_MODE (x) == DImode
5515            && GET_CODE (XEXP (x, 0)) == MULT
5516            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5517            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5518            && CONST_INT_P (XEXP (x, 1)))
5519     {
5520       type = ADDRESS_REG_UXTW;
5521       index = XEXP (XEXP (x, 0), 0);
5522       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5523       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5524         shift = -1;
5525     }
5526   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5527   else if ((GET_CODE (x) == SIGN_EXTRACT
5528             || GET_CODE (x) == ZERO_EXTRACT)
5529            && GET_MODE (x) == DImode
5530            && GET_CODE (XEXP (x, 0)) == ASHIFT
5531            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5532            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5533     {
5534       type = (GET_CODE (x) == SIGN_EXTRACT)
5535         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5536       index = XEXP (XEXP (x, 0), 0);
5537       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5538       if (INTVAL (XEXP (x, 1)) != 32 + shift
5539           || INTVAL (XEXP (x, 2)) != 0)
5540         shift = -1;
5541     }
5542   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5543      (const_int 0xffffffff<<shift)) */
5544   else if (GET_CODE (x) == AND
5545            && GET_MODE (x) == DImode
5546            && GET_CODE (XEXP (x, 0)) == ASHIFT
5547            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5548            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5549            && CONST_INT_P (XEXP (x, 1)))
5550     {
5551       type = ADDRESS_REG_UXTW;
5552       index = XEXP (XEXP (x, 0), 0);
5553       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5554       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5555         shift = -1;
5556     }
5557   /* (mult:P (reg:P) (const_int scale)) */
5558   else if (GET_CODE (x) == MULT
5559            && GET_MODE (x) == Pmode
5560            && GET_MODE (XEXP (x, 0)) == Pmode
5561            && CONST_INT_P (XEXP (x, 1)))
5562     {
5563       type = ADDRESS_REG_REG;
5564       index = XEXP (x, 0);
5565       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5566     }
5567   /* (ashift:P (reg:P) (const_int shift)) */
5568   else if (GET_CODE (x) == ASHIFT
5569            && GET_MODE (x) == Pmode
5570            && GET_MODE (XEXP (x, 0)) == Pmode
5571            && CONST_INT_P (XEXP (x, 1)))
5572     {
5573       type = ADDRESS_REG_REG;
5574       index = XEXP (x, 0);
5575       shift = INTVAL (XEXP (x, 1));
5576     }
5577   else
5578     return false;
5579
5580   if (!strict_p
5581       && GET_CODE (index) == SUBREG
5582       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5583     index = SUBREG_REG (index);
5584
5585   if (aarch64_sve_data_mode_p (mode))
5586     {
5587       if (type != ADDRESS_REG_REG
5588           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5589         return false;
5590     }
5591   else
5592     {
5593       if (shift != 0
5594           && !(IN_RANGE (shift, 1, 3)
5595                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5596         return false;
5597     }
5598
5599   if (REG_P (index)
5600       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5601     {
5602       info->type = type;
5603       info->offset = index;
5604       info->shift = shift;
5605       return true;
5606     }
5607
5608   return false;
5609 }
5610
5611 /* Return true if MODE is one of the modes for which we
5612    support LDP/STP operations.  */
5613
5614 static bool
5615 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5616 {
5617   return mode == SImode || mode == DImode
5618          || mode == SFmode || mode == DFmode
5619          || (aarch64_vector_mode_supported_p (mode)
5620              && known_eq (GET_MODE_SIZE (mode), 8));
5621 }
5622
5623 /* Return true if REGNO is a virtual pointer register, or an eliminable
5624    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
5625    include stack_pointer or hard_frame_pointer.  */
5626 static bool
5627 virt_or_elim_regno_p (unsigned regno)
5628 {
5629   return ((regno >= FIRST_VIRTUAL_REGISTER
5630            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5631           || regno == FRAME_POINTER_REGNUM
5632           || regno == ARG_POINTER_REGNUM);
5633 }
5634
5635 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5636    If it is, fill in INFO appropriately.  STRICT_P is true if
5637    REG_OK_STRICT is in effect.  */
5638
5639 static bool
5640 aarch64_classify_address (struct aarch64_address_info *info,
5641                           rtx x, machine_mode mode, bool strict_p,
5642                           aarch64_addr_query_type type = ADDR_QUERY_M)
5643 {
5644   enum rtx_code code = GET_CODE (x);
5645   rtx op0, op1;
5646   poly_int64 offset;
5647
5648   HOST_WIDE_INT const_size;
5649
5650   /* On BE, we use load/store pair for all large int mode load/stores.
5651      TI/TFmode may also use a load/store pair.  */
5652   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5653   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5654   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5655                             || mode == TImode
5656                             || mode == TFmode
5657                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5658
5659   bool allow_reg_index_p = (!load_store_pair_p
5660                             && (known_lt (GET_MODE_SIZE (mode), 16)
5661                                 || vec_flags == VEC_ADVSIMD
5662                                 || vec_flags == VEC_SVE_DATA));
5663
5664   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5665      [Rn, #offset, MUL VL].  */
5666   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5667       && (code != REG && code != PLUS))
5668     return false;
5669
5670   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5671      REG addressing.  */
5672   if (advsimd_struct_p
5673       && !BYTES_BIG_ENDIAN
5674       && (code != POST_INC && code != REG))
5675     return false;
5676
5677   gcc_checking_assert (GET_MODE (x) == VOIDmode
5678                        || SCALAR_INT_MODE_P (GET_MODE (x)));
5679
5680   switch (code)
5681     {
5682     case REG:
5683     case SUBREG:
5684       info->type = ADDRESS_REG_IMM;
5685       info->base = x;
5686       info->offset = const0_rtx;
5687       info->const_offset = 0;
5688       return aarch64_base_register_rtx_p (x, strict_p);
5689
5690     case PLUS:
5691       op0 = XEXP (x, 0);
5692       op1 = XEXP (x, 1);
5693
5694       if (! strict_p
5695           && REG_P (op0)
5696           && virt_or_elim_regno_p (REGNO (op0))
5697           && poly_int_rtx_p (op1, &offset))
5698         {
5699           info->type = ADDRESS_REG_IMM;
5700           info->base = op0;
5701           info->offset = op1;
5702           info->const_offset = offset;
5703
5704           return true;
5705         }
5706
5707       if (maybe_ne (GET_MODE_SIZE (mode), 0)
5708           && aarch64_base_register_rtx_p (op0, strict_p)
5709           && poly_int_rtx_p (op1, &offset))
5710         {
5711           info->type = ADDRESS_REG_IMM;
5712           info->base = op0;
5713           info->offset = op1;
5714           info->const_offset = offset;
5715
5716           /* TImode and TFmode values are allowed in both pairs of X
5717              registers and individual Q registers.  The available
5718              address modes are:
5719              X,X: 7-bit signed scaled offset
5720              Q:   9-bit signed offset
5721              We conservatively require an offset representable in either mode.
5722              When performing the check for pairs of X registers i.e.  LDP/STP
5723              pass down DImode since that is the natural size of the LDP/STP
5724              instruction memory accesses.  */
5725           if (mode == TImode || mode == TFmode)
5726             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5727                     && (offset_9bit_signed_unscaled_p (mode, offset)
5728                         || offset_12bit_unsigned_scaled_p (mode, offset)));
5729
5730           /* A 7bit offset check because OImode will emit a ldp/stp
5731              instruction (only big endian will get here).
5732              For ldp/stp instructions, the offset is scaled for the size of a
5733              single element of the pair.  */
5734           if (mode == OImode)
5735             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5736
5737           /* Three 9/12 bit offsets checks because CImode will emit three
5738              ldr/str instructions (only big endian will get here).  */
5739           if (mode == CImode)
5740             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5741                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5742                         || offset_12bit_unsigned_scaled_p (V16QImode,
5743                                                            offset + 32)));
5744
5745           /* Two 7bit offsets checks because XImode will emit two ldp/stp
5746              instructions (only big endian will get here).  */
5747           if (mode == XImode)
5748             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5749                     && aarch64_offset_7bit_signed_scaled_p (TImode,
5750                                                             offset + 32));
5751
5752           /* Make "m" use the LD1 offset range for SVE data modes, so
5753              that pre-RTL optimizers like ivopts will work to that
5754              instead of the wider LDR/STR range.  */
5755           if (vec_flags == VEC_SVE_DATA)
5756             return (type == ADDR_QUERY_M
5757                     ? offset_4bit_signed_scaled_p (mode, offset)
5758                     : offset_9bit_signed_scaled_p (mode, offset));
5759
5760           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5761             {
5762               poly_int64 end_offset = (offset
5763                                        + GET_MODE_SIZE (mode)
5764                                        - BYTES_PER_SVE_VECTOR);
5765               return (type == ADDR_QUERY_M
5766                       ? offset_4bit_signed_scaled_p (mode, offset)
5767                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5768                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5769                                                          end_offset)));
5770             }
5771
5772           if (vec_flags == VEC_SVE_PRED)
5773             return offset_9bit_signed_scaled_p (mode, offset);
5774
5775           if (load_store_pair_p)
5776             return ((known_eq (GET_MODE_SIZE (mode), 4)
5777                      || known_eq (GET_MODE_SIZE (mode), 8))
5778                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5779           else
5780             return (offset_9bit_signed_unscaled_p (mode, offset)
5781                     || offset_12bit_unsigned_scaled_p (mode, offset));
5782         }
5783
5784       if (allow_reg_index_p)
5785         {
5786           /* Look for base + (scaled/extended) index register.  */
5787           if (aarch64_base_register_rtx_p (op0, strict_p)
5788               && aarch64_classify_index (info, op1, mode, strict_p))
5789             {
5790               info->base = op0;
5791               return true;
5792             }
5793           if (aarch64_base_register_rtx_p (op1, strict_p)
5794               && aarch64_classify_index (info, op0, mode, strict_p))
5795             {
5796               info->base = op1;
5797               return true;
5798             }
5799         }
5800
5801       return false;
5802
5803     case POST_INC:
5804     case POST_DEC:
5805     case PRE_INC:
5806     case PRE_DEC:
5807       info->type = ADDRESS_REG_WB;
5808       info->base = XEXP (x, 0);
5809       info->offset = NULL_RTX;
5810       return aarch64_base_register_rtx_p (info->base, strict_p);
5811
5812     case POST_MODIFY:
5813     case PRE_MODIFY:
5814       info->type = ADDRESS_REG_WB;
5815       info->base = XEXP (x, 0);
5816       if (GET_CODE (XEXP (x, 1)) == PLUS
5817           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5818           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5819           && aarch64_base_register_rtx_p (info->base, strict_p))
5820         {
5821           info->offset = XEXP (XEXP (x, 1), 1);
5822           info->const_offset = offset;
5823
5824           /* TImode and TFmode values are allowed in both pairs of X
5825              registers and individual Q registers.  The available
5826              address modes are:
5827              X,X: 7-bit signed scaled offset
5828              Q:   9-bit signed offset
5829              We conservatively require an offset representable in either mode.
5830            */
5831           if (mode == TImode || mode == TFmode)
5832             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5833                     && offset_9bit_signed_unscaled_p (mode, offset));
5834
5835           if (load_store_pair_p)
5836             return ((known_eq (GET_MODE_SIZE (mode), 4)
5837                      || known_eq (GET_MODE_SIZE (mode), 8))
5838                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5839           else
5840             return offset_9bit_signed_unscaled_p (mode, offset);
5841         }
5842       return false;
5843
5844     case CONST:
5845     case SYMBOL_REF:
5846     case LABEL_REF:
5847       /* load literal: pc-relative constant pool entry.  Only supported
5848          for SI mode or larger.  */
5849       info->type = ADDRESS_SYMBOLIC;
5850
5851       if (!load_store_pair_p
5852           && GET_MODE_SIZE (mode).is_constant (&const_size)
5853           && const_size >= 4)
5854         {
5855           rtx sym, addend;
5856
5857           split_const (x, &sym, &addend);
5858           return ((GET_CODE (sym) == LABEL_REF
5859                    || (GET_CODE (sym) == SYMBOL_REF
5860                        && CONSTANT_POOL_ADDRESS_P (sym)
5861                        && aarch64_pcrelative_literal_loads)));
5862         }
5863       return false;
5864
5865     case LO_SUM:
5866       info->type = ADDRESS_LO_SUM;
5867       info->base = XEXP (x, 0);
5868       info->offset = XEXP (x, 1);
5869       if (allow_reg_index_p
5870           && aarch64_base_register_rtx_p (info->base, strict_p))
5871         {
5872           rtx sym, offs;
5873           split_const (info->offset, &sym, &offs);
5874           if (GET_CODE (sym) == SYMBOL_REF
5875               && (aarch64_classify_symbol (sym, INTVAL (offs))
5876                   == SYMBOL_SMALL_ABSOLUTE))
5877             {
5878               /* The symbol and offset must be aligned to the access size.  */
5879               unsigned int align;
5880
5881               if (CONSTANT_POOL_ADDRESS_P (sym))
5882                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5883               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5884                 {
5885                   tree exp = SYMBOL_REF_DECL (sym);
5886                   align = TYPE_ALIGN (TREE_TYPE (exp));
5887                   align = aarch64_constant_alignment (exp, align);
5888                 }
5889               else if (SYMBOL_REF_DECL (sym))
5890                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5891               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5892                        && SYMBOL_REF_BLOCK (sym) != NULL)
5893                 align = SYMBOL_REF_BLOCK (sym)->alignment;
5894               else
5895                 align = BITS_PER_UNIT;
5896
5897               poly_int64 ref_size = GET_MODE_SIZE (mode);
5898               if (known_eq (ref_size, 0))
5899                 ref_size = GET_MODE_SIZE (DImode);
5900
5901               return (multiple_p (INTVAL (offs), ref_size)
5902                       && multiple_p (align / BITS_PER_UNIT, ref_size));
5903             }
5904         }
5905       return false;
5906
5907     default:
5908       return false;
5909     }
5910 }
5911
5912 /* Return true if the address X is valid for a PRFM instruction.
5913    STRICT_P is true if we should do strict checking with
5914    aarch64_classify_address.  */
5915
5916 bool
5917 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5918 {
5919   struct aarch64_address_info addr;
5920
5921   /* PRFM accepts the same addresses as DImode...  */
5922   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5923   if (!res)
5924     return false;
5925
5926   /* ... except writeback forms.  */
5927   return addr.type != ADDRESS_REG_WB;
5928 }
5929
5930 bool
5931 aarch64_symbolic_address_p (rtx x)
5932 {
5933   rtx offset;
5934
5935   split_const (x, &x, &offset);
5936   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
5937 }
5938
5939 /* Classify the base of symbolic expression X.  */
5940
5941 enum aarch64_symbol_type
5942 aarch64_classify_symbolic_expression (rtx x)
5943 {
5944   rtx offset;
5945
5946   split_const (x, &x, &offset);
5947   return aarch64_classify_symbol (x, INTVAL (offset));
5948 }
5949
5950
5951 /* Return TRUE if X is a legitimate address for accessing memory in
5952    mode MODE.  */
5953 static bool
5954 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
5955 {
5956   struct aarch64_address_info addr;
5957
5958   return aarch64_classify_address (&addr, x, mode, strict_p);
5959 }
5960
5961 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5962    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5963 bool
5964 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
5965                               aarch64_addr_query_type type)
5966 {
5967   struct aarch64_address_info addr;
5968
5969   return aarch64_classify_address (&addr, x, mode, strict_p, type);
5970 }
5971
5972 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
5973
5974 static bool
5975 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
5976                                          poly_int64 orig_offset,
5977                                          machine_mode mode)
5978 {
5979   HOST_WIDE_INT size;
5980   if (GET_MODE_SIZE (mode).is_constant (&size))
5981     {
5982       HOST_WIDE_INT const_offset, second_offset;
5983
5984       /* A general SVE offset is A * VQ + B.  Remove the A component from
5985          coefficient 0 in order to get the constant B.  */
5986       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
5987
5988       /* Split an out-of-range address displacement into a base and
5989          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
5990          range otherwise to increase opportunities for sharing the base
5991          address of different sizes.  Unaligned accesses use the signed
5992          9-bit range, TImode/TFmode use the intersection of signed
5993          scaled 7-bit and signed 9-bit offset.  */
5994       if (mode == TImode || mode == TFmode)
5995         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
5996       else if ((const_offset & (size - 1)) != 0)
5997         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
5998       else
5999         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6000
6001       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6002         return false;
6003
6004       /* Split the offset into second_offset and the rest.  */
6005       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6006       *offset2 = gen_int_mode (second_offset, Pmode);
6007       return true;
6008     }
6009   else
6010     {
6011       /* Get the mode we should use as the basis of the range.  For structure
6012          modes this is the mode of one vector.  */
6013       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6014       machine_mode step_mode
6015         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6016
6017       /* Get the "mul vl" multiplier we'd like to use.  */
6018       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6019       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6020       if (vec_flags & VEC_SVE_DATA)
6021         /* LDR supports a 9-bit range, but the move patterns for
6022            structure modes require all vectors to be in range of the
6023            same base.  The simplest way of accomodating that while still
6024            promoting reuse of anchor points between different modes is
6025            to use an 8-bit range unconditionally.  */
6026         vnum = ((vnum + 128) & 255) - 128;
6027       else
6028         /* Predicates are only handled singly, so we might as well use
6029            the full range.  */
6030         vnum = ((vnum + 256) & 511) - 256;
6031       if (vnum == 0)
6032         return false;
6033
6034       /* Convert the "mul vl" multiplier into a byte offset.  */
6035       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6036       if (known_eq (second_offset, orig_offset))
6037         return false;
6038
6039       /* Split the offset into second_offset and the rest.  */
6040       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6041       *offset2 = gen_int_mode (second_offset, Pmode);
6042       return true;
6043     }
6044 }
6045
6046 /* Return the binary representation of floating point constant VALUE in INTVAL.
6047    If the value cannot be converted, return false without setting INTVAL.
6048    The conversion is done in the given MODE.  */
6049 bool
6050 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6051 {
6052
6053   /* We make a general exception for 0.  */
6054   if (aarch64_float_const_zero_rtx_p (value))
6055     {
6056       *intval = 0;
6057       return true;
6058     }
6059
6060   scalar_float_mode mode;
6061   if (GET_CODE (value) != CONST_DOUBLE
6062       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6063       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6064       /* Only support up to DF mode.  */
6065       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6066     return false;
6067
6068   unsigned HOST_WIDE_INT ival = 0;
6069
6070   long res[2];
6071   real_to_target (res,
6072                   CONST_DOUBLE_REAL_VALUE (value),
6073                   REAL_MODE_FORMAT (mode));
6074
6075   if (mode == DFmode)
6076     {
6077       int order = BYTES_BIG_ENDIAN ? 1 : 0;
6078       ival = zext_hwi (res[order], 32);
6079       ival |= (zext_hwi (res[1 - order], 32) << 32);
6080     }
6081   else
6082       ival = zext_hwi (res[0], 32);
6083
6084   *intval = ival;
6085   return true;
6086 }
6087
6088 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6089    single MOV(+MOVK) followed by an FMOV.  */
6090 bool
6091 aarch64_float_const_rtx_p (rtx x)
6092 {
6093   machine_mode mode = GET_MODE (x);
6094   if (mode == VOIDmode)
6095     return false;
6096
6097   /* Determine whether it's cheaper to write float constants as
6098      mov/movk pairs over ldr/adrp pairs.  */
6099   unsigned HOST_WIDE_INT ival;
6100
6101   if (GET_CODE (x) == CONST_DOUBLE
6102       && SCALAR_FLOAT_MODE_P (mode)
6103       && aarch64_reinterpret_float_as_int (x, &ival))
6104     {
6105       scalar_int_mode imode = (mode == HFmode
6106                                ? SImode
6107                                : int_mode_for_mode (mode).require ());
6108       int num_instr = aarch64_internal_mov_immediate
6109                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6110       return num_instr < 3;
6111     }
6112
6113   return false;
6114 }
6115
6116 /* Return TRUE if rtx X is immediate constant 0.0 */
6117 bool
6118 aarch64_float_const_zero_rtx_p (rtx x)
6119 {
6120   if (GET_MODE (x) == VOIDmode)
6121     return false;
6122
6123   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6124     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6125   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6126 }
6127
6128 /* Return TRUE if rtx X is immediate constant that fits in a single
6129    MOVI immediate operation.  */
6130 bool
6131 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6132 {
6133   if (!TARGET_SIMD)
6134      return false;
6135
6136   machine_mode vmode;
6137   scalar_int_mode imode;
6138   unsigned HOST_WIDE_INT ival;
6139
6140   if (GET_CODE (x) == CONST_DOUBLE
6141       && SCALAR_FLOAT_MODE_P (mode))
6142     {
6143       if (!aarch64_reinterpret_float_as_int (x, &ival))
6144         return false;
6145
6146       /* We make a general exception for 0.  */
6147       if (aarch64_float_const_zero_rtx_p (x))
6148         return true;
6149
6150       imode = int_mode_for_mode (mode).require ();
6151     }
6152   else if (GET_CODE (x) == CONST_INT
6153            && is_a <scalar_int_mode> (mode, &imode))
6154     ival = INTVAL (x);
6155   else
6156     return false;
6157
6158    /* use a 64 bit mode for everything except for DI/DF mode, where we use
6159      a 128 bit vector mode.  */
6160   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6161
6162   vmode = aarch64_simd_container_mode (imode, width);
6163   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6164
6165   return aarch64_simd_valid_immediate (v_op, NULL);
6166 }
6167
6168
6169 /* Return the fixed registers used for condition codes.  */
6170
6171 static bool
6172 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6173 {
6174   *p1 = CC_REGNUM;
6175   *p2 = INVALID_REGNUM;
6176   return true;
6177 }
6178
6179 /* This function is used by the call expanders of the machine description.
6180    RESULT is the register in which the result is returned.  It's NULL for
6181    "call" and "sibcall".
6182    MEM is the location of the function call.
6183    SIBCALL indicates whether this function call is normal call or sibling call.
6184    It will generate different pattern accordingly.  */
6185
6186 void
6187 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6188 {
6189   rtx call, callee, tmp;
6190   rtvec vec;
6191   machine_mode mode;
6192
6193   gcc_assert (MEM_P (mem));
6194   callee = XEXP (mem, 0);
6195   mode = GET_MODE (callee);
6196   gcc_assert (mode == Pmode);
6197
6198   /* Decide if we should generate indirect calls by loading the
6199      address of the callee into a register before performing
6200      the branch-and-link.  */
6201   if (SYMBOL_REF_P (callee)
6202       ? (aarch64_is_long_call_p (callee)
6203          || aarch64_is_noplt_call_p (callee))
6204       : !REG_P (callee))
6205     XEXP (mem, 0) = force_reg (mode, callee);
6206
6207   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6208
6209   if (result != NULL_RTX)
6210     call = gen_rtx_SET (result, call);
6211
6212   if (sibcall)
6213     tmp = ret_rtx;
6214   else
6215     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6216
6217   vec = gen_rtvec (2, call, tmp);
6218   call = gen_rtx_PARALLEL (VOIDmode, vec);
6219
6220   aarch64_emit_call_insn (call);
6221 }
6222
6223 /* Emit call insn with PAT and do aarch64-specific handling.  */
6224
6225 void
6226 aarch64_emit_call_insn (rtx pat)
6227 {
6228   rtx insn = emit_call_insn (pat);
6229
6230   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6231   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6232   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6233 }
6234
6235 machine_mode
6236 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6237 {
6238   /* All floating point compares return CCFP if it is an equality
6239      comparison, and CCFPE otherwise.  */
6240   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6241     {
6242       switch (code)
6243         {
6244         case EQ:
6245         case NE:
6246         case UNORDERED:
6247         case ORDERED:
6248         case UNLT:
6249         case UNLE:
6250         case UNGT:
6251         case UNGE:
6252         case UNEQ:
6253           return CCFPmode;
6254
6255         case LT:
6256         case LE:
6257         case GT:
6258         case GE:
6259         case LTGT:
6260           return CCFPEmode;
6261
6262         default:
6263           gcc_unreachable ();
6264         }
6265     }
6266
6267   /* Equality comparisons of short modes against zero can be performed
6268      using the TST instruction with the appropriate bitmask.  */
6269   if (y == const0_rtx && REG_P (x)
6270       && (code == EQ || code == NE)
6271       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6272     return CC_NZmode;
6273
6274   /* Similarly, comparisons of zero_extends from shorter modes can
6275      be performed using an ANDS with an immediate mask.  */
6276   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6277       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6278       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6279       && (code == EQ || code == NE))
6280     return CC_NZmode;
6281
6282   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6283       && y == const0_rtx
6284       && (code == EQ || code == NE || code == LT || code == GE)
6285       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6286           || GET_CODE (x) == NEG
6287           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6288               && CONST_INT_P (XEXP (x, 2)))))
6289     return CC_NZmode;
6290
6291   /* A compare with a shifted operand.  Because of canonicalization,
6292      the comparison will have to be swapped when we emit the assembly
6293      code.  */
6294   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6295       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6296       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6297           || GET_CODE (x) == LSHIFTRT
6298           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6299     return CC_SWPmode;
6300
6301   /* Similarly for a negated operand, but we can only do this for
6302      equalities.  */
6303   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6304       && (REG_P (y) || GET_CODE (y) == SUBREG)
6305       && (code == EQ || code == NE)
6306       && GET_CODE (x) == NEG)
6307     return CC_Zmode;
6308
6309   /* A test for unsigned overflow.  */
6310   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6311       && code == NE
6312       && GET_CODE (x) == PLUS
6313       && GET_CODE (y) == ZERO_EXTEND)
6314     return CC_Cmode;
6315
6316   /* For everything else, return CCmode.  */
6317   return CCmode;
6318 }
6319
6320 static int
6321 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6322
6323 int
6324 aarch64_get_condition_code (rtx x)
6325 {
6326   machine_mode mode = GET_MODE (XEXP (x, 0));
6327   enum rtx_code comp_code = GET_CODE (x);
6328
6329   if (GET_MODE_CLASS (mode) != MODE_CC)
6330     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6331   return aarch64_get_condition_code_1 (mode, comp_code);
6332 }
6333
6334 static int
6335 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6336 {
6337   switch (mode)
6338     {
6339     case E_CCFPmode:
6340     case E_CCFPEmode:
6341       switch (comp_code)
6342         {
6343         case GE: return AARCH64_GE;
6344         case GT: return AARCH64_GT;
6345         case LE: return AARCH64_LS;
6346         case LT: return AARCH64_MI;
6347         case NE: return AARCH64_NE;
6348         case EQ: return AARCH64_EQ;
6349         case ORDERED: return AARCH64_VC;
6350         case UNORDERED: return AARCH64_VS;
6351         case UNLT: return AARCH64_LT;
6352         case UNLE: return AARCH64_LE;
6353         case UNGT: return AARCH64_HI;
6354         case UNGE: return AARCH64_PL;
6355         default: return -1;
6356         }
6357       break;
6358
6359     case E_CCmode:
6360       switch (comp_code)
6361         {
6362         case NE: return AARCH64_NE;
6363         case EQ: return AARCH64_EQ;
6364         case GE: return AARCH64_GE;
6365         case GT: return AARCH64_GT;
6366         case LE: return AARCH64_LE;
6367         case LT: return AARCH64_LT;
6368         case GEU: return AARCH64_CS;
6369         case GTU: return AARCH64_HI;
6370         case LEU: return AARCH64_LS;
6371         case LTU: return AARCH64_CC;
6372         default: return -1;
6373         }
6374       break;
6375
6376     case E_CC_SWPmode:
6377       switch (comp_code)
6378         {
6379         case NE: return AARCH64_NE;
6380         case EQ: return AARCH64_EQ;
6381         case GE: return AARCH64_LE;
6382         case GT: return AARCH64_LT;
6383         case LE: return AARCH64_GE;
6384         case LT: return AARCH64_GT;
6385         case GEU: return AARCH64_LS;
6386         case GTU: return AARCH64_CC;
6387         case LEU: return AARCH64_CS;
6388         case LTU: return AARCH64_HI;
6389         default: return -1;
6390         }
6391       break;
6392
6393     case E_CC_NZmode:
6394       switch (comp_code)
6395         {
6396         case NE: return AARCH64_NE;
6397         case EQ: return AARCH64_EQ;
6398         case GE: return AARCH64_PL;
6399         case LT: return AARCH64_MI;
6400         default: return -1;
6401         }
6402       break;
6403
6404     case E_CC_Zmode:
6405       switch (comp_code)
6406         {
6407         case NE: return AARCH64_NE;
6408         case EQ: return AARCH64_EQ;
6409         default: return -1;
6410         }
6411       break;
6412
6413     case E_CC_Cmode:
6414       switch (comp_code)
6415         {
6416         case NE: return AARCH64_CS;
6417         case EQ: return AARCH64_CC;
6418         default: return -1;
6419         }
6420       break;
6421
6422     default:
6423       return -1;
6424     }
6425
6426   return -1;
6427 }
6428
6429 bool
6430 aarch64_const_vec_all_same_in_range_p (rtx x,
6431                                        HOST_WIDE_INT minval,
6432                                        HOST_WIDE_INT maxval)
6433 {
6434   rtx elt;
6435   return (const_vec_duplicate_p (x, &elt)
6436           && CONST_INT_P (elt)
6437           && IN_RANGE (INTVAL (elt), minval, maxval));
6438 }
6439
6440 bool
6441 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6442 {
6443   return aarch64_const_vec_all_same_in_range_p (x, val, val);
6444 }
6445
6446 /* Return true if VEC is a constant in which every element is in the range
6447    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
6448
6449 static bool
6450 aarch64_const_vec_all_in_range_p (rtx vec,
6451                                   HOST_WIDE_INT minval,
6452                                   HOST_WIDE_INT maxval)
6453 {
6454   if (GET_CODE (vec) != CONST_VECTOR
6455       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6456     return false;
6457
6458   int nunits;
6459   if (!CONST_VECTOR_STEPPED_P (vec))
6460     nunits = const_vector_encoded_nelts (vec);
6461   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6462     return false;
6463
6464   for (int i = 0; i < nunits; i++)
6465     {
6466       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6467       if (!CONST_INT_P (vec_elem)
6468           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6469         return false;
6470     }
6471   return true;
6472 }
6473
6474 /* N Z C V.  */
6475 #define AARCH64_CC_V 1
6476 #define AARCH64_CC_C (1 << 1)
6477 #define AARCH64_CC_Z (1 << 2)
6478 #define AARCH64_CC_N (1 << 3)
6479
6480 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
6481 static const int aarch64_nzcv_codes[] =
6482 {
6483   0,            /* EQ, Z == 1.  */
6484   AARCH64_CC_Z, /* NE, Z == 0.  */
6485   0,            /* CS, C == 1.  */
6486   AARCH64_CC_C, /* CC, C == 0.  */
6487   0,            /* MI, N == 1.  */
6488   AARCH64_CC_N, /* PL, N == 0.  */
6489   0,            /* VS, V == 1.  */
6490   AARCH64_CC_V, /* VC, V == 0.  */
6491   0,            /* HI, C ==1 && Z == 0.  */
6492   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
6493   AARCH64_CC_V, /* GE, N == V.  */
6494   0,            /* LT, N != V.  */
6495   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
6496   0,            /* LE, !(Z == 0 && N == V).  */
6497   0,            /* AL, Any.  */
6498   0             /* NV, Any.  */
6499 };
6500
6501 /* Print floating-point vector immediate operand X to F, negating it
6502    first if NEGATE is true.  Return true on success, false if it isn't
6503    a constant we can handle.  */
6504
6505 static bool
6506 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6507 {
6508   rtx elt;
6509
6510   if (!const_vec_duplicate_p (x, &elt))
6511     return false;
6512
6513   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6514   if (negate)
6515     r = real_value_negate (&r);
6516
6517   /* We only handle the SVE single-bit immediates here.  */
6518   if (real_equal (&r, &dconst0))
6519     asm_fprintf (f, "0.0");
6520   else if (real_equal (&r, &dconst1))
6521     asm_fprintf (f, "1.0");
6522   else if (real_equal (&r, &dconsthalf))
6523     asm_fprintf (f, "0.5");
6524   else
6525     return false;
6526
6527   return true;
6528 }
6529
6530 /* Return the equivalent letter for size.  */
6531 static char
6532 sizetochar (int size)
6533 {
6534   switch (size)
6535     {
6536     case 64: return 'd';
6537     case 32: return 's';
6538     case 16: return 'h';
6539     case 8 : return 'b';
6540     default: gcc_unreachable ();
6541     }
6542 }
6543
6544 /* Print operand X to file F in a target specific manner according to CODE.
6545    The acceptable formatting commands given by CODE are:
6546      'c':               An integer or symbol address without a preceding #
6547                         sign.
6548      'C':               Take the duplicated element in a vector constant
6549                         and print it in hex.
6550      'D':               Take the duplicated element in a vector constant
6551                         and print it as an unsigned integer, in decimal.
6552      'e':               Print the sign/zero-extend size as a character 8->b,
6553                         16->h, 32->w.
6554      'p':               Prints N such that 2^N == X (X must be power of 2 and
6555                         const int).
6556      'P':               Print the number of non-zero bits in X (a const_int).
6557      'H':               Print the higher numbered register of a pair (TImode)
6558                         of regs.
6559      'm':               Print a condition (eq, ne, etc).
6560      'M':               Same as 'm', but invert condition.
6561      'N':               Take the duplicated element in a vector constant
6562                         and print the negative of it in decimal.
6563      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
6564      'S/T/U/V':         Print a FP/SIMD register name for a register list.
6565                         The register printed is the FP/SIMD register name
6566                         of X + 0/1/2/3 for S/T/U/V.
6567      'R':               Print a scalar FP/SIMD register name + 1.
6568      'X':               Print bottom 16 bits of integer constant in hex.
6569      'w/x':             Print a general register name or the zero register
6570                         (32-bit or 64-bit).
6571      '0':               Print a normal operand, if it's a general register,
6572                         then we assume DImode.
6573      'k':               Print NZCV for conditional compare instructions.
6574      'A':               Output address constant representing the first
6575                         argument of X, specifying a relocation offset
6576                         if appropriate.
6577      'L':               Output constant address specified by X
6578                         with a relocation offset if appropriate.
6579      'G':               Prints address of X, specifying a PC relative
6580                         relocation mode if appropriate.
6581      'y':               Output address of LDP or STP - this is used for
6582                         some LDP/STPs which don't use a PARALLEL in their
6583                         pattern (so the mode needs to be adjusted).
6584      'z':               Output address of a typical LDP or STP.  */
6585
6586 static void
6587 aarch64_print_operand (FILE *f, rtx x, int code)
6588 {
6589   rtx elt;
6590   switch (code)
6591     {
6592     case 'c':
6593       switch (GET_CODE (x))
6594         {
6595         case CONST_INT:
6596           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6597           break;
6598
6599         case SYMBOL_REF:
6600           output_addr_const (f, x);
6601           break;
6602
6603         case CONST:
6604           if (GET_CODE (XEXP (x, 0)) == PLUS
6605               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6606             {
6607               output_addr_const (f, x);
6608               break;
6609             }
6610           /* Fall through.  */
6611
6612         default:
6613           output_operand_lossage ("unsupported operand for code '%c'", code);
6614         }
6615       break;
6616
6617     case 'e':
6618       {
6619         int n;
6620
6621         if (!CONST_INT_P (x)
6622             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6623           {
6624             output_operand_lossage ("invalid operand for '%%%c'", code);
6625             return;
6626           }
6627
6628         switch (n)
6629           {
6630           case 3:
6631             fputc ('b', f);
6632             break;
6633           case 4:
6634             fputc ('h', f);
6635             break;
6636           case 5:
6637             fputc ('w', f);
6638             break;
6639           default:
6640             output_operand_lossage ("invalid operand for '%%%c'", code);
6641             return;
6642           }
6643       }
6644       break;
6645
6646     case 'p':
6647       {
6648         int n;
6649
6650         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6651           {
6652             output_operand_lossage ("invalid operand for '%%%c'", code);
6653             return;
6654           }
6655
6656         asm_fprintf (f, "%d", n);
6657       }
6658       break;
6659
6660     case 'P':
6661       if (!CONST_INT_P (x))
6662         {
6663           output_operand_lossage ("invalid operand for '%%%c'", code);
6664           return;
6665         }
6666
6667       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6668       break;
6669
6670     case 'H':
6671       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6672         {
6673           output_operand_lossage ("invalid operand for '%%%c'", code);
6674           return;
6675         }
6676
6677       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6678       break;
6679
6680     case 'M':
6681     case 'm':
6682       {
6683         int cond_code;
6684         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
6685         if (x == const_true_rtx)
6686           {
6687             if (code == 'M')
6688               fputs ("nv", f);
6689             return;
6690           }
6691
6692         if (!COMPARISON_P (x))
6693           {
6694             output_operand_lossage ("invalid operand for '%%%c'", code);
6695             return;
6696           }
6697
6698         cond_code = aarch64_get_condition_code (x);
6699         gcc_assert (cond_code >= 0);
6700         if (code == 'M')
6701           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6702         fputs (aarch64_condition_codes[cond_code], f);
6703       }
6704       break;
6705
6706     case 'N':
6707       if (!const_vec_duplicate_p (x, &elt))
6708         {
6709           output_operand_lossage ("invalid vector constant");
6710           return;
6711         }
6712
6713       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6714         asm_fprintf (f, "%wd", -INTVAL (elt));
6715       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6716                && aarch64_print_vector_float_operand (f, x, true))
6717         ;
6718       else
6719         {
6720           output_operand_lossage ("invalid vector constant");
6721           return;
6722         }
6723       break;
6724
6725     case 'b':
6726     case 'h':
6727     case 's':
6728     case 'd':
6729     case 'q':
6730       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6731         {
6732           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6733           return;
6734         }
6735       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6736       break;
6737
6738     case 'S':
6739     case 'T':
6740     case 'U':
6741     case 'V':
6742       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6743         {
6744           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6745           return;
6746         }
6747       asm_fprintf (f, "%c%d",
6748                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6749                    REGNO (x) - V0_REGNUM + (code - 'S'));
6750       break;
6751
6752     case 'R':
6753       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6754         {
6755           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6756           return;
6757         }
6758       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6759       break;
6760
6761     case 'X':
6762       if (!CONST_INT_P (x))
6763         {
6764           output_operand_lossage ("invalid operand for '%%%c'", code);
6765           return;
6766         }
6767       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6768       break;
6769
6770     case 'C':
6771       {
6772         /* Print a replicated constant in hex.  */
6773         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6774           {
6775             output_operand_lossage ("invalid operand for '%%%c'", code);
6776             return;
6777           }
6778         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6779         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6780       }
6781       break;
6782
6783     case 'D':
6784       {
6785         /* Print a replicated constant in decimal, treating it as
6786            unsigned.  */
6787         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6788           {
6789             output_operand_lossage ("invalid operand for '%%%c'", code);
6790             return;
6791           }
6792         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6793         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6794       }
6795       break;
6796
6797     case 'w':
6798     case 'x':
6799       if (x == const0_rtx
6800           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6801         {
6802           asm_fprintf (f, "%czr", code);
6803           break;
6804         }
6805
6806       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6807         {
6808           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6809           break;
6810         }
6811
6812       if (REG_P (x) && REGNO (x) == SP_REGNUM)
6813         {
6814           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6815           break;
6816         }
6817
6818       /* Fall through */
6819
6820     case 0:
6821       if (x == NULL)
6822         {
6823           output_operand_lossage ("missing operand");
6824           return;
6825         }
6826
6827       switch (GET_CODE (x))
6828         {
6829         case REG:
6830           if (aarch64_sve_data_mode_p (GET_MODE (x)))
6831             {
6832               if (REG_NREGS (x) == 1)
6833                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6834               else
6835                 {
6836                   char suffix
6837                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6838                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
6839                                REGNO (x) - V0_REGNUM, suffix,
6840                                END_REGNO (x) - V0_REGNUM - 1, suffix);
6841                 }
6842             }
6843           else
6844             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6845           break;
6846
6847         case MEM:
6848           output_address (GET_MODE (x), XEXP (x, 0));
6849           break;
6850
6851         case LABEL_REF:
6852         case SYMBOL_REF:
6853           output_addr_const (asm_out_file, x);
6854           break;
6855
6856         case CONST_INT:
6857           asm_fprintf (f, "%wd", INTVAL (x));
6858           break;
6859
6860         case CONST:
6861           if (!VECTOR_MODE_P (GET_MODE (x)))
6862             {
6863               output_addr_const (asm_out_file, x);
6864               break;
6865             }
6866           /* fall through */
6867
6868         case CONST_VECTOR:
6869           if (!const_vec_duplicate_p (x, &elt))
6870             {
6871               output_operand_lossage ("invalid vector constant");
6872               return;
6873             }
6874
6875           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6876             asm_fprintf (f, "%wd", INTVAL (elt));
6877           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6878                    && aarch64_print_vector_float_operand (f, x, false))
6879             ;
6880           else
6881             {
6882               output_operand_lossage ("invalid vector constant");
6883               return;
6884             }
6885           break;
6886
6887         case CONST_DOUBLE:
6888           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6889              be getting CONST_DOUBLEs holding integers.  */
6890           gcc_assert (GET_MODE (x) != VOIDmode);
6891           if (aarch64_float_const_zero_rtx_p (x))
6892             {
6893               fputc ('0', f);
6894               break;
6895             }
6896           else if (aarch64_float_const_representable_p (x))
6897             {
6898 #define buf_size 20
6899               char float_buf[buf_size] = {'\0'};
6900               real_to_decimal_for_mode (float_buf,
6901                                         CONST_DOUBLE_REAL_VALUE (x),
6902                                         buf_size, buf_size,
6903                                         1, GET_MODE (x));
6904               asm_fprintf (asm_out_file, "%s", float_buf);
6905               break;
6906 #undef buf_size
6907             }
6908           output_operand_lossage ("invalid constant");
6909           return;
6910         default:
6911           output_operand_lossage ("invalid operand");
6912           return;
6913         }
6914       break;
6915
6916     case 'A':
6917       if (GET_CODE (x) == HIGH)
6918         x = XEXP (x, 0);
6919
6920       switch (aarch64_classify_symbolic_expression (x))
6921         {
6922         case SYMBOL_SMALL_GOT_4G:
6923           asm_fprintf (asm_out_file, ":got:");
6924           break;
6925
6926         case SYMBOL_SMALL_TLSGD:
6927           asm_fprintf (asm_out_file, ":tlsgd:");
6928           break;
6929
6930         case SYMBOL_SMALL_TLSDESC:
6931           asm_fprintf (asm_out_file, ":tlsdesc:");
6932           break;
6933
6934         case SYMBOL_SMALL_TLSIE:
6935           asm_fprintf (asm_out_file, ":gottprel:");
6936           break;
6937
6938         case SYMBOL_TLSLE24:
6939           asm_fprintf (asm_out_file, ":tprel:");
6940           break;
6941
6942         case SYMBOL_TINY_GOT:
6943           gcc_unreachable ();
6944           break;
6945
6946         default:
6947           break;
6948         }
6949       output_addr_const (asm_out_file, x);
6950       break;
6951
6952     case 'L':
6953       switch (aarch64_classify_symbolic_expression (x))
6954         {
6955         case SYMBOL_SMALL_GOT_4G:
6956           asm_fprintf (asm_out_file, ":lo12:");
6957           break;
6958
6959         case SYMBOL_SMALL_TLSGD:
6960           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
6961           break;
6962
6963         case SYMBOL_SMALL_TLSDESC:
6964           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
6965           break;
6966
6967         case SYMBOL_SMALL_TLSIE:
6968           asm_fprintf (asm_out_file, ":gottprel_lo12:");
6969           break;
6970
6971         case SYMBOL_TLSLE12:
6972           asm_fprintf (asm_out_file, ":tprel_lo12:");
6973           break;
6974
6975         case SYMBOL_TLSLE24:
6976           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
6977           break;
6978
6979         case SYMBOL_TINY_GOT:
6980           asm_fprintf (asm_out_file, ":got:");
6981           break;
6982
6983         case SYMBOL_TINY_TLSIE:
6984           asm_fprintf (asm_out_file, ":gottprel:");
6985           break;
6986
6987         default:
6988           break;
6989         }
6990       output_addr_const (asm_out_file, x);
6991       break;
6992
6993     case 'G':
6994       switch (aarch64_classify_symbolic_expression (x))
6995         {
6996         case SYMBOL_TLSLE24:
6997           asm_fprintf (asm_out_file, ":tprel_hi12:");
6998           break;
6999         default:
7000           break;
7001         }
7002       output_addr_const (asm_out_file, x);
7003       break;
7004
7005     case 'k':
7006       {
7007         HOST_WIDE_INT cond_code;
7008
7009         if (!CONST_INT_P (x))
7010           {
7011             output_operand_lossage ("invalid operand for '%%%c'", code);
7012             return;
7013           }
7014
7015         cond_code = INTVAL (x);
7016         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7017         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7018       }
7019       break;
7020
7021     case 'y':
7022     case 'z':
7023       {
7024         machine_mode mode = GET_MODE (x);
7025
7026         if (GET_CODE (x) != MEM
7027             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7028           {
7029             output_operand_lossage ("invalid operand for '%%%c'", code);
7030             return;
7031           }
7032
7033         if (code == 'y')
7034           /* LDP/STP which uses a single double-width memory operand.
7035              Adjust the mode to appear like a typical LDP/STP.
7036              Currently this is supported for 16-byte accesses only.  */
7037           mode = DFmode;
7038
7039         if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
7040           output_operand_lossage ("invalid operand prefix '%%%c'", code);
7041       }
7042       break;
7043
7044     default:
7045       output_operand_lossage ("invalid operand prefix '%%%c'", code);
7046       return;
7047     }
7048 }
7049
7050 /* Print address 'x' of a memory access with mode 'mode'.
7051    'op' is the context required by aarch64_classify_address.  It can either be
7052    MEM for a normal memory access or PARALLEL for LDP/STP.  */
7053 static bool
7054 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7055                                 aarch64_addr_query_type type)
7056 {
7057   struct aarch64_address_info addr;
7058   unsigned int size;
7059
7060   /* Check all addresses are Pmode - including ILP32.  */
7061   if (GET_MODE (x) != Pmode)
7062     output_operand_lossage ("invalid address mode");
7063
7064   if (aarch64_classify_address (&addr, x, mode, true, type))
7065     switch (addr.type)
7066       {
7067       case ADDRESS_REG_IMM:
7068         if (known_eq (addr.const_offset, 0))
7069           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7070         else if (aarch64_sve_data_mode_p (mode))
7071           {
7072             HOST_WIDE_INT vnum
7073               = exact_div (addr.const_offset,
7074                            BYTES_PER_SVE_VECTOR).to_constant ();
7075             asm_fprintf (f, "[%s, #%wd, mul vl]",
7076                          reg_names[REGNO (addr.base)], vnum);
7077           }
7078         else if (aarch64_sve_pred_mode_p (mode))
7079           {
7080             HOST_WIDE_INT vnum
7081               = exact_div (addr.const_offset,
7082                            BYTES_PER_SVE_PRED).to_constant ();
7083             asm_fprintf (f, "[%s, #%wd, mul vl]",
7084                          reg_names[REGNO (addr.base)], vnum);
7085           }
7086         else
7087           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7088                        INTVAL (addr.offset));
7089         return true;
7090
7091       case ADDRESS_REG_REG:
7092         if (addr.shift == 0)
7093           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7094                        reg_names [REGNO (addr.offset)]);
7095         else
7096           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7097                        reg_names [REGNO (addr.offset)], addr.shift);
7098         return true;
7099
7100       case ADDRESS_REG_UXTW:
7101         if (addr.shift == 0)
7102           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7103                        REGNO (addr.offset) - R0_REGNUM);
7104         else
7105           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7106                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7107         return true;
7108
7109       case ADDRESS_REG_SXTW:
7110         if (addr.shift == 0)
7111           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7112                        REGNO (addr.offset) - R0_REGNUM);
7113         else
7114           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7115                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7116         return true;
7117
7118       case ADDRESS_REG_WB:
7119         /* Writeback is only supported for fixed-width modes.  */
7120         size = GET_MODE_SIZE (mode).to_constant ();
7121         switch (GET_CODE (x))
7122           {
7123           case PRE_INC:
7124             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7125             return true;
7126           case POST_INC:
7127             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7128             return true;
7129           case PRE_DEC:
7130             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7131             return true;
7132           case POST_DEC:
7133             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7134             return true;
7135           case PRE_MODIFY:
7136             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7137                          INTVAL (addr.offset));
7138             return true;
7139           case POST_MODIFY:
7140             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7141                          INTVAL (addr.offset));
7142             return true;
7143           default:
7144             break;
7145           }
7146         break;
7147
7148       case ADDRESS_LO_SUM:
7149         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7150         output_addr_const (f, addr.offset);
7151         asm_fprintf (f, "]");
7152         return true;
7153
7154       case ADDRESS_SYMBOLIC:
7155         output_addr_const (f, x);
7156         return true;
7157       }
7158
7159   return false;
7160 }
7161
7162 /* Print address 'x' of a LDP/STP with mode 'mode'.  */
7163 static bool
7164 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7165 {
7166   return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7167 }
7168
7169 /* Print address 'x' of a memory access with mode 'mode'.  */
7170 static void
7171 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7172 {
7173   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7174     output_addr_const (f, x);
7175 }
7176
7177 bool
7178 aarch64_label_mentioned_p (rtx x)
7179 {
7180   const char *fmt;
7181   int i;
7182
7183   if (GET_CODE (x) == LABEL_REF)
7184     return true;
7185
7186   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7187      referencing instruction, but they are constant offsets, not
7188      symbols.  */
7189   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7190     return false;
7191
7192   fmt = GET_RTX_FORMAT (GET_CODE (x));
7193   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7194     {
7195       if (fmt[i] == 'E')
7196         {
7197           int j;
7198
7199           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7200             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7201               return 1;
7202         }
7203       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7204         return 1;
7205     }
7206
7207   return 0;
7208 }
7209
7210 /* Implement REGNO_REG_CLASS.  */
7211
7212 enum reg_class
7213 aarch64_regno_regclass (unsigned regno)
7214 {
7215   if (GP_REGNUM_P (regno))
7216     return GENERAL_REGS;
7217
7218   if (regno == SP_REGNUM)
7219     return STACK_REG;
7220
7221   if (regno == FRAME_POINTER_REGNUM
7222       || regno == ARG_POINTER_REGNUM)
7223     return POINTER_REGS;
7224
7225   if (FP_REGNUM_P (regno))
7226     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
7227
7228   if (PR_REGNUM_P (regno))
7229     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7230
7231   return NO_REGS;
7232 }
7233
7234 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7235    If OFFSET is out of range, return an offset of an anchor point
7236    that is in range.  Return 0 otherwise.  */
7237
7238 static HOST_WIDE_INT
7239 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7240                        machine_mode mode)
7241 {
7242   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
7243   if (size > 16)
7244     return (offset + 0x400) & ~0x7f0;
7245
7246   /* For offsets that aren't a multiple of the access size, the limit is
7247      -256...255.  */
7248   if (offset & (size - 1))
7249     {
7250       /* BLKmode typically uses LDP of X-registers.  */
7251       if (mode == BLKmode)
7252         return (offset + 512) & ~0x3ff;
7253       return (offset + 0x100) & ~0x1ff;
7254     }
7255
7256   /* Small negative offsets are supported.  */
7257   if (IN_RANGE (offset, -256, 0))
7258     return 0;
7259
7260   if (mode == TImode || mode == TFmode)
7261     return (offset + 0x100) & ~0x1ff;
7262
7263   /* Use 12-bit offset by access size.  */
7264   return offset & (~0xfff * size);
7265 }
7266
7267 static rtx
7268 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
7269 {
7270   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7271      where mask is selected by alignment and size of the offset.
7272      We try to pick as large a range for the offset as possible to
7273      maximize the chance of a CSE.  However, for aligned addresses
7274      we limit the range to 4k so that structures with different sized
7275      elements are likely to use the same base.  We need to be careful
7276      not to split a CONST for some forms of address expression, otherwise
7277      it will generate sub-optimal code.  */
7278
7279   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7280     {
7281       rtx base = XEXP (x, 0);
7282       rtx offset_rtx = XEXP (x, 1);
7283       HOST_WIDE_INT offset = INTVAL (offset_rtx);
7284
7285       if (GET_CODE (base) == PLUS)
7286         {
7287           rtx op0 = XEXP (base, 0);
7288           rtx op1 = XEXP (base, 1);
7289
7290           /* Force any scaling into a temp for CSE.  */
7291           op0 = force_reg (Pmode, op0);
7292           op1 = force_reg (Pmode, op1);
7293
7294           /* Let the pointer register be in op0.  */
7295           if (REG_POINTER (op1))
7296             std::swap (op0, op1);
7297
7298           /* If the pointer is virtual or frame related, then we know that
7299              virtual register instantiation or register elimination is going
7300              to apply a second constant.  We want the two constants folded
7301              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
7302           if (virt_or_elim_regno_p (REGNO (op0)))
7303             {
7304               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7305                                    NULL_RTX, true, OPTAB_DIRECT);
7306               return gen_rtx_PLUS (Pmode, base, op1);
7307             }
7308
7309           /* Otherwise, in order to encourage CSE (and thence loop strength
7310              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
7311           base = expand_binop (Pmode, add_optab, op0, op1,
7312                                NULL_RTX, true, OPTAB_DIRECT);
7313           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7314         }
7315
7316       HOST_WIDE_INT size;
7317       if (GET_MODE_SIZE (mode).is_constant (&size))
7318         {
7319           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7320                                                              mode);
7321           if (base_offset != 0)
7322             {
7323               base = plus_constant (Pmode, base, base_offset);
7324               base = force_operand (base, NULL_RTX);
7325               return plus_constant (Pmode, base, offset - base_offset);
7326             }
7327         }
7328     }
7329
7330   return x;
7331 }
7332
7333 /* Return the reload icode required for a constant pool in mode.  */
7334 static enum insn_code
7335 aarch64_constant_pool_reload_icode (machine_mode mode)
7336 {
7337   switch (mode)
7338     {
7339     case E_SFmode:
7340       return CODE_FOR_aarch64_reload_movcpsfdi;
7341
7342     case E_DFmode:
7343       return CODE_FOR_aarch64_reload_movcpdfdi;
7344
7345     case E_TFmode:
7346       return CODE_FOR_aarch64_reload_movcptfdi;
7347
7348     case E_V8QImode:
7349       return CODE_FOR_aarch64_reload_movcpv8qidi;
7350
7351     case E_V16QImode:
7352       return CODE_FOR_aarch64_reload_movcpv16qidi;
7353
7354     case E_V4HImode:
7355       return CODE_FOR_aarch64_reload_movcpv4hidi;
7356
7357     case E_V8HImode:
7358       return CODE_FOR_aarch64_reload_movcpv8hidi;
7359
7360     case E_V2SImode:
7361       return CODE_FOR_aarch64_reload_movcpv2sidi;
7362
7363     case E_V4SImode:
7364       return CODE_FOR_aarch64_reload_movcpv4sidi;
7365
7366     case E_V2DImode:
7367       return CODE_FOR_aarch64_reload_movcpv2didi;
7368
7369     case E_V2DFmode:
7370       return CODE_FOR_aarch64_reload_movcpv2dfdi;
7371
7372     default:
7373       gcc_unreachable ();
7374     }
7375
7376   gcc_unreachable ();
7377 }
7378 static reg_class_t
7379 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7380                           reg_class_t rclass,
7381                           machine_mode mode,
7382                           secondary_reload_info *sri)
7383 {
7384   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7385      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
7386      comment at the head of aarch64-sve.md for more details about the
7387      big-endian handling.  */
7388   if (BYTES_BIG_ENDIAN
7389       && reg_class_subset_p (rclass, FP_REGS)
7390       && !((REG_P (x) && HARD_REGISTER_P (x))
7391            || aarch64_simd_valid_immediate (x, NULL))
7392       && aarch64_sve_data_mode_p (mode))
7393     {
7394       sri->icode = CODE_FOR_aarch64_sve_reload_be;
7395       return NO_REGS;
7396     }
7397
7398   /* If we have to disable direct literal pool loads and stores because the
7399      function is too big, then we need a scratch register.  */
7400   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7401       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7402           || targetm.vector_mode_supported_p (GET_MODE (x)))
7403       && !aarch64_pcrelative_literal_loads)
7404     {
7405       sri->icode = aarch64_constant_pool_reload_icode (mode);
7406       return NO_REGS;
7407     }
7408
7409   /* Without the TARGET_SIMD instructions we cannot move a Q register
7410      to a Q register directly.  We need a scratch.  */
7411   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7412       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7413       && reg_class_subset_p (rclass, FP_REGS))
7414     {
7415       if (mode == TFmode)
7416         sri->icode = CODE_FOR_aarch64_reload_movtf;
7417       else if (mode == TImode)
7418         sri->icode = CODE_FOR_aarch64_reload_movti;
7419       return NO_REGS;
7420     }
7421
7422   /* A TFmode or TImode memory access should be handled via an FP_REGS
7423      because AArch64 has richer addressing modes for LDR/STR instructions
7424      than LDP/STP instructions.  */
7425   if (TARGET_FLOAT && rclass == GENERAL_REGS
7426       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7427     return FP_REGS;
7428
7429   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7430       return GENERAL_REGS;
7431
7432   return NO_REGS;
7433 }
7434
7435 static bool
7436 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7437 {
7438   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7439
7440   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7441      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
7442   if (frame_pointer_needed)
7443     return to == HARD_FRAME_POINTER_REGNUM;
7444   return true;
7445 }
7446
7447 poly_int64
7448 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7449 {
7450   aarch64_layout_frame ();
7451
7452   if (to == HARD_FRAME_POINTER_REGNUM)
7453     {
7454       if (from == ARG_POINTER_REGNUM)
7455         return cfun->machine->frame.hard_fp_offset;
7456
7457       if (from == FRAME_POINTER_REGNUM)
7458         return cfun->machine->frame.hard_fp_offset
7459                - cfun->machine->frame.locals_offset;
7460     }
7461
7462   if (to == STACK_POINTER_REGNUM)
7463     {
7464       if (from == FRAME_POINTER_REGNUM)
7465           return cfun->machine->frame.frame_size
7466                  - cfun->machine->frame.locals_offset;
7467     }
7468
7469   return cfun->machine->frame.frame_size;
7470 }
7471
7472 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
7473    previous frame.  */
7474
7475 rtx
7476 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7477 {
7478   if (count != 0)
7479     return const0_rtx;
7480   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7481 }
7482
7483
7484 static void
7485 aarch64_asm_trampoline_template (FILE *f)
7486 {
7487   if (TARGET_ILP32)
7488     {
7489       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7490       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7491     }
7492   else
7493     {
7494       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7495       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7496     }
7497   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7498   assemble_aligned_integer (4, const0_rtx);
7499   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7500   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7501 }
7502
7503 static void
7504 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7505 {
7506   rtx fnaddr, mem, a_tramp;
7507   const int tramp_code_sz = 16;
7508
7509   /* Don't need to copy the trailing D-words, we fill those in below.  */
7510   emit_block_move (m_tramp, assemble_trampoline_template (),
7511                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7512   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7513   fnaddr = XEXP (DECL_RTL (fndecl), 0);
7514   if (GET_MODE (fnaddr) != ptr_mode)
7515     fnaddr = convert_memory_address (ptr_mode, fnaddr);
7516   emit_move_insn (mem, fnaddr);
7517
7518   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7519   emit_move_insn (mem, chain_value);
7520
7521   /* XXX We should really define a "clear_cache" pattern and use
7522      gen_clear_cache().  */
7523   a_tramp = XEXP (m_tramp, 0);
7524   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7525                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7526                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7527                      ptr_mode);
7528 }
7529
7530 static unsigned char
7531 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7532 {
7533   /* ??? Logically we should only need to provide a value when
7534      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7535      can hold MODE, but at the moment we need to handle all modes.
7536      Just ignore any runtime parts for registers that can't store them.  */
7537   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7538   unsigned int nregs;
7539   switch (regclass)
7540     {
7541     case TAILCALL_ADDR_REGS:
7542     case POINTER_REGS:
7543     case GENERAL_REGS:
7544     case ALL_REGS:
7545     case POINTER_AND_FP_REGS:
7546     case FP_REGS:
7547     case FP_LO_REGS:
7548       if (aarch64_sve_data_mode_p (mode)
7549           && constant_multiple_p (GET_MODE_SIZE (mode),
7550                                   BYTES_PER_SVE_VECTOR, &nregs))
7551         return nregs;
7552       return (aarch64_vector_data_mode_p (mode)
7553               ? CEIL (lowest_size, UNITS_PER_VREG)
7554               : CEIL (lowest_size, UNITS_PER_WORD));
7555     case STACK_REG:
7556     case PR_REGS:
7557     case PR_LO_REGS:
7558     case PR_HI_REGS:
7559       return 1;
7560
7561     case NO_REGS:
7562       return 0;
7563
7564     default:
7565       break;
7566     }
7567   gcc_unreachable ();
7568 }
7569
7570 static reg_class_t
7571 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7572 {
7573   if (regclass == POINTER_REGS)
7574     return GENERAL_REGS;
7575
7576   if (regclass == STACK_REG)
7577     {
7578       if (REG_P(x)
7579           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7580           return regclass;
7581
7582       return NO_REGS;
7583     }
7584
7585   /* Register eliminiation can result in a request for
7586      SP+constant->FP_REGS.  We cannot support such operations which
7587      use SP as source and an FP_REG as destination, so reject out
7588      right now.  */
7589   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7590     {
7591       rtx lhs = XEXP (x, 0);
7592
7593       /* Look through a possible SUBREG introduced by ILP32.  */
7594       if (GET_CODE (lhs) == SUBREG)
7595         lhs = SUBREG_REG (lhs);
7596
7597       gcc_assert (REG_P (lhs));
7598       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7599                                       POINTER_REGS));
7600       return NO_REGS;
7601     }
7602
7603   return regclass;
7604 }
7605
7606 void
7607 aarch64_asm_output_labelref (FILE* f, const char *name)
7608 {
7609   asm_fprintf (f, "%U%s", name);
7610 }
7611
7612 static void
7613 aarch64_elf_asm_constructor (rtx symbol, int priority)
7614 {
7615   if (priority == DEFAULT_INIT_PRIORITY)
7616     default_ctor_section_asm_out_constructor (symbol, priority);
7617   else
7618     {
7619       section *s;
7620       /* While priority is known to be in range [0, 65535], so 18 bytes
7621          would be enough, the compiler might not know that.  To avoid
7622          -Wformat-truncation false positive, use a larger size.  */
7623       char buf[23];
7624       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7625       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7626       switch_to_section (s);
7627       assemble_align (POINTER_SIZE);
7628       assemble_aligned_integer (POINTER_BYTES, symbol);
7629     }
7630 }
7631
7632 static void
7633 aarch64_elf_asm_destructor (rtx symbol, int priority)
7634 {
7635   if (priority == DEFAULT_INIT_PRIORITY)
7636     default_dtor_section_asm_out_destructor (symbol, priority);
7637   else
7638     {
7639       section *s;
7640       /* While priority is known to be in range [0, 65535], so 18 bytes
7641          would be enough, the compiler might not know that.  To avoid
7642          -Wformat-truncation false positive, use a larger size.  */
7643       char buf[23];
7644       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7645       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7646       switch_to_section (s);
7647       assemble_align (POINTER_SIZE);
7648       assemble_aligned_integer (POINTER_BYTES, symbol);
7649     }
7650 }
7651
7652 const char*
7653 aarch64_output_casesi (rtx *operands)
7654 {
7655   char buf[100];
7656   char label[100];
7657   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7658   int index;
7659   static const char *const patterns[4][2] =
7660   {
7661     {
7662       "ldrb\t%w3, [%0,%w1,uxtw]",
7663       "add\t%3, %4, %w3, sxtb #2"
7664     },
7665     {
7666       "ldrh\t%w3, [%0,%w1,uxtw #1]",
7667       "add\t%3, %4, %w3, sxth #2"
7668     },
7669     {
7670       "ldr\t%w3, [%0,%w1,uxtw #2]",
7671       "add\t%3, %4, %w3, sxtw #2"
7672     },
7673     /* We assume that DImode is only generated when not optimizing and
7674        that we don't really need 64-bit address offsets.  That would
7675        imply an object file with 8GB of code in a single function!  */
7676     {
7677       "ldr\t%w3, [%0,%w1,uxtw #2]",
7678       "add\t%3, %4, %w3, sxtw #2"
7679     }
7680   };
7681
7682   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7683
7684   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7685   index = exact_log2 (GET_MODE_SIZE (mode));
7686
7687   gcc_assert (index >= 0 && index <= 3);
7688
7689   /* Need to implement table size reduction, by chaning the code below.  */
7690   output_asm_insn (patterns[index][0], operands);
7691   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7692   snprintf (buf, sizeof (buf),
7693             "adr\t%%4, %s", targetm.strip_name_encoding (label));
7694   output_asm_insn (buf, operands);
7695   output_asm_insn (patterns[index][1], operands);
7696   output_asm_insn ("br\t%3", operands);
7697   assemble_label (asm_out_file, label);
7698   return "";
7699 }
7700
7701
7702 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7703    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7704    operator.  */
7705
7706 int
7707 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7708 {
7709   if (shift >= 0 && shift <= 3)
7710     {
7711       int size;
7712       for (size = 8; size <= 32; size *= 2)
7713         {
7714           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7715           if (mask == bits << shift)
7716             return size;
7717         }
7718     }
7719   return 0;
7720 }
7721
7722 /* Constant pools are per function only when PC relative
7723    literal loads are true or we are in the large memory
7724    model.  */
7725
7726 static inline bool
7727 aarch64_can_use_per_function_literal_pools_p (void)
7728 {
7729   return (aarch64_pcrelative_literal_loads
7730           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7731 }
7732
7733 static bool
7734 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7735 {
7736   /* Fixme:: In an ideal world this would work similar
7737      to the logic in aarch64_select_rtx_section but this
7738      breaks bootstrap in gcc go.  For now we workaround
7739      this by returning false here.  */
7740   return false;
7741 }
7742
7743 /* Select appropriate section for constants depending
7744    on where we place literal pools.  */
7745
7746 static section *
7747 aarch64_select_rtx_section (machine_mode mode,
7748                             rtx x,
7749                             unsigned HOST_WIDE_INT align)
7750 {
7751   if (aarch64_can_use_per_function_literal_pools_p ())
7752     return function_section (current_function_decl);
7753
7754   return default_elf_select_rtx_section (mode, x, align);
7755 }
7756
7757 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
7758 void
7759 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7760                                   HOST_WIDE_INT offset)
7761 {
7762   /* When using per-function literal pools, we must ensure that any code
7763      section is aligned to the minimal instruction length, lest we get
7764      errors from the assembler re "unaligned instructions".  */
7765   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7766     ASM_OUTPUT_ALIGN (f, 2);
7767 }
7768
7769 /* Costs.  */
7770
7771 /* Helper function for rtx cost calculation.  Strip a shift expression
7772    from X.  Returns the inner operand if successful, or the original
7773    expression on failure.  */
7774 static rtx
7775 aarch64_strip_shift (rtx x)
7776 {
7777   rtx op = x;
7778
7779   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7780      we can convert both to ROR during final output.  */
7781   if ((GET_CODE (op) == ASHIFT
7782        || GET_CODE (op) == ASHIFTRT
7783        || GET_CODE (op) == LSHIFTRT
7784        || GET_CODE (op) == ROTATERT
7785        || GET_CODE (op) == ROTATE)
7786       && CONST_INT_P (XEXP (op, 1)))
7787     return XEXP (op, 0);
7788
7789   if (GET_CODE (op) == MULT
7790       && CONST_INT_P (XEXP (op, 1))
7791       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7792     return XEXP (op, 0);
7793
7794   return x;
7795 }
7796
7797 /* Helper function for rtx cost calculation.  Strip an extend
7798    expression from X.  Returns the inner operand if successful, or the
7799    original expression on failure.  We deal with a number of possible
7800    canonicalization variations here. If STRIP_SHIFT is true, then
7801    we can strip off a shift also.  */
7802 static rtx
7803 aarch64_strip_extend (rtx x, bool strip_shift)
7804 {
7805   scalar_int_mode mode;
7806   rtx op = x;
7807
7808   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7809     return op;
7810
7811   /* Zero and sign extraction of a widened value.  */
7812   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7813       && XEXP (op, 2) == const0_rtx
7814       && GET_CODE (XEXP (op, 0)) == MULT
7815       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7816                                          XEXP (op, 1)))
7817     return XEXP (XEXP (op, 0), 0);
7818
7819   /* It can also be represented (for zero-extend) as an AND with an
7820      immediate.  */
7821   if (GET_CODE (op) == AND
7822       && GET_CODE (XEXP (op, 0)) == MULT
7823       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7824       && CONST_INT_P (XEXP (op, 1))
7825       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7826                            INTVAL (XEXP (op, 1))) != 0)
7827     return XEXP (XEXP (op, 0), 0);
7828
7829   /* Now handle extended register, as this may also have an optional
7830      left shift by 1..4.  */
7831   if (strip_shift
7832       && GET_CODE (op) == ASHIFT
7833       && CONST_INT_P (XEXP (op, 1))
7834       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7835     op = XEXP (op, 0);
7836
7837   if (GET_CODE (op) == ZERO_EXTEND
7838       || GET_CODE (op) == SIGN_EXTEND)
7839     op = XEXP (op, 0);
7840
7841   if (op != x)
7842     return op;
7843
7844   return x;
7845 }
7846
7847 /* Return true iff CODE is a shift supported in combination
7848    with arithmetic instructions.  */
7849
7850 static bool
7851 aarch64_shift_p (enum rtx_code code)
7852 {
7853   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7854 }
7855
7856
7857 /* Return true iff X is a cheap shift without a sign extend. */
7858
7859 static bool
7860 aarch64_cheap_mult_shift_p (rtx x)
7861 {
7862   rtx op0, op1;
7863
7864   op0 = XEXP (x, 0);
7865   op1 = XEXP (x, 1);
7866
7867   if (!(aarch64_tune_params.extra_tuning_flags
7868                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7869     return false;
7870
7871   if (GET_CODE (op0) == SIGN_EXTEND)
7872     return false;
7873
7874   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7875       && UINTVAL (op1) <= 4)
7876     return true;
7877
7878   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7879     return false;
7880
7881   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7882
7883   if (l2 > 0 && l2 <= 4)
7884     return true;
7885
7886   return false;
7887 }
7888
7889 /* Helper function for rtx cost calculation.  Calculate the cost of
7890    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7891    Return the calculated cost of the expression, recursing manually in to
7892    operands where needed.  */
7893
7894 static int
7895 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7896 {
7897   rtx op0, op1;
7898   const struct cpu_cost_table *extra_cost
7899     = aarch64_tune_params.insn_extra_cost;
7900   int cost = 0;
7901   bool compound_p = (outer == PLUS || outer == MINUS);
7902   machine_mode mode = GET_MODE (x);
7903
7904   gcc_checking_assert (code == MULT);
7905
7906   op0 = XEXP (x, 0);
7907   op1 = XEXP (x, 1);
7908
7909   if (VECTOR_MODE_P (mode))
7910     mode = GET_MODE_INNER (mode);
7911
7912   /* Integer multiply/fma.  */
7913   if (GET_MODE_CLASS (mode) == MODE_INT)
7914     {
7915       /* The multiply will be canonicalized as a shift, cost it as such.  */
7916       if (aarch64_shift_p (GET_CODE (x))
7917           || (CONST_INT_P (op1)
7918               && exact_log2 (INTVAL (op1)) > 0))
7919         {
7920           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7921                            || GET_CODE (op0) == SIGN_EXTEND;
7922           if (speed)
7923             {
7924               if (compound_p)
7925                 {
7926                   /* If the shift is considered cheap,
7927                      then don't add any cost. */
7928                   if (aarch64_cheap_mult_shift_p (x))
7929                     ;
7930                   else if (REG_P (op1))
7931                     /* ARITH + shift-by-register.  */
7932                     cost += extra_cost->alu.arith_shift_reg;
7933                   else if (is_extend)
7934                     /* ARITH + extended register.  We don't have a cost field
7935                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
7936                     cost += extra_cost->alu.extend_arith;
7937                   else
7938                     /* ARITH + shift-by-immediate.  */
7939                     cost += extra_cost->alu.arith_shift;
7940                 }
7941               else
7942                 /* LSL (immediate).  */
7943                 cost += extra_cost->alu.shift;
7944
7945             }
7946           /* Strip extends as we will have costed them in the case above.  */
7947           if (is_extend)
7948             op0 = aarch64_strip_extend (op0, true);
7949
7950           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7951
7952           return cost;
7953         }
7954
7955       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
7956          compound and let the below cases handle it.  After all, MNEG is a
7957          special-case alias of MSUB.  */
7958       if (GET_CODE (op0) == NEG)
7959         {
7960           op0 = XEXP (op0, 0);
7961           compound_p = true;
7962         }
7963
7964       /* Integer multiplies or FMAs have zero/sign extending variants.  */
7965       if ((GET_CODE (op0) == ZERO_EXTEND
7966            && GET_CODE (op1) == ZERO_EXTEND)
7967           || (GET_CODE (op0) == SIGN_EXTEND
7968               && GET_CODE (op1) == SIGN_EXTEND))
7969         {
7970           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
7971           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
7972
7973           if (speed)
7974             {
7975               if (compound_p)
7976                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
7977                 cost += extra_cost->mult[0].extend_add;
7978               else
7979                 /* MUL/SMULL/UMULL.  */
7980                 cost += extra_cost->mult[0].extend;
7981             }
7982
7983           return cost;
7984         }
7985
7986       /* This is either an integer multiply or a MADD.  In both cases
7987          we want to recurse and cost the operands.  */
7988       cost += rtx_cost (op0, mode, MULT, 0, speed);
7989       cost += rtx_cost (op1, mode, MULT, 1, speed);
7990
7991       if (speed)
7992         {
7993           if (compound_p)
7994             /* MADD/MSUB.  */
7995             cost += extra_cost->mult[mode == DImode].add;
7996           else
7997             /* MUL.  */
7998             cost += extra_cost->mult[mode == DImode].simple;
7999         }
8000
8001       return cost;
8002     }
8003   else
8004     {
8005       if (speed)
8006         {
8007           /* Floating-point FMA/FMUL can also support negations of the
8008              operands, unless the rounding mode is upward or downward in
8009              which case FNMUL is different than FMUL with operand negation.  */
8010           bool neg0 = GET_CODE (op0) == NEG;
8011           bool neg1 = GET_CODE (op1) == NEG;
8012           if (compound_p || !flag_rounding_math || (neg0 && neg1))
8013             {
8014               if (neg0)
8015                 op0 = XEXP (op0, 0);
8016               if (neg1)
8017                 op1 = XEXP (op1, 0);
8018             }
8019
8020           if (compound_p)
8021             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
8022             cost += extra_cost->fp[mode == DFmode].fma;
8023           else
8024             /* FMUL/FNMUL.  */
8025             cost += extra_cost->fp[mode == DFmode].mult;
8026         }
8027
8028       cost += rtx_cost (op0, mode, MULT, 0, speed);
8029       cost += rtx_cost (op1, mode, MULT, 1, speed);
8030       return cost;
8031     }
8032 }
8033
8034 static int
8035 aarch64_address_cost (rtx x,
8036                       machine_mode mode,
8037                       addr_space_t as ATTRIBUTE_UNUSED,
8038                       bool speed)
8039 {
8040   enum rtx_code c = GET_CODE (x);
8041   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8042   struct aarch64_address_info info;
8043   int cost = 0;
8044   info.shift = 0;
8045
8046   if (!aarch64_classify_address (&info, x, mode, false))
8047     {
8048       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8049         {
8050           /* This is a CONST or SYMBOL ref which will be split
8051              in a different way depending on the code model in use.
8052              Cost it through the generic infrastructure.  */
8053           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8054           /* Divide through by the cost of one instruction to
8055              bring it to the same units as the address costs.  */
8056           cost_symbol_ref /= COSTS_N_INSNS (1);
8057           /* The cost is then the cost of preparing the address,
8058              followed by an immediate (possibly 0) offset.  */
8059           return cost_symbol_ref + addr_cost->imm_offset;
8060         }
8061       else
8062         {
8063           /* This is most likely a jump table from a case
8064              statement.  */
8065           return addr_cost->register_offset;
8066         }
8067     }
8068
8069   switch (info.type)
8070     {
8071       case ADDRESS_LO_SUM:
8072       case ADDRESS_SYMBOLIC:
8073       case ADDRESS_REG_IMM:
8074         cost += addr_cost->imm_offset;
8075         break;
8076
8077       case ADDRESS_REG_WB:
8078         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8079           cost += addr_cost->pre_modify;
8080         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8081           cost += addr_cost->post_modify;
8082         else
8083           gcc_unreachable ();
8084
8085         break;
8086
8087       case ADDRESS_REG_REG:
8088         cost += addr_cost->register_offset;
8089         break;
8090
8091       case ADDRESS_REG_SXTW:
8092         cost += addr_cost->register_sextend;
8093         break;
8094
8095       case ADDRESS_REG_UXTW:
8096         cost += addr_cost->register_zextend;
8097         break;
8098
8099       default:
8100         gcc_unreachable ();
8101     }
8102
8103
8104   if (info.shift > 0)
8105     {
8106       /* For the sake of calculating the cost of the shifted register
8107          component, we can treat same sized modes in the same way.  */
8108       if (known_eq (GET_MODE_BITSIZE (mode), 16))
8109         cost += addr_cost->addr_scale_costs.hi;
8110       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8111         cost += addr_cost->addr_scale_costs.si;
8112       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8113         cost += addr_cost->addr_scale_costs.di;
8114       else
8115         /* We can't tell, or this is a 128-bit vector.  */
8116         cost += addr_cost->addr_scale_costs.ti;
8117     }
8118
8119   return cost;
8120 }
8121
8122 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
8123    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
8124    to be taken.  */
8125
8126 int
8127 aarch64_branch_cost (bool speed_p, bool predictable_p)
8128 {
8129   /* When optimizing for speed, use the cost of unpredictable branches.  */
8130   const struct cpu_branch_cost *branch_costs =
8131     aarch64_tune_params.branch_costs;
8132
8133   if (!speed_p || predictable_p)
8134     return branch_costs->predictable;
8135   else
8136     return branch_costs->unpredictable;
8137 }
8138
8139 /* Return true if the RTX X in mode MODE is a zero or sign extract
8140    usable in an ADD or SUB (extended register) instruction.  */
8141 static bool
8142 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8143 {
8144   /* Catch add with a sign extract.
8145      This is add_<optab><mode>_multp2.  */
8146   if (GET_CODE (x) == SIGN_EXTRACT
8147       || GET_CODE (x) == ZERO_EXTRACT)
8148     {
8149       rtx op0 = XEXP (x, 0);
8150       rtx op1 = XEXP (x, 1);
8151       rtx op2 = XEXP (x, 2);
8152
8153       if (GET_CODE (op0) == MULT
8154           && CONST_INT_P (op1)
8155           && op2 == const0_rtx
8156           && CONST_INT_P (XEXP (op0, 1))
8157           && aarch64_is_extend_from_extract (mode,
8158                                              XEXP (op0, 1),
8159                                              op1))
8160         {
8161           return true;
8162         }
8163     }
8164   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8165      No shift.  */
8166   else if (GET_CODE (x) == SIGN_EXTEND
8167            || GET_CODE (x) == ZERO_EXTEND)
8168     return REG_P (XEXP (x, 0));
8169
8170   return false;
8171 }
8172
8173 static bool
8174 aarch64_frint_unspec_p (unsigned int u)
8175 {
8176   switch (u)
8177     {
8178       case UNSPEC_FRINTZ:
8179       case UNSPEC_FRINTP:
8180       case UNSPEC_FRINTM:
8181       case UNSPEC_FRINTA:
8182       case UNSPEC_FRINTN:
8183       case UNSPEC_FRINTX:
8184       case UNSPEC_FRINTI:
8185         return true;
8186
8187       default:
8188         return false;
8189     }
8190 }
8191
8192 /* Return true iff X is an rtx that will match an extr instruction
8193    i.e. as described in the *extr<mode>5_insn family of patterns.
8194    OP0 and OP1 will be set to the operands of the shifts involved
8195    on success and will be NULL_RTX otherwise.  */
8196
8197 static bool
8198 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8199 {
8200   rtx op0, op1;
8201   scalar_int_mode mode;
8202   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8203     return false;
8204
8205   *res_op0 = NULL_RTX;
8206   *res_op1 = NULL_RTX;
8207
8208   if (GET_CODE (x) != IOR)
8209     return false;
8210
8211   op0 = XEXP (x, 0);
8212   op1 = XEXP (x, 1);
8213
8214   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8215       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8216     {
8217      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
8218       if (GET_CODE (op1) == ASHIFT)
8219         std::swap (op0, op1);
8220
8221       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8222         return false;
8223
8224       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8225       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8226
8227       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8228           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8229         {
8230           *res_op0 = XEXP (op0, 0);
8231           *res_op1 = XEXP (op1, 0);
8232           return true;
8233         }
8234     }
8235
8236   return false;
8237 }
8238
8239 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8240    storing it in *COST.  Result is true if the total cost of the operation
8241    has now been calculated.  */
8242 static bool
8243 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8244 {
8245   rtx inner;
8246   rtx comparator;
8247   enum rtx_code cmpcode;
8248
8249   if (COMPARISON_P (op0))
8250     {
8251       inner = XEXP (op0, 0);
8252       comparator = XEXP (op0, 1);
8253       cmpcode = GET_CODE (op0);
8254     }
8255   else
8256     {
8257       inner = op0;
8258       comparator = const0_rtx;
8259       cmpcode = NE;
8260     }
8261
8262   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8263     {
8264       /* Conditional branch.  */
8265       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8266         return true;
8267       else
8268         {
8269           if (cmpcode == NE || cmpcode == EQ)
8270             {
8271               if (comparator == const0_rtx)
8272                 {
8273                   /* TBZ/TBNZ/CBZ/CBNZ.  */
8274                   if (GET_CODE (inner) == ZERO_EXTRACT)
8275                     /* TBZ/TBNZ.  */
8276                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8277                                        ZERO_EXTRACT, 0, speed);
8278                   else
8279                     /* CBZ/CBNZ.  */
8280                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8281
8282                 return true;
8283               }
8284             }
8285           else if (cmpcode == LT || cmpcode == GE)
8286             {
8287               /* TBZ/TBNZ.  */
8288               if (comparator == const0_rtx)
8289                 return true;
8290             }
8291         }
8292     }
8293   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8294     {
8295       /* CCMP.  */
8296       if (GET_CODE (op1) == COMPARE)
8297         {
8298           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
8299           if (XEXP (op1, 1) == const0_rtx)
8300             *cost += 1;
8301           if (speed)
8302             {
8303               machine_mode mode = GET_MODE (XEXP (op1, 0));
8304               const struct cpu_cost_table *extra_cost
8305                 = aarch64_tune_params.insn_extra_cost;
8306
8307               if (GET_MODE_CLASS (mode) == MODE_INT)
8308                 *cost += extra_cost->alu.arith;
8309               else
8310                 *cost += extra_cost->fp[mode == DFmode].compare;
8311             }
8312           return true;
8313         }
8314
8315       /* It's a conditional operation based on the status flags,
8316          so it must be some flavor of CSEL.  */
8317
8318       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
8319       if (GET_CODE (op1) == NEG
8320           || GET_CODE (op1) == NOT
8321           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8322         op1 = XEXP (op1, 0);
8323       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8324         {
8325           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
8326           op1 = XEXP (op1, 0);
8327           op2 = XEXP (op2, 0);
8328         }
8329
8330       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8331       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8332       return true;
8333     }
8334
8335   /* We don't know what this is, cost all operands.  */
8336   return false;
8337 }
8338
8339 /* Check whether X is a bitfield operation of the form shift + extend that
8340    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
8341    operand to which the bitfield operation is applied.  Otherwise return
8342    NULL_RTX.  */
8343
8344 static rtx
8345 aarch64_extend_bitfield_pattern_p (rtx x)
8346 {
8347   rtx_code outer_code = GET_CODE (x);
8348   machine_mode outer_mode = GET_MODE (x);
8349
8350   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8351       && outer_mode != SImode && outer_mode != DImode)
8352     return NULL_RTX;
8353
8354   rtx inner = XEXP (x, 0);
8355   rtx_code inner_code = GET_CODE (inner);
8356   machine_mode inner_mode = GET_MODE (inner);
8357   rtx op = NULL_RTX;
8358
8359   switch (inner_code)
8360     {
8361       case ASHIFT:
8362         if (CONST_INT_P (XEXP (inner, 1))
8363             && (inner_mode == QImode || inner_mode == HImode))
8364           op = XEXP (inner, 0);
8365         break;
8366       case LSHIFTRT:
8367         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8368             && (inner_mode == QImode || inner_mode == HImode))
8369           op = XEXP (inner, 0);
8370         break;
8371       case ASHIFTRT:
8372         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8373             && (inner_mode == QImode || inner_mode == HImode))
8374           op = XEXP (inner, 0);
8375         break;
8376       default:
8377         break;
8378     }
8379
8380   return op;
8381 }
8382
8383 /* Return true if the mask and a shift amount from an RTX of the form
8384    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8385    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
8386
8387 bool
8388 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8389                                     rtx shft_amnt)
8390 {
8391   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8392          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8393          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8394          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8395 }
8396
8397 /* Calculate the cost of calculating X, storing it in *COST.  Result
8398    is true if the total cost of the operation has now been calculated.  */
8399 static bool
8400 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8401                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8402 {
8403   rtx op0, op1, op2;
8404   const struct cpu_cost_table *extra_cost
8405     = aarch64_tune_params.insn_extra_cost;
8406   int code = GET_CODE (x);
8407   scalar_int_mode int_mode;
8408
8409   /* By default, assume that everything has equivalent cost to the
8410      cheapest instruction.  Any additional costs are applied as a delta
8411      above this default.  */
8412   *cost = COSTS_N_INSNS (1);
8413
8414   switch (code)
8415     {
8416     case SET:
8417       /* The cost depends entirely on the operands to SET.  */
8418       *cost = 0;
8419       op0 = SET_DEST (x);
8420       op1 = SET_SRC (x);
8421
8422       switch (GET_CODE (op0))
8423         {
8424         case MEM:
8425           if (speed)
8426             {
8427               rtx address = XEXP (op0, 0);
8428               if (VECTOR_MODE_P (mode))
8429                 *cost += extra_cost->ldst.storev;
8430               else if (GET_MODE_CLASS (mode) == MODE_INT)
8431                 *cost += extra_cost->ldst.store;
8432               else if (mode == SFmode)
8433                 *cost += extra_cost->ldst.storef;
8434               else if (mode == DFmode)
8435                 *cost += extra_cost->ldst.stored;
8436
8437               *cost +=
8438                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8439                                                      0, speed));
8440             }
8441
8442           *cost += rtx_cost (op1, mode, SET, 1, speed);
8443           return true;
8444
8445         case SUBREG:
8446           if (! REG_P (SUBREG_REG (op0)))
8447             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8448
8449           /* Fall through.  */
8450         case REG:
8451           /* The cost is one per vector-register copied.  */
8452           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8453             {
8454               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8455               *cost = COSTS_N_INSNS (nregs);
8456             }
8457           /* const0_rtx is in general free, but we will use an
8458              instruction to set a register to 0.  */
8459           else if (REG_P (op1) || op1 == const0_rtx)
8460             {
8461               /* The cost is 1 per register copied.  */
8462               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8463               *cost = COSTS_N_INSNS (nregs);
8464             }
8465           else
8466             /* Cost is just the cost of the RHS of the set.  */
8467             *cost += rtx_cost (op1, mode, SET, 1, speed);
8468           return true;
8469
8470         case ZERO_EXTRACT:
8471         case SIGN_EXTRACT:
8472           /* Bit-field insertion.  Strip any redundant widening of
8473              the RHS to meet the width of the target.  */
8474           if (GET_CODE (op1) == SUBREG)
8475             op1 = SUBREG_REG (op1);
8476           if ((GET_CODE (op1) == ZERO_EXTEND
8477                || GET_CODE (op1) == SIGN_EXTEND)
8478               && CONST_INT_P (XEXP (op0, 1))
8479               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8480               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8481             op1 = XEXP (op1, 0);
8482
8483           if (CONST_INT_P (op1))
8484             {
8485               /* MOV immediate is assumed to always be cheap.  */
8486               *cost = COSTS_N_INSNS (1);
8487             }
8488           else
8489             {
8490               /* BFM.  */
8491               if (speed)
8492                 *cost += extra_cost->alu.bfi;
8493               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8494             }
8495
8496           return true;
8497
8498         default:
8499           /* We can't make sense of this, assume default cost.  */
8500           *cost = COSTS_N_INSNS (1);
8501           return false;
8502         }
8503       return false;
8504
8505     case CONST_INT:
8506       /* If an instruction can incorporate a constant within the
8507          instruction, the instruction's expression avoids calling
8508          rtx_cost() on the constant.  If rtx_cost() is called on a
8509          constant, then it is usually because the constant must be
8510          moved into a register by one or more instructions.
8511
8512          The exception is constant 0, which can be expressed
8513          as XZR/WZR and is therefore free.  The exception to this is
8514          if we have (set (reg) (const0_rtx)) in which case we must cost
8515          the move.  However, we can catch that when we cost the SET, so
8516          we don't need to consider that here.  */
8517       if (x == const0_rtx)
8518         *cost = 0;
8519       else
8520         {
8521           /* To an approximation, building any other constant is
8522              proportionally expensive to the number of instructions
8523              required to build that constant.  This is true whether we
8524              are compiling for SPEED or otherwise.  */
8525           if (!is_a <scalar_int_mode> (mode, &int_mode))
8526             int_mode = word_mode;
8527           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8528                                  (NULL_RTX, x, false, int_mode));
8529         }
8530       return true;
8531
8532     case CONST_DOUBLE:
8533
8534       /* First determine number of instructions to do the move
8535           as an integer constant.  */
8536       if (!aarch64_float_const_representable_p (x)
8537            && !aarch64_can_const_movi_rtx_p (x, mode)
8538            && aarch64_float_const_rtx_p (x))
8539         {
8540           unsigned HOST_WIDE_INT ival;
8541           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8542           gcc_assert (succeed);
8543
8544           scalar_int_mode imode = (mode == HFmode
8545                                    ? SImode
8546                                    : int_mode_for_mode (mode).require ());
8547           int ncost = aarch64_internal_mov_immediate
8548                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8549           *cost += COSTS_N_INSNS (ncost);
8550           return true;
8551         }
8552
8553       if (speed)
8554         {
8555           /* mov[df,sf]_aarch64.  */
8556           if (aarch64_float_const_representable_p (x))
8557             /* FMOV (scalar immediate).  */
8558             *cost += extra_cost->fp[mode == DFmode].fpconst;
8559           else if (!aarch64_float_const_zero_rtx_p (x))
8560             {
8561               /* This will be a load from memory.  */
8562               if (mode == DFmode)
8563                 *cost += extra_cost->ldst.loadd;
8564               else
8565                 *cost += extra_cost->ldst.loadf;
8566             }
8567           else
8568             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
8569                or MOV v0.s[0], wzr - neither of which are modeled by the
8570                cost tables.  Just use the default cost.  */
8571             {
8572             }
8573         }
8574
8575       return true;
8576
8577     case MEM:
8578       if (speed)
8579         {
8580           /* For loads we want the base cost of a load, plus an
8581              approximation for the additional cost of the addressing
8582              mode.  */
8583           rtx address = XEXP (x, 0);
8584           if (VECTOR_MODE_P (mode))
8585             *cost += extra_cost->ldst.loadv;
8586           else if (GET_MODE_CLASS (mode) == MODE_INT)
8587             *cost += extra_cost->ldst.load;
8588           else if (mode == SFmode)
8589             *cost += extra_cost->ldst.loadf;
8590           else if (mode == DFmode)
8591             *cost += extra_cost->ldst.loadd;
8592
8593           *cost +=
8594                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8595                                                      0, speed));
8596         }
8597
8598       return true;
8599
8600     case NEG:
8601       op0 = XEXP (x, 0);
8602
8603       if (VECTOR_MODE_P (mode))
8604         {
8605           if (speed)
8606             {
8607               /* FNEG.  */
8608               *cost += extra_cost->vect.alu;
8609             }
8610           return false;
8611         }
8612
8613       if (GET_MODE_CLASS (mode) == MODE_INT)
8614         {
8615           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8616               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8617             {
8618               /* CSETM.  */
8619               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8620               return true;
8621             }
8622
8623           /* Cost this as SUB wzr, X.  */
8624           op0 = CONST0_RTX (mode);
8625           op1 = XEXP (x, 0);
8626           goto cost_minus;
8627         }
8628
8629       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8630         {
8631           /* Support (neg(fma...)) as a single instruction only if
8632              sign of zeros is unimportant.  This matches the decision
8633              making in aarch64.md.  */
8634           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8635             {
8636               /* FNMADD.  */
8637               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8638               return true;
8639             }
8640           if (GET_CODE (op0) == MULT)
8641             {
8642               /* FNMUL.  */
8643               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8644               return true;
8645             }
8646           if (speed)
8647             /* FNEG.  */
8648             *cost += extra_cost->fp[mode == DFmode].neg;
8649           return false;
8650         }
8651
8652       return false;
8653
8654     case CLRSB:
8655     case CLZ:
8656       if (speed)
8657         {
8658           if (VECTOR_MODE_P (mode))
8659             *cost += extra_cost->vect.alu;
8660           else
8661             *cost += extra_cost->alu.clz;
8662         }
8663
8664       return false;
8665
8666     case COMPARE:
8667       op0 = XEXP (x, 0);
8668       op1 = XEXP (x, 1);
8669
8670       if (op1 == const0_rtx
8671           && GET_CODE (op0) == AND)
8672         {
8673           x = op0;
8674           mode = GET_MODE (op0);
8675           goto cost_logic;
8676         }
8677
8678       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8679         {
8680           /* TODO: A write to the CC flags possibly costs extra, this
8681              needs encoding in the cost tables.  */
8682
8683           mode = GET_MODE (op0);
8684           /* ANDS.  */
8685           if (GET_CODE (op0) == AND)
8686             {
8687               x = op0;
8688               goto cost_logic;
8689             }
8690
8691           if (GET_CODE (op0) == PLUS)
8692             {
8693               /* ADDS (and CMN alias).  */
8694               x = op0;
8695               goto cost_plus;
8696             }
8697
8698           if (GET_CODE (op0) == MINUS)
8699             {
8700               /* SUBS.  */
8701               x = op0;
8702               goto cost_minus;
8703             }
8704
8705           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8706               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8707               && CONST_INT_P (XEXP (op0, 2)))
8708             {
8709               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8710                  Handle it here directly rather than going to cost_logic
8711                  since we know the immediate generated for the TST is valid
8712                  so we can avoid creating an intermediate rtx for it only
8713                  for costing purposes.  */
8714               if (speed)
8715                 *cost += extra_cost->alu.logical;
8716
8717               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8718                                  ZERO_EXTRACT, 0, speed);
8719               return true;
8720             }
8721
8722           if (GET_CODE (op1) == NEG)
8723             {
8724               /* CMN.  */
8725               if (speed)
8726                 *cost += extra_cost->alu.arith;
8727
8728               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8729               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8730               return true;
8731             }
8732
8733           /* CMP.
8734
8735              Compare can freely swap the order of operands, and
8736              canonicalization puts the more complex operation first.
8737              But the integer MINUS logic expects the shift/extend
8738              operation in op1.  */
8739           if (! (REG_P (op0)
8740                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8741           {
8742             op0 = XEXP (x, 1);
8743             op1 = XEXP (x, 0);
8744           }
8745           goto cost_minus;
8746         }
8747
8748       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8749         {
8750           /* FCMP.  */
8751           if (speed)
8752             *cost += extra_cost->fp[mode == DFmode].compare;
8753
8754           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8755             {
8756               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8757               /* FCMP supports constant 0.0 for no extra cost. */
8758               return true;
8759             }
8760           return false;
8761         }
8762
8763       if (VECTOR_MODE_P (mode))
8764         {
8765           /* Vector compare.  */
8766           if (speed)
8767             *cost += extra_cost->vect.alu;
8768
8769           if (aarch64_float_const_zero_rtx_p (op1))
8770             {
8771               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8772                  cost.  */
8773               return true;
8774             }
8775           return false;
8776         }
8777       return false;
8778
8779     case MINUS:
8780       {
8781         op0 = XEXP (x, 0);
8782         op1 = XEXP (x, 1);
8783
8784 cost_minus:
8785         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8786
8787         /* Detect valid immediates.  */
8788         if ((GET_MODE_CLASS (mode) == MODE_INT
8789              || (GET_MODE_CLASS (mode) == MODE_CC
8790                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8791             && CONST_INT_P (op1)
8792             && aarch64_uimm12_shift (INTVAL (op1)))
8793           {
8794             if (speed)
8795               /* SUB(S) (immediate).  */
8796               *cost += extra_cost->alu.arith;
8797             return true;
8798           }
8799
8800         /* Look for SUB (extended register).  */
8801         if (is_a <scalar_int_mode> (mode, &int_mode)
8802             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8803           {
8804             if (speed)
8805               *cost += extra_cost->alu.extend_arith;
8806
8807             op1 = aarch64_strip_extend (op1, true);
8808             *cost += rtx_cost (op1, VOIDmode,
8809                                (enum rtx_code) GET_CODE (op1), 0, speed);
8810             return true;
8811           }
8812
8813         rtx new_op1 = aarch64_strip_extend (op1, false);
8814
8815         /* Cost this as an FMA-alike operation.  */
8816         if ((GET_CODE (new_op1) == MULT
8817              || aarch64_shift_p (GET_CODE (new_op1)))
8818             && code != COMPARE)
8819           {
8820             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8821                                             (enum rtx_code) code,
8822                                             speed);
8823             return true;
8824           }
8825
8826         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8827
8828         if (speed)
8829           {
8830             if (VECTOR_MODE_P (mode))
8831               {
8832                 /* Vector SUB.  */
8833                 *cost += extra_cost->vect.alu;
8834               }
8835             else if (GET_MODE_CLASS (mode) == MODE_INT)
8836               {
8837                 /* SUB(S).  */
8838                 *cost += extra_cost->alu.arith;
8839               }
8840             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8841               {
8842                 /* FSUB.  */
8843                 *cost += extra_cost->fp[mode == DFmode].addsub;
8844               }
8845           }
8846         return true;
8847       }
8848
8849     case PLUS:
8850       {
8851         rtx new_op0;
8852
8853         op0 = XEXP (x, 0);
8854         op1 = XEXP (x, 1);
8855
8856 cost_plus:
8857         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8858             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8859           {
8860             /* CSINC.  */
8861             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8862             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8863             return true;
8864           }
8865
8866         if (GET_MODE_CLASS (mode) == MODE_INT
8867             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8868                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8869           {
8870             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8871
8872             if (speed)
8873               /* ADD (immediate).  */
8874               *cost += extra_cost->alu.arith;
8875             return true;
8876           }
8877
8878         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8879
8880         /* Look for ADD (extended register).  */
8881         if (is_a <scalar_int_mode> (mode, &int_mode)
8882             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8883           {
8884             if (speed)
8885               *cost += extra_cost->alu.extend_arith;
8886
8887             op0 = aarch64_strip_extend (op0, true);
8888             *cost += rtx_cost (op0, VOIDmode,
8889                                (enum rtx_code) GET_CODE (op0), 0, speed);
8890             return true;
8891           }
8892
8893         /* Strip any extend, leave shifts behind as we will
8894            cost them through mult_cost.  */
8895         new_op0 = aarch64_strip_extend (op0, false);
8896
8897         if (GET_CODE (new_op0) == MULT
8898             || aarch64_shift_p (GET_CODE (new_op0)))
8899           {
8900             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8901                                             speed);
8902             return true;
8903           }
8904
8905         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8906
8907         if (speed)
8908           {
8909             if (VECTOR_MODE_P (mode))
8910               {
8911                 /* Vector ADD.  */
8912                 *cost += extra_cost->vect.alu;
8913               }
8914             else if (GET_MODE_CLASS (mode) == MODE_INT)
8915               {
8916                 /* ADD.  */
8917                 *cost += extra_cost->alu.arith;
8918               }
8919             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8920               {
8921                 /* FADD.  */
8922                 *cost += extra_cost->fp[mode == DFmode].addsub;
8923               }
8924           }
8925         return true;
8926       }
8927
8928     case BSWAP:
8929       *cost = COSTS_N_INSNS (1);
8930
8931       if (speed)
8932         {
8933           if (VECTOR_MODE_P (mode))
8934             *cost += extra_cost->vect.alu;
8935           else
8936             *cost += extra_cost->alu.rev;
8937         }
8938       return false;
8939
8940     case IOR:
8941       if (aarch_rev16_p (x))
8942         {
8943           *cost = COSTS_N_INSNS (1);
8944
8945           if (speed)
8946             {
8947               if (VECTOR_MODE_P (mode))
8948                 *cost += extra_cost->vect.alu;
8949               else
8950                 *cost += extra_cost->alu.rev;
8951             }
8952           return true;
8953         }
8954
8955       if (aarch64_extr_rtx_p (x, &op0, &op1))
8956         {
8957           *cost += rtx_cost (op0, mode, IOR, 0, speed);
8958           *cost += rtx_cost (op1, mode, IOR, 1, speed);
8959           if (speed)
8960             *cost += extra_cost->alu.shift;
8961
8962           return true;
8963         }
8964     /* Fall through.  */
8965     case XOR:
8966     case AND:
8967     cost_logic:
8968       op0 = XEXP (x, 0);
8969       op1 = XEXP (x, 1);
8970
8971       if (VECTOR_MODE_P (mode))
8972         {
8973           if (speed)
8974             *cost += extra_cost->vect.alu;
8975           return true;
8976         }
8977
8978       if (code == AND
8979           && GET_CODE (op0) == MULT
8980           && CONST_INT_P (XEXP (op0, 1))
8981           && CONST_INT_P (op1)
8982           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
8983                                INTVAL (op1)) != 0)
8984         {
8985           /* This is a UBFM/SBFM.  */
8986           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
8987           if (speed)
8988             *cost += extra_cost->alu.bfx;
8989           return true;
8990         }
8991
8992       if (is_int_mode (mode, &int_mode))
8993         {
8994           if (CONST_INT_P (op1))
8995             {
8996               /* We have a mask + shift version of a UBFIZ
8997                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
8998               if (GET_CODE (op0) == ASHIFT
8999                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9000                                                          XEXP (op0, 1)))
9001                 {
9002                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
9003                                      (enum rtx_code) code, 0, speed);
9004                   if (speed)
9005                     *cost += extra_cost->alu.bfx;
9006
9007                   return true;
9008                 }
9009               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9010                 {
9011                 /* We possibly get the immediate for free, this is not
9012                    modelled.  */
9013                   *cost += rtx_cost (op0, int_mode,
9014                                      (enum rtx_code) code, 0, speed);
9015                   if (speed)
9016                     *cost += extra_cost->alu.logical;
9017
9018                   return true;
9019                 }
9020             }
9021           else
9022             {
9023               rtx new_op0 = op0;
9024
9025               /* Handle ORN, EON, or BIC.  */
9026               if (GET_CODE (op0) == NOT)
9027                 op0 = XEXP (op0, 0);
9028
9029               new_op0 = aarch64_strip_shift (op0);
9030
9031               /* If we had a shift on op0 then this is a logical-shift-
9032                  by-register/immediate operation.  Otherwise, this is just
9033                  a logical operation.  */
9034               if (speed)
9035                 {
9036                   if (new_op0 != op0)
9037                     {
9038                       /* Shift by immediate.  */
9039                       if (CONST_INT_P (XEXP (op0, 1)))
9040                         *cost += extra_cost->alu.log_shift;
9041                       else
9042                         *cost += extra_cost->alu.log_shift_reg;
9043                     }
9044                   else
9045                     *cost += extra_cost->alu.logical;
9046                 }
9047
9048               /* In both cases we want to cost both operands.  */
9049               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9050                                  0, speed);
9051               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9052                                  1, speed);
9053
9054               return true;
9055             }
9056         }
9057       return false;
9058
9059     case NOT:
9060       x = XEXP (x, 0);
9061       op0 = aarch64_strip_shift (x);
9062
9063       if (VECTOR_MODE_P (mode))
9064         {
9065           /* Vector NOT.  */
9066           *cost += extra_cost->vect.alu;
9067           return false;
9068         }
9069
9070       /* MVN-shifted-reg.  */
9071       if (op0 != x)
9072         {
9073           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9074
9075           if (speed)
9076             *cost += extra_cost->alu.log_shift;
9077
9078           return true;
9079         }
9080       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9081          Handle the second form here taking care that 'a' in the above can
9082          be a shift.  */
9083       else if (GET_CODE (op0) == XOR)
9084         {
9085           rtx newop0 = XEXP (op0, 0);
9086           rtx newop1 = XEXP (op0, 1);
9087           rtx op0_stripped = aarch64_strip_shift (newop0);
9088
9089           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9090           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9091
9092           if (speed)
9093             {
9094               if (op0_stripped != newop0)
9095                 *cost += extra_cost->alu.log_shift;
9096               else
9097                 *cost += extra_cost->alu.logical;
9098             }
9099
9100           return true;
9101         }
9102       /* MVN.  */
9103       if (speed)
9104         *cost += extra_cost->alu.logical;
9105
9106       return false;
9107
9108     case ZERO_EXTEND:
9109
9110       op0 = XEXP (x, 0);
9111       /* If a value is written in SI mode, then zero extended to DI
9112          mode, the operation will in general be free as a write to
9113          a 'w' register implicitly zeroes the upper bits of an 'x'
9114          register.  However, if this is
9115
9116            (set (reg) (zero_extend (reg)))
9117
9118          we must cost the explicit register move.  */
9119       if (mode == DImode
9120           && GET_MODE (op0) == SImode
9121           && outer == SET)
9122         {
9123           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9124
9125         /* If OP_COST is non-zero, then the cost of the zero extend
9126            is effectively the cost of the inner operation.  Otherwise
9127            we have a MOV instruction and we take the cost from the MOV
9128            itself.  This is true independently of whether we are
9129            optimizing for space or time.  */
9130           if (op_cost)
9131             *cost = op_cost;
9132
9133           return true;
9134         }
9135       else if (MEM_P (op0))
9136         {
9137           /* All loads can zero extend to any size for free.  */
9138           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9139           return true;
9140         }
9141
9142       op0 = aarch64_extend_bitfield_pattern_p (x);
9143       if (op0)
9144         {
9145           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9146           if (speed)
9147             *cost += extra_cost->alu.bfx;
9148           return true;
9149         }
9150
9151       if (speed)
9152         {
9153           if (VECTOR_MODE_P (mode))
9154             {
9155               /* UMOV.  */
9156               *cost += extra_cost->vect.alu;
9157             }
9158           else
9159             {
9160               /* We generate an AND instead of UXTB/UXTH.  */
9161               *cost += extra_cost->alu.logical;
9162             }
9163         }
9164       return false;
9165
9166     case SIGN_EXTEND:
9167       if (MEM_P (XEXP (x, 0)))
9168         {
9169           /* LDRSH.  */
9170           if (speed)
9171             {
9172               rtx address = XEXP (XEXP (x, 0), 0);
9173               *cost += extra_cost->ldst.load_sign_extend;
9174
9175               *cost +=
9176                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9177                                                      0, speed));
9178             }
9179           return true;
9180         }
9181
9182       op0 = aarch64_extend_bitfield_pattern_p (x);
9183       if (op0)
9184         {
9185           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9186           if (speed)
9187             *cost += extra_cost->alu.bfx;
9188           return true;
9189         }
9190
9191       if (speed)
9192         {
9193           if (VECTOR_MODE_P (mode))
9194             *cost += extra_cost->vect.alu;
9195           else
9196             *cost += extra_cost->alu.extend;
9197         }
9198       return false;
9199
9200     case ASHIFT:
9201       op0 = XEXP (x, 0);
9202       op1 = XEXP (x, 1);
9203
9204       if (CONST_INT_P (op1))
9205         {
9206           if (speed)
9207             {
9208               if (VECTOR_MODE_P (mode))
9209                 {
9210                   /* Vector shift (immediate).  */
9211                   *cost += extra_cost->vect.alu;
9212                 }
9213               else
9214                 {
9215                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
9216                      aliases.  */
9217                   *cost += extra_cost->alu.shift;
9218                 }
9219             }
9220
9221           /* We can incorporate zero/sign extend for free.  */
9222           if (GET_CODE (op0) == ZERO_EXTEND
9223               || GET_CODE (op0) == SIGN_EXTEND)
9224             op0 = XEXP (op0, 0);
9225
9226           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9227           return true;
9228         }
9229       else
9230         {
9231           if (VECTOR_MODE_P (mode))
9232             {
9233               if (speed)
9234                 /* Vector shift (register).  */
9235                 *cost += extra_cost->vect.alu;
9236             }
9237           else
9238             {
9239               if (speed)
9240                 /* LSLV.  */
9241                 *cost += extra_cost->alu.shift_reg;
9242
9243               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9244                   && CONST_INT_P (XEXP (op1, 1))
9245                   && known_eq (INTVAL (XEXP (op1, 1)),
9246                                GET_MODE_BITSIZE (mode) - 1))
9247                 {
9248                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9249                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9250                      don't recurse into it.  */
9251                   return true;
9252                 }
9253             }
9254           return false;  /* All arguments need to be in registers.  */
9255         }
9256
9257     case ROTATE:
9258     case ROTATERT:
9259     case LSHIFTRT:
9260     case ASHIFTRT:
9261       op0 = XEXP (x, 0);
9262       op1 = XEXP (x, 1);
9263
9264       if (CONST_INT_P (op1))
9265         {
9266           /* ASR (immediate) and friends.  */
9267           if (speed)
9268             {
9269               if (VECTOR_MODE_P (mode))
9270                 *cost += extra_cost->vect.alu;
9271               else
9272                 *cost += extra_cost->alu.shift;
9273             }
9274
9275           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9276           return true;
9277         }
9278       else
9279         {
9280           if (VECTOR_MODE_P (mode))
9281             {
9282               if (speed)
9283                 /* Vector shift (register).  */
9284                 *cost += extra_cost->vect.alu;
9285             }
9286           else
9287             {
9288               if (speed)
9289                 /* ASR (register) and friends.  */
9290                 *cost += extra_cost->alu.shift_reg;
9291
9292               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9293                   && CONST_INT_P (XEXP (op1, 1))
9294                   && known_eq (INTVAL (XEXP (op1, 1)),
9295                                GET_MODE_BITSIZE (mode) - 1))
9296                 {
9297                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9298                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9299                      don't recurse into it.  */
9300                   return true;
9301                 }
9302             }
9303           return false;  /* All arguments need to be in registers.  */
9304         }
9305
9306     case SYMBOL_REF:
9307
9308       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9309           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9310         {
9311           /* LDR.  */
9312           if (speed)
9313             *cost += extra_cost->ldst.load;
9314         }
9315       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9316                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9317         {
9318           /* ADRP, followed by ADD.  */
9319           *cost += COSTS_N_INSNS (1);
9320           if (speed)
9321             *cost += 2 * extra_cost->alu.arith;
9322         }
9323       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9324                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9325         {
9326           /* ADR.  */
9327           if (speed)
9328             *cost += extra_cost->alu.arith;
9329         }
9330
9331       if (flag_pic)
9332         {
9333           /* One extra load instruction, after accessing the GOT.  */
9334           *cost += COSTS_N_INSNS (1);
9335           if (speed)
9336             *cost += extra_cost->ldst.load;
9337         }
9338       return true;
9339
9340     case HIGH:
9341     case LO_SUM:
9342       /* ADRP/ADD (immediate).  */
9343       if (speed)
9344         *cost += extra_cost->alu.arith;
9345       return true;
9346
9347     case ZERO_EXTRACT:
9348     case SIGN_EXTRACT:
9349       /* UBFX/SBFX.  */
9350       if (speed)
9351         {
9352           if (VECTOR_MODE_P (mode))
9353             *cost += extra_cost->vect.alu;
9354           else
9355             *cost += extra_cost->alu.bfx;
9356         }
9357
9358       /* We can trust that the immediates used will be correct (there
9359          are no by-register forms), so we need only cost op0.  */
9360       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9361       return true;
9362
9363     case MULT:
9364       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9365       /* aarch64_rtx_mult_cost always handles recursion to its
9366          operands.  */
9367       return true;
9368
9369     case MOD:
9370     /* We can expand signed mod by power of 2 using a NEGS, two parallel
9371        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
9372        an unconditional negate.  This case should only ever be reached through
9373        the set_smod_pow2_cheap check in expmed.c.  */
9374       if (CONST_INT_P (XEXP (x, 1))
9375           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9376           && (mode == SImode || mode == DImode))
9377         {
9378           /* We expand to 4 instructions.  Reset the baseline.  */
9379           *cost = COSTS_N_INSNS (4);
9380
9381           if (speed)
9382             *cost += 2 * extra_cost->alu.logical
9383                      + 2 * extra_cost->alu.arith;
9384
9385           return true;
9386         }
9387
9388     /* Fall-through.  */
9389     case UMOD:
9390       if (speed)
9391         {
9392           /* Slighly prefer UMOD over SMOD.  */
9393           if (VECTOR_MODE_P (mode))
9394             *cost += extra_cost->vect.alu;
9395           else if (GET_MODE_CLASS (mode) == MODE_INT)
9396             *cost += (extra_cost->mult[mode == DImode].add
9397                       + extra_cost->mult[mode == DImode].idiv
9398                       + (code == MOD ? 1 : 0));
9399         }
9400       return false;  /* All arguments need to be in registers.  */
9401
9402     case DIV:
9403     case UDIV:
9404     case SQRT:
9405       if (speed)
9406         {
9407           if (VECTOR_MODE_P (mode))
9408             *cost += extra_cost->vect.alu;
9409           else if (GET_MODE_CLASS (mode) == MODE_INT)
9410             /* There is no integer SQRT, so only DIV and UDIV can get
9411                here.  */
9412             *cost += (extra_cost->mult[mode == DImode].idiv
9413                      /* Slighly prefer UDIV over SDIV.  */
9414                      + (code == DIV ? 1 : 0));
9415           else
9416             *cost += extra_cost->fp[mode == DFmode].div;
9417         }
9418       return false;  /* All arguments need to be in registers.  */
9419
9420     case IF_THEN_ELSE:
9421       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9422                                          XEXP (x, 2), cost, speed);
9423
9424     case EQ:
9425     case NE:
9426     case GT:
9427     case GTU:
9428     case LT:
9429     case LTU:
9430     case GE:
9431     case GEU:
9432     case LE:
9433     case LEU:
9434
9435       return false; /* All arguments must be in registers.  */
9436
9437     case FMA:
9438       op0 = XEXP (x, 0);
9439       op1 = XEXP (x, 1);
9440       op2 = XEXP (x, 2);
9441
9442       if (speed)
9443         {
9444           if (VECTOR_MODE_P (mode))
9445             *cost += extra_cost->vect.alu;
9446           else
9447             *cost += extra_cost->fp[mode == DFmode].fma;
9448         }
9449
9450       /* FMSUB, FNMADD, and FNMSUB are free.  */
9451       if (GET_CODE (op0) == NEG)
9452         op0 = XEXP (op0, 0);
9453
9454       if (GET_CODE (op2) == NEG)
9455         op2 = XEXP (op2, 0);
9456
9457       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9458          and the by-element operand as operand 0.  */
9459       if (GET_CODE (op1) == NEG)
9460         op1 = XEXP (op1, 0);
9461
9462       /* Catch vector-by-element operations.  The by-element operand can
9463          either be (vec_duplicate (vec_select (x))) or just
9464          (vec_select (x)), depending on whether we are multiplying by
9465          a vector or a scalar.
9466
9467          Canonicalization is not very good in these cases, FMA4 will put the
9468          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
9469       if (GET_CODE (op0) == VEC_DUPLICATE)
9470         op0 = XEXP (op0, 0);
9471       else if (GET_CODE (op1) == VEC_DUPLICATE)
9472         op1 = XEXP (op1, 0);
9473
9474       if (GET_CODE (op0) == VEC_SELECT)
9475         op0 = XEXP (op0, 0);
9476       else if (GET_CODE (op1) == VEC_SELECT)
9477         op1 = XEXP (op1, 0);
9478
9479       /* If the remaining parameters are not registers,
9480          get the cost to put them into registers.  */
9481       *cost += rtx_cost (op0, mode, FMA, 0, speed);
9482       *cost += rtx_cost (op1, mode, FMA, 1, speed);
9483       *cost += rtx_cost (op2, mode, FMA, 2, speed);
9484       return true;
9485
9486     case FLOAT:
9487     case UNSIGNED_FLOAT:
9488       if (speed)
9489         *cost += extra_cost->fp[mode == DFmode].fromint;
9490       return false;
9491
9492     case FLOAT_EXTEND:
9493       if (speed)
9494         {
9495           if (VECTOR_MODE_P (mode))
9496             {
9497               /*Vector truncate.  */
9498               *cost += extra_cost->vect.alu;
9499             }
9500           else
9501             *cost += extra_cost->fp[mode == DFmode].widen;
9502         }
9503       return false;
9504
9505     case FLOAT_TRUNCATE:
9506       if (speed)
9507         {
9508           if (VECTOR_MODE_P (mode))
9509             {
9510               /*Vector conversion.  */
9511               *cost += extra_cost->vect.alu;
9512             }
9513           else
9514             *cost += extra_cost->fp[mode == DFmode].narrow;
9515         }
9516       return false;
9517
9518     case FIX:
9519     case UNSIGNED_FIX:
9520       x = XEXP (x, 0);
9521       /* Strip the rounding part.  They will all be implemented
9522          by the fcvt* family of instructions anyway.  */
9523       if (GET_CODE (x) == UNSPEC)
9524         {
9525           unsigned int uns_code = XINT (x, 1);
9526
9527           if (uns_code == UNSPEC_FRINTA
9528               || uns_code == UNSPEC_FRINTM
9529               || uns_code == UNSPEC_FRINTN
9530               || uns_code == UNSPEC_FRINTP
9531               || uns_code == UNSPEC_FRINTZ)
9532             x = XVECEXP (x, 0, 0);
9533         }
9534
9535       if (speed)
9536         {
9537           if (VECTOR_MODE_P (mode))
9538             *cost += extra_cost->vect.alu;
9539           else
9540             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9541         }
9542
9543       /* We can combine fmul by a power of 2 followed by a fcvt into a single
9544          fixed-point fcvt.  */
9545       if (GET_CODE (x) == MULT
9546           && ((VECTOR_MODE_P (mode)
9547                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9548               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9549         {
9550           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9551                              0, speed);
9552           return true;
9553         }
9554
9555       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9556       return true;
9557
9558     case ABS:
9559       if (VECTOR_MODE_P (mode))
9560         {
9561           /* ABS (vector).  */
9562           if (speed)
9563             *cost += extra_cost->vect.alu;
9564         }
9565       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9566         {
9567           op0 = XEXP (x, 0);
9568
9569           /* FABD, which is analogous to FADD.  */
9570           if (GET_CODE (op0) == MINUS)
9571             {
9572               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9573               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9574               if (speed)
9575                 *cost += extra_cost->fp[mode == DFmode].addsub;
9576
9577               return true;
9578             }
9579           /* Simple FABS is analogous to FNEG.  */
9580           if (speed)
9581             *cost += extra_cost->fp[mode == DFmode].neg;
9582         }
9583       else
9584         {
9585           /* Integer ABS will either be split to
9586              two arithmetic instructions, or will be an ABS
9587              (scalar), which we don't model.  */
9588           *cost = COSTS_N_INSNS (2);
9589           if (speed)
9590             *cost += 2 * extra_cost->alu.arith;
9591         }
9592       return false;
9593
9594     case SMAX:
9595     case SMIN:
9596       if (speed)
9597         {
9598           if (VECTOR_MODE_P (mode))
9599             *cost += extra_cost->vect.alu;
9600           else
9601             {
9602               /* FMAXNM/FMINNM/FMAX/FMIN.
9603                  TODO: This may not be accurate for all implementations, but
9604                  we do not model this in the cost tables.  */
9605               *cost += extra_cost->fp[mode == DFmode].addsub;
9606             }
9607         }
9608       return false;
9609
9610     case UNSPEC:
9611       /* The floating point round to integer frint* instructions.  */
9612       if (aarch64_frint_unspec_p (XINT (x, 1)))
9613         {
9614           if (speed)
9615             *cost += extra_cost->fp[mode == DFmode].roundint;
9616
9617           return false;
9618         }
9619
9620       if (XINT (x, 1) == UNSPEC_RBIT)
9621         {
9622           if (speed)
9623             *cost += extra_cost->alu.rev;
9624
9625           return false;
9626         }
9627       break;
9628
9629     case TRUNCATE:
9630
9631       /* Decompose <su>muldi3_highpart.  */
9632       if (/* (truncate:DI  */
9633           mode == DImode
9634           /*   (lshiftrt:TI  */
9635           && GET_MODE (XEXP (x, 0)) == TImode
9636           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9637           /*      (mult:TI  */
9638           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9639           /*        (ANY_EXTEND:TI (reg:DI))
9640                     (ANY_EXTEND:TI (reg:DI)))  */
9641           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9642                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9643               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9644                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9645           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9646           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9647           /*     (const_int 64)  */
9648           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9649           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9650         {
9651           /* UMULH/SMULH.  */
9652           if (speed)
9653             *cost += extra_cost->mult[mode == DImode].extend;
9654           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9655                              mode, MULT, 0, speed);
9656           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9657                              mode, MULT, 1, speed);
9658           return true;
9659         }
9660
9661       /* Fall through.  */
9662     default:
9663       break;
9664     }
9665
9666   if (dump_file
9667       && flag_aarch64_verbose_cost)
9668     fprintf (dump_file,
9669       "\nFailed to cost RTX.  Assuming default cost.\n");
9670
9671   return true;
9672 }
9673
9674 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9675    calculated for X.  This cost is stored in *COST.  Returns true
9676    if the total cost of X was calculated.  */
9677 static bool
9678 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9679                    int param, int *cost, bool speed)
9680 {
9681   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9682
9683   if (dump_file
9684       && flag_aarch64_verbose_cost)
9685     {
9686       print_rtl_single (dump_file, x);
9687       fprintf (dump_file, "\n%s cost: %d (%s)\n",
9688                speed ? "Hot" : "Cold",
9689                *cost, result ? "final" : "partial");
9690     }
9691
9692   return result;
9693 }
9694
9695 static int
9696 aarch64_register_move_cost (machine_mode mode,
9697                             reg_class_t from_i, reg_class_t to_i)
9698 {
9699   enum reg_class from = (enum reg_class) from_i;
9700   enum reg_class to = (enum reg_class) to_i;
9701   const struct cpu_regmove_cost *regmove_cost
9702     = aarch64_tune_params.regmove_cost;
9703
9704   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
9705   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9706     to = GENERAL_REGS;
9707
9708   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9709     from = GENERAL_REGS;
9710
9711   /* Moving between GPR and stack cost is the same as GP2GP.  */
9712   if ((from == GENERAL_REGS && to == STACK_REG)
9713       || (to == GENERAL_REGS && from == STACK_REG))
9714     return regmove_cost->GP2GP;
9715
9716   /* To/From the stack register, we move via the gprs.  */
9717   if (to == STACK_REG || from == STACK_REG)
9718     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9719             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9720
9721   if (known_eq (GET_MODE_SIZE (mode), 16))
9722     {
9723       /* 128-bit operations on general registers require 2 instructions.  */
9724       if (from == GENERAL_REGS && to == GENERAL_REGS)
9725         return regmove_cost->GP2GP * 2;
9726       else if (from == GENERAL_REGS)
9727         return regmove_cost->GP2FP * 2;
9728       else if (to == GENERAL_REGS)
9729         return regmove_cost->FP2GP * 2;
9730
9731       /* When AdvSIMD instructions are disabled it is not possible to move
9732          a 128-bit value directly between Q registers.  This is handled in
9733          secondary reload.  A general register is used as a scratch to move
9734          the upper DI value and the lower DI value is moved directly,
9735          hence the cost is the sum of three moves. */
9736       if (! TARGET_SIMD)
9737         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9738
9739       return regmove_cost->FP2FP;
9740     }
9741
9742   if (from == GENERAL_REGS && to == GENERAL_REGS)
9743     return regmove_cost->GP2GP;
9744   else if (from == GENERAL_REGS)
9745     return regmove_cost->GP2FP;
9746   else if (to == GENERAL_REGS)
9747     return regmove_cost->FP2GP;
9748
9749   return regmove_cost->FP2FP;
9750 }
9751
9752 static int
9753 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9754                           reg_class_t rclass ATTRIBUTE_UNUSED,
9755                           bool in ATTRIBUTE_UNUSED)
9756 {
9757   return aarch64_tune_params.memmov_cost;
9758 }
9759
9760 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9761    to optimize 1.0/sqrt.  */
9762
9763 static bool
9764 use_rsqrt_p (machine_mode mode)
9765 {
9766   return (!flag_trapping_math
9767           && flag_unsafe_math_optimizations
9768           && ((aarch64_tune_params.approx_modes->recip_sqrt
9769                & AARCH64_APPROX_MODE (mode))
9770               || flag_mrecip_low_precision_sqrt));
9771 }
9772
9773 /* Function to decide when to use the approximate reciprocal square root
9774    builtin.  */
9775
9776 static tree
9777 aarch64_builtin_reciprocal (tree fndecl)
9778 {
9779   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9780
9781   if (!use_rsqrt_p (mode))
9782     return NULL_TREE;
9783   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9784 }
9785
9786 typedef rtx (*rsqrte_type) (rtx, rtx);
9787
9788 /* Select reciprocal square root initial estimate insn depending on machine
9789    mode.  */
9790
9791 static rsqrte_type
9792 get_rsqrte_type (machine_mode mode)
9793 {
9794   switch (mode)
9795   {
9796     case E_DFmode:   return gen_aarch64_rsqrtedf;
9797     case E_SFmode:   return gen_aarch64_rsqrtesf;
9798     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9799     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9800     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9801     default: gcc_unreachable ();
9802   }
9803 }
9804
9805 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9806
9807 /* Select reciprocal square root series step insn depending on machine mode.  */
9808
9809 static rsqrts_type
9810 get_rsqrts_type (machine_mode mode)
9811 {
9812   switch (mode)
9813   {
9814     case E_DFmode:   return gen_aarch64_rsqrtsdf;
9815     case E_SFmode:   return gen_aarch64_rsqrtssf;
9816     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9817     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9818     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9819     default: gcc_unreachable ();
9820   }
9821 }
9822
9823 /* Emit instruction sequence to compute either the approximate square root
9824    or its approximate reciprocal, depending on the flag RECP, and return
9825    whether the sequence was emitted or not.  */
9826
9827 bool
9828 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9829 {
9830   machine_mode mode = GET_MODE (dst);
9831
9832   if (GET_MODE_INNER (mode) == HFmode)
9833     {
9834       gcc_assert (!recp);
9835       return false;
9836     }
9837
9838   if (!recp)
9839     {
9840       if (!(flag_mlow_precision_sqrt
9841             || (aarch64_tune_params.approx_modes->sqrt
9842                 & AARCH64_APPROX_MODE (mode))))
9843         return false;
9844
9845       if (flag_finite_math_only
9846           || flag_trapping_math
9847           || !flag_unsafe_math_optimizations
9848           || optimize_function_for_size_p (cfun))
9849         return false;
9850     }
9851   else
9852     /* Caller assumes we cannot fail.  */
9853     gcc_assert (use_rsqrt_p (mode));
9854
9855   machine_mode mmsk = mode_for_int_vector (mode).require ();
9856   rtx xmsk = gen_reg_rtx (mmsk);
9857   if (!recp)
9858     /* When calculating the approximate square root, compare the
9859        argument with 0.0 and create a mask.  */
9860     emit_insn (gen_rtx_SET (xmsk,
9861                             gen_rtx_NEG (mmsk,
9862                                          gen_rtx_EQ (mmsk, src,
9863                                                      CONST0_RTX (mode)))));
9864
9865   /* Estimate the approximate reciprocal square root.  */
9866   rtx xdst = gen_reg_rtx (mode);
9867   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9868
9869   /* Iterate over the series twice for SF and thrice for DF.  */
9870   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9871
9872   /* Optionally iterate over the series once less for faster performance
9873      while sacrificing the accuracy.  */
9874   if ((recp && flag_mrecip_low_precision_sqrt)
9875       || (!recp && flag_mlow_precision_sqrt))
9876     iterations--;
9877
9878   /* Iterate over the series to calculate the approximate reciprocal square
9879      root.  */
9880   rtx x1 = gen_reg_rtx (mode);
9881   while (iterations--)
9882     {
9883       rtx x2 = gen_reg_rtx (mode);
9884       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9885
9886       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9887
9888       if (iterations > 0)
9889         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9890     }
9891
9892   if (!recp)
9893     {
9894       /* Qualify the approximate reciprocal square root when the argument is
9895          0.0 by squashing the intermediary result to 0.0.  */
9896       rtx xtmp = gen_reg_rtx (mmsk);
9897       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9898                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
9899       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9900
9901       /* Calculate the approximate square root.  */
9902       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9903     }
9904
9905   /* Finalize the approximation.  */
9906   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9907
9908   return true;
9909 }
9910
9911 typedef rtx (*recpe_type) (rtx, rtx);
9912
9913 /* Select reciprocal initial estimate insn depending on machine mode.  */
9914
9915 static recpe_type
9916 get_recpe_type (machine_mode mode)
9917 {
9918   switch (mode)
9919   {
9920     case E_SFmode:   return (gen_aarch64_frecpesf);
9921     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
9922     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
9923     case E_DFmode:   return (gen_aarch64_frecpedf);
9924     case E_V2DFmode: return (gen_aarch64_frecpev2df);
9925     default:         gcc_unreachable ();
9926   }
9927 }
9928
9929 typedef rtx (*recps_type) (rtx, rtx, rtx);
9930
9931 /* Select reciprocal series step insn depending on machine mode.  */
9932
9933 static recps_type
9934 get_recps_type (machine_mode mode)
9935 {
9936   switch (mode)
9937   {
9938     case E_SFmode:   return (gen_aarch64_frecpssf);
9939     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
9940     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
9941     case E_DFmode:   return (gen_aarch64_frecpsdf);
9942     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
9943     default:         gcc_unreachable ();
9944   }
9945 }
9946
9947 /* Emit the instruction sequence to compute the approximation for the division
9948    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
9949
9950 bool
9951 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9952 {
9953   machine_mode mode = GET_MODE (quo);
9954
9955   if (GET_MODE_INNER (mode) == HFmode)
9956     return false;
9957
9958   bool use_approx_division_p = (flag_mlow_precision_div
9959                                 || (aarch64_tune_params.approx_modes->division
9960                                     & AARCH64_APPROX_MODE (mode)));
9961
9962   if (!flag_finite_math_only
9963       || flag_trapping_math
9964       || !flag_unsafe_math_optimizations
9965       || optimize_function_for_size_p (cfun)
9966       || !use_approx_division_p)
9967     return false;
9968
9969   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
9970     return false;
9971
9972   /* Estimate the approximate reciprocal.  */
9973   rtx xrcp = gen_reg_rtx (mode);
9974   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
9975
9976   /* Iterate over the series twice for SF and thrice for DF.  */
9977   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9978
9979   /* Optionally iterate over the series once less for faster performance,
9980      while sacrificing the accuracy.  */
9981   if (flag_mlow_precision_div)
9982     iterations--;
9983
9984   /* Iterate over the series to calculate the approximate reciprocal.  */
9985   rtx xtmp = gen_reg_rtx (mode);
9986   while (iterations--)
9987     {
9988       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
9989
9990       if (iterations > 0)
9991         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
9992     }
9993
9994   if (num != CONST1_RTX (mode))
9995     {
9996       /* As the approximate reciprocal of DEN is already calculated, only
9997          calculate the approximate division when NUM is not 1.0.  */
9998       rtx xnum = force_reg (mode, num);
9999       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10000     }
10001
10002   /* Finalize the approximation.  */
10003   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10004   return true;
10005 }
10006
10007 /* Return the number of instructions that can be issued per cycle.  */
10008 static int
10009 aarch64_sched_issue_rate (void)
10010 {
10011   return aarch64_tune_params.issue_rate;
10012 }
10013
10014 static int
10015 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10016 {
10017   int issue_rate = aarch64_sched_issue_rate ();
10018
10019   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10020 }
10021
10022
10023 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10024    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
10025    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
10026
10027 static int
10028 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10029                                                     int ready_index)
10030 {
10031   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10032 }
10033
10034
10035 /* Vectorizer cost model target hooks.  */
10036
10037 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
10038 static int
10039 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10040                                     tree vectype,
10041                                     int misalign ATTRIBUTE_UNUSED)
10042 {
10043   unsigned elements;
10044   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10045   bool fp = false;
10046
10047   if (vectype != NULL)
10048     fp = FLOAT_TYPE_P (vectype);
10049
10050   switch (type_of_cost)
10051     {
10052       case scalar_stmt:
10053         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10054
10055       case scalar_load:
10056         return costs->scalar_load_cost;
10057
10058       case scalar_store:
10059         return costs->scalar_store_cost;
10060
10061       case vector_stmt:
10062         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10063
10064       case vector_load:
10065         return costs->vec_align_load_cost;
10066
10067       case vector_store:
10068         return costs->vec_store_cost;
10069
10070       case vec_to_scalar:
10071         return costs->vec_to_scalar_cost;
10072
10073       case scalar_to_vec:
10074         return costs->scalar_to_vec_cost;
10075
10076       case unaligned_load:
10077       case vector_gather_load:
10078         return costs->vec_unalign_load_cost;
10079
10080       case unaligned_store:
10081       case vector_scatter_store:
10082         return costs->vec_unalign_store_cost;
10083
10084       case cond_branch_taken:
10085         return costs->cond_taken_branch_cost;
10086
10087       case cond_branch_not_taken:
10088         return costs->cond_not_taken_branch_cost;
10089
10090       case vec_perm:
10091         return costs->vec_permute_cost;
10092
10093       case vec_promote_demote:
10094         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10095
10096       case vec_construct:
10097         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10098         return elements / 2 + 1;
10099
10100       default:
10101         gcc_unreachable ();
10102     }
10103 }
10104
10105 /* Implement targetm.vectorize.add_stmt_cost.  */
10106 static unsigned
10107 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10108                        struct _stmt_vec_info *stmt_info, int misalign,
10109                        enum vect_cost_model_location where)
10110 {
10111   unsigned *cost = (unsigned *) data;
10112   unsigned retval = 0;
10113
10114   if (flag_vect_cost_model)
10115     {
10116       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10117       int stmt_cost =
10118             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10119
10120       /* Statements in an inner loop relative to the loop being
10121          vectorized are weighted more heavily.  The value here is
10122          arbitrary and could potentially be improved with analysis.  */
10123       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10124         count *= 50; /*  FIXME  */
10125
10126       retval = (unsigned) (count * stmt_cost);
10127       cost[where] += retval;
10128     }
10129
10130   return retval;
10131 }
10132
10133 static void initialize_aarch64_code_model (struct gcc_options *);
10134
10135 /* Parse the TO_PARSE string and put the architecture struct that it
10136    selects into RES and the architectural features into ISA_FLAGS.
10137    Return an aarch64_parse_opt_result describing the parse result.
10138    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
10139
10140 static enum aarch64_parse_opt_result
10141 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10142                     unsigned long *isa_flags)
10143 {
10144   char *ext;
10145   const struct processor *arch;
10146   char *str = (char *) alloca (strlen (to_parse) + 1);
10147   size_t len;
10148
10149   strcpy (str, to_parse);
10150
10151   ext = strchr (str, '+');
10152
10153   if (ext != NULL)
10154     len = ext - str;
10155   else
10156     len = strlen (str);
10157
10158   if (len == 0)
10159     return AARCH64_PARSE_MISSING_ARG;
10160
10161
10162   /* Loop through the list of supported ARCHes to find a match.  */
10163   for (arch = all_architectures; arch->name != NULL; arch++)
10164     {
10165       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10166         {
10167           unsigned long isa_temp = arch->flags;
10168
10169           if (ext != NULL)
10170             {
10171               /* TO_PARSE string contains at least one extension.  */
10172               enum aarch64_parse_opt_result ext_res
10173                 = aarch64_parse_extension (ext, &isa_temp);
10174
10175               if (ext_res != AARCH64_PARSE_OK)
10176                 return ext_res;
10177             }
10178           /* Extension parsing was successful.  Confirm the result
10179              arch and ISA flags.  */
10180           *res = arch;
10181           *isa_flags = isa_temp;
10182           return AARCH64_PARSE_OK;
10183         }
10184     }
10185
10186   /* ARCH name not found in list.  */
10187   return AARCH64_PARSE_INVALID_ARG;
10188 }
10189
10190 /* Parse the TO_PARSE string and put the result tuning in RES and the
10191    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
10192    describing the parse result.  If there is an error parsing, RES and
10193    ISA_FLAGS are left unchanged.  */
10194
10195 static enum aarch64_parse_opt_result
10196 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10197                    unsigned long *isa_flags)
10198 {
10199   char *ext;
10200   const struct processor *cpu;
10201   char *str = (char *) alloca (strlen (to_parse) + 1);
10202   size_t len;
10203
10204   strcpy (str, to_parse);
10205
10206   ext = strchr (str, '+');
10207
10208   if (ext != NULL)
10209     len = ext - str;
10210   else
10211     len = strlen (str);
10212
10213   if (len == 0)
10214     return AARCH64_PARSE_MISSING_ARG;
10215
10216
10217   /* Loop through the list of supported CPUs to find a match.  */
10218   for (cpu = all_cores; cpu->name != NULL; cpu++)
10219     {
10220       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10221         {
10222           unsigned long isa_temp = cpu->flags;
10223
10224
10225           if (ext != NULL)
10226             {
10227               /* TO_PARSE string contains at least one extension.  */
10228               enum aarch64_parse_opt_result ext_res
10229                 = aarch64_parse_extension (ext, &isa_temp);
10230
10231               if (ext_res != AARCH64_PARSE_OK)
10232                 return ext_res;
10233             }
10234           /* Extension parsing was successfull.  Confirm the result
10235              cpu and ISA flags.  */
10236           *res = cpu;
10237           *isa_flags = isa_temp;
10238           return AARCH64_PARSE_OK;
10239         }
10240     }
10241
10242   /* CPU name not found in list.  */
10243   return AARCH64_PARSE_INVALID_ARG;
10244 }
10245
10246 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10247    Return an aarch64_parse_opt_result describing the parse result.
10248    If the parsing fails the RES does not change.  */
10249
10250 static enum aarch64_parse_opt_result
10251 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10252 {
10253   const struct processor *cpu;
10254   char *str = (char *) alloca (strlen (to_parse) + 1);
10255
10256   strcpy (str, to_parse);
10257
10258   /* Loop through the list of supported CPUs to find a match.  */
10259   for (cpu = all_cores; cpu->name != NULL; cpu++)
10260     {
10261       if (strcmp (cpu->name, str) == 0)
10262         {
10263           *res = cpu;
10264           return AARCH64_PARSE_OK;
10265         }
10266     }
10267
10268   /* CPU name not found in list.  */
10269   return AARCH64_PARSE_INVALID_ARG;
10270 }
10271
10272 /* Parse TOKEN, which has length LENGTH to see if it is an option
10273    described in FLAG.  If it is, return the index bit for that fusion type.
10274    If not, error (printing OPTION_NAME) and return zero.  */
10275
10276 static unsigned int
10277 aarch64_parse_one_option_token (const char *token,
10278                                 size_t length,
10279                                 const struct aarch64_flag_desc *flag,
10280                                 const char *option_name)
10281 {
10282   for (; flag->name != NULL; flag++)
10283     {
10284       if (length == strlen (flag->name)
10285           && !strncmp (flag->name, token, length))
10286         return flag->flag;
10287     }
10288
10289   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10290   return 0;
10291 }
10292
10293 /* Parse OPTION which is a comma-separated list of flags to enable.
10294    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10295    default state we inherit from the CPU tuning structures.  OPTION_NAME
10296    gives the top-level option we are parsing in the -moverride string,
10297    for use in error messages.  */
10298
10299 static unsigned int
10300 aarch64_parse_boolean_options (const char *option,
10301                                const struct aarch64_flag_desc *flags,
10302                                unsigned int initial_state,
10303                                const char *option_name)
10304 {
10305   const char separator = '.';
10306   const char* specs = option;
10307   const char* ntoken = option;
10308   unsigned int found_flags = initial_state;
10309
10310   while ((ntoken = strchr (specs, separator)))
10311     {
10312       size_t token_length = ntoken - specs;
10313       unsigned token_ops = aarch64_parse_one_option_token (specs,
10314                                                            token_length,
10315                                                            flags,
10316                                                            option_name);
10317       /* If we find "none" (or, for simplicity's sake, an error) anywhere
10318          in the token stream, reset the supported operations.  So:
10319
10320            adrp+add.cmp+branch.none.adrp+add
10321
10322            would have the result of turning on only adrp+add fusion.  */
10323       if (!token_ops)
10324         found_flags = 0;
10325
10326       found_flags |= token_ops;
10327       specs = ++ntoken;
10328     }
10329
10330   /* We ended with a comma, print something.  */
10331   if (!(*specs))
10332     {
10333       error ("%s string ill-formed\n", option_name);
10334       return 0;
10335     }
10336
10337   /* We still have one more token to parse.  */
10338   size_t token_length = strlen (specs);
10339   unsigned token_ops = aarch64_parse_one_option_token (specs,
10340                                                        token_length,
10341                                                        flags,
10342                                                        option_name);
10343    if (!token_ops)
10344      found_flags = 0;
10345
10346   found_flags |= token_ops;
10347   return found_flags;
10348 }
10349
10350 /* Support for overriding instruction fusion.  */
10351
10352 static void
10353 aarch64_parse_fuse_string (const char *fuse_string,
10354                             struct tune_params *tune)
10355 {
10356   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10357                                                      aarch64_fusible_pairs,
10358                                                      tune->fusible_ops,
10359                                                      "fuse=");
10360 }
10361
10362 /* Support for overriding other tuning flags.  */
10363
10364 static void
10365 aarch64_parse_tune_string (const char *tune_string,
10366                             struct tune_params *tune)
10367 {
10368   tune->extra_tuning_flags
10369     = aarch64_parse_boolean_options (tune_string,
10370                                      aarch64_tuning_flags,
10371                                      tune->extra_tuning_flags,
10372                                      "tune=");
10373 }
10374
10375 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10376    we understand.  If it is, extract the option string and handoff to
10377    the appropriate function.  */
10378
10379 void
10380 aarch64_parse_one_override_token (const char* token,
10381                                   size_t length,
10382                                   struct tune_params *tune)
10383 {
10384   const struct aarch64_tuning_override_function *fn
10385     = aarch64_tuning_override_functions;
10386
10387   const char *option_part = strchr (token, '=');
10388   if (!option_part)
10389     {
10390       error ("tuning string missing in option (%s)", token);
10391       return;
10392     }
10393
10394   /* Get the length of the option name.  */
10395   length = option_part - token;
10396   /* Skip the '=' to get to the option string.  */
10397   option_part++;
10398
10399   for (; fn->name != NULL; fn++)
10400     {
10401       if (!strncmp (fn->name, token, length))
10402         {
10403           fn->parse_override (option_part, tune);
10404           return;
10405         }
10406     }
10407
10408   error ("unknown tuning option (%s)",token);
10409   return;
10410 }
10411
10412 /* A checking mechanism for the implementation of the tls size.  */
10413
10414 static void
10415 initialize_aarch64_tls_size (struct gcc_options *opts)
10416 {
10417   if (aarch64_tls_size == 0)
10418     aarch64_tls_size = 24;
10419
10420   switch (opts->x_aarch64_cmodel_var)
10421     {
10422     case AARCH64_CMODEL_TINY:
10423       /* Both the default and maximum TLS size allowed under tiny is 1M which
10424          needs two instructions to address, so we clamp the size to 24.  */
10425       if (aarch64_tls_size > 24)
10426         aarch64_tls_size = 24;
10427       break;
10428     case AARCH64_CMODEL_SMALL:
10429       /* The maximum TLS size allowed under small is 4G.  */
10430       if (aarch64_tls_size > 32)
10431         aarch64_tls_size = 32;
10432       break;
10433     case AARCH64_CMODEL_LARGE:
10434       /* The maximum TLS size allowed under large is 16E.
10435          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
10436       if (aarch64_tls_size > 48)
10437         aarch64_tls_size = 48;
10438       break;
10439     default:
10440       gcc_unreachable ();
10441     }
10442
10443   return;
10444 }
10445
10446 /* Parse STRING looking for options in the format:
10447      string     :: option:string
10448      option     :: name=substring
10449      name       :: {a-z}
10450      substring  :: defined by option.  */
10451
10452 static void
10453 aarch64_parse_override_string (const char* input_string,
10454                                struct tune_params* tune)
10455 {
10456   const char separator = ':';
10457   size_t string_length = strlen (input_string) + 1;
10458   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10459   char *string = string_root;
10460   strncpy (string, input_string, string_length);
10461   string[string_length - 1] = '\0';
10462
10463   char* ntoken = string;
10464
10465   while ((ntoken = strchr (string, separator)))
10466     {
10467       size_t token_length = ntoken - string;
10468       /* Make this substring look like a string.  */
10469       *ntoken = '\0';
10470       aarch64_parse_one_override_token (string, token_length, tune);
10471       string = ++ntoken;
10472     }
10473
10474   /* One last option to parse.  */
10475   aarch64_parse_one_override_token (string, strlen (string), tune);
10476   free (string_root);
10477 }
10478
10479
10480 static void
10481 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10482 {
10483   /* PR 70044: We have to be careful about being called multiple times for the
10484      same function.  This means all changes should be repeatable.  */
10485
10486   /* If the frame pointer is enabled, set it to a special value that behaves
10487      similar to frame pointer omission.  If we don't do this all leaf functions
10488      will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
10489      If flag_omit_frame_pointer has this special value, we must force the
10490      frame pointer if not in a leaf function.  We also need to force it in a
10491      leaf function if flag_omit_frame_pointer is not set or if LR is used.  */
10492   if (opts->x_flag_omit_frame_pointer == 0)
10493     opts->x_flag_omit_frame_pointer = 2;
10494
10495   /* If not optimizing for size, set the default
10496      alignment to what the target wants.  */
10497   if (!opts->x_optimize_size)
10498     {
10499       if (opts->x_align_loops <= 0)
10500         opts->x_align_loops = aarch64_tune_params.loop_align;
10501       if (opts->x_align_jumps <= 0)
10502         opts->x_align_jumps = aarch64_tune_params.jump_align;
10503       if (opts->x_align_functions <= 0)
10504         opts->x_align_functions = aarch64_tune_params.function_align;
10505     }
10506
10507   /* We default to no pc-relative literal loads.  */
10508
10509   aarch64_pcrelative_literal_loads = false;
10510
10511   /* If -mpc-relative-literal-loads is set on the command line, this
10512      implies that the user asked for PC relative literal loads.  */
10513   if (opts->x_pcrelative_literal_loads == 1)
10514     aarch64_pcrelative_literal_loads = true;
10515
10516   /* In the tiny memory model it makes no sense to disallow PC relative
10517      literal pool loads.  */
10518   if (aarch64_cmodel == AARCH64_CMODEL_TINY
10519       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10520     aarch64_pcrelative_literal_loads = true;
10521
10522   /* When enabling the lower precision Newton series for the square root, also
10523      enable it for the reciprocal square root, since the latter is an
10524      intermediary step for the former.  */
10525   if (flag_mlow_precision_sqrt)
10526     flag_mrecip_low_precision_sqrt = true;
10527 }
10528
10529 /* 'Unpack' up the internal tuning structs and update the options
10530     in OPTS.  The caller must have set up selected_tune and selected_arch
10531     as all the other target-specific codegen decisions are
10532     derived from them.  */
10533
10534 void
10535 aarch64_override_options_internal (struct gcc_options *opts)
10536 {
10537   aarch64_tune_flags = selected_tune->flags;
10538   aarch64_tune = selected_tune->sched_core;
10539   /* Make a copy of the tuning parameters attached to the core, which
10540      we may later overwrite.  */
10541   aarch64_tune_params = *(selected_tune->tune);
10542   aarch64_architecture_version = selected_arch->architecture_version;
10543
10544   if (opts->x_aarch64_override_tune_string)
10545     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10546                                   &aarch64_tune_params);
10547
10548   /* This target defaults to strict volatile bitfields.  */
10549   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10550     opts->x_flag_strict_volatile_bitfields = 1;
10551
10552   initialize_aarch64_code_model (opts);
10553   initialize_aarch64_tls_size (opts);
10554
10555   int queue_depth = 0;
10556   switch (aarch64_tune_params.autoprefetcher_model)
10557     {
10558       case tune_params::AUTOPREFETCHER_OFF:
10559         queue_depth = -1;
10560         break;
10561       case tune_params::AUTOPREFETCHER_WEAK:
10562         queue_depth = 0;
10563         break;
10564       case tune_params::AUTOPREFETCHER_STRONG:
10565         queue_depth = max_insn_queue_index + 1;
10566         break;
10567       default:
10568         gcc_unreachable ();
10569     }
10570
10571   /* We don't mind passing in global_options_set here as we don't use
10572      the *options_set structs anyway.  */
10573   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10574                          queue_depth,
10575                          opts->x_param_values,
10576                          global_options_set.x_param_values);
10577
10578   /* Set up parameters to be used in prefetching algorithm.  Do not
10579      override the defaults unless we are tuning for a core we have
10580      researched values for.  */
10581   if (aarch64_tune_params.prefetch->num_slots > 0)
10582     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10583                            aarch64_tune_params.prefetch->num_slots,
10584                            opts->x_param_values,
10585                            global_options_set.x_param_values);
10586   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10587     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10588                            aarch64_tune_params.prefetch->l1_cache_size,
10589                            opts->x_param_values,
10590                            global_options_set.x_param_values);
10591   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10592     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10593                            aarch64_tune_params.prefetch->l1_cache_line_size,
10594                            opts->x_param_values,
10595                            global_options_set.x_param_values);
10596   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10597     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10598                            aarch64_tune_params.prefetch->l2_cache_size,
10599                            opts->x_param_values,
10600                            global_options_set.x_param_values);
10601
10602   /* Use the alternative scheduling-pressure algorithm by default.  */
10603   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10604                          opts->x_param_values,
10605                          global_options_set.x_param_values);
10606
10607   /* Enable sw prefetching at specified optimization level for
10608      CPUS that have prefetch.  Lower optimization level threshold by 1
10609      when profiling is enabled.  */
10610   if (opts->x_flag_prefetch_loop_arrays < 0
10611       && !opts->x_optimize_size
10612       && aarch64_tune_params.prefetch->default_opt_level >= 0
10613       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10614     opts->x_flag_prefetch_loop_arrays = 1;
10615
10616   aarch64_override_options_after_change_1 (opts);
10617 }
10618
10619 /* Print a hint with a suggestion for a core or architecture name that
10620    most closely resembles what the user passed in STR.  ARCH is true if
10621    the user is asking for an architecture name.  ARCH is false if the user
10622    is asking for a core name.  */
10623
10624 static void
10625 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10626 {
10627   auto_vec<const char *> candidates;
10628   const struct processor *entry = arch ? all_architectures : all_cores;
10629   for (; entry->name != NULL; entry++)
10630     candidates.safe_push (entry->name);
10631
10632 #ifdef HAVE_LOCAL_CPU_DETECT
10633   /* Add also "native" as possible value.  */
10634   if (arch)
10635     candidates.safe_push ("native");
10636 #endif
10637
10638   char *s;
10639   const char *hint = candidates_list_and_hint (str, s, candidates);
10640   if (hint)
10641     inform (input_location, "valid arguments are: %s;"
10642                              " did you mean %qs?", s, hint);
10643   else
10644     inform (input_location, "valid arguments are: %s", s);
10645
10646   XDELETEVEC (s);
10647 }
10648
10649 /* Print a hint with a suggestion for a core name that most closely resembles
10650    what the user passed in STR.  */
10651
10652 inline static void
10653 aarch64_print_hint_for_core (const char *str)
10654 {
10655   aarch64_print_hint_for_core_or_arch (str, false);
10656 }
10657
10658 /* Print a hint with a suggestion for an architecture name that most closely
10659    resembles what the user passed in STR.  */
10660
10661 inline static void
10662 aarch64_print_hint_for_arch (const char *str)
10663 {
10664   aarch64_print_hint_for_core_or_arch (str, true);
10665 }
10666
10667 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
10668    specified in STR and throw errors if appropriate.  Put the results if
10669    they are valid in RES and ISA_FLAGS.  Return whether the option is
10670    valid.  */
10671
10672 static bool
10673 aarch64_validate_mcpu (const char *str, const struct processor **res,
10674                        unsigned long *isa_flags)
10675 {
10676   enum aarch64_parse_opt_result parse_res
10677     = aarch64_parse_cpu (str, res, isa_flags);
10678
10679   if (parse_res == AARCH64_PARSE_OK)
10680     return true;
10681
10682   switch (parse_res)
10683     {
10684       case AARCH64_PARSE_MISSING_ARG:
10685         error ("missing cpu name in %<-mcpu=%s%>", str);
10686         break;
10687       case AARCH64_PARSE_INVALID_ARG:
10688         error ("unknown value %qs for -mcpu", str);
10689         aarch64_print_hint_for_core (str);
10690         break;
10691       case AARCH64_PARSE_INVALID_FEATURE:
10692         error ("invalid feature modifier in %<-mcpu=%s%>", str);
10693         break;
10694       default:
10695         gcc_unreachable ();
10696     }
10697
10698   return false;
10699 }
10700
10701 /* Validate a command-line -march option.  Parse the arch and extensions
10702    (if any) specified in STR and throw errors if appropriate.  Put the
10703    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
10704    option is valid.  */
10705
10706 static bool
10707 aarch64_validate_march (const char *str, const struct processor **res,
10708                          unsigned long *isa_flags)
10709 {
10710   enum aarch64_parse_opt_result parse_res
10711     = aarch64_parse_arch (str, res, isa_flags);
10712
10713   if (parse_res == AARCH64_PARSE_OK)
10714     return true;
10715
10716   switch (parse_res)
10717     {
10718       case AARCH64_PARSE_MISSING_ARG:
10719         error ("missing arch name in %<-march=%s%>", str);
10720         break;
10721       case AARCH64_PARSE_INVALID_ARG:
10722         error ("unknown value %qs for -march", str);
10723         aarch64_print_hint_for_arch (str);
10724         break;
10725       case AARCH64_PARSE_INVALID_FEATURE:
10726         error ("invalid feature modifier in %<-march=%s%>", str);
10727         break;
10728       default:
10729         gcc_unreachable ();
10730     }
10731
10732   return false;
10733 }
10734
10735 /* Validate a command-line -mtune option.  Parse the cpu
10736    specified in STR and throw errors if appropriate.  Put the
10737    result, if it is valid, in RES.  Return whether the option is
10738    valid.  */
10739
10740 static bool
10741 aarch64_validate_mtune (const char *str, const struct processor **res)
10742 {
10743   enum aarch64_parse_opt_result parse_res
10744     = aarch64_parse_tune (str, res);
10745
10746   if (parse_res == AARCH64_PARSE_OK)
10747     return true;
10748
10749   switch (parse_res)
10750     {
10751       case AARCH64_PARSE_MISSING_ARG:
10752         error ("missing cpu name in %<-mtune=%s%>", str);
10753         break;
10754       case AARCH64_PARSE_INVALID_ARG:
10755         error ("unknown value %qs for -mtune", str);
10756         aarch64_print_hint_for_core (str);
10757         break;
10758       default:
10759         gcc_unreachable ();
10760     }
10761   return false;
10762 }
10763
10764 /* Return the CPU corresponding to the enum CPU.
10765    If it doesn't specify a cpu, return the default.  */
10766
10767 static const struct processor *
10768 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10769 {
10770   if (cpu != aarch64_none)
10771     return &all_cores[cpu];
10772
10773   /* The & 0x3f is to extract the bottom 6 bits that encode the
10774      default cpu as selected by the --with-cpu GCC configure option
10775      in config.gcc.
10776      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10777      flags mechanism should be reworked to make it more sane.  */
10778   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10779 }
10780
10781 /* Return the architecture corresponding to the enum ARCH.
10782    If it doesn't specify a valid architecture, return the default.  */
10783
10784 static const struct processor *
10785 aarch64_get_arch (enum aarch64_arch arch)
10786 {
10787   if (arch != aarch64_no_arch)
10788     return &all_architectures[arch];
10789
10790   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10791
10792   return &all_architectures[cpu->arch];
10793 }
10794
10795 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
10796
10797 static poly_uint16
10798 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10799 {
10800   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10801      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10802      deciding which .md file patterns to use and when deciding whether
10803      something is a legitimate address or constant.  */
10804   if (value == SVE_SCALABLE || value == SVE_128)
10805     return poly_uint16 (2, 2);
10806   else
10807     return (int) value / 64;
10808 }
10809
10810 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
10811    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10812    tuning structs.  In particular it must set selected_tune and
10813    aarch64_isa_flags that define the available ISA features and tuning
10814    decisions.  It must also set selected_arch as this will be used to
10815    output the .arch asm tags for each function.  */
10816
10817 static void
10818 aarch64_override_options (void)
10819 {
10820   unsigned long cpu_isa = 0;
10821   unsigned long arch_isa = 0;
10822   aarch64_isa_flags = 0;
10823
10824   bool valid_cpu = true;
10825   bool valid_tune = true;
10826   bool valid_arch = true;
10827
10828   selected_cpu = NULL;
10829   selected_arch = NULL;
10830   selected_tune = NULL;
10831
10832   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10833      If either of -march or -mtune is given, they override their
10834      respective component of -mcpu.  */
10835   if (aarch64_cpu_string)
10836     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10837                                         &cpu_isa);
10838
10839   if (aarch64_arch_string)
10840     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10841                                           &arch_isa);
10842
10843   if (aarch64_tune_string)
10844     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10845
10846   /* If the user did not specify a processor, choose the default
10847      one for them.  This will be the CPU set during configuration using
10848      --with-cpu, otherwise it is "generic".  */
10849   if (!selected_cpu)
10850     {
10851       if (selected_arch)
10852         {
10853           selected_cpu = &all_cores[selected_arch->ident];
10854           aarch64_isa_flags = arch_isa;
10855           explicit_arch = selected_arch->arch;
10856         }
10857       else
10858         {
10859           /* Get default configure-time CPU.  */
10860           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10861           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10862         }
10863
10864       if (selected_tune)
10865         explicit_tune_core = selected_tune->ident;
10866     }
10867   /* If both -mcpu and -march are specified check that they are architecturally
10868      compatible, warn if they're not and prefer the -march ISA flags.  */
10869   else if (selected_arch)
10870     {
10871       if (selected_arch->arch != selected_cpu->arch)
10872         {
10873           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10874                        all_architectures[selected_cpu->arch].name,
10875                        selected_arch->name);
10876         }
10877       aarch64_isa_flags = arch_isa;
10878       explicit_arch = selected_arch->arch;
10879       explicit_tune_core = selected_tune ? selected_tune->ident
10880                                           : selected_cpu->ident;
10881     }
10882   else
10883     {
10884       /* -mcpu but no -march.  */
10885       aarch64_isa_flags = cpu_isa;
10886       explicit_tune_core = selected_tune ? selected_tune->ident
10887                                           : selected_cpu->ident;
10888       gcc_assert (selected_cpu);
10889       selected_arch = &all_architectures[selected_cpu->arch];
10890       explicit_arch = selected_arch->arch;
10891     }
10892
10893   /* Set the arch as well as we will need it when outputing
10894      the .arch directive in assembly.  */
10895   if (!selected_arch)
10896     {
10897       gcc_assert (selected_cpu);
10898       selected_arch = &all_architectures[selected_cpu->arch];
10899     }
10900
10901   if (!selected_tune)
10902     selected_tune = selected_cpu;
10903
10904 #ifndef HAVE_AS_MABI_OPTION
10905   /* The compiler may have been configured with 2.23.* binutils, which does
10906      not have support for ILP32.  */
10907   if (TARGET_ILP32)
10908     error ("assembler does not support -mabi=ilp32");
10909 #endif
10910
10911   /* Convert -msve-vector-bits to a VG count.  */
10912   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10913
10914   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10915     sorry ("return address signing is only supported for -mabi=lp64");
10916
10917   /* Make sure we properly set up the explicit options.  */
10918   if ((aarch64_cpu_string && valid_cpu)
10919        || (aarch64_tune_string && valid_tune))
10920     gcc_assert (explicit_tune_core != aarch64_none);
10921
10922   if ((aarch64_cpu_string && valid_cpu)
10923        || (aarch64_arch_string && valid_arch))
10924     gcc_assert (explicit_arch != aarch64_no_arch);
10925
10926   aarch64_override_options_internal (&global_options);
10927
10928   /* Save these options as the default ones in case we push and pop them later
10929      while processing functions with potential target attributes.  */
10930   target_option_default_node = target_option_current_node
10931       = build_target_option_node (&global_options);
10932 }
10933
10934 /* Implement targetm.override_options_after_change.  */
10935
10936 static void
10937 aarch64_override_options_after_change (void)
10938 {
10939   aarch64_override_options_after_change_1 (&global_options);
10940 }
10941
10942 static struct machine_function *
10943 aarch64_init_machine_status (void)
10944 {
10945   struct machine_function *machine;
10946   machine = ggc_cleared_alloc<machine_function> ();
10947   return machine;
10948 }
10949
10950 void
10951 aarch64_init_expanders (void)
10952 {
10953   init_machine_status = aarch64_init_machine_status;
10954 }
10955
10956 /* A checking mechanism for the implementation of the various code models.  */
10957 static void
10958 initialize_aarch64_code_model (struct gcc_options *opts)
10959 {
10960    if (opts->x_flag_pic)
10961      {
10962        switch (opts->x_aarch64_cmodel_var)
10963          {
10964          case AARCH64_CMODEL_TINY:
10965            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
10966            break;
10967          case AARCH64_CMODEL_SMALL:
10968 #ifdef HAVE_AS_SMALL_PIC_RELOCS
10969            aarch64_cmodel = (flag_pic == 2
10970                              ? AARCH64_CMODEL_SMALL_PIC
10971                              : AARCH64_CMODEL_SMALL_SPIC);
10972 #else
10973            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
10974 #endif
10975            break;
10976          case AARCH64_CMODEL_LARGE:
10977            sorry ("code model %qs with -f%s", "large",
10978                   opts->x_flag_pic > 1 ? "PIC" : "pic");
10979            break;
10980          default:
10981            gcc_unreachable ();
10982          }
10983      }
10984    else
10985      aarch64_cmodel = opts->x_aarch64_cmodel_var;
10986 }
10987
10988 /* Implement TARGET_OPTION_SAVE.  */
10989
10990 static void
10991 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
10992 {
10993   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
10994 }
10995
10996 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
10997    using the information saved in PTR.  */
10998
10999 static void
11000 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11001 {
11002   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11003   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11004   opts->x_explicit_arch = ptr->x_explicit_arch;
11005   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11006   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11007
11008   aarch64_override_options_internal (opts);
11009 }
11010
11011 /* Implement TARGET_OPTION_PRINT.  */
11012
11013 static void
11014 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11015 {
11016   const struct processor *cpu
11017     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11018   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11019   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11020   std::string extension
11021     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11022
11023   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11024   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11025            arch->name, extension.c_str ());
11026 }
11027
11028 static GTY(()) tree aarch64_previous_fndecl;
11029
11030 void
11031 aarch64_reset_previous_fndecl (void)
11032 {
11033   aarch64_previous_fndecl = NULL;
11034 }
11035
11036 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11037    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11038    make sure optab availability predicates are recomputed when necessary.  */
11039
11040 void
11041 aarch64_save_restore_target_globals (tree new_tree)
11042 {
11043   if (TREE_TARGET_GLOBALS (new_tree))
11044     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11045   else if (new_tree == target_option_default_node)
11046     restore_target_globals (&default_target_globals);
11047   else
11048     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11049 }
11050
11051 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
11052    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11053    of the function, if such exists.  This function may be called multiple
11054    times on a single function so use aarch64_previous_fndecl to avoid
11055    setting up identical state.  */
11056
11057 static void
11058 aarch64_set_current_function (tree fndecl)
11059 {
11060   if (!fndecl || fndecl == aarch64_previous_fndecl)
11061     return;
11062
11063   tree old_tree = (aarch64_previous_fndecl
11064                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11065                    : NULL_TREE);
11066
11067   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11068
11069   /* If current function has no attributes but the previous one did,
11070      use the default node.  */
11071   if (!new_tree && old_tree)
11072     new_tree = target_option_default_node;
11073
11074   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
11075      the default have been handled by aarch64_save_restore_target_globals from
11076      aarch64_pragma_target_parse.  */
11077   if (old_tree == new_tree)
11078     return;
11079
11080   aarch64_previous_fndecl = fndecl;
11081
11082   /* First set the target options.  */
11083   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11084
11085   aarch64_save_restore_target_globals (new_tree);
11086 }
11087
11088 /* Enum describing the various ways we can handle attributes.
11089    In many cases we can reuse the generic option handling machinery.  */
11090
11091 enum aarch64_attr_opt_type
11092 {
11093   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
11094   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
11095   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
11096   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
11097 };
11098
11099 /* All the information needed to handle a target attribute.
11100    NAME is the name of the attribute.
11101    ATTR_TYPE specifies the type of behavior of the attribute as described
11102    in the definition of enum aarch64_attr_opt_type.
11103    ALLOW_NEG is true if the attribute supports a "no-" form.
11104    HANDLER is the function that takes the attribute string as an argument
11105    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11106    OPT_NUM is the enum specifying the option that the attribute modifies.
11107    This is needed for attributes that mirror the behavior of a command-line
11108    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11109    aarch64_attr_enum.  */
11110
11111 struct aarch64_attribute_info
11112 {
11113   const char *name;
11114   enum aarch64_attr_opt_type attr_type;
11115   bool allow_neg;
11116   bool (*handler) (const char *);
11117   enum opt_code opt_num;
11118 };
11119
11120 /* Handle the ARCH_STR argument to the arch= target attribute.  */
11121
11122 static bool
11123 aarch64_handle_attr_arch (const char *str)
11124 {
11125   const struct processor *tmp_arch = NULL;
11126   enum aarch64_parse_opt_result parse_res
11127     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11128
11129   if (parse_res == AARCH64_PARSE_OK)
11130     {
11131       gcc_assert (tmp_arch);
11132       selected_arch = tmp_arch;
11133       explicit_arch = selected_arch->arch;
11134       return true;
11135     }
11136
11137   switch (parse_res)
11138     {
11139       case AARCH64_PARSE_MISSING_ARG:
11140         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11141         break;
11142       case AARCH64_PARSE_INVALID_ARG:
11143         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11144         aarch64_print_hint_for_arch (str);
11145         break;
11146       case AARCH64_PARSE_INVALID_FEATURE:
11147         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11148         break;
11149       default:
11150         gcc_unreachable ();
11151     }
11152
11153   return false;
11154 }
11155
11156 /* Handle the argument CPU_STR to the cpu= target attribute.  */
11157
11158 static bool
11159 aarch64_handle_attr_cpu (const char *str)
11160 {
11161   const struct processor *tmp_cpu = NULL;
11162   enum aarch64_parse_opt_result parse_res
11163     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11164
11165   if (parse_res == AARCH64_PARSE_OK)
11166     {
11167       gcc_assert (tmp_cpu);
11168       selected_tune = tmp_cpu;
11169       explicit_tune_core = selected_tune->ident;
11170
11171       selected_arch = &all_architectures[tmp_cpu->arch];
11172       explicit_arch = selected_arch->arch;
11173       return true;
11174     }
11175
11176   switch (parse_res)
11177     {
11178       case AARCH64_PARSE_MISSING_ARG:
11179         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11180         break;
11181       case AARCH64_PARSE_INVALID_ARG:
11182         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11183         aarch64_print_hint_for_core (str);
11184         break;
11185       case AARCH64_PARSE_INVALID_FEATURE:
11186         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11187         break;
11188       default:
11189         gcc_unreachable ();
11190     }
11191
11192   return false;
11193 }
11194
11195 /* Handle the argument STR to the tune= target attribute.  */
11196
11197 static bool
11198 aarch64_handle_attr_tune (const char *str)
11199 {
11200   const struct processor *tmp_tune = NULL;
11201   enum aarch64_parse_opt_result parse_res
11202     = aarch64_parse_tune (str, &tmp_tune);
11203
11204   if (parse_res == AARCH64_PARSE_OK)
11205     {
11206       gcc_assert (tmp_tune);
11207       selected_tune = tmp_tune;
11208       explicit_tune_core = selected_tune->ident;
11209       return true;
11210     }
11211
11212   switch (parse_res)
11213     {
11214       case AARCH64_PARSE_INVALID_ARG:
11215         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11216         aarch64_print_hint_for_core (str);
11217         break;
11218       default:
11219         gcc_unreachable ();
11220     }
11221
11222   return false;
11223 }
11224
11225 /* Parse an architecture extensions target attribute string specified in STR.
11226    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
11227    if successful.  Update aarch64_isa_flags to reflect the ISA features
11228    modified.  */
11229
11230 static bool
11231 aarch64_handle_attr_isa_flags (char *str)
11232 {
11233   enum aarch64_parse_opt_result parse_res;
11234   unsigned long isa_flags = aarch64_isa_flags;
11235
11236   /* We allow "+nothing" in the beginning to clear out all architectural
11237      features if the user wants to handpick specific features.  */
11238   if (strncmp ("+nothing", str, 8) == 0)
11239     {
11240       isa_flags = 0;
11241       str += 8;
11242     }
11243
11244   parse_res = aarch64_parse_extension (str, &isa_flags);
11245
11246   if (parse_res == AARCH64_PARSE_OK)
11247     {
11248       aarch64_isa_flags = isa_flags;
11249       return true;
11250     }
11251
11252   switch (parse_res)
11253     {
11254       case AARCH64_PARSE_MISSING_ARG:
11255         error ("missing value in %<target()%> pragma or attribute");
11256         break;
11257
11258       case AARCH64_PARSE_INVALID_FEATURE:
11259         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11260         break;
11261
11262       default:
11263         gcc_unreachable ();
11264     }
11265
11266  return false;
11267 }
11268
11269 /* The target attributes that we support.  On top of these we also support just
11270    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
11271    handled explicitly in aarch64_process_one_target_attr.  */
11272
11273 static const struct aarch64_attribute_info aarch64_attributes[] =
11274 {
11275   { "general-regs-only", aarch64_attr_mask, false, NULL,
11276      OPT_mgeneral_regs_only },
11277   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11278      OPT_mfix_cortex_a53_835769 },
11279   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11280      OPT_mfix_cortex_a53_843419 },
11281   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11282   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
11283   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11284      OPT_momit_leaf_frame_pointer },
11285   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11286   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11287      OPT_march_ },
11288   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11289   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11290      OPT_mtune_ },
11291   { "sign-return-address", aarch64_attr_enum, false, NULL,
11292      OPT_msign_return_address_ },
11293   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11294 };
11295
11296 /* Parse ARG_STR which contains the definition of one target attribute.
11297    Show appropriate errors if any or return true if the attribute is valid.  */
11298
11299 static bool
11300 aarch64_process_one_target_attr (char *arg_str)
11301 {
11302   bool invert = false;
11303
11304   size_t len = strlen (arg_str);
11305
11306   if (len == 0)
11307     {
11308       error ("malformed %<target()%> pragma or attribute");
11309       return false;
11310     }
11311
11312   char *str_to_check = (char *) alloca (len + 1);
11313   strcpy (str_to_check, arg_str);
11314
11315   /* Skip leading whitespace.  */
11316   while (*str_to_check == ' ' || *str_to_check == '\t')
11317     str_to_check++;
11318
11319   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11320      It is easier to detect and handle it explicitly here rather than going
11321      through the machinery for the rest of the target attributes in this
11322      function.  */
11323   if (*str_to_check == '+')
11324     return aarch64_handle_attr_isa_flags (str_to_check);
11325
11326   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11327     {
11328       invert = true;
11329       str_to_check += 3;
11330     }
11331   char *arg = strchr (str_to_check, '=');
11332
11333   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11334      and point ARG to "foo".  */
11335   if (arg)
11336     {
11337       *arg = '\0';
11338       arg++;
11339     }
11340   const struct aarch64_attribute_info *p_attr;
11341   bool found = false;
11342   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11343     {
11344       /* If the names don't match up, or the user has given an argument
11345          to an attribute that doesn't accept one, or didn't give an argument
11346          to an attribute that expects one, fail to match.  */
11347       if (strcmp (str_to_check, p_attr->name) != 0)
11348         continue;
11349
11350       found = true;
11351       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11352                               || p_attr->attr_type == aarch64_attr_enum;
11353
11354       if (attr_need_arg_p ^ (arg != NULL))
11355         {
11356           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11357           return false;
11358         }
11359
11360       /* If the name matches but the attribute does not allow "no-" versions
11361          then we can't match.  */
11362       if (invert && !p_attr->allow_neg)
11363         {
11364           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11365           return false;
11366         }
11367
11368       switch (p_attr->attr_type)
11369         {
11370         /* Has a custom handler registered.
11371            For example, cpu=, arch=, tune=.  */
11372           case aarch64_attr_custom:
11373             gcc_assert (p_attr->handler);
11374             if (!p_attr->handler (arg))
11375               return false;
11376             break;
11377
11378           /* Either set or unset a boolean option.  */
11379           case aarch64_attr_bool:
11380             {
11381               struct cl_decoded_option decoded;
11382
11383               generate_option (p_attr->opt_num, NULL, !invert,
11384                                CL_TARGET, &decoded);
11385               aarch64_handle_option (&global_options, &global_options_set,
11386                                       &decoded, input_location);
11387               break;
11388             }
11389           /* Set or unset a bit in the target_flags.  aarch64_handle_option
11390              should know what mask to apply given the option number.  */
11391           case aarch64_attr_mask:
11392             {
11393               struct cl_decoded_option decoded;
11394               /* We only need to specify the option number.
11395                  aarch64_handle_option will know which mask to apply.  */
11396               decoded.opt_index = p_attr->opt_num;
11397               decoded.value = !invert;
11398               aarch64_handle_option (&global_options, &global_options_set,
11399                                       &decoded, input_location);
11400               break;
11401             }
11402           /* Use the option setting machinery to set an option to an enum.  */
11403           case aarch64_attr_enum:
11404             {
11405               gcc_assert (arg);
11406               bool valid;
11407               int value;
11408               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11409                                               &value, CL_TARGET);
11410               if (valid)
11411                 {
11412                   set_option (&global_options, NULL, p_attr->opt_num, value,
11413                               NULL, DK_UNSPECIFIED, input_location,
11414                               global_dc);
11415                 }
11416               else
11417                 {
11418                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11419                 }
11420               break;
11421             }
11422           default:
11423             gcc_unreachable ();
11424         }
11425     }
11426
11427   /* If we reached here we either have found an attribute and validated
11428      it or didn't match any.  If we matched an attribute but its arguments
11429      were malformed we will have returned false already.  */
11430   return found;
11431 }
11432
11433 /* Count how many times the character C appears in
11434    NULL-terminated string STR.  */
11435
11436 static unsigned int
11437 num_occurences_in_str (char c, char *str)
11438 {
11439   unsigned int res = 0;
11440   while (*str != '\0')
11441     {
11442       if (*str == c)
11443         res++;
11444
11445       str++;
11446     }
11447
11448   return res;
11449 }
11450
11451 /* Parse the tree in ARGS that contains the target attribute information
11452    and update the global target options space.  */
11453
11454 bool
11455 aarch64_process_target_attr (tree args)
11456 {
11457   if (TREE_CODE (args) == TREE_LIST)
11458     {
11459       do
11460         {
11461           tree head = TREE_VALUE (args);
11462           if (head)
11463             {
11464               if (!aarch64_process_target_attr (head))
11465                 return false;
11466             }
11467           args = TREE_CHAIN (args);
11468         } while (args);
11469
11470       return true;
11471     }
11472
11473   if (TREE_CODE (args) != STRING_CST)
11474     {
11475       error ("attribute %<target%> argument not a string");
11476       return false;
11477     }
11478
11479   size_t len = strlen (TREE_STRING_POINTER (args));
11480   char *str_to_check = (char *) alloca (len + 1);
11481   strcpy (str_to_check, TREE_STRING_POINTER (args));
11482
11483   if (len == 0)
11484     {
11485       error ("malformed %<target()%> pragma or attribute");
11486       return false;
11487     }
11488
11489   /* Used to catch empty spaces between commas i.e.
11490      attribute ((target ("attr1,,attr2"))).  */
11491   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11492
11493   /* Handle multiple target attributes separated by ','.  */
11494   char *token = strtok (str_to_check, ",");
11495
11496   unsigned int num_attrs = 0;
11497   while (token)
11498     {
11499       num_attrs++;
11500       if (!aarch64_process_one_target_attr (token))
11501         {
11502           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11503           return false;
11504         }
11505
11506       token = strtok (NULL, ",");
11507     }
11508
11509   if (num_attrs != num_commas + 1)
11510     {
11511       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11512       return false;
11513     }
11514
11515   return true;
11516 }
11517
11518 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
11519    process attribute ((target ("..."))).  */
11520
11521 static bool
11522 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11523 {
11524   struct cl_target_option cur_target;
11525   bool ret;
11526   tree old_optimize;
11527   tree new_target, new_optimize;
11528   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11529
11530   /* If what we're processing is the current pragma string then the
11531      target option node is already stored in target_option_current_node
11532      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
11533      having to re-parse the string.  This is especially useful to keep
11534      arm_neon.h compile times down since that header contains a lot
11535      of intrinsics enclosed in pragmas.  */
11536   if (!existing_target && args == current_target_pragma)
11537     {
11538       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11539       return true;
11540     }
11541   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11542
11543   old_optimize = build_optimization_node (&global_options);
11544   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11545
11546   /* If the function changed the optimization levels as well as setting
11547      target options, start with the optimizations specified.  */
11548   if (func_optimize && func_optimize != old_optimize)
11549     cl_optimization_restore (&global_options,
11550                              TREE_OPTIMIZATION (func_optimize));
11551
11552   /* Save the current target options to restore at the end.  */
11553   cl_target_option_save (&cur_target, &global_options);
11554
11555   /* If fndecl already has some target attributes applied to it, unpack
11556      them so that we add this attribute on top of them, rather than
11557      overwriting them.  */
11558   if (existing_target)
11559     {
11560       struct cl_target_option *existing_options
11561         = TREE_TARGET_OPTION (existing_target);
11562
11563       if (existing_options)
11564         cl_target_option_restore (&global_options, existing_options);
11565     }
11566   else
11567     cl_target_option_restore (&global_options,
11568                         TREE_TARGET_OPTION (target_option_current_node));
11569
11570   ret = aarch64_process_target_attr (args);
11571
11572   /* Set up any additional state.  */
11573   if (ret)
11574     {
11575       aarch64_override_options_internal (&global_options);
11576       /* Initialize SIMD builtins if we haven't already.
11577          Set current_target_pragma to NULL for the duration so that
11578          the builtin initialization code doesn't try to tag the functions
11579          being built with the attributes specified by any current pragma, thus
11580          going into an infinite recursion.  */
11581       if (TARGET_SIMD)
11582         {
11583           tree saved_current_target_pragma = current_target_pragma;
11584           current_target_pragma = NULL;
11585           aarch64_init_simd_builtins ();
11586           current_target_pragma = saved_current_target_pragma;
11587         }
11588       new_target = build_target_option_node (&global_options);
11589     }
11590   else
11591     new_target = NULL;
11592
11593   new_optimize = build_optimization_node (&global_options);
11594
11595   if (fndecl && ret)
11596     {
11597       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11598
11599       if (old_optimize != new_optimize)
11600         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11601     }
11602
11603   cl_target_option_restore (&global_options, &cur_target);
11604
11605   if (old_optimize != new_optimize)
11606     cl_optimization_restore (&global_options,
11607                              TREE_OPTIMIZATION (old_optimize));
11608   return ret;
11609 }
11610
11611 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
11612    tri-bool options (yes, no, don't care) and the default value is
11613    DEF, determine whether to reject inlining.  */
11614
11615 static bool
11616 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11617                                      int dont_care, int def)
11618 {
11619   /* If the callee doesn't care, always allow inlining.  */
11620   if (callee == dont_care)
11621     return true;
11622
11623   /* If the caller doesn't care, always allow inlining.  */
11624   if (caller == dont_care)
11625     return true;
11626
11627   /* Otherwise, allow inlining if either the callee and caller values
11628      agree, or if the callee is using the default value.  */
11629   return (callee == caller || callee == def);
11630 }
11631
11632 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
11633    to inline CALLEE into CALLER based on target-specific info.
11634    Make sure that the caller and callee have compatible architectural
11635    features.  Then go through the other possible target attributes
11636    and see if they can block inlining.  Try not to reject always_inline
11637    callees unless they are incompatible architecturally.  */
11638
11639 static bool
11640 aarch64_can_inline_p (tree caller, tree callee)
11641 {
11642   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11643   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11644
11645   /* If callee has no option attributes, then it is ok to inline.  */
11646   if (!callee_tree)
11647     return true;
11648
11649   struct cl_target_option *caller_opts
11650         = TREE_TARGET_OPTION (caller_tree ? caller_tree
11651                                            : target_option_default_node);
11652
11653   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
11654
11655
11656   /* Callee's ISA flags should be a subset of the caller's.  */
11657   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11658        != callee_opts->x_aarch64_isa_flags)
11659     return false;
11660
11661   /* Allow non-strict aligned functions inlining into strict
11662      aligned ones.  */
11663   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11664        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11665       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11666            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11667     return false;
11668
11669   bool always_inline = lookup_attribute ("always_inline",
11670                                           DECL_ATTRIBUTES (callee));
11671
11672   /* If the architectural features match up and the callee is always_inline
11673      then the other attributes don't matter.  */
11674   if (always_inline)
11675     return true;
11676
11677   if (caller_opts->x_aarch64_cmodel_var
11678       != callee_opts->x_aarch64_cmodel_var)
11679     return false;
11680
11681   if (caller_opts->x_aarch64_tls_dialect
11682       != callee_opts->x_aarch64_tls_dialect)
11683     return false;
11684
11685   /* Honour explicit requests to workaround errata.  */
11686   if (!aarch64_tribools_ok_for_inlining_p (
11687           caller_opts->x_aarch64_fix_a53_err835769,
11688           callee_opts->x_aarch64_fix_a53_err835769,
11689           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11690     return false;
11691
11692   if (!aarch64_tribools_ok_for_inlining_p (
11693           caller_opts->x_aarch64_fix_a53_err843419,
11694           callee_opts->x_aarch64_fix_a53_err843419,
11695           2, TARGET_FIX_ERR_A53_843419))
11696     return false;
11697
11698   /* If the user explicitly specified -momit-leaf-frame-pointer for the
11699      caller and calle and they don't match up, reject inlining.  */
11700   if (!aarch64_tribools_ok_for_inlining_p (
11701           caller_opts->x_flag_omit_leaf_frame_pointer,
11702           callee_opts->x_flag_omit_leaf_frame_pointer,
11703           2, 1))
11704     return false;
11705
11706   /* If the callee has specific tuning overrides, respect them.  */
11707   if (callee_opts->x_aarch64_override_tune_string != NULL
11708       && caller_opts->x_aarch64_override_tune_string == NULL)
11709     return false;
11710
11711   /* If the user specified tuning override strings for the
11712      caller and callee and they don't match up, reject inlining.
11713      We just do a string compare here, we don't analyze the meaning
11714      of the string, as it would be too costly for little gain.  */
11715   if (callee_opts->x_aarch64_override_tune_string
11716       && caller_opts->x_aarch64_override_tune_string
11717       && (strcmp (callee_opts->x_aarch64_override_tune_string,
11718                   caller_opts->x_aarch64_override_tune_string) != 0))
11719     return false;
11720
11721   return true;
11722 }
11723
11724 /* Return true if SYMBOL_REF X binds locally.  */
11725
11726 static bool
11727 aarch64_symbol_binds_local_p (const_rtx x)
11728 {
11729   return (SYMBOL_REF_DECL (x)
11730           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11731           : SYMBOL_REF_LOCAL_P (x));
11732 }
11733
11734 /* Return true if SYMBOL_REF X is thread local */
11735 static bool
11736 aarch64_tls_symbol_p (rtx x)
11737 {
11738   if (! TARGET_HAVE_TLS)
11739     return false;
11740
11741   if (GET_CODE (x) != SYMBOL_REF)
11742     return false;
11743
11744   return SYMBOL_REF_TLS_MODEL (x) != 0;
11745 }
11746
11747 /* Classify a TLS symbol into one of the TLS kinds.  */
11748 enum aarch64_symbol_type
11749 aarch64_classify_tls_symbol (rtx x)
11750 {
11751   enum tls_model tls_kind = tls_symbolic_operand_type (x);
11752
11753   switch (tls_kind)
11754     {
11755     case TLS_MODEL_GLOBAL_DYNAMIC:
11756     case TLS_MODEL_LOCAL_DYNAMIC:
11757       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11758
11759     case TLS_MODEL_INITIAL_EXEC:
11760       switch (aarch64_cmodel)
11761         {
11762         case AARCH64_CMODEL_TINY:
11763         case AARCH64_CMODEL_TINY_PIC:
11764           return SYMBOL_TINY_TLSIE;
11765         default:
11766           return SYMBOL_SMALL_TLSIE;
11767         }
11768
11769     case TLS_MODEL_LOCAL_EXEC:
11770       if (aarch64_tls_size == 12)
11771         return SYMBOL_TLSLE12;
11772       else if (aarch64_tls_size == 24)
11773         return SYMBOL_TLSLE24;
11774       else if (aarch64_tls_size == 32)
11775         return SYMBOL_TLSLE32;
11776       else if (aarch64_tls_size == 48)
11777         return SYMBOL_TLSLE48;
11778       else
11779         gcc_unreachable ();
11780
11781     case TLS_MODEL_EMULATED:
11782     case TLS_MODEL_NONE:
11783       return SYMBOL_FORCE_TO_MEM;
11784
11785     default:
11786       gcc_unreachable ();
11787     }
11788 }
11789
11790 /* Return the correct method for accessing X + OFFSET, where X is either
11791    a SYMBOL_REF or LABEL_REF.  */
11792
11793 enum aarch64_symbol_type
11794 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11795 {
11796   if (GET_CODE (x) == LABEL_REF)
11797     {
11798       switch (aarch64_cmodel)
11799         {
11800         case AARCH64_CMODEL_LARGE:
11801           return SYMBOL_FORCE_TO_MEM;
11802
11803         case AARCH64_CMODEL_TINY_PIC:
11804         case AARCH64_CMODEL_TINY:
11805           return SYMBOL_TINY_ABSOLUTE;
11806
11807         case AARCH64_CMODEL_SMALL_SPIC:
11808         case AARCH64_CMODEL_SMALL_PIC:
11809         case AARCH64_CMODEL_SMALL:
11810           return SYMBOL_SMALL_ABSOLUTE;
11811
11812         default:
11813           gcc_unreachable ();
11814         }
11815     }
11816
11817   if (GET_CODE (x) == SYMBOL_REF)
11818     {
11819       if (aarch64_tls_symbol_p (x))
11820         return aarch64_classify_tls_symbol (x);
11821
11822       switch (aarch64_cmodel)
11823         {
11824         case AARCH64_CMODEL_TINY:
11825           /* When we retrieve symbol + offset address, we have to make sure
11826              the offset does not cause overflow of the final address.  But
11827              we have no way of knowing the address of symbol at compile time
11828              so we can't accurately say if the distance between the PC and
11829              symbol + offset is outside the addressible range of +/-1M in the
11830              TINY code model.  So we rely on images not being greater than
11831              1M and cap the offset at 1M and anything beyond 1M will have to
11832              be loaded using an alternative mechanism.  Furthermore if the
11833              symbol is a weak reference to something that isn't known to
11834              resolve to a symbol in this module, then force to memory.  */
11835           if ((SYMBOL_REF_WEAK (x)
11836                && !aarch64_symbol_binds_local_p (x))
11837               || !IN_RANGE (offset, -1048575, 1048575))
11838             return SYMBOL_FORCE_TO_MEM;
11839           return SYMBOL_TINY_ABSOLUTE;
11840
11841         case AARCH64_CMODEL_SMALL:
11842           /* Same reasoning as the tiny code model, but the offset cap here is
11843              4G.  */
11844           if ((SYMBOL_REF_WEAK (x)
11845                && !aarch64_symbol_binds_local_p (x))
11846               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11847                             HOST_WIDE_INT_C (4294967264)))
11848             return SYMBOL_FORCE_TO_MEM;
11849           return SYMBOL_SMALL_ABSOLUTE;
11850
11851         case AARCH64_CMODEL_TINY_PIC:
11852           if (!aarch64_symbol_binds_local_p (x))
11853             return SYMBOL_TINY_GOT;
11854           return SYMBOL_TINY_ABSOLUTE;
11855
11856         case AARCH64_CMODEL_SMALL_SPIC:
11857         case AARCH64_CMODEL_SMALL_PIC:
11858           if (!aarch64_symbol_binds_local_p (x))
11859             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11860                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11861           return SYMBOL_SMALL_ABSOLUTE;
11862
11863         case AARCH64_CMODEL_LARGE:
11864           /* This is alright even in PIC code as the constant
11865              pool reference is always PC relative and within
11866              the same translation unit.  */
11867           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11868             return SYMBOL_SMALL_ABSOLUTE;
11869           else
11870             return SYMBOL_FORCE_TO_MEM;
11871
11872         default:
11873           gcc_unreachable ();
11874         }
11875     }
11876
11877   /* By default push everything into the constant pool.  */
11878   return SYMBOL_FORCE_TO_MEM;
11879 }
11880
11881 bool
11882 aarch64_constant_address_p (rtx x)
11883 {
11884   return (CONSTANT_P (x) && memory_address_p (DImode, x));
11885 }
11886
11887 bool
11888 aarch64_legitimate_pic_operand_p (rtx x)
11889 {
11890   if (GET_CODE (x) == SYMBOL_REF
11891       || (GET_CODE (x) == CONST
11892           && GET_CODE (XEXP (x, 0)) == PLUS
11893           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11894      return false;
11895
11896   return true;
11897 }
11898
11899 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
11900    that should be rematerialized rather than spilled.  */
11901
11902 static bool
11903 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11904 {
11905   /* Support CSE and rematerialization of common constants.  */
11906   if (CONST_INT_P (x)
11907       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11908       || GET_CODE (x) == CONST_VECTOR)
11909     return true;
11910
11911   /* Do not allow vector struct mode constants for Advanced SIMD.
11912      We could support 0 and -1 easily, but they need support in
11913      aarch64-simd.md.  */
11914   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11915   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11916     return false;
11917
11918   /* Only accept variable-length vector constants if they can be
11919      handled directly.
11920
11921      ??? It would be possible to handle rematerialization of other
11922      constants via secondary reloads.  */
11923   if (vec_flags & VEC_ANY_SVE)
11924     return aarch64_simd_valid_immediate (x, NULL);
11925
11926   if (GET_CODE (x) == HIGH)
11927     x = XEXP (x, 0);
11928
11929   /* Accept polynomial constants that can be calculated by using the
11930      destination of a move as the sole temporary.  Constants that
11931      require a second temporary cannot be rematerialized (they can't be
11932      forced to memory and also aren't legitimate constants).  */
11933   poly_int64 offset;
11934   if (poly_int_rtx_p (x, &offset))
11935     return aarch64_offset_temporaries (false, offset) <= 1;
11936
11937   /* If an offset is being added to something else, we need to allow the
11938      base to be moved into the destination register, meaning that there
11939      are no free temporaries for the offset.  */
11940   x = strip_offset (x, &offset);
11941   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11942     return false;
11943
11944   /* Do not allow const (plus (anchor_symbol, const_int)).  */
11945   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11946     return false;
11947
11948   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
11949      so spilling them is better than rematerialization.  */
11950   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11951     return true;
11952
11953   /* Label references are always constant.  */
11954   if (GET_CODE (x) == LABEL_REF)
11955     return true;
11956
11957   return false;
11958 }
11959
11960 rtx
11961 aarch64_load_tp (rtx target)
11962 {
11963   if (!target
11964       || GET_MODE (target) != Pmode
11965       || !register_operand (target, Pmode))
11966     target = gen_reg_rtx (Pmode);
11967
11968   /* Can return in any reg.  */
11969   emit_insn (gen_aarch64_load_tp_hard (target));
11970   return target;
11971 }
11972
11973 /* On AAPCS systems, this is the "struct __va_list".  */
11974 static GTY(()) tree va_list_type;
11975
11976 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
11977    Return the type to use as __builtin_va_list.
11978
11979    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
11980
11981    struct __va_list
11982    {
11983      void *__stack;
11984      void *__gr_top;
11985      void *__vr_top;
11986      int   __gr_offs;
11987      int   __vr_offs;
11988    };  */
11989
11990 static tree
11991 aarch64_build_builtin_va_list (void)
11992 {
11993   tree va_list_name;
11994   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11995
11996   /* Create the type.  */
11997   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
11998   /* Give it the required name.  */
11999   va_list_name = build_decl (BUILTINS_LOCATION,
12000                              TYPE_DECL,
12001                              get_identifier ("__va_list"),
12002                              va_list_type);
12003   DECL_ARTIFICIAL (va_list_name) = 1;
12004   TYPE_NAME (va_list_type) = va_list_name;
12005   TYPE_STUB_DECL (va_list_type) = va_list_name;
12006
12007   /* Create the fields.  */
12008   f_stack = build_decl (BUILTINS_LOCATION,
12009                         FIELD_DECL, get_identifier ("__stack"),
12010                         ptr_type_node);
12011   f_grtop = build_decl (BUILTINS_LOCATION,
12012                         FIELD_DECL, get_identifier ("__gr_top"),
12013                         ptr_type_node);
12014   f_vrtop = build_decl (BUILTINS_LOCATION,
12015                         FIELD_DECL, get_identifier ("__vr_top"),
12016                         ptr_type_node);
12017   f_groff = build_decl (BUILTINS_LOCATION,
12018                         FIELD_DECL, get_identifier ("__gr_offs"),
12019                         integer_type_node);
12020   f_vroff = build_decl (BUILTINS_LOCATION,
12021                         FIELD_DECL, get_identifier ("__vr_offs"),
12022                         integer_type_node);
12023
12024   /* Tell tree-stdarg pass about our internal offset fields.
12025      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12026      purpose to identify whether the code is updating va_list internal
12027      offset fields through irregular way.  */
12028   va_list_gpr_counter_field = f_groff;
12029   va_list_fpr_counter_field = f_vroff;
12030
12031   DECL_ARTIFICIAL (f_stack) = 1;
12032   DECL_ARTIFICIAL (f_grtop) = 1;
12033   DECL_ARTIFICIAL (f_vrtop) = 1;
12034   DECL_ARTIFICIAL (f_groff) = 1;
12035   DECL_ARTIFICIAL (f_vroff) = 1;
12036
12037   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12038   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12039   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12040   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12041   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12042
12043   TYPE_FIELDS (va_list_type) = f_stack;
12044   DECL_CHAIN (f_stack) = f_grtop;
12045   DECL_CHAIN (f_grtop) = f_vrtop;
12046   DECL_CHAIN (f_vrtop) = f_groff;
12047   DECL_CHAIN (f_groff) = f_vroff;
12048
12049   /* Compute its layout.  */
12050   layout_type (va_list_type);
12051
12052   return va_list_type;
12053 }
12054
12055 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
12056 static void
12057 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12058 {
12059   const CUMULATIVE_ARGS *cum;
12060   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12061   tree stack, grtop, vrtop, groff, vroff;
12062   tree t;
12063   int gr_save_area_size = cfun->va_list_gpr_size;
12064   int vr_save_area_size = cfun->va_list_fpr_size;
12065   int vr_offset;
12066
12067   cum = &crtl->args.info;
12068   if (cfun->va_list_gpr_size)
12069     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12070                              cfun->va_list_gpr_size);
12071   if (cfun->va_list_fpr_size)
12072     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12073                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
12074
12075   if (!TARGET_FLOAT)
12076     {
12077       gcc_assert (cum->aapcs_nvrn == 0);
12078       vr_save_area_size = 0;
12079     }
12080
12081   f_stack = TYPE_FIELDS (va_list_type_node);
12082   f_grtop = DECL_CHAIN (f_stack);
12083   f_vrtop = DECL_CHAIN (f_grtop);
12084   f_groff = DECL_CHAIN (f_vrtop);
12085   f_vroff = DECL_CHAIN (f_groff);
12086
12087   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12088                   NULL_TREE);
12089   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12090                   NULL_TREE);
12091   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12092                   NULL_TREE);
12093   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12094                   NULL_TREE);
12095   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12096                   NULL_TREE);
12097
12098   /* Emit code to initialize STACK, which points to the next varargs stack
12099      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
12100      by named arguments.  STACK is 8-byte aligned.  */
12101   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12102   if (cum->aapcs_stack_size > 0)
12103     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12104   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12105   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12106
12107   /* Emit code to initialize GRTOP, the top of the GR save area.
12108      virtual_incoming_args_rtx should have been 16 byte aligned.  */
12109   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12110   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12111   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12112
12113   /* Emit code to initialize VRTOP, the top of the VR save area.
12114      This address is gr_save_area_bytes below GRTOP, rounded
12115      down to the next 16-byte boundary.  */
12116   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12117   vr_offset = ROUND_UP (gr_save_area_size,
12118                         STACK_BOUNDARY / BITS_PER_UNIT);
12119
12120   if (vr_offset)
12121     t = fold_build_pointer_plus_hwi (t, -vr_offset);
12122   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12123   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12124
12125   /* Emit code to initialize GROFF, the offset from GRTOP of the
12126      next GPR argument.  */
12127   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12128               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12129   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12130
12131   /* Likewise emit code to initialize VROFF, the offset from FTOP
12132      of the next VR argument.  */
12133   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12134               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12135   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12136 }
12137
12138 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
12139
12140 static tree
12141 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12142                               gimple_seq *post_p ATTRIBUTE_UNUSED)
12143 {
12144   tree addr;
12145   bool indirect_p;
12146   bool is_ha;           /* is HFA or HVA.  */
12147   bool dw_align;        /* double-word align.  */
12148   machine_mode ag_mode = VOIDmode;
12149   int nregs;
12150   machine_mode mode;
12151
12152   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12153   tree stack, f_top, f_off, off, arg, roundup, on_stack;
12154   HOST_WIDE_INT size, rsize, adjust, align;
12155   tree t, u, cond1, cond2;
12156
12157   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12158   if (indirect_p)
12159     type = build_pointer_type (type);
12160
12161   mode = TYPE_MODE (type);
12162
12163   f_stack = TYPE_FIELDS (va_list_type_node);
12164   f_grtop = DECL_CHAIN (f_stack);
12165   f_vrtop = DECL_CHAIN (f_grtop);
12166   f_groff = DECL_CHAIN (f_vrtop);
12167   f_vroff = DECL_CHAIN (f_groff);
12168
12169   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12170                   f_stack, NULL_TREE);
12171   size = int_size_in_bytes (type);
12172   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12173
12174   dw_align = false;
12175   adjust = 0;
12176   if (aarch64_vfp_is_call_or_return_candidate (mode,
12177                                                type,
12178                                                &ag_mode,
12179                                                &nregs,
12180                                                &is_ha))
12181     {
12182       /* No frontends can create types with variable-sized modes, so we
12183          shouldn't be asked to pass or return them.  */
12184       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12185
12186       /* TYPE passed in fp/simd registers.  */
12187       if (!TARGET_FLOAT)
12188         aarch64_err_no_fpadvsimd (mode, "varargs");
12189
12190       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12191                       unshare_expr (valist), f_vrtop, NULL_TREE);
12192       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12193                       unshare_expr (valist), f_vroff, NULL_TREE);
12194
12195       rsize = nregs * UNITS_PER_VREG;
12196
12197       if (is_ha)
12198         {
12199           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12200             adjust = UNITS_PER_VREG - ag_size;
12201         }
12202       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12203                && size < UNITS_PER_VREG)
12204         {
12205           adjust = UNITS_PER_VREG - size;
12206         }
12207     }
12208   else
12209     {
12210       /* TYPE passed in general registers.  */
12211       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12212                       unshare_expr (valist), f_grtop, NULL_TREE);
12213       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12214                       unshare_expr (valist), f_groff, NULL_TREE);
12215       rsize = ROUND_UP (size, UNITS_PER_WORD);
12216       nregs = rsize / UNITS_PER_WORD;
12217
12218       if (align > 8)
12219         dw_align = true;
12220
12221       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12222           && size < UNITS_PER_WORD)
12223         {
12224           adjust = UNITS_PER_WORD  - size;
12225         }
12226     }
12227
12228   /* Get a local temporary for the field value.  */
12229   off = get_initialized_tmp_var (f_off, pre_p, NULL);
12230
12231   /* Emit code to branch if off >= 0.  */
12232   t = build2 (GE_EXPR, boolean_type_node, off,
12233               build_int_cst (TREE_TYPE (off), 0));
12234   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12235
12236   if (dw_align)
12237     {
12238       /* Emit: offs = (offs + 15) & -16.  */
12239       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12240                   build_int_cst (TREE_TYPE (off), 15));
12241       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12242                   build_int_cst (TREE_TYPE (off), -16));
12243       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12244     }
12245   else
12246     roundup = NULL;
12247
12248   /* Update ap.__[g|v]r_offs  */
12249   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12250               build_int_cst (TREE_TYPE (off), rsize));
12251   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12252
12253   /* String up.  */
12254   if (roundup)
12255     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12256
12257   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
12258   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12259               build_int_cst (TREE_TYPE (f_off), 0));
12260   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12261
12262   /* String up: make sure the assignment happens before the use.  */
12263   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12264   COND_EXPR_ELSE (cond1) = t;
12265
12266   /* Prepare the trees handling the argument that is passed on the stack;
12267      the top level node will store in ON_STACK.  */
12268   arg = get_initialized_tmp_var (stack, pre_p, NULL);
12269   if (align > 8)
12270     {
12271       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
12272       t = fold_convert (intDI_type_node, arg);
12273       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12274                   build_int_cst (TREE_TYPE (t), 15));
12275       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12276                   build_int_cst (TREE_TYPE (t), -16));
12277       t = fold_convert (TREE_TYPE (arg), t);
12278       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12279     }
12280   else
12281     roundup = NULL;
12282   /* Advance ap.__stack  */
12283   t = fold_convert (intDI_type_node, arg);
12284   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12285               build_int_cst (TREE_TYPE (t), size + 7));
12286   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12287               build_int_cst (TREE_TYPE (t), -8));
12288   t = fold_convert (TREE_TYPE (arg), t);
12289   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12290   /* String up roundup and advance.  */
12291   if (roundup)
12292     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12293   /* String up with arg */
12294   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12295   /* Big-endianness related address adjustment.  */
12296   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12297       && size < UNITS_PER_WORD)
12298   {
12299     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12300                 size_int (UNITS_PER_WORD - size));
12301     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12302   }
12303
12304   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12305   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12306
12307   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
12308   t = off;
12309   if (adjust)
12310     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12311                 build_int_cst (TREE_TYPE (off), adjust));
12312
12313   t = fold_convert (sizetype, t);
12314   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12315
12316   if (is_ha)
12317     {
12318       /* type ha; // treat as "struct {ftype field[n];}"
12319          ... [computing offs]
12320          for (i = 0; i <nregs; ++i, offs += 16)
12321            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12322          return ha;  */
12323       int i;
12324       tree tmp_ha, field_t, field_ptr_t;
12325
12326       /* Declare a local variable.  */
12327       tmp_ha = create_tmp_var_raw (type, "ha");
12328       gimple_add_tmp_var (tmp_ha);
12329
12330       /* Establish the base type.  */
12331       switch (ag_mode)
12332         {
12333         case E_SFmode:
12334           field_t = float_type_node;
12335           field_ptr_t = float_ptr_type_node;
12336           break;
12337         case E_DFmode:
12338           field_t = double_type_node;
12339           field_ptr_t = double_ptr_type_node;
12340           break;
12341         case E_TFmode:
12342           field_t = long_double_type_node;
12343           field_ptr_t = long_double_ptr_type_node;
12344           break;
12345         case E_HFmode:
12346           field_t = aarch64_fp16_type_node;
12347           field_ptr_t = aarch64_fp16_ptr_type_node;
12348           break;
12349         case E_V2SImode:
12350         case E_V4SImode:
12351             {
12352               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12353               field_t = build_vector_type_for_mode (innertype, ag_mode);
12354               field_ptr_t = build_pointer_type (field_t);
12355             }
12356           break;
12357         default:
12358           gcc_assert (0);
12359         }
12360
12361       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
12362       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12363       addr = t;
12364       t = fold_convert (field_ptr_t, addr);
12365       t = build2 (MODIFY_EXPR, field_t,
12366                   build1 (INDIRECT_REF, field_t, tmp_ha),
12367                   build1 (INDIRECT_REF, field_t, t));
12368
12369       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
12370       for (i = 1; i < nregs; ++i)
12371         {
12372           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12373           u = fold_convert (field_ptr_t, addr);
12374           u = build2 (MODIFY_EXPR, field_t,
12375                       build2 (MEM_REF, field_t, tmp_ha,
12376                               build_int_cst (field_ptr_t,
12377                                              (i *
12378                                               int_size_in_bytes (field_t)))),
12379                       build1 (INDIRECT_REF, field_t, u));
12380           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12381         }
12382
12383       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12384       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12385     }
12386
12387   COND_EXPR_ELSE (cond2) = t;
12388   addr = fold_convert (build_pointer_type (type), cond1);
12389   addr = build_va_arg_indirect_ref (addr);
12390
12391   if (indirect_p)
12392     addr = build_va_arg_indirect_ref (addr);
12393
12394   return addr;
12395 }
12396
12397 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
12398
12399 static void
12400 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12401                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12402                                 int no_rtl)
12403 {
12404   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12405   CUMULATIVE_ARGS local_cum;
12406   int gr_saved = cfun->va_list_gpr_size;
12407   int vr_saved = cfun->va_list_fpr_size;
12408
12409   /* The caller has advanced CUM up to, but not beyond, the last named
12410      argument.  Advance a local copy of CUM past the last "real" named
12411      argument, to find out how many registers are left over.  */
12412   local_cum = *cum;
12413   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12414
12415   /* Found out how many registers we need to save.
12416      Honor tree-stdvar analysis results.  */
12417   if (cfun->va_list_gpr_size)
12418     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12419                     cfun->va_list_gpr_size / UNITS_PER_WORD);
12420   if (cfun->va_list_fpr_size)
12421     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12422                     cfun->va_list_fpr_size / UNITS_PER_VREG);
12423
12424   if (!TARGET_FLOAT)
12425     {
12426       gcc_assert (local_cum.aapcs_nvrn == 0);
12427       vr_saved = 0;
12428     }
12429
12430   if (!no_rtl)
12431     {
12432       if (gr_saved > 0)
12433         {
12434           rtx ptr, mem;
12435
12436           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
12437           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12438                                - gr_saved * UNITS_PER_WORD);
12439           mem = gen_frame_mem (BLKmode, ptr);
12440           set_mem_alias_set (mem, get_varargs_alias_set ());
12441
12442           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12443                                mem, gr_saved);
12444         }
12445       if (vr_saved > 0)
12446         {
12447           /* We can't use move_block_from_reg, because it will use
12448              the wrong mode, storing D regs only.  */
12449           machine_mode mode = TImode;
12450           int off, i, vr_start;
12451
12452           /* Set OFF to the offset from virtual_incoming_args_rtx of
12453              the first vector register.  The VR save area lies below
12454              the GR one, and is aligned to 16 bytes.  */
12455           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12456                            STACK_BOUNDARY / BITS_PER_UNIT);
12457           off -= vr_saved * UNITS_PER_VREG;
12458
12459           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12460           for (i = 0; i < vr_saved; ++i)
12461             {
12462               rtx ptr, mem;
12463
12464               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12465               mem = gen_frame_mem (mode, ptr);
12466               set_mem_alias_set (mem, get_varargs_alias_set ());
12467               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12468               off += UNITS_PER_VREG;
12469             }
12470         }
12471     }
12472
12473   /* We don't save the size into *PRETEND_SIZE because we want to avoid
12474      any complication of having crtl->args.pretend_args_size changed.  */
12475   cfun->machine->frame.saved_varargs_size
12476     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12477                  STACK_BOUNDARY / BITS_PER_UNIT)
12478        + vr_saved * UNITS_PER_VREG);
12479 }
12480
12481 static void
12482 aarch64_conditional_register_usage (void)
12483 {
12484   int i;
12485   if (!TARGET_FLOAT)
12486     {
12487       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12488         {
12489           fixed_regs[i] = 1;
12490           call_used_regs[i] = 1;
12491         }
12492     }
12493   if (!TARGET_SVE)
12494     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12495       {
12496         fixed_regs[i] = 1;
12497         call_used_regs[i] = 1;
12498       }
12499 }
12500
12501 /* Walk down the type tree of TYPE counting consecutive base elements.
12502    If *MODEP is VOIDmode, then set it to the first valid floating point
12503    type.  If a non-floating point type is found, or if a floating point
12504    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12505    otherwise return the count in the sub-tree.  */
12506 static int
12507 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12508 {
12509   machine_mode mode;
12510   HOST_WIDE_INT size;
12511
12512   switch (TREE_CODE (type))
12513     {
12514     case REAL_TYPE:
12515       mode = TYPE_MODE (type);
12516       if (mode != DFmode && mode != SFmode
12517           && mode != TFmode && mode != HFmode)
12518         return -1;
12519
12520       if (*modep == VOIDmode)
12521         *modep = mode;
12522
12523       if (*modep == mode)
12524         return 1;
12525
12526       break;
12527
12528     case COMPLEX_TYPE:
12529       mode = TYPE_MODE (TREE_TYPE (type));
12530       if (mode != DFmode && mode != SFmode
12531           && mode != TFmode && mode != HFmode)
12532         return -1;
12533
12534       if (*modep == VOIDmode)
12535         *modep = mode;
12536
12537       if (*modep == mode)
12538         return 2;
12539
12540       break;
12541
12542     case VECTOR_TYPE:
12543       /* Use V2SImode and V4SImode as representatives of all 64-bit
12544          and 128-bit vector types.  */
12545       size = int_size_in_bytes (type);
12546       switch (size)
12547         {
12548         case 8:
12549           mode = V2SImode;
12550           break;
12551         case 16:
12552           mode = V4SImode;
12553           break;
12554         default:
12555           return -1;
12556         }
12557
12558       if (*modep == VOIDmode)
12559         *modep = mode;
12560
12561       /* Vector modes are considered to be opaque: two vectors are
12562          equivalent for the purposes of being homogeneous aggregates
12563          if they are the same size.  */
12564       if (*modep == mode)
12565         return 1;
12566
12567       break;
12568
12569     case ARRAY_TYPE:
12570       {
12571         int count;
12572         tree index = TYPE_DOMAIN (type);
12573
12574         /* Can't handle incomplete types nor sizes that are not
12575            fixed.  */
12576         if (!COMPLETE_TYPE_P (type)
12577             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12578           return -1;
12579
12580         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12581         if (count == -1
12582             || !index
12583             || !TYPE_MAX_VALUE (index)
12584             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12585             || !TYPE_MIN_VALUE (index)
12586             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12587             || count < 0)
12588           return -1;
12589
12590         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12591                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12592
12593         /* There must be no padding.  */
12594         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12595                       count * GET_MODE_BITSIZE (*modep)))
12596           return -1;
12597
12598         return count;
12599       }
12600
12601     case RECORD_TYPE:
12602       {
12603         int count = 0;
12604         int sub_count;
12605         tree field;
12606
12607         /* Can't handle incomplete types nor sizes that are not
12608            fixed.  */
12609         if (!COMPLETE_TYPE_P (type)
12610             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12611           return -1;
12612
12613         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12614           {
12615             if (TREE_CODE (field) != FIELD_DECL)
12616               continue;
12617
12618             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12619             if (sub_count < 0)
12620               return -1;
12621             count += sub_count;
12622           }
12623
12624         /* There must be no padding.  */
12625         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12626                       count * GET_MODE_BITSIZE (*modep)))
12627           return -1;
12628
12629         return count;
12630       }
12631
12632     case UNION_TYPE:
12633     case QUAL_UNION_TYPE:
12634       {
12635         /* These aren't very interesting except in a degenerate case.  */
12636         int count = 0;
12637         int sub_count;
12638         tree field;
12639
12640         /* Can't handle incomplete types nor sizes that are not
12641            fixed.  */
12642         if (!COMPLETE_TYPE_P (type)
12643             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12644           return -1;
12645
12646         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12647           {
12648             if (TREE_CODE (field) != FIELD_DECL)
12649               continue;
12650
12651             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12652             if (sub_count < 0)
12653               return -1;
12654             count = count > sub_count ? count : sub_count;
12655           }
12656
12657         /* There must be no padding.  */
12658         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12659                       count * GET_MODE_BITSIZE (*modep)))
12660           return -1;
12661
12662         return count;
12663       }
12664
12665     default:
12666       break;
12667     }
12668
12669   return -1;
12670 }
12671
12672 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12673    type as described in AAPCS64 \S 4.1.2.
12674
12675    See the comment above aarch64_composite_type_p for the notes on MODE.  */
12676
12677 static bool
12678 aarch64_short_vector_p (const_tree type,
12679                         machine_mode mode)
12680 {
12681   poly_int64 size = -1;
12682
12683   if (type && TREE_CODE (type) == VECTOR_TYPE)
12684     size = int_size_in_bytes (type);
12685   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12686             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12687     size = GET_MODE_SIZE (mode);
12688
12689   return known_eq (size, 8) || known_eq (size, 16);
12690 }
12691
12692 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12693    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
12694    array types.  The C99 floating-point complex types are also considered
12695    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
12696    types, which are GCC extensions and out of the scope of AAPCS64, are
12697    treated as composite types here as well.
12698
12699    Note that MODE itself is not sufficient in determining whether a type
12700    is such a composite type or not.  This is because
12701    stor-layout.c:compute_record_mode may have already changed the MODE
12702    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
12703    structure with only one field may have its MODE set to the mode of the
12704    field.  Also an integer mode whose size matches the size of the
12705    RECORD_TYPE type may be used to substitute the original mode
12706    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
12707    solely relied on.  */
12708
12709 static bool
12710 aarch64_composite_type_p (const_tree type,
12711                           machine_mode mode)
12712 {
12713   if (aarch64_short_vector_p (type, mode))
12714     return false;
12715
12716   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12717     return true;
12718
12719   if (mode == BLKmode
12720       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12721       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12722     return true;
12723
12724   return false;
12725 }
12726
12727 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12728    shall be passed or returned in simd/fp register(s) (providing these
12729    parameter passing registers are available).
12730
12731    Upon successful return, *COUNT returns the number of needed registers,
12732    *BASE_MODE returns the mode of the individual register and when IS_HAF
12733    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12734    floating-point aggregate or a homogeneous short-vector aggregate.  */
12735
12736 static bool
12737 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12738                                          const_tree type,
12739                                          machine_mode *base_mode,
12740                                          int *count,
12741                                          bool *is_ha)
12742 {
12743   machine_mode new_mode = VOIDmode;
12744   bool composite_p = aarch64_composite_type_p (type, mode);
12745
12746   if (is_ha != NULL) *is_ha = false;
12747
12748   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12749       || aarch64_short_vector_p (type, mode))
12750     {
12751       *count = 1;
12752       new_mode = mode;
12753     }
12754   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12755     {
12756       if (is_ha != NULL) *is_ha = true;
12757       *count = 2;
12758       new_mode = GET_MODE_INNER (mode);
12759     }
12760   else if (type && composite_p)
12761     {
12762       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12763
12764       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12765         {
12766           if (is_ha != NULL) *is_ha = true;
12767           *count = ag_count;
12768         }
12769       else
12770         return false;
12771     }
12772   else
12773     return false;
12774
12775   *base_mode = new_mode;
12776   return true;
12777 }
12778
12779 /* Implement TARGET_STRUCT_VALUE_RTX.  */
12780
12781 static rtx
12782 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12783                           int incoming ATTRIBUTE_UNUSED)
12784 {
12785   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12786 }
12787
12788 /* Implements target hook vector_mode_supported_p.  */
12789 static bool
12790 aarch64_vector_mode_supported_p (machine_mode mode)
12791 {
12792   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12793   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12794 }
12795
12796 /* Return appropriate SIMD container
12797    for MODE within a vector of WIDTH bits.  */
12798 static machine_mode
12799 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12800 {
12801   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12802     switch (mode)
12803       {
12804       case E_DFmode:
12805         return VNx2DFmode;
12806       case E_SFmode:
12807         return VNx4SFmode;
12808       case E_HFmode:
12809         return VNx8HFmode;
12810       case E_DImode:
12811         return VNx2DImode;
12812       case E_SImode:
12813         return VNx4SImode;
12814       case E_HImode:
12815         return VNx8HImode;
12816       case E_QImode:
12817         return VNx16QImode;
12818       default:
12819         return word_mode;
12820       }
12821
12822   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12823   if (TARGET_SIMD)
12824     {
12825       if (known_eq (width, 128))
12826         switch (mode)
12827           {
12828           case E_DFmode:
12829             return V2DFmode;
12830           case E_SFmode:
12831             return V4SFmode;
12832           case E_HFmode:
12833             return V8HFmode;
12834           case E_SImode:
12835             return V4SImode;
12836           case E_HImode:
12837             return V8HImode;
12838           case E_QImode:
12839             return V16QImode;
12840           case E_DImode:
12841             return V2DImode;
12842           default:
12843             break;
12844           }
12845       else
12846         switch (mode)
12847           {
12848           case E_SFmode:
12849             return V2SFmode;
12850           case E_HFmode:
12851             return V4HFmode;
12852           case E_SImode:
12853             return V2SImode;
12854           case E_HImode:
12855             return V4HImode;
12856           case E_QImode:
12857             return V8QImode;
12858           default:
12859             break;
12860           }
12861     }
12862   return word_mode;
12863 }
12864
12865 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
12866 static machine_mode
12867 aarch64_preferred_simd_mode (scalar_mode mode)
12868 {
12869   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12870   return aarch64_simd_container_mode (mode, bits);
12871 }
12872
12873 /* Return a list of possible vector sizes for the vectorizer
12874    to iterate over.  */
12875 static void
12876 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12877 {
12878   if (TARGET_SVE)
12879     sizes->safe_push (BYTES_PER_SVE_VECTOR);
12880   sizes->safe_push (16);
12881   sizes->safe_push (8);
12882 }
12883
12884 /* Implement TARGET_MANGLE_TYPE.  */
12885
12886 static const char *
12887 aarch64_mangle_type (const_tree type)
12888 {
12889   /* The AArch64 ABI documents say that "__va_list" has to be
12890      managled as if it is in the "std" namespace.  */
12891   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12892     return "St9__va_list";
12893
12894   /* Half-precision float.  */
12895   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12896     return "Dh";
12897
12898   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
12899      builtin types.  */
12900   if (TYPE_NAME (type) != NULL)
12901     return aarch64_mangle_builtin_type (type);
12902
12903   /* Use the default mangling.  */
12904   return NULL;
12905 }
12906
12907 /* Find the first rtx_insn before insn that will generate an assembly
12908    instruction.  */
12909
12910 static rtx_insn *
12911 aarch64_prev_real_insn (rtx_insn *insn)
12912 {
12913   if (!insn)
12914     return NULL;
12915
12916   do
12917     {
12918       insn = prev_real_insn (insn);
12919     }
12920   while (insn && recog_memoized (insn) < 0);
12921
12922   return insn;
12923 }
12924
12925 static bool
12926 is_madd_op (enum attr_type t1)
12927 {
12928   unsigned int i;
12929   /* A number of these may be AArch32 only.  */
12930   enum attr_type mlatypes[] = {
12931     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12932     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12933     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12934   };
12935
12936   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12937     {
12938       if (t1 == mlatypes[i])
12939         return true;
12940     }
12941
12942   return false;
12943 }
12944
12945 /* Check if there is a register dependency between a load and the insn
12946    for which we hold recog_data.  */
12947
12948 static bool
12949 dep_between_memop_and_curr (rtx memop)
12950 {
12951   rtx load_reg;
12952   int opno;
12953
12954   gcc_assert (GET_CODE (memop) == SET);
12955
12956   if (!REG_P (SET_DEST (memop)))
12957     return false;
12958
12959   load_reg = SET_DEST (memop);
12960   for (opno = 1; opno < recog_data.n_operands; opno++)
12961     {
12962       rtx operand = recog_data.operand[opno];
12963       if (REG_P (operand)
12964           && reg_overlap_mentioned_p (load_reg, operand))
12965         return true;
12966
12967     }
12968   return false;
12969 }
12970
12971
12972 /* When working around the Cortex-A53 erratum 835769,
12973    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12974    instruction and has a preceding memory instruction such that a NOP
12975    should be inserted between them.  */
12976
12977 bool
12978 aarch64_madd_needs_nop (rtx_insn* insn)
12979 {
12980   enum attr_type attr_type;
12981   rtx_insn *prev;
12982   rtx body;
12983
12984   if (!TARGET_FIX_ERR_A53_835769)
12985     return false;
12986
12987   if (!INSN_P (insn) || recog_memoized (insn) < 0)
12988     return false;
12989
12990   attr_type = get_attr_type (insn);
12991   if (!is_madd_op (attr_type))
12992     return false;
12993
12994   prev = aarch64_prev_real_insn (insn);
12995   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
12996      Restore recog state to INSN to avoid state corruption.  */
12997   extract_constrain_insn_cached (insn);
12998
12999   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13000     return false;
13001
13002   body = single_set (prev);
13003
13004   /* If the previous insn is a memory op and there is no dependency between
13005      it and the DImode madd, emit a NOP between them.  If body is NULL then we
13006      have a complex memory operation, probably a load/store pair.
13007      Be conservative for now and emit a NOP.  */
13008   if (GET_MODE (recog_data.operand[0]) == DImode
13009       && (!body || !dep_between_memop_and_curr (body)))
13010     return true;
13011
13012   return false;
13013
13014 }
13015
13016
13017 /* Implement FINAL_PRESCAN_INSN.  */
13018
13019 void
13020 aarch64_final_prescan_insn (rtx_insn *insn)
13021 {
13022   if (aarch64_madd_needs_nop (insn))
13023     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13024 }
13025
13026
13027 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13028    instruction.  */
13029
13030 bool
13031 aarch64_sve_index_immediate_p (rtx base_or_step)
13032 {
13033   return (CONST_INT_P (base_or_step)
13034           && IN_RANGE (INTVAL (base_or_step), -16, 15));
13035 }
13036
13037 /* Return true if X is a valid immediate for the SVE ADD and SUB
13038    instructions.  Negate X first if NEGATE_P is true.  */
13039
13040 bool
13041 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13042 {
13043   rtx elt;
13044
13045   if (!const_vec_duplicate_p (x, &elt)
13046       || !CONST_INT_P (elt))
13047     return false;
13048
13049   HOST_WIDE_INT val = INTVAL (elt);
13050   if (negate_p)
13051     val = -val;
13052   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13053
13054   if (val & 0xff)
13055     return IN_RANGE (val, 0, 0xff);
13056   return IN_RANGE (val, 0, 0xff00);
13057 }
13058
13059 /* Return true if X is a valid immediate operand for an SVE logical
13060    instruction such as AND.  */
13061
13062 bool
13063 aarch64_sve_bitmask_immediate_p (rtx x)
13064 {
13065   rtx elt;
13066
13067   return (const_vec_duplicate_p (x, &elt)
13068           && CONST_INT_P (elt)
13069           && aarch64_bitmask_imm (INTVAL (elt),
13070                                   GET_MODE_INNER (GET_MODE (x))));
13071 }
13072
13073 /* Return true if X is a valid immediate for the SVE DUP and CPY
13074    instructions.  */
13075
13076 bool
13077 aarch64_sve_dup_immediate_p (rtx x)
13078 {
13079   rtx elt;
13080
13081   if (!const_vec_duplicate_p (x, &elt)
13082       || !CONST_INT_P (elt))
13083     return false;
13084
13085   HOST_WIDE_INT val = INTVAL (elt);
13086   if (val & 0xff)
13087     return IN_RANGE (val, -0x80, 0x7f);
13088   return IN_RANGE (val, -0x8000, 0x7f00);
13089 }
13090
13091 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13092    SIGNED_P says whether the operand is signed rather than unsigned.  */
13093
13094 bool
13095 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13096 {
13097   rtx elt;
13098
13099   return (const_vec_duplicate_p (x, &elt)
13100           && CONST_INT_P (elt)
13101           && (signed_p
13102               ? IN_RANGE (INTVAL (elt), -16, 15)
13103               : IN_RANGE (INTVAL (elt), 0, 127)));
13104 }
13105
13106 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13107    instruction.  Negate X first if NEGATE_P is true.  */
13108
13109 bool
13110 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13111 {
13112   rtx elt;
13113   REAL_VALUE_TYPE r;
13114
13115   if (!const_vec_duplicate_p (x, &elt)
13116       || GET_CODE (elt) != CONST_DOUBLE)
13117     return false;
13118
13119   r = *CONST_DOUBLE_REAL_VALUE (elt);
13120
13121   if (negate_p)
13122     r = real_value_negate (&r);
13123
13124   if (real_equal (&r, &dconst1))
13125     return true;
13126   if (real_equal (&r, &dconsthalf))
13127     return true;
13128   return false;
13129 }
13130
13131 /* Return true if X is a valid immediate operand for an SVE FMUL
13132    instruction.  */
13133
13134 bool
13135 aarch64_sve_float_mul_immediate_p (rtx x)
13136 {
13137   rtx elt;
13138
13139   /* GCC will never generate a multiply with an immediate of 2, so there is no
13140      point testing for it (even though it is a valid constant).  */
13141   return (const_vec_duplicate_p (x, &elt)
13142           && GET_CODE (elt) == CONST_DOUBLE
13143           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13144 }
13145
13146 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13147    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
13148    is nonnull, use it to describe valid immediates.  */
13149 static bool
13150 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13151                                     simd_immediate_info *info,
13152                                     enum simd_immediate_check which,
13153                                     simd_immediate_info::insn_type insn)
13154 {
13155   /* Try a 4-byte immediate with LSL.  */
13156   for (unsigned int shift = 0; shift < 32; shift += 8)
13157     if ((val32 & (0xff << shift)) == val32)
13158       {
13159         if (info)
13160           *info = simd_immediate_info (SImode, val32 >> shift, insn,
13161                                        simd_immediate_info::LSL, shift);
13162         return true;
13163       }
13164
13165   /* Try a 2-byte immediate with LSL.  */
13166   unsigned int imm16 = val32 & 0xffff;
13167   if (imm16 == (val32 >> 16))
13168     for (unsigned int shift = 0; shift < 16; shift += 8)
13169       if ((imm16 & (0xff << shift)) == imm16)
13170         {
13171           if (info)
13172             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13173                                          simd_immediate_info::LSL, shift);
13174           return true;
13175         }
13176
13177   /* Try a 4-byte immediate with MSL, except for cases that MVN
13178      can handle.  */
13179   if (which == AARCH64_CHECK_MOV)
13180     for (unsigned int shift = 8; shift < 24; shift += 8)
13181       {
13182         unsigned int low = (1 << shift) - 1;
13183         if (((val32 & (0xff << shift)) | low) == val32)
13184           {
13185             if (info)
13186               *info = simd_immediate_info (SImode, val32 >> shift, insn,
13187                                            simd_immediate_info::MSL, shift);
13188             return true;
13189           }
13190       }
13191
13192   return false;
13193 }
13194
13195 /* Return true if replicating VAL64 is a valid immediate for the
13196    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
13197    use it to describe valid immediates.  */
13198 static bool
13199 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13200                                  simd_immediate_info *info,
13201                                  enum simd_immediate_check which)
13202 {
13203   unsigned int val32 = val64 & 0xffffffff;
13204   unsigned int val16 = val64 & 0xffff;
13205   unsigned int val8 = val64 & 0xff;
13206
13207   if (val32 == (val64 >> 32))
13208     {
13209       if ((which & AARCH64_CHECK_ORR) != 0
13210           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13211                                                  simd_immediate_info::MOV))
13212         return true;
13213
13214       if ((which & AARCH64_CHECK_BIC) != 0
13215           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13216                                                  simd_immediate_info::MVN))
13217         return true;
13218
13219       /* Try using a replicated byte.  */
13220       if (which == AARCH64_CHECK_MOV
13221           && val16 == (val32 >> 16)
13222           && val8 == (val16 >> 8))
13223         {
13224           if (info)
13225             *info = simd_immediate_info (QImode, val8);
13226           return true;
13227         }
13228     }
13229
13230   /* Try using a bit-to-bytemask.  */
13231   if (which == AARCH64_CHECK_MOV)
13232     {
13233       unsigned int i;
13234       for (i = 0; i < 64; i += 8)
13235         {
13236           unsigned char byte = (val64 >> i) & 0xff;
13237           if (byte != 0 && byte != 0xff)
13238             break;
13239         }
13240       if (i == 64)
13241         {
13242           if (info)
13243             *info = simd_immediate_info (DImode, val64);
13244           return true;
13245         }
13246     }
13247   return false;
13248 }
13249
13250 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13251    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
13252
13253 static bool
13254 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13255                              simd_immediate_info *info)
13256 {
13257   scalar_int_mode mode = DImode;
13258   unsigned int val32 = val64 & 0xffffffff;
13259   if (val32 == (val64 >> 32))
13260     {
13261       mode = SImode;
13262       unsigned int val16 = val32 & 0xffff;
13263       if (val16 == (val32 >> 16))
13264         {
13265           mode = HImode;
13266           unsigned int val8 = val16 & 0xff;
13267           if (val8 == (val16 >> 8))
13268             mode = QImode;
13269         }
13270     }
13271   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13272   if (IN_RANGE (val, -0x80, 0x7f))
13273     {
13274       /* DUP with no shift.  */
13275       if (info)
13276         *info = simd_immediate_info (mode, val);
13277       return true;
13278     }
13279   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13280     {
13281       /* DUP with LSL #8.  */
13282       if (info)
13283         *info = simd_immediate_info (mode, val);
13284       return true;
13285     }
13286   if (aarch64_bitmask_imm (val64, mode))
13287     {
13288       /* DUPM.  */
13289       if (info)
13290         *info = simd_immediate_info (mode, val);
13291       return true;
13292     }
13293   return false;
13294 }
13295
13296 /* Return true if OP is a valid SIMD immediate for the operation
13297    described by WHICH.  If INFO is nonnull, use it to describe valid
13298    immediates.  */
13299 bool
13300 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13301                               enum simd_immediate_check which)
13302 {
13303   machine_mode mode = GET_MODE (op);
13304   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13305   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13306     return false;
13307
13308   scalar_mode elt_mode = GET_MODE_INNER (mode);
13309   rtx base, step;
13310   unsigned int n_elts;
13311   if (GET_CODE (op) == CONST_VECTOR
13312       && CONST_VECTOR_DUPLICATE_P (op))
13313     n_elts = CONST_VECTOR_NPATTERNS (op);
13314   else if ((vec_flags & VEC_SVE_DATA)
13315            && const_vec_series_p (op, &base, &step))
13316     {
13317       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13318       if (!aarch64_sve_index_immediate_p (base)
13319           || !aarch64_sve_index_immediate_p (step))
13320         return false;
13321
13322       if (info)
13323         *info = simd_immediate_info (elt_mode, base, step);
13324       return true;
13325     }
13326   else if (GET_CODE (op) == CONST_VECTOR
13327            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13328     /* N_ELTS set above.  */;
13329   else
13330     return false;
13331
13332   /* Handle PFALSE and PTRUE.  */
13333   if (vec_flags & VEC_SVE_PRED)
13334     return (op == CONST0_RTX (mode)
13335             || op == CONSTM1_RTX (mode));
13336
13337   scalar_float_mode elt_float_mode;
13338   if (n_elts == 1
13339       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13340     {
13341       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13342       if (aarch64_float_const_zero_rtx_p (elt)
13343           || aarch64_float_const_representable_p (elt))
13344         {
13345           if (info)
13346             *info = simd_immediate_info (elt_float_mode, elt);
13347           return true;
13348         }
13349     }
13350
13351   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13352   if (elt_size > 8)
13353     return false;
13354
13355   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13356
13357   /* Expand the vector constant out into a byte vector, with the least
13358      significant byte of the register first.  */
13359   auto_vec<unsigned char, 16> bytes;
13360   bytes.reserve (n_elts * elt_size);
13361   for (unsigned int i = 0; i < n_elts; i++)
13362     {
13363       /* The vector is provided in gcc endian-neutral fashion.
13364          For aarch64_be Advanced SIMD, it must be laid out in the vector
13365          register in reverse order.  */
13366       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13367       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13368
13369       if (elt_mode != elt_int_mode)
13370         elt = gen_lowpart (elt_int_mode, elt);
13371
13372       if (!CONST_INT_P (elt))
13373         return false;
13374
13375       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13376       for (unsigned int byte = 0; byte < elt_size; byte++)
13377         {
13378           bytes.quick_push (elt_val & 0xff);
13379           elt_val >>= BITS_PER_UNIT;
13380         }
13381     }
13382
13383   /* The immediate must repeat every eight bytes.  */
13384   unsigned int nbytes = bytes.length ();
13385   for (unsigned i = 8; i < nbytes; ++i)
13386     if (bytes[i] != bytes[i - 8])
13387       return false;
13388
13389   /* Get the repeating 8-byte value as an integer.  No endian correction
13390      is needed here because bytes is already in lsb-first order.  */
13391   unsigned HOST_WIDE_INT val64 = 0;
13392   for (unsigned int i = 0; i < 8; i++)
13393     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13394               << (i * BITS_PER_UNIT));
13395
13396   if (vec_flags & VEC_SVE_DATA)
13397     return aarch64_sve_valid_immediate (val64, info);
13398   else
13399     return aarch64_advsimd_valid_immediate (val64, info, which);
13400 }
13401
13402 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13403    has a step in the range of INDEX.  Return the index expression if so,
13404    otherwise return null.  */
13405 rtx
13406 aarch64_check_zero_based_sve_index_immediate (rtx x)
13407 {
13408   rtx base, step;
13409   if (const_vec_series_p (x, &base, &step)
13410       && base == const0_rtx
13411       && aarch64_sve_index_immediate_p (step))
13412     return step;
13413   return NULL_RTX;
13414 }
13415
13416 /* Check of immediate shift constants are within range.  */
13417 bool
13418 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13419 {
13420   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13421   if (left)
13422     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13423   else
13424     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13425 }
13426
13427 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13428    operation of width WIDTH at bit position POS.  */
13429
13430 rtx
13431 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13432 {
13433   gcc_assert (CONST_INT_P (width));
13434   gcc_assert (CONST_INT_P (pos));
13435
13436   unsigned HOST_WIDE_INT mask
13437     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13438   return GEN_INT (mask << UINTVAL (pos));
13439 }
13440
13441 bool
13442 aarch64_mov_operand_p (rtx x, machine_mode mode)
13443 {
13444   if (GET_CODE (x) == HIGH
13445       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13446     return true;
13447
13448   if (CONST_INT_P (x))
13449     return true;
13450
13451   if (VECTOR_MODE_P (GET_MODE (x)))
13452     return aarch64_simd_valid_immediate (x, NULL);
13453
13454   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13455     return true;
13456
13457   if (aarch64_sve_cnt_immediate_p (x))
13458     return true;
13459
13460   return aarch64_classify_symbolic_expression (x)
13461     == SYMBOL_TINY_ABSOLUTE;
13462 }
13463
13464 /* Return a const_int vector of VAL.  */
13465 rtx
13466 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13467 {
13468   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13469   return gen_const_vec_duplicate (mode, c);
13470 }
13471
13472 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
13473
13474 bool
13475 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13476 {
13477   machine_mode vmode;
13478
13479   vmode = aarch64_simd_container_mode (mode, 64);
13480   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13481   return aarch64_simd_valid_immediate (op_v, NULL);
13482 }
13483
13484 /* Construct and return a PARALLEL RTX vector with elements numbering the
13485    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13486    the vector - from the perspective of the architecture.  This does not
13487    line up with GCC's perspective on lane numbers, so we end up with
13488    different masks depending on our target endian-ness.  The diagram
13489    below may help.  We must draw the distinction when building masks
13490    which select one half of the vector.  An instruction selecting
13491    architectural low-lanes for a big-endian target, must be described using
13492    a mask selecting GCC high-lanes.
13493
13494                  Big-Endian             Little-Endian
13495
13496 GCC             0   1   2   3           3   2   1   0
13497               | x | x | x | x |       | x | x | x | x |
13498 Architecture    3   2   1   0           3   2   1   0
13499
13500 Low Mask:         { 2, 3 }                { 0, 1 }
13501 High Mask:        { 0, 1 }                { 2, 3 }
13502
13503    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
13504
13505 rtx
13506 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13507 {
13508   rtvec v = rtvec_alloc (nunits / 2);
13509   int high_base = nunits / 2;
13510   int low_base = 0;
13511   int base;
13512   rtx t1;
13513   int i;
13514
13515   if (BYTES_BIG_ENDIAN)
13516     base = high ? low_base : high_base;
13517   else
13518     base = high ? high_base : low_base;
13519
13520   for (i = 0; i < nunits / 2; i++)
13521     RTVEC_ELT (v, i) = GEN_INT (base + i);
13522
13523   t1 = gen_rtx_PARALLEL (mode, v);
13524   return t1;
13525 }
13526
13527 /* Check OP for validity as a PARALLEL RTX vector with elements
13528    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13529    from the perspective of the architecture.  See the diagram above
13530    aarch64_simd_vect_par_cnst_half for more details.  */
13531
13532 bool
13533 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13534                                        bool high)
13535 {
13536   int nelts;
13537   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13538     return false;
13539
13540   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13541   HOST_WIDE_INT count_op = XVECLEN (op, 0);
13542   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13543   int i = 0;
13544
13545   if (count_op != count_ideal)
13546     return false;
13547
13548   for (i = 0; i < count_ideal; i++)
13549     {
13550       rtx elt_op = XVECEXP (op, 0, i);
13551       rtx elt_ideal = XVECEXP (ideal, 0, i);
13552
13553       if (!CONST_INT_P (elt_op)
13554           || INTVAL (elt_ideal) != INTVAL (elt_op))
13555         return false;
13556     }
13557   return true;
13558 }
13559
13560 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
13561    HIGH (exclusive).  */
13562 void
13563 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13564                           const_tree exp)
13565 {
13566   HOST_WIDE_INT lane;
13567   gcc_assert (CONST_INT_P (operand));
13568   lane = INTVAL (operand);
13569
13570   if (lane < low || lane >= high)
13571   {
13572     if (exp)
13573       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13574     else
13575       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13576   }
13577 }
13578
13579 /* Peform endian correction on lane number N, which indexes a vector
13580    of mode MODE, and return the result as an SImode rtx.  */
13581
13582 rtx
13583 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13584 {
13585   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13586 }
13587
13588 /* Return TRUE if OP is a valid vector addressing mode.  */
13589
13590 bool
13591 aarch64_simd_mem_operand_p (rtx op)
13592 {
13593   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13594                         || REG_P (XEXP (op, 0)));
13595 }
13596
13597 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
13598
13599 bool
13600 aarch64_sve_ld1r_operand_p (rtx op)
13601 {
13602   struct aarch64_address_info addr;
13603   scalar_mode mode;
13604
13605   return (MEM_P (op)
13606           && is_a <scalar_mode> (GET_MODE (op), &mode)
13607           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13608           && addr.type == ADDRESS_REG_IMM
13609           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13610 }
13611
13612 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13613    The conditions for STR are the same.  */
13614 bool
13615 aarch64_sve_ldr_operand_p (rtx op)
13616 {
13617   struct aarch64_address_info addr;
13618
13619   return (MEM_P (op)
13620           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13621                                        false, ADDR_QUERY_ANY)
13622           && addr.type == ADDRESS_REG_IMM);
13623 }
13624
13625 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13626    We need to be able to access the individual pieces, so the range
13627    is different from LD[234] and ST[234].  */
13628 bool
13629 aarch64_sve_struct_memory_operand_p (rtx op)
13630 {
13631   if (!MEM_P (op))
13632     return false;
13633
13634   machine_mode mode = GET_MODE (op);
13635   struct aarch64_address_info addr;
13636   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13637                                  ADDR_QUERY_ANY)
13638       || addr.type != ADDRESS_REG_IMM)
13639     return false;
13640
13641   poly_int64 first = addr.const_offset;
13642   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13643   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13644           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13645 }
13646
13647 /* Emit a register copy from operand to operand, taking care not to
13648    early-clobber source registers in the process.
13649
13650    COUNT is the number of components into which the copy needs to be
13651    decomposed.  */
13652 void
13653 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13654                                 unsigned int count)
13655 {
13656   unsigned int i;
13657   int rdest = REGNO (operands[0]);
13658   int rsrc = REGNO (operands[1]);
13659
13660   if (!reg_overlap_mentioned_p (operands[0], operands[1])
13661       || rdest < rsrc)
13662     for (i = 0; i < count; i++)
13663       emit_move_insn (gen_rtx_REG (mode, rdest + i),
13664                       gen_rtx_REG (mode, rsrc + i));
13665   else
13666     for (i = 0; i < count; i++)
13667       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13668                       gen_rtx_REG (mode, rsrc + count - i - 1));
13669 }
13670
13671 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13672    one of VSTRUCT modes: OI, CI, or XI.  */
13673 int
13674 aarch64_simd_attr_length_rglist (machine_mode mode)
13675 {
13676   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
13677   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13678 }
13679
13680 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
13681    alignment of a vector to 128 bits.  SVE predicates have an alignment of
13682    16 bits.  */
13683 static HOST_WIDE_INT
13684 aarch64_simd_vector_alignment (const_tree type)
13685 {
13686   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13687     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13688        be set for non-predicate vectors of booleans.  Modes are the most
13689        direct way we have of identifying real SVE predicate types.  */
13690     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13691   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13692   return MIN (align, 128);
13693 }
13694
13695 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
13696 static HOST_WIDE_INT
13697 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13698 {
13699   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13700     {
13701       /* If the length of the vector is fixed, try to align to that length,
13702          otherwise don't try to align at all.  */
13703       HOST_WIDE_INT result;
13704       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13705         result = TYPE_ALIGN (TREE_TYPE (type));
13706       return result;
13707     }
13708   return TYPE_ALIGN (type);
13709 }
13710
13711 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
13712 static bool
13713 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13714 {
13715   if (is_packed)
13716     return false;
13717
13718   /* For fixed-length vectors, check that the vectorizer will aim for
13719      full-vector alignment.  This isn't true for generic GCC vectors
13720      that are wider than the ABI maximum of 128 bits.  */
13721   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13722       && (wi::to_widest (TYPE_SIZE (type))
13723           != aarch64_vectorize_preferred_vector_alignment (type)))
13724     return false;
13725
13726   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
13727   return true;
13728 }
13729
13730 /* Return true if the vector misalignment factor is supported by the
13731    target.  */
13732 static bool
13733 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13734                                              const_tree type, int misalignment,
13735                                              bool is_packed)
13736 {
13737   if (TARGET_SIMD && STRICT_ALIGNMENT)
13738     {
13739       /* Return if movmisalign pattern is not supported for this mode.  */
13740       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13741         return false;
13742
13743       /* Misalignment factor is unknown at compile time.  */
13744       if (misalignment == -1)
13745         return false;
13746     }
13747   return default_builtin_support_vector_misalignment (mode, type, misalignment,
13748                                                       is_packed);
13749 }
13750
13751 /* If VALS is a vector constant that can be loaded into a register
13752    using DUP, generate instructions to do so and return an RTX to
13753    assign to the register.  Otherwise return NULL_RTX.  */
13754 static rtx
13755 aarch64_simd_dup_constant (rtx vals)
13756 {
13757   machine_mode mode = GET_MODE (vals);
13758   machine_mode inner_mode = GET_MODE_INNER (mode);
13759   rtx x;
13760
13761   if (!const_vec_duplicate_p (vals, &x))
13762     return NULL_RTX;
13763
13764   /* We can load this constant by using DUP and a constant in a
13765      single ARM register.  This will be cheaper than a vector
13766      load.  */
13767   x = copy_to_mode_reg (inner_mode, x);
13768   return gen_vec_duplicate (mode, x);
13769 }
13770
13771
13772 /* Generate code to load VALS, which is a PARALLEL containing only
13773    constants (for vec_init) or CONST_VECTOR, efficiently into a
13774    register.  Returns an RTX to copy into the register, or NULL_RTX
13775    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
13776 static rtx
13777 aarch64_simd_make_constant (rtx vals)
13778 {
13779   machine_mode mode = GET_MODE (vals);
13780   rtx const_dup;
13781   rtx const_vec = NULL_RTX;
13782   int n_const = 0;
13783   int i;
13784
13785   if (GET_CODE (vals) == CONST_VECTOR)
13786     const_vec = vals;
13787   else if (GET_CODE (vals) == PARALLEL)
13788     {
13789       /* A CONST_VECTOR must contain only CONST_INTs and
13790          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13791          Only store valid constants in a CONST_VECTOR.  */
13792       int n_elts = XVECLEN (vals, 0);
13793       for (i = 0; i < n_elts; ++i)
13794         {
13795           rtx x = XVECEXP (vals, 0, i);
13796           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13797             n_const++;
13798         }
13799       if (n_const == n_elts)
13800         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13801     }
13802   else
13803     gcc_unreachable ();
13804
13805   if (const_vec != NULL_RTX
13806       && aarch64_simd_valid_immediate (const_vec, NULL))
13807     /* Load using MOVI/MVNI.  */
13808     return const_vec;
13809   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13810     /* Loaded using DUP.  */
13811     return const_dup;
13812   else if (const_vec != NULL_RTX)
13813     /* Load from constant pool. We can not take advantage of single-cycle
13814        LD1 because we need a PC-relative addressing mode.  */
13815     return const_vec;
13816   else
13817     /* A PARALLEL containing something not valid inside CONST_VECTOR.
13818        We can not construct an initializer.  */
13819     return NULL_RTX;
13820 }
13821
13822 /* Expand a vector initialisation sequence, such that TARGET is
13823    initialised to contain VALS.  */
13824
13825 void
13826 aarch64_expand_vector_init (rtx target, rtx vals)
13827 {
13828   machine_mode mode = GET_MODE (target);
13829   scalar_mode inner_mode = GET_MODE_INNER (mode);
13830   /* The number of vector elements.  */
13831   int n_elts = XVECLEN (vals, 0);
13832   /* The number of vector elements which are not constant.  */
13833   int n_var = 0;
13834   rtx any_const = NULL_RTX;
13835   /* The first element of vals.  */
13836   rtx v0 = XVECEXP (vals, 0, 0);
13837   bool all_same = true;
13838
13839   /* Count the number of variable elements to initialise.  */
13840   for (int i = 0; i < n_elts; ++i)
13841     {
13842       rtx x = XVECEXP (vals, 0, i);
13843       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13844         ++n_var;
13845       else
13846         any_const = x;
13847
13848       all_same &= rtx_equal_p (x, v0);
13849     }
13850
13851   /* No variable elements, hand off to aarch64_simd_make_constant which knows
13852      how best to handle this.  */
13853   if (n_var == 0)
13854     {
13855       rtx constant = aarch64_simd_make_constant (vals);
13856       if (constant != NULL_RTX)
13857         {
13858           emit_move_insn (target, constant);
13859           return;
13860         }
13861     }
13862
13863   /* Splat a single non-constant element if we can.  */
13864   if (all_same)
13865     {
13866       rtx x = copy_to_mode_reg (inner_mode, v0);
13867       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13868       return;
13869     }
13870
13871   enum insn_code icode = optab_handler (vec_set_optab, mode);
13872   gcc_assert (icode != CODE_FOR_nothing);
13873
13874   /* If there are only variable elements, try to optimize
13875      the insertion using dup for the most common element
13876      followed by insertions.  */
13877
13878   /* The algorithm will fill matches[*][0] with the earliest matching element,
13879      and matches[X][1] with the count of duplicate elements (if X is the
13880      earliest element which has duplicates).  */
13881
13882   if (n_var == n_elts && n_elts <= 16)
13883     {
13884       int matches[16][2] = {0};
13885       for (int i = 0; i < n_elts; i++)
13886         {
13887           for (int j = 0; j <= i; j++)
13888             {
13889               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13890                 {
13891                   matches[i][0] = j;
13892                   matches[j][1]++;
13893                   break;
13894                 }
13895             }
13896         }
13897       int maxelement = 0;
13898       int maxv = 0;
13899       for (int i = 0; i < n_elts; i++)
13900         if (matches[i][1] > maxv)
13901           {
13902             maxelement = i;
13903             maxv = matches[i][1];
13904           }
13905
13906       /* Create a duplicate of the most common element.  */
13907       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13908       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13909
13910       /* Insert the rest.  */
13911       for (int i = 0; i < n_elts; i++)
13912         {
13913           rtx x = XVECEXP (vals, 0, i);
13914           if (matches[i][0] == maxelement)
13915             continue;
13916           x = copy_to_mode_reg (inner_mode, x);
13917           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13918         }
13919       return;
13920     }
13921
13922   /* Initialise a vector which is part-variable.  We want to first try
13923      to build those lanes which are constant in the most efficient way we
13924      can.  */
13925   if (n_var != n_elts)
13926     {
13927       rtx copy = copy_rtx (vals);
13928
13929       /* Load constant part of vector.  We really don't care what goes into the
13930          parts we will overwrite, but we're more likely to be able to load the
13931          constant efficiently if it has fewer, larger, repeating parts
13932          (see aarch64_simd_valid_immediate).  */
13933       for (int i = 0; i < n_elts; i++)
13934         {
13935           rtx x = XVECEXP (vals, 0, i);
13936           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13937             continue;
13938           rtx subst = any_const;
13939           for (int bit = n_elts / 2; bit > 0; bit /= 2)
13940             {
13941               /* Look in the copied vector, as more elements are const.  */
13942               rtx test = XVECEXP (copy, 0, i ^ bit);
13943               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
13944                 {
13945                   subst = test;
13946                   break;
13947                 }
13948             }
13949           XVECEXP (copy, 0, i) = subst;
13950         }
13951       aarch64_expand_vector_init (target, copy);
13952     }
13953
13954   /* Insert the variable lanes directly.  */
13955   for (int i = 0; i < n_elts; i++)
13956     {
13957       rtx x = XVECEXP (vals, 0, i);
13958       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13959         continue;
13960       x = copy_to_mode_reg (inner_mode, x);
13961       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13962     }
13963 }
13964
13965 static unsigned HOST_WIDE_INT
13966 aarch64_shift_truncation_mask (machine_mode mode)
13967 {
13968   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
13969     return 0;
13970   return GET_MODE_UNIT_BITSIZE (mode) - 1;
13971 }
13972
13973 /* Select a format to encode pointers in exception handling data.  */
13974 int
13975 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
13976 {
13977    int type;
13978    switch (aarch64_cmodel)
13979      {
13980      case AARCH64_CMODEL_TINY:
13981      case AARCH64_CMODEL_TINY_PIC:
13982      case AARCH64_CMODEL_SMALL:
13983      case AARCH64_CMODEL_SMALL_PIC:
13984      case AARCH64_CMODEL_SMALL_SPIC:
13985        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
13986           for everything.  */
13987        type = DW_EH_PE_sdata4;
13988        break;
13989      default:
13990        /* No assumptions here.  8-byte relocs required.  */
13991        type = DW_EH_PE_sdata8;
13992        break;
13993      }
13994    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
13995 }
13996
13997 /* The last .arch and .tune assembly strings that we printed.  */
13998 static std::string aarch64_last_printed_arch_string;
13999 static std::string aarch64_last_printed_tune_string;
14000
14001 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
14002    by the function fndecl.  */
14003
14004 void
14005 aarch64_declare_function_name (FILE *stream, const char* name,
14006                                 tree fndecl)
14007 {
14008   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14009
14010   struct cl_target_option *targ_options;
14011   if (target_parts)
14012     targ_options = TREE_TARGET_OPTION (target_parts);
14013   else
14014     targ_options = TREE_TARGET_OPTION (target_option_current_node);
14015   gcc_assert (targ_options);
14016
14017   const struct processor *this_arch
14018     = aarch64_get_arch (targ_options->x_explicit_arch);
14019
14020   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14021   std::string extension
14022     = aarch64_get_extension_string_for_isa_flags (isa_flags,
14023                                                   this_arch->flags);
14024   /* Only update the assembler .arch string if it is distinct from the last
14025      such string we printed.  */
14026   std::string to_print = this_arch->name + extension;
14027   if (to_print != aarch64_last_printed_arch_string)
14028     {
14029       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14030       aarch64_last_printed_arch_string = to_print;
14031     }
14032
14033   /* Print the cpu name we're tuning for in the comments, might be
14034      useful to readers of the generated asm.  Do it only when it changes
14035      from function to function and verbose assembly is requested.  */
14036   const struct processor *this_tune
14037     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14038
14039   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14040     {
14041       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14042                    this_tune->name);
14043       aarch64_last_printed_tune_string = this_tune->name;
14044     }
14045
14046   /* Don't forget the type directive for ELF.  */
14047   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14048   ASM_OUTPUT_LABEL (stream, name);
14049 }
14050
14051 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
14052
14053 static void
14054 aarch64_start_file (void)
14055 {
14056   struct cl_target_option *default_options
14057     = TREE_TARGET_OPTION (target_option_default_node);
14058
14059   const struct processor *default_arch
14060     = aarch64_get_arch (default_options->x_explicit_arch);
14061   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14062   std::string extension
14063     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14064                                                   default_arch->flags);
14065
14066    aarch64_last_printed_arch_string = default_arch->name + extension;
14067    aarch64_last_printed_tune_string = "";
14068    asm_fprintf (asm_out_file, "\t.arch %s\n",
14069                 aarch64_last_printed_arch_string.c_str ());
14070
14071    default_file_start ();
14072 }
14073
14074 /* Emit load exclusive.  */
14075
14076 static void
14077 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14078                              rtx mem, rtx model_rtx)
14079 {
14080   rtx (*gen) (rtx, rtx, rtx);
14081
14082   switch (mode)
14083     {
14084     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14085     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14086     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14087     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14088     default:
14089       gcc_unreachable ();
14090     }
14091
14092   emit_insn (gen (rval, mem, model_rtx));
14093 }
14094
14095 /* Emit store exclusive.  */
14096
14097 static void
14098 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14099                               rtx rval, rtx mem, rtx model_rtx)
14100 {
14101   rtx (*gen) (rtx, rtx, rtx, rtx);
14102
14103   switch (mode)
14104     {
14105     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14106     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14107     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14108     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14109     default:
14110       gcc_unreachable ();
14111     }
14112
14113   emit_insn (gen (bval, rval, mem, model_rtx));
14114 }
14115
14116 /* Mark the previous jump instruction as unlikely.  */
14117
14118 static void
14119 aarch64_emit_unlikely_jump (rtx insn)
14120 {
14121   rtx_insn *jump = emit_jump_insn (insn);
14122   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14123 }
14124
14125 /* Expand a compare and swap pattern.  */
14126
14127 void
14128 aarch64_expand_compare_and_swap (rtx operands[])
14129 {
14130   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14131   machine_mode mode, cmp_mode;
14132   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14133   int idx;
14134   gen_cas_fn gen;
14135   const gen_cas_fn split_cas[] =
14136   {
14137     gen_aarch64_compare_and_swapqi,
14138     gen_aarch64_compare_and_swaphi,
14139     gen_aarch64_compare_and_swapsi,
14140     gen_aarch64_compare_and_swapdi
14141   };
14142   const gen_cas_fn atomic_cas[] =
14143   {
14144     gen_aarch64_compare_and_swapqi_lse,
14145     gen_aarch64_compare_and_swaphi_lse,
14146     gen_aarch64_compare_and_swapsi_lse,
14147     gen_aarch64_compare_and_swapdi_lse
14148   };
14149
14150   bval = operands[0];
14151   rval = operands[1];
14152   mem = operands[2];
14153   oldval = operands[3];
14154   newval = operands[4];
14155   is_weak = operands[5];
14156   mod_s = operands[6];
14157   mod_f = operands[7];
14158   mode = GET_MODE (mem);
14159   cmp_mode = mode;
14160
14161   /* Normally the succ memory model must be stronger than fail, but in the
14162      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14163      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
14164
14165   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14166       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14167     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14168
14169   switch (mode)
14170     {
14171     case E_QImode:
14172     case E_HImode:
14173       /* For short modes, we're going to perform the comparison in SImode,
14174          so do the zero-extension now.  */
14175       cmp_mode = SImode;
14176       rval = gen_reg_rtx (SImode);
14177       oldval = convert_modes (SImode, mode, oldval, true);
14178       /* Fall through.  */
14179
14180     case E_SImode:
14181     case E_DImode:
14182       /* Force the value into a register if needed.  */
14183       if (!aarch64_plus_operand (oldval, mode))
14184         oldval = force_reg (cmp_mode, oldval);
14185       break;
14186
14187     default:
14188       gcc_unreachable ();
14189     }
14190
14191   switch (mode)
14192     {
14193     case E_QImode: idx = 0; break;
14194     case E_HImode: idx = 1; break;
14195     case E_SImode: idx = 2; break;
14196     case E_DImode: idx = 3; break;
14197     default:
14198       gcc_unreachable ();
14199     }
14200   if (TARGET_LSE)
14201     gen = atomic_cas[idx];
14202   else
14203     gen = split_cas[idx];
14204
14205   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14206
14207   if (mode == QImode || mode == HImode)
14208     emit_move_insn (operands[1], gen_lowpart (mode, rval));
14209
14210   x = gen_rtx_REG (CCmode, CC_REGNUM);
14211   x = gen_rtx_EQ (SImode, x, const0_rtx);
14212   emit_insn (gen_rtx_SET (bval, x));
14213 }
14214
14215 /* Test whether the target supports using a atomic load-operate instruction.
14216    CODE is the operation and AFTER is TRUE if the data in memory after the
14217    operation should be returned and FALSE if the data before the operation
14218    should be returned.  Returns FALSE if the operation isn't supported by the
14219    architecture.  */
14220
14221 bool
14222 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14223 {
14224   if (!TARGET_LSE)
14225     return false;
14226
14227   switch (code)
14228     {
14229     case SET:
14230     case AND:
14231     case IOR:
14232     case XOR:
14233     case MINUS:
14234     case PLUS:
14235       return true;
14236     default:
14237       return false;
14238     }
14239 }
14240
14241 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14242    sequence implementing an atomic operation.  */
14243
14244 static void
14245 aarch64_emit_post_barrier (enum memmodel model)
14246 {
14247   const enum memmodel base_model = memmodel_base (model);
14248
14249   if (is_mm_sync (model)
14250       && (base_model == MEMMODEL_ACQUIRE
14251           || base_model == MEMMODEL_ACQ_REL
14252           || base_model == MEMMODEL_SEQ_CST))
14253     {
14254       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14255     }
14256 }
14257
14258 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
14259    for the data in memory.  EXPECTED is the value expected to be in memory.
14260    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
14261    is the memory ordering to use.  */
14262
14263 void
14264 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14265                         rtx expected, rtx desired,
14266                         rtx model)
14267 {
14268   rtx (*gen) (rtx, rtx, rtx, rtx);
14269   machine_mode mode;
14270
14271   mode = GET_MODE (mem);
14272
14273   switch (mode)
14274     {
14275     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14276     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14277     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14278     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14279     default:
14280       gcc_unreachable ();
14281     }
14282
14283   /* Move the expected value into the CAS destination register.  */
14284   emit_insn (gen_rtx_SET (rval, expected));
14285
14286   /* Emit the CAS.  */
14287   emit_insn (gen (rval, mem, desired, model));
14288
14289   /* Compare the expected value with the value loaded by the CAS, to establish
14290      whether the swap was made.  */
14291   aarch64_gen_compare_reg (EQ, rval, expected);
14292 }
14293
14294 /* Split a compare and swap pattern.  */
14295
14296 void
14297 aarch64_split_compare_and_swap (rtx operands[])
14298 {
14299   rtx rval, mem, oldval, newval, scratch;
14300   machine_mode mode;
14301   bool is_weak;
14302   rtx_code_label *label1, *label2;
14303   rtx x, cond;
14304   enum memmodel model;
14305   rtx model_rtx;
14306
14307   rval = operands[0];
14308   mem = operands[1];
14309   oldval = operands[2];
14310   newval = operands[3];
14311   is_weak = (operands[4] != const0_rtx);
14312   model_rtx = operands[5];
14313   scratch = operands[7];
14314   mode = GET_MODE (mem);
14315   model = memmodel_from_int (INTVAL (model_rtx));
14316
14317   /* When OLDVAL is zero and we want the strong version we can emit a tighter
14318     loop:
14319     .label1:
14320         LD[A]XR rval, [mem]
14321         CBNZ    rval, .label2
14322         ST[L]XR scratch, newval, [mem]
14323         CBNZ    scratch, .label1
14324     .label2:
14325         CMP     rval, 0.  */
14326   bool strong_zero_p = !is_weak && oldval == const0_rtx;
14327
14328   label1 = NULL;
14329   if (!is_weak)
14330     {
14331       label1 = gen_label_rtx ();
14332       emit_label (label1);
14333     }
14334   label2 = gen_label_rtx ();
14335
14336   /* The initial load can be relaxed for a __sync operation since a final
14337      barrier will be emitted to stop code hoisting.  */
14338   if (is_mm_sync (model))
14339     aarch64_emit_load_exclusive (mode, rval, mem,
14340                                  GEN_INT (MEMMODEL_RELAXED));
14341   else
14342     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14343
14344   if (strong_zero_p)
14345     {
14346       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14347       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14348                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14349       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14350     }
14351   else
14352     {
14353       cond = aarch64_gen_compare_reg (NE, rval, oldval);
14354       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14355       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14356                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14357       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14358     }
14359
14360   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14361
14362   if (!is_weak)
14363     {
14364       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14365       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14366                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14367       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14368     }
14369   else
14370     {
14371       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14372       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14373       emit_insn (gen_rtx_SET (cond, x));
14374     }
14375
14376   emit_label (label2);
14377   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14378      to set the condition flags.  If this is not used it will be removed by
14379      later passes.  */
14380   if (strong_zero_p)
14381     {
14382       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14383       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14384       emit_insn (gen_rtx_SET (cond, x));
14385     }
14386   /* Emit any final barrier needed for a __sync operation.  */
14387   if (is_mm_sync (model))
14388     aarch64_emit_post_barrier (model);
14389 }
14390
14391 /* Emit a BIC instruction.  */
14392
14393 static void
14394 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14395 {
14396   rtx shift_rtx = GEN_INT (shift);
14397   rtx (*gen) (rtx, rtx, rtx, rtx);
14398
14399   switch (mode)
14400     {
14401     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14402     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14403     default:
14404       gcc_unreachable ();
14405     }
14406
14407   emit_insn (gen (dst, s2, shift_rtx, s1));
14408 }
14409
14410 /* Emit an atomic swap.  */
14411
14412 static void
14413 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14414                           rtx mem, rtx model)
14415 {
14416   rtx (*gen) (rtx, rtx, rtx, rtx);
14417
14418   switch (mode)
14419     {
14420     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14421     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14422     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14423     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14424     default:
14425       gcc_unreachable ();
14426     }
14427
14428   emit_insn (gen (dst, mem, value, model));
14429 }
14430
14431 /* Operations supported by aarch64_emit_atomic_load_op.  */
14432
14433 enum aarch64_atomic_load_op_code
14434 {
14435   AARCH64_LDOP_PLUS,    /* A + B  */
14436   AARCH64_LDOP_XOR,     /* A ^ B  */
14437   AARCH64_LDOP_OR,      /* A | B  */
14438   AARCH64_LDOP_BIC      /* A & ~B  */
14439 };
14440
14441 /* Emit an atomic load-operate.  */
14442
14443 static void
14444 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14445                              machine_mode mode, rtx dst, rtx src,
14446                              rtx mem, rtx model)
14447 {
14448   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14449   const aarch64_atomic_load_op_fn plus[] =
14450   {
14451     gen_aarch64_atomic_loadaddqi,
14452     gen_aarch64_atomic_loadaddhi,
14453     gen_aarch64_atomic_loadaddsi,
14454     gen_aarch64_atomic_loadadddi
14455   };
14456   const aarch64_atomic_load_op_fn eor[] =
14457   {
14458     gen_aarch64_atomic_loadeorqi,
14459     gen_aarch64_atomic_loadeorhi,
14460     gen_aarch64_atomic_loadeorsi,
14461     gen_aarch64_atomic_loadeordi
14462   };
14463   const aarch64_atomic_load_op_fn ior[] =
14464   {
14465     gen_aarch64_atomic_loadsetqi,
14466     gen_aarch64_atomic_loadsethi,
14467     gen_aarch64_atomic_loadsetsi,
14468     gen_aarch64_atomic_loadsetdi
14469   };
14470   const aarch64_atomic_load_op_fn bic[] =
14471   {
14472     gen_aarch64_atomic_loadclrqi,
14473     gen_aarch64_atomic_loadclrhi,
14474     gen_aarch64_atomic_loadclrsi,
14475     gen_aarch64_atomic_loadclrdi
14476   };
14477   aarch64_atomic_load_op_fn gen;
14478   int idx = 0;
14479
14480   switch (mode)
14481     {
14482     case E_QImode: idx = 0; break;
14483     case E_HImode: idx = 1; break;
14484     case E_SImode: idx = 2; break;
14485     case E_DImode: idx = 3; break;
14486     default:
14487       gcc_unreachable ();
14488     }
14489
14490   switch (code)
14491     {
14492     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14493     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14494     case AARCH64_LDOP_OR: gen = ior[idx]; break;
14495     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14496     default:
14497       gcc_unreachable ();
14498     }
14499
14500   emit_insn (gen (dst, mem, src, model));
14501 }
14502
14503 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
14504    location to store the data read from memory.  OUT_RESULT is the location to
14505    store the result of the operation.  MEM is the memory location to read and
14506    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
14507    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
14508    be NULL.  */
14509
14510 void
14511 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14512                          rtx mem, rtx value, rtx model_rtx)
14513 {
14514   machine_mode mode = GET_MODE (mem);
14515   machine_mode wmode = (mode == DImode ? DImode : SImode);
14516   const bool short_mode = (mode < SImode);
14517   aarch64_atomic_load_op_code ldop_code;
14518   rtx src;
14519   rtx x;
14520
14521   if (out_data)
14522     out_data = gen_lowpart (mode, out_data);
14523
14524   if (out_result)
14525     out_result = gen_lowpart (mode, out_result);
14526
14527   /* Make sure the value is in a register, putting it into a destination
14528      register if it needs to be manipulated.  */
14529   if (!register_operand (value, mode)
14530       || code == AND || code == MINUS)
14531     {
14532       src = out_result ? out_result : out_data;
14533       emit_move_insn (src, gen_lowpart (mode, value));
14534     }
14535   else
14536     src = value;
14537   gcc_assert (register_operand (src, mode));
14538
14539   /* Preprocess the data for the operation as necessary.  If the operation is
14540      a SET then emit a swap instruction and finish.  */
14541   switch (code)
14542     {
14543     case SET:
14544       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14545       return;
14546
14547     case MINUS:
14548       /* Negate the value and treat it as a PLUS.  */
14549       {
14550         rtx neg_src;
14551
14552         /* Resize the value if necessary.  */
14553         if (short_mode)
14554           src = gen_lowpart (wmode, src);
14555
14556         neg_src = gen_rtx_NEG (wmode, src);
14557         emit_insn (gen_rtx_SET (src, neg_src));
14558
14559         if (short_mode)
14560           src = gen_lowpart (mode, src);
14561       }
14562       /* Fall-through.  */
14563     case PLUS:
14564       ldop_code = AARCH64_LDOP_PLUS;
14565       break;
14566
14567     case IOR:
14568       ldop_code = AARCH64_LDOP_OR;
14569       break;
14570
14571     case XOR:
14572       ldop_code = AARCH64_LDOP_XOR;
14573       break;
14574
14575     case AND:
14576       {
14577         rtx not_src;
14578
14579         /* Resize the value if necessary.  */
14580         if (short_mode)
14581           src = gen_lowpart (wmode, src);
14582
14583         not_src = gen_rtx_NOT (wmode, src);
14584         emit_insn (gen_rtx_SET (src, not_src));
14585
14586         if (short_mode)
14587           src = gen_lowpart (mode, src);
14588       }
14589       ldop_code = AARCH64_LDOP_BIC;
14590       break;
14591
14592     default:
14593       /* The operation can't be done with atomic instructions.  */
14594       gcc_unreachable ();
14595     }
14596
14597   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14598
14599   /* If necessary, calculate the data in memory after the update by redoing the
14600      operation from values in registers.  */
14601   if (!out_result)
14602     return;
14603
14604   if (short_mode)
14605     {
14606       src = gen_lowpart (wmode, src);
14607       out_data = gen_lowpart (wmode, out_data);
14608       out_result = gen_lowpart (wmode, out_result);
14609     }
14610
14611   x = NULL_RTX;
14612
14613   switch (code)
14614     {
14615     case MINUS:
14616     case PLUS:
14617       x = gen_rtx_PLUS (wmode, out_data, src);
14618       break;
14619     case IOR:
14620       x = gen_rtx_IOR (wmode, out_data, src);
14621       break;
14622     case XOR:
14623       x = gen_rtx_XOR (wmode, out_data, src);
14624       break;
14625     case AND:
14626       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14627       return;
14628     default:
14629       gcc_unreachable ();
14630     }
14631
14632   emit_set_insn (out_result, x);
14633
14634   return;
14635 }
14636
14637 /* Split an atomic operation.  */
14638
14639 void
14640 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14641                          rtx value, rtx model_rtx, rtx cond)
14642 {
14643   machine_mode mode = GET_MODE (mem);
14644   machine_mode wmode = (mode == DImode ? DImode : SImode);
14645   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14646   const bool is_sync = is_mm_sync (model);
14647   rtx_code_label *label;
14648   rtx x;
14649
14650   /* Split the atomic operation into a sequence.  */
14651   label = gen_label_rtx ();
14652   emit_label (label);
14653
14654   if (new_out)
14655     new_out = gen_lowpart (wmode, new_out);
14656   if (old_out)
14657     old_out = gen_lowpart (wmode, old_out);
14658   else
14659     old_out = new_out;
14660   value = simplify_gen_subreg (wmode, value, mode, 0);
14661
14662   /* The initial load can be relaxed for a __sync operation since a final
14663      barrier will be emitted to stop code hoisting.  */
14664  if (is_sync)
14665     aarch64_emit_load_exclusive (mode, old_out, mem,
14666                                  GEN_INT (MEMMODEL_RELAXED));
14667   else
14668     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14669
14670   switch (code)
14671     {
14672     case SET:
14673       new_out = value;
14674       break;
14675
14676     case NOT:
14677       x = gen_rtx_AND (wmode, old_out, value);
14678       emit_insn (gen_rtx_SET (new_out, x));
14679       x = gen_rtx_NOT (wmode, new_out);
14680       emit_insn (gen_rtx_SET (new_out, x));
14681       break;
14682
14683     case MINUS:
14684       if (CONST_INT_P (value))
14685         {
14686           value = GEN_INT (-INTVAL (value));
14687           code = PLUS;
14688         }
14689       /* Fall through.  */
14690
14691     default:
14692       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14693       emit_insn (gen_rtx_SET (new_out, x));
14694       break;
14695     }
14696
14697   aarch64_emit_store_exclusive (mode, cond, mem,
14698                                 gen_lowpart (mode, new_out), model_rtx);
14699
14700   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14701   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14702                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14703   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14704
14705   /* Emit any final barrier needed for a __sync operation.  */
14706   if (is_sync)
14707     aarch64_emit_post_barrier (model);
14708 }
14709
14710 static void
14711 aarch64_init_libfuncs (void)
14712 {
14713    /* Half-precision float operations.  The compiler handles all operations
14714      with NULL libfuncs by converting to SFmode.  */
14715
14716   /* Conversions.  */
14717   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14718   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14719
14720   /* Arithmetic.  */
14721   set_optab_libfunc (add_optab, HFmode, NULL);
14722   set_optab_libfunc (sdiv_optab, HFmode, NULL);
14723   set_optab_libfunc (smul_optab, HFmode, NULL);
14724   set_optab_libfunc (neg_optab, HFmode, NULL);
14725   set_optab_libfunc (sub_optab, HFmode, NULL);
14726
14727   /* Comparisons.  */
14728   set_optab_libfunc (eq_optab, HFmode, NULL);
14729   set_optab_libfunc (ne_optab, HFmode, NULL);
14730   set_optab_libfunc (lt_optab, HFmode, NULL);
14731   set_optab_libfunc (le_optab, HFmode, NULL);
14732   set_optab_libfunc (ge_optab, HFmode, NULL);
14733   set_optab_libfunc (gt_optab, HFmode, NULL);
14734   set_optab_libfunc (unord_optab, HFmode, NULL);
14735 }
14736
14737 /* Target hook for c_mode_for_suffix.  */
14738 static machine_mode
14739 aarch64_c_mode_for_suffix (char suffix)
14740 {
14741   if (suffix == 'q')
14742     return TFmode;
14743
14744   return VOIDmode;
14745 }
14746
14747 /* We can only represent floating point constants which will fit in
14748    "quarter-precision" values.  These values are characterised by
14749    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
14750    by:
14751
14752    (-1)^s * (n/16) * 2^r
14753
14754    Where:
14755      's' is the sign bit.
14756      'n' is an integer in the range 16 <= n <= 31.
14757      'r' is an integer in the range -3 <= r <= 4.  */
14758
14759 /* Return true iff X can be represented by a quarter-precision
14760    floating point immediate operand X.  Note, we cannot represent 0.0.  */
14761 bool
14762 aarch64_float_const_representable_p (rtx x)
14763 {
14764   /* This represents our current view of how many bits
14765      make up the mantissa.  */
14766   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14767   int exponent;
14768   unsigned HOST_WIDE_INT mantissa, mask;
14769   REAL_VALUE_TYPE r, m;
14770   bool fail;
14771
14772   if (!CONST_DOUBLE_P (x))
14773     return false;
14774
14775   /* We don't support HFmode constants yet.  */
14776   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14777     return false;
14778
14779   r = *CONST_DOUBLE_REAL_VALUE (x);
14780
14781   /* We cannot represent infinities, NaNs or +/-zero.  We won't
14782      know if we have +zero until we analyse the mantissa, but we
14783      can reject the other invalid values.  */
14784   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14785       || REAL_VALUE_MINUS_ZERO (r))
14786     return false;
14787
14788   /* Extract exponent.  */
14789   r = real_value_abs (&r);
14790   exponent = REAL_EXP (&r);
14791
14792   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14793      highest (sign) bit, with a fixed binary point at bit point_pos.
14794      m1 holds the low part of the mantissa, m2 the high part.
14795      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14796      bits for the mantissa, this can fail (low bits will be lost).  */
14797   real_ldexp (&m, &r, point_pos - exponent);
14798   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14799
14800   /* If the low part of the mantissa has bits set we cannot represent
14801      the value.  */
14802   if (w.ulow () != 0)
14803     return false;
14804   /* We have rejected the lower HOST_WIDE_INT, so update our
14805      understanding of how many bits lie in the mantissa and
14806      look only at the high HOST_WIDE_INT.  */
14807   mantissa = w.elt (1);
14808   point_pos -= HOST_BITS_PER_WIDE_INT;
14809
14810   /* We can only represent values with a mantissa of the form 1.xxxx.  */
14811   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14812   if ((mantissa & mask) != 0)
14813     return false;
14814
14815   /* Having filtered unrepresentable values, we may now remove all
14816      but the highest 5 bits.  */
14817   mantissa >>= point_pos - 5;
14818
14819   /* We cannot represent the value 0.0, so reject it.  This is handled
14820      elsewhere.  */
14821   if (mantissa == 0)
14822     return false;
14823
14824   /* Then, as bit 4 is always set, we can mask it off, leaving
14825      the mantissa in the range [0, 15].  */
14826   mantissa &= ~(1 << 4);
14827   gcc_assert (mantissa <= 15);
14828
14829   /* GCC internally does not use IEEE754-like encoding (where normalized
14830      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
14831      Our mantissa values are shifted 4 places to the left relative to
14832      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14833      by 5 places to correct for GCC's representation.  */
14834   exponent = 5 - exponent;
14835
14836   return (exponent >= 0 && exponent <= 7);
14837 }
14838
14839 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14840    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
14841    output MOVI/MVNI, ORR or BIC immediate.  */
14842 char*
14843 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14844                                    enum simd_immediate_check which)
14845 {
14846   bool is_valid;
14847   static char templ[40];
14848   const char *mnemonic;
14849   const char *shift_op;
14850   unsigned int lane_count = 0;
14851   char element_char;
14852
14853   struct simd_immediate_info info;
14854
14855   /* This will return true to show const_vector is legal for use as either
14856      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14857      It will also update INFO to show how the immediate should be generated.
14858      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
14859   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14860   gcc_assert (is_valid);
14861
14862   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14863   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14864
14865   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14866     {
14867       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14868       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14869          move immediate path.  */
14870       if (aarch64_float_const_zero_rtx_p (info.value))
14871         info.value = GEN_INT (0);
14872       else
14873         {
14874           const unsigned int buf_size = 20;
14875           char float_buf[buf_size] = {'\0'};
14876           real_to_decimal_for_mode (float_buf,
14877                                     CONST_DOUBLE_REAL_VALUE (info.value),
14878                                     buf_size, buf_size, 1, info.elt_mode);
14879
14880           if (lane_count == 1)
14881             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14882           else
14883             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14884                       lane_count, element_char, float_buf);
14885           return templ;
14886         }
14887     }
14888
14889   gcc_assert (CONST_INT_P (info.value));
14890
14891   if (which == AARCH64_CHECK_MOV)
14892     {
14893       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14894       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14895       if (lane_count == 1)
14896         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14897                   mnemonic, UINTVAL (info.value));
14898       else if (info.shift)
14899         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14900                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14901                   element_char, UINTVAL (info.value), shift_op, info.shift);
14902       else
14903         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14904                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14905                   element_char, UINTVAL (info.value));
14906     }
14907   else
14908     {
14909       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
14910       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14911       if (info.shift)
14912         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14913                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14914                   element_char, UINTVAL (info.value), "lsl", info.shift);
14915       else
14916         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14917                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14918                   element_char, UINTVAL (info.value));
14919     }
14920   return templ;
14921 }
14922
14923 char*
14924 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14925 {
14926
14927   /* If a floating point number was passed and we desire to use it in an
14928      integer mode do the conversion to integer.  */
14929   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14930     {
14931       unsigned HOST_WIDE_INT ival;
14932       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
14933           gcc_unreachable ();
14934       immediate = gen_int_mode (ival, mode);
14935     }
14936
14937   machine_mode vmode;
14938   /* use a 64 bit mode for everything except for DI/DF mode, where we use
14939      a 128 bit vector mode.  */
14940   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
14941
14942   vmode = aarch64_simd_container_mode (mode, width);
14943   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
14944   return aarch64_output_simd_mov_immediate (v_op, width);
14945 }
14946
14947 /* Return the output string to use for moving immediate CONST_VECTOR
14948    into an SVE register.  */
14949
14950 char *
14951 aarch64_output_sve_mov_immediate (rtx const_vector)
14952 {
14953   static char templ[40];
14954   struct simd_immediate_info info;
14955   char element_char;
14956
14957   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
14958   gcc_assert (is_valid);
14959
14960   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14961
14962   if (info.step)
14963     {
14964       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
14965                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
14966                 element_char, INTVAL (info.value), INTVAL (info.step));
14967       return templ;
14968     }
14969
14970   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14971     {
14972       if (aarch64_float_const_zero_rtx_p (info.value))
14973         info.value = GEN_INT (0);
14974       else
14975         {
14976           const int buf_size = 20;
14977           char float_buf[buf_size] = {};
14978           real_to_decimal_for_mode (float_buf,
14979                                     CONST_DOUBLE_REAL_VALUE (info.value),
14980                                     buf_size, buf_size, 1, info.elt_mode);
14981
14982           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
14983                     element_char, float_buf);
14984           return templ;
14985         }
14986     }
14987
14988   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
14989             element_char, INTVAL (info.value));
14990   return templ;
14991 }
14992
14993 /* Return the asm format for a PTRUE instruction whose destination has
14994    mode MODE.  SUFFIX is the element size suffix.  */
14995
14996 char *
14997 aarch64_output_ptrue (machine_mode mode, char suffix)
14998 {
14999   unsigned int nunits;
15000   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15001   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15002     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15003   else
15004     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15005   return buf;
15006 }
15007
15008 /* Split operands into moves from op[1] + op[2] into op[0].  */
15009
15010 void
15011 aarch64_split_combinev16qi (rtx operands[3])
15012 {
15013   unsigned int dest = REGNO (operands[0]);
15014   unsigned int src1 = REGNO (operands[1]);
15015   unsigned int src2 = REGNO (operands[2]);
15016   machine_mode halfmode = GET_MODE (operands[1]);
15017   unsigned int halfregs = REG_NREGS (operands[1]);
15018   rtx destlo, desthi;
15019
15020   gcc_assert (halfmode == V16QImode);
15021
15022   if (src1 == dest && src2 == dest + halfregs)
15023     {
15024       /* No-op move.  Can't split to nothing; emit something.  */
15025       emit_note (NOTE_INSN_DELETED);
15026       return;
15027     }
15028
15029   /* Preserve register attributes for variable tracking.  */
15030   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15031   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15032                                GET_MODE_SIZE (halfmode));
15033
15034   /* Special case of reversed high/low parts.  */
15035   if (reg_overlap_mentioned_p (operands[2], destlo)
15036       && reg_overlap_mentioned_p (operands[1], desthi))
15037     {
15038       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15039       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15040       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15041     }
15042   else if (!reg_overlap_mentioned_p (operands[2], destlo))
15043     {
15044       /* Try to avoid unnecessary moves if part of the result
15045          is in the right place already.  */
15046       if (src1 != dest)
15047         emit_move_insn (destlo, operands[1]);
15048       if (src2 != dest + halfregs)
15049         emit_move_insn (desthi, operands[2]);
15050     }
15051   else
15052     {
15053       if (src2 != dest + halfregs)
15054         emit_move_insn (desthi, operands[2]);
15055       if (src1 != dest)
15056         emit_move_insn (destlo, operands[1]);
15057     }
15058 }
15059
15060 /* vec_perm support.  */
15061
15062 struct expand_vec_perm_d
15063 {
15064   rtx target, op0, op1;
15065   vec_perm_indices perm;
15066   machine_mode vmode;
15067   unsigned int vec_flags;
15068   bool one_vector_p;
15069   bool testing_p;
15070 };
15071
15072 /* Generate a variable permutation.  */
15073
15074 static void
15075 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15076 {
15077   machine_mode vmode = GET_MODE (target);
15078   bool one_vector_p = rtx_equal_p (op0, op1);
15079
15080   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15081   gcc_checking_assert (GET_MODE (op0) == vmode);
15082   gcc_checking_assert (GET_MODE (op1) == vmode);
15083   gcc_checking_assert (GET_MODE (sel) == vmode);
15084   gcc_checking_assert (TARGET_SIMD);
15085
15086   if (one_vector_p)
15087     {
15088       if (vmode == V8QImode)
15089         {
15090           /* Expand the argument to a V16QI mode by duplicating it.  */
15091           rtx pair = gen_reg_rtx (V16QImode);
15092           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15093           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15094         }
15095       else
15096         {
15097           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15098         }
15099     }
15100   else
15101     {
15102       rtx pair;
15103
15104       if (vmode == V8QImode)
15105         {
15106           pair = gen_reg_rtx (V16QImode);
15107           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15108           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15109         }
15110       else
15111         {
15112           pair = gen_reg_rtx (OImode);
15113           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15114           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15115         }
15116     }
15117 }
15118
15119 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15120    NELT is the number of elements in the vector.  */
15121
15122 void
15123 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15124                          unsigned int nelt)
15125 {
15126   machine_mode vmode = GET_MODE (target);
15127   bool one_vector_p = rtx_equal_p (op0, op1);
15128   rtx mask;
15129
15130   /* The TBL instruction does not use a modulo index, so we must take care
15131      of that ourselves.  */
15132   mask = aarch64_simd_gen_const_vector_dup (vmode,
15133       one_vector_p ? nelt - 1 : 2 * nelt - 1);
15134   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15135
15136   /* For big-endian, we also need to reverse the index within the vector
15137      (but not which vector).  */
15138   if (BYTES_BIG_ENDIAN)
15139     {
15140       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
15141       if (!one_vector_p)
15142         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15143       sel = expand_simple_binop (vmode, XOR, sel, mask,
15144                                  NULL, 0, OPTAB_LIB_WIDEN);
15145     }
15146   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15147 }
15148
15149 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
15150
15151 static void
15152 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15153 {
15154   emit_insn (gen_rtx_SET (target,
15155                           gen_rtx_UNSPEC (GET_MODE (target),
15156                                           gen_rtvec (2, op0, op1), code)));
15157 }
15158
15159 /* Expand an SVE vec_perm with the given operands.  */
15160
15161 void
15162 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15163 {
15164   machine_mode data_mode = GET_MODE (target);
15165   machine_mode sel_mode = GET_MODE (sel);
15166   /* Enforced by the pattern condition.  */
15167   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15168
15169   /* Note: vec_perm indices are supposed to wrap when they go beyond the
15170      size of the two value vectors, i.e. the upper bits of the indices
15171      are effectively ignored.  SVE TBL instead produces 0 for any
15172      out-of-range indices, so we need to modulo all the vec_perm indices
15173      to ensure they are all in range.  */
15174   rtx sel_reg = force_reg (sel_mode, sel);
15175
15176   /* Check if the sel only references the first values vector.  */
15177   if (GET_CODE (sel) == CONST_VECTOR
15178       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15179     {
15180       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15181       return;
15182     }
15183
15184   /* Check if the two values vectors are the same.  */
15185   if (rtx_equal_p (op0, op1))
15186     {
15187       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15188       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15189                                          NULL, 0, OPTAB_DIRECT);
15190       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15191       return;
15192     }
15193
15194   /* Run TBL on for each value vector and combine the results.  */
15195
15196   rtx res0 = gen_reg_rtx (data_mode);
15197   rtx res1 = gen_reg_rtx (data_mode);
15198   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15199   if (GET_CODE (sel) != CONST_VECTOR
15200       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15201     {
15202       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15203                                                        2 * nunits - 1);
15204       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15205                                      NULL, 0, OPTAB_DIRECT);
15206     }
15207   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15208   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15209                                      NULL, 0, OPTAB_DIRECT);
15210   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15211   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15212     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15213   else
15214     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15215 }
15216
15217 /* Recognize patterns suitable for the TRN instructions.  */
15218 static bool
15219 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15220 {
15221   HOST_WIDE_INT odd;
15222   poly_uint64 nelt = d->perm.length ();
15223   rtx out, in0, in1, x;
15224   machine_mode vmode = d->vmode;
15225
15226   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15227     return false;
15228
15229   /* Note that these are little-endian tests.
15230      We correct for big-endian later.  */
15231   if (!d->perm[0].is_constant (&odd)
15232       || (odd != 0 && odd != 1)
15233       || !d->perm.series_p (0, 2, odd, 2)
15234       || !d->perm.series_p (1, 2, nelt + odd, 2))
15235     return false;
15236
15237   /* Success!  */
15238   if (d->testing_p)
15239     return true;
15240
15241   in0 = d->op0;
15242   in1 = d->op1;
15243   /* We don't need a big-endian lane correction for SVE; see the comment
15244      at the head of aarch64-sve.md for details.  */
15245   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15246     {
15247       x = in0, in0 = in1, in1 = x;
15248       odd = !odd;
15249     }
15250   out = d->target;
15251
15252   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15253                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15254   return true;
15255 }
15256
15257 /* Recognize patterns suitable for the UZP instructions.  */
15258 static bool
15259 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15260 {
15261   HOST_WIDE_INT odd;
15262   rtx out, in0, in1, x;
15263   machine_mode vmode = d->vmode;
15264
15265   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15266     return false;
15267
15268   /* Note that these are little-endian tests.
15269      We correct for big-endian later.  */
15270   if (!d->perm[0].is_constant (&odd)
15271       || (odd != 0 && odd != 1)
15272       || !d->perm.series_p (0, 1, odd, 2))
15273     return false;
15274
15275   /* Success!  */
15276   if (d->testing_p)
15277     return true;
15278
15279   in0 = d->op0;
15280   in1 = d->op1;
15281   /* We don't need a big-endian lane correction for SVE; see the comment
15282      at the head of aarch64-sve.md for details.  */
15283   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15284     {
15285       x = in0, in0 = in1, in1 = x;
15286       odd = !odd;
15287     }
15288   out = d->target;
15289
15290   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15291                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15292   return true;
15293 }
15294
15295 /* Recognize patterns suitable for the ZIP instructions.  */
15296 static bool
15297 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15298 {
15299   unsigned int high;
15300   poly_uint64 nelt = d->perm.length ();
15301   rtx out, in0, in1, x;
15302   machine_mode vmode = d->vmode;
15303
15304   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15305     return false;
15306
15307   /* Note that these are little-endian tests.
15308      We correct for big-endian later.  */
15309   poly_uint64 first = d->perm[0];
15310   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15311       || !d->perm.series_p (0, 2, first, 1)
15312       || !d->perm.series_p (1, 2, first + nelt, 1))
15313     return false;
15314   high = maybe_ne (first, 0U);
15315
15316   /* Success!  */
15317   if (d->testing_p)
15318     return true;
15319
15320   in0 = d->op0;
15321   in1 = d->op1;
15322   /* We don't need a big-endian lane correction for SVE; see the comment
15323      at the head of aarch64-sve.md for details.  */
15324   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15325     {
15326       x = in0, in0 = in1, in1 = x;
15327       high = !high;
15328     }
15329   out = d->target;
15330
15331   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15332                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15333   return true;
15334 }
15335
15336 /* Recognize patterns for the EXT insn.  */
15337
15338 static bool
15339 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15340 {
15341   HOST_WIDE_INT location;
15342   rtx offset;
15343
15344   /* The first element always refers to the first vector.
15345      Check if the extracted indices are increasing by one.  */
15346   if (d->vec_flags == VEC_SVE_PRED
15347       || !d->perm[0].is_constant (&location)
15348       || !d->perm.series_p (0, 1, location, 1))
15349     return false;
15350
15351   /* Success! */
15352   if (d->testing_p)
15353     return true;
15354
15355   /* The case where (location == 0) is a no-op for both big- and little-endian,
15356      and is removed by the mid-end at optimization levels -O1 and higher.
15357
15358      We don't need a big-endian lane correction for SVE; see the comment
15359      at the head of aarch64-sve.md for details.  */
15360   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15361     {
15362       /* After setup, we want the high elements of the first vector (stored
15363          at the LSB end of the register), and the low elements of the second
15364          vector (stored at the MSB end of the register). So swap.  */
15365       std::swap (d->op0, d->op1);
15366       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15367          to_constant () is safe since this is restricted to Advanced SIMD
15368          vectors.  */
15369       location = d->perm.length ().to_constant () - location;
15370     }
15371
15372   offset = GEN_INT (location);
15373   emit_set_insn (d->target,
15374                  gen_rtx_UNSPEC (d->vmode,
15375                                  gen_rtvec (3, d->op0, d->op1, offset),
15376                                  UNSPEC_EXT));
15377   return true;
15378 }
15379
15380 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15381    within each 64-bit, 32-bit or 16-bit granule.  */
15382
15383 static bool
15384 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15385 {
15386   HOST_WIDE_INT diff;
15387   unsigned int i, size, unspec;
15388   machine_mode pred_mode;
15389
15390   if (d->vec_flags == VEC_SVE_PRED
15391       || !d->one_vector_p
15392       || !d->perm[0].is_constant (&diff))
15393     return false;
15394
15395   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15396   if (size == 8)
15397     {
15398       unspec = UNSPEC_REV64;
15399       pred_mode = VNx2BImode;
15400     }
15401   else if (size == 4)
15402     {
15403       unspec = UNSPEC_REV32;
15404       pred_mode = VNx4BImode;
15405     }
15406   else if (size == 2)
15407     {
15408       unspec = UNSPEC_REV16;
15409       pred_mode = VNx8BImode;
15410     }
15411   else
15412     return false;
15413
15414   unsigned int step = diff + 1;
15415   for (i = 0; i < step; ++i)
15416     if (!d->perm.series_p (i, step, diff - i, step))
15417       return false;
15418
15419   /* Success! */
15420   if (d->testing_p)
15421     return true;
15422
15423   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15424   if (d->vec_flags == VEC_SVE_DATA)
15425     {
15426       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15427       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15428                             UNSPEC_MERGE_PTRUE);
15429     }
15430   emit_set_insn (d->target, src);
15431   return true;
15432 }
15433
15434 /* Recognize patterns for the REV insn, which reverses elements within
15435    a full vector.  */
15436
15437 static bool
15438 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15439 {
15440   poly_uint64 nelt = d->perm.length ();
15441
15442   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15443     return false;
15444
15445   if (!d->perm.series_p (0, 1, nelt - 1, -1))
15446     return false;
15447
15448   /* Success! */
15449   if (d->testing_p)
15450     return true;
15451
15452   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15453   emit_set_insn (d->target, src);
15454   return true;
15455 }
15456
15457 static bool
15458 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15459 {
15460   rtx out = d->target;
15461   rtx in0;
15462   HOST_WIDE_INT elt;
15463   machine_mode vmode = d->vmode;
15464   rtx lane;
15465
15466   if (d->vec_flags == VEC_SVE_PRED
15467       || d->perm.encoding ().encoded_nelts () != 1
15468       || !d->perm[0].is_constant (&elt))
15469     return false;
15470
15471   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15472     return false;
15473
15474   /* Success! */
15475   if (d->testing_p)
15476     return true;
15477
15478   /* The generic preparation in aarch64_expand_vec_perm_const_1
15479      swaps the operand order and the permute indices if it finds
15480      d->perm[0] to be in the second operand.  Thus, we can always
15481      use d->op0 and need not do any extra arithmetic to get the
15482      correct lane number.  */
15483   in0 = d->op0;
15484   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
15485
15486   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15487   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15488   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15489   return true;
15490 }
15491
15492 static bool
15493 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15494 {
15495   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15496   machine_mode vmode = d->vmode;
15497
15498   /* Make sure that the indices are constant.  */
15499   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15500   for (unsigned int i = 0; i < encoded_nelts; ++i)
15501     if (!d->perm[i].is_constant ())
15502       return false;
15503
15504   if (d->testing_p)
15505     return true;
15506
15507   /* Generic code will try constant permutation twice.  Once with the
15508      original mode and again with the elements lowered to QImode.
15509      So wait and don't do the selector expansion ourselves.  */
15510   if (vmode != V8QImode && vmode != V16QImode)
15511     return false;
15512
15513   /* to_constant is safe since this routine is specific to Advanced SIMD
15514      vectors.  */
15515   unsigned int nelt = d->perm.length ().to_constant ();
15516   for (unsigned int i = 0; i < nelt; ++i)
15517     /* If big-endian and two vectors we end up with a weird mixed-endian
15518        mode on NEON.  Reverse the index within each word but not the word
15519        itself.  to_constant is safe because we checked is_constant above.  */
15520     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15521                         ? d->perm[i].to_constant () ^ (nelt - 1)
15522                         : d->perm[i].to_constant ());
15523
15524   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15525   sel = force_reg (vmode, sel);
15526
15527   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15528   return true;
15529 }
15530
15531 /* Try to implement D using an SVE TBL instruction.  */
15532
15533 static bool
15534 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15535 {
15536   unsigned HOST_WIDE_INT nelt;
15537
15538   /* Permuting two variable-length vectors could overflow the
15539      index range.  */
15540   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15541     return false;
15542
15543   if (d->testing_p)
15544     return true;
15545
15546   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15547   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15548   aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15549   return true;
15550 }
15551
15552 static bool
15553 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15554 {
15555   /* The pattern matching functions above are written to look for a small
15556      number to begin the sequence (0, 1, N/2).  If we begin with an index
15557      from the second operand, we can swap the operands.  */
15558   poly_int64 nelt = d->perm.length ();
15559   if (known_ge (d->perm[0], nelt))
15560     {
15561       d->perm.rotate_inputs (1);
15562       std::swap (d->op0, d->op1);
15563     }
15564
15565   if ((d->vec_flags == VEC_ADVSIMD
15566        || d->vec_flags == VEC_SVE_DATA
15567        || d->vec_flags == VEC_SVE_PRED)
15568       && known_gt (nelt, 1))
15569     {
15570       if (aarch64_evpc_rev_local (d))
15571         return true;
15572       else if (aarch64_evpc_rev_global (d))
15573         return true;
15574       else if (aarch64_evpc_ext (d))
15575         return true;
15576       else if (aarch64_evpc_dup (d))
15577         return true;
15578       else if (aarch64_evpc_zip (d))
15579         return true;
15580       else if (aarch64_evpc_uzp (d))
15581         return true;
15582       else if (aarch64_evpc_trn (d))
15583         return true;
15584       if (d->vec_flags == VEC_SVE_DATA)
15585         return aarch64_evpc_sve_tbl (d);
15586       else if (d->vec_flags == VEC_SVE_DATA)
15587         return aarch64_evpc_tbl (d);
15588     }
15589   return false;
15590 }
15591
15592 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
15593
15594 static bool
15595 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15596                                   rtx op1, const vec_perm_indices &sel)
15597 {
15598   struct expand_vec_perm_d d;
15599
15600   /* Check whether the mask can be applied to a single vector.  */
15601   if (op0 && rtx_equal_p (op0, op1))
15602     d.one_vector_p = true;
15603   else if (sel.all_from_input_p (0))
15604     {
15605       d.one_vector_p = true;
15606       op1 = op0;
15607     }
15608   else if (sel.all_from_input_p (1))
15609     {
15610       d.one_vector_p = true;
15611       op0 = op1;
15612     }
15613   else
15614     d.one_vector_p = false;
15615
15616   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15617                      sel.nelts_per_input ());
15618   d.vmode = vmode;
15619   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15620   d.target = target;
15621   d.op0 = op0;
15622   d.op1 = op1;
15623   d.testing_p = !target;
15624
15625   if (!d.testing_p)
15626     return aarch64_expand_vec_perm_const_1 (&d);
15627
15628   rtx_insn *last = get_last_insn ();
15629   bool ret = aarch64_expand_vec_perm_const_1 (&d);
15630   gcc_assert (last == get_last_insn ());
15631
15632   return ret;
15633 }
15634
15635 /* Generate a byte permute mask for a register of mode MODE,
15636    which has NUNITS units.  */
15637
15638 rtx
15639 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15640 {
15641   /* We have to reverse each vector because we dont have
15642      a permuted load that can reverse-load according to ABI rules.  */
15643   rtx mask;
15644   rtvec v = rtvec_alloc (16);
15645   unsigned int i, j;
15646   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15647
15648   gcc_assert (BYTES_BIG_ENDIAN);
15649   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15650
15651   for (i = 0; i < nunits; i++)
15652     for (j = 0; j < usize; j++)
15653       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15654   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15655   return force_reg (V16QImode, mask);
15656 }
15657
15658 /* Return true if X is a valid second operand for the SVE instruction
15659    that implements integer comparison OP_CODE.  */
15660
15661 static bool
15662 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15663 {
15664   if (register_operand (x, VOIDmode))
15665     return true;
15666
15667   switch (op_code)
15668     {
15669     case LTU:
15670     case LEU:
15671     case GEU:
15672     case GTU:
15673       return aarch64_sve_cmp_immediate_p (x, false);
15674     case LT:
15675     case LE:
15676     case GE:
15677     case GT:
15678     case NE:
15679     case EQ:
15680       return aarch64_sve_cmp_immediate_p (x, true);
15681     default:
15682       gcc_unreachable ();
15683     }
15684 }
15685
15686 /* Return the UNSPEC_COND_* code for comparison CODE.  */
15687
15688 static unsigned int
15689 aarch64_unspec_cond_code (rtx_code code)
15690 {
15691   switch (code)
15692     {
15693     case NE:
15694       return UNSPEC_COND_NE;
15695     case EQ:
15696       return UNSPEC_COND_EQ;
15697     case LT:
15698       return UNSPEC_COND_LT;
15699     case GT:
15700       return UNSPEC_COND_GT;
15701     case LE:
15702       return UNSPEC_COND_LE;
15703     case GE:
15704       return UNSPEC_COND_GE;
15705     case LTU:
15706       return UNSPEC_COND_LO;
15707     case GTU:
15708       return UNSPEC_COND_HI;
15709     case LEU:
15710       return UNSPEC_COND_LS;
15711     case GEU:
15712       return UNSPEC_COND_HS;
15713     case UNORDERED:
15714       return UNSPEC_COND_UO;
15715     default:
15716       gcc_unreachable ();
15717     }
15718 }
15719
15720 /* Return an (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>) expression,
15721    where <X> is the operation associated with comparison CODE.  */
15722
15723 static rtx
15724 aarch64_gen_unspec_cond (rtx_code code, machine_mode pred_mode,
15725                          rtx pred, rtx op0, rtx op1)
15726 {
15727   rtvec vec = gen_rtvec (3, pred, op0, op1);
15728   return gen_rtx_UNSPEC (pred_mode, vec, aarch64_unspec_cond_code (code));
15729 }
15730
15731 /* Expand an SVE integer comparison:
15732
15733      TARGET = CODE (OP0, OP1).  */
15734
15735 void
15736 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15737 {
15738   machine_mode pred_mode = GET_MODE (target);
15739   machine_mode data_mode = GET_MODE (op0);
15740
15741   if (!aarch64_sve_cmp_operand_p (code, op1))
15742     op1 = force_reg (data_mode, op1);
15743
15744   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15745   rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, ptrue, op0, op1);
15746   emit_insn (gen_set_clobber_cc (target, unspec));
15747 }
15748
15749 /* Emit an instruction:
15750
15751       (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15752
15753    where <X> is the operation associated with comparison CODE.  */
15754
15755 static void
15756 aarch64_emit_unspec_cond (rtx target, rtx_code code, machine_mode pred_mode,
15757                           rtx pred, rtx op0, rtx op1)
15758 {
15759   rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, pred, op0, op1);
15760   emit_set_insn (target, unspec);
15761 }
15762
15763 /* Emit:
15764
15765       (set TMP1 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X1>))
15766       (set TMP2 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X2>))
15767       (set TARGET (and:PRED_MODE (ior:PRED_MODE TMP1 TMP2) PTRUE))
15768
15769    where <Xi> is the operation associated with comparison CODEi.  */
15770
15771 static void
15772 aarch64_emit_unspec_cond_or (rtx target, rtx_code code1, rtx_code code2,
15773                              machine_mode pred_mode, rtx ptrue,
15774                              rtx op0, rtx op1)
15775 {
15776   rtx tmp1 = gen_reg_rtx (pred_mode);
15777   aarch64_emit_unspec_cond (tmp1, code1, pred_mode, ptrue, op0, op1);
15778   rtx tmp2 = gen_reg_rtx (pred_mode);
15779   aarch64_emit_unspec_cond (tmp2, code2, pred_mode, ptrue, op0, op1);
15780   emit_set_insn (target, gen_rtx_AND (pred_mode,
15781                                       gen_rtx_IOR (pred_mode, tmp1, tmp2),
15782                                       ptrue));
15783 }
15784
15785 /* If CAN_INVERT_P, emit an instruction:
15786
15787       (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15788
15789    where <X> is the operation associated with comparison CODE.  Otherwise
15790    emit:
15791
15792       (set TMP (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15793       (set TARGET (and:PRED_MODE (not:PRED_MODE TMP) PTRUE))
15794
15795    where the second instructions sets TARGET to the inverse of TMP.  */
15796
15797 static void
15798 aarch64_emit_inverted_unspec_cond (rtx target, rtx_code code,
15799                                    machine_mode pred_mode, rtx ptrue, rtx pred,
15800                                    rtx op0, rtx op1, bool can_invert_p)
15801 {
15802   if (can_invert_p)
15803     aarch64_emit_unspec_cond (target, code, pred_mode, pred, op0, op1);
15804   else
15805     {
15806       rtx tmp = gen_reg_rtx (pred_mode);
15807       aarch64_emit_unspec_cond (tmp, code, pred_mode, pred, op0, op1);
15808       emit_set_insn (target, gen_rtx_AND (pred_mode,
15809                                           gen_rtx_NOT (pred_mode, tmp),
15810                                           ptrue));
15811     }
15812 }
15813
15814 /* Expand an SVE floating-point comparison:
15815
15816      TARGET = CODE (OP0, OP1)
15817
15818    If CAN_INVERT_P is true, the caller can also handle inverted results;
15819    return true if the result is in fact inverted.  */
15820
15821 bool
15822 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15823                                   rtx op0, rtx op1, bool can_invert_p)
15824 {
15825   machine_mode pred_mode = GET_MODE (target);
15826   machine_mode data_mode = GET_MODE (op0);
15827
15828   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15829   switch (code)
15830     {
15831     case UNORDERED:
15832       /* UNORDERED has no immediate form.  */
15833       op1 = force_reg (data_mode, op1);
15834       aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15835       return false;
15836
15837     case LT:
15838     case LE:
15839     case GT:
15840     case GE:
15841     case EQ:
15842     case NE:
15843       /* There is native support for the comparison.  */
15844       aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15845       return false;
15846
15847     case ORDERED:
15848       /* There is native support for the inverse comparison.  */
15849       op1 = force_reg (data_mode, op1);
15850       aarch64_emit_inverted_unspec_cond (target, UNORDERED,
15851                                          pred_mode, ptrue, ptrue, op0, op1,
15852                                          can_invert_p);
15853       return can_invert_p;
15854
15855     case LTGT:
15856       /* This is a trapping operation (LT or GT).  */
15857       aarch64_emit_unspec_cond_or (target, LT, GT, pred_mode, ptrue, op0, op1);
15858       return false;
15859
15860     case UNEQ:
15861       if (!flag_trapping_math)
15862         {
15863           /* This would trap for signaling NaNs.  */
15864           op1 = force_reg (data_mode, op1);
15865           aarch64_emit_unspec_cond_or (target, UNORDERED, EQ,
15866                                        pred_mode, ptrue, op0, op1);
15867           return false;
15868         }
15869       /* fall through */
15870
15871     case UNLT:
15872     case UNLE:
15873     case UNGT:
15874     case UNGE:
15875       {
15876         rtx ordered = ptrue;
15877         if (flag_trapping_math)
15878           {
15879             /* Only compare the elements that are known to be ordered.  */
15880             ordered = gen_reg_rtx (pred_mode);
15881             op1 = force_reg (data_mode, op1);
15882             aarch64_emit_inverted_unspec_cond (ordered, UNORDERED, pred_mode,
15883                                                ptrue, ptrue, op0, op1, false);
15884           }
15885         if (code == UNEQ)
15886           code = NE;
15887         else
15888           code = reverse_condition_maybe_unordered (code);
15889         aarch64_emit_inverted_unspec_cond (target, code, pred_mode, ptrue,
15890                                            ordered, op0, op1, can_invert_p);
15891         return can_invert_p;
15892       }
15893
15894     default:
15895       gcc_unreachable ();
15896     }
15897 }
15898
15899 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
15900    of the data being selected and CMP_MODE is the mode of the values being
15901    compared.  */
15902
15903 void
15904 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15905                           rtx *ops)
15906 {
15907   machine_mode pred_mode
15908     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15909                              GET_MODE_SIZE (cmp_mode)).require ();
15910   rtx pred = gen_reg_rtx (pred_mode);
15911   if (FLOAT_MODE_P (cmp_mode))
15912     {
15913       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15914                                             ops[4], ops[5], true))
15915         std::swap (ops[1], ops[2]);
15916     }
15917   else
15918     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
15919
15920   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
15921   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
15922 }
15923
15924 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
15925    true.  However due to issues with register allocation it is preferable
15926    to avoid tieing integer scalar and FP scalar modes.  Executing integer
15927    operations in general registers is better than treating them as scalar
15928    vector operations.  This reduces latency and avoids redundant int<->FP
15929    moves.  So tie modes if they are either the same class, or vector modes
15930    with other vector modes, vector structs or any scalar mode.  */
15931
15932 static bool
15933 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
15934 {
15935   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
15936     return true;
15937
15938   /* We specifically want to allow elements of "structure" modes to
15939      be tieable to the structure.  This more general condition allows
15940      other rarer situations too.  The reason we don't extend this to
15941      predicate modes is that there are no predicate structure modes
15942      nor any specific instructions for extracting part of a predicate
15943      register.  */
15944   if (aarch64_vector_data_mode_p (mode1)
15945       && aarch64_vector_data_mode_p (mode2))
15946     return true;
15947
15948   /* Also allow any scalar modes with vectors.  */
15949   if (aarch64_vector_mode_supported_p (mode1)
15950       || aarch64_vector_mode_supported_p (mode2))
15951     return true;
15952
15953   return false;
15954 }
15955
15956 /* Return a new RTX holding the result of moving POINTER forward by
15957    AMOUNT bytes.  */
15958
15959 static rtx
15960 aarch64_move_pointer (rtx pointer, poly_int64 amount)
15961 {
15962   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
15963
15964   return adjust_automodify_address (pointer, GET_MODE (pointer),
15965                                     next, amount);
15966 }
15967
15968 /* Return a new RTX holding the result of moving POINTER forward by the
15969    size of the mode it points to.  */
15970
15971 static rtx
15972 aarch64_progress_pointer (rtx pointer)
15973 {
15974   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
15975 }
15976
15977 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
15978    MODE bytes.  */
15979
15980 static void
15981 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
15982                                               machine_mode mode)
15983 {
15984   rtx reg = gen_reg_rtx (mode);
15985
15986   /* "Cast" the pointers to the correct mode.  */
15987   *src = adjust_address (*src, mode, 0);
15988   *dst = adjust_address (*dst, mode, 0);
15989   /* Emit the memcpy.  */
15990   emit_move_insn (reg, *src);
15991   emit_move_insn (*dst, reg);
15992   /* Move the pointers forward.  */
15993   *src = aarch64_progress_pointer (*src);
15994   *dst = aarch64_progress_pointer (*dst);
15995 }
15996
15997 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
15998    we succeed, otherwise return false.  */
15999
16000 bool
16001 aarch64_expand_movmem (rtx *operands)
16002 {
16003   unsigned int n;
16004   rtx dst = operands[0];
16005   rtx src = operands[1];
16006   rtx base;
16007   bool speed_p = !optimize_function_for_size_p (cfun);
16008
16009   /* When optimizing for size, give a better estimate of the length of a
16010      memcpy call, but use the default otherwise.  */
16011   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
16012
16013   /* We can't do anything smart if the amount to copy is not constant.  */
16014   if (!CONST_INT_P (operands[2]))
16015     return false;
16016
16017   n = UINTVAL (operands[2]);
16018
16019   /* Try to keep the number of instructions low.  For cases below 16 bytes we
16020      need to make at most two moves.  For cases above 16 bytes it will be one
16021      move for each 16 byte chunk, then at most two additional moves.  */
16022   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
16023     return false;
16024
16025   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16026   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16027
16028   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16029   src = adjust_automodify_address (src, VOIDmode, base, 0);
16030
16031   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
16032      1-byte chunk.  */
16033   if (n < 4)
16034     {
16035       if (n >= 2)
16036         {
16037           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16038           n -= 2;
16039         }
16040
16041       if (n == 1)
16042         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16043
16044       return true;
16045     }
16046
16047   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
16048      4-byte chunk, partially overlapping with the previously copied chunk.  */
16049   if (n < 8)
16050     {
16051       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16052       n -= 4;
16053       if (n > 0)
16054         {
16055           int move = n - 4;
16056
16057           src = aarch64_move_pointer (src, move);
16058           dst = aarch64_move_pointer (dst, move);
16059           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16060         }
16061       return true;
16062     }
16063
16064   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
16065      them, then (if applicable) an 8-byte chunk.  */
16066   while (n >= 8)
16067     {
16068       if (n / 16)
16069         {
16070           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
16071           n -= 16;
16072         }
16073       else
16074         {
16075           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16076           n -= 8;
16077         }
16078     }
16079
16080   /* Finish the final bytes of the copy.  We can always do this in one
16081      instruction.  We either copy the exact amount we need, or partially
16082      overlap with the previous chunk we copied and copy 8-bytes.  */
16083   if (n == 0)
16084     return true;
16085   else if (n == 1)
16086     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16087   else if (n == 2)
16088     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16089   else if (n == 4)
16090     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16091   else
16092     {
16093       if (n == 3)
16094         {
16095           src = aarch64_move_pointer (src, -1);
16096           dst = aarch64_move_pointer (dst, -1);
16097           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16098         }
16099       else
16100         {
16101           int move = n - 8;
16102
16103           src = aarch64_move_pointer (src, move);
16104           dst = aarch64_move_pointer (dst, move);
16105           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16106         }
16107     }
16108
16109   return true;
16110 }
16111
16112 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16113    SImode stores.  Handle the case when the constant has identical
16114    bottom and top halves.  This is beneficial when the two stores can be
16115    merged into an STP and we avoid synthesising potentially expensive
16116    immediates twice.  Return true if such a split is possible.  */
16117
16118 bool
16119 aarch64_split_dimode_const_store (rtx dst, rtx src)
16120 {
16121   rtx lo = gen_lowpart (SImode, src);
16122   rtx hi = gen_highpart_mode (SImode, DImode, src);
16123
16124   bool size_p = optimize_function_for_size_p (cfun);
16125
16126   if (!rtx_equal_p (lo, hi))
16127     return false;
16128
16129   unsigned int orig_cost
16130     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16131   unsigned int lo_cost
16132     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16133
16134   /* We want to transform:
16135      MOV        x1, 49370
16136      MOVK       x1, 0x140, lsl 16
16137      MOVK       x1, 0xc0da, lsl 32
16138      MOVK       x1, 0x140, lsl 48
16139      STR        x1, [x0]
16140    into:
16141      MOV        w1, 49370
16142      MOVK       w1, 0x140, lsl 16
16143      STP        w1, w1, [x0]
16144    So we want to perform this only when we save two instructions
16145    or more.  When optimizing for size, however, accept any code size
16146    savings we can.  */
16147   if (size_p && orig_cost <= lo_cost)
16148     return false;
16149
16150   if (!size_p
16151       && (orig_cost <= lo_cost + 1))
16152     return false;
16153
16154   rtx mem_lo = adjust_address (dst, SImode, 0);
16155   if (!aarch64_mem_pair_operand (mem_lo, SImode))
16156     return false;
16157
16158   rtx tmp_reg = gen_reg_rtx (SImode);
16159   aarch64_expand_mov_immediate (tmp_reg, lo);
16160   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16161   /* Don't emit an explicit store pair as this may not be always profitable.
16162      Let the sched-fusion logic decide whether to merge them.  */
16163   emit_move_insn (mem_lo, tmp_reg);
16164   emit_move_insn (mem_hi, tmp_reg);
16165
16166   return true;
16167 }
16168
16169 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
16170
16171 static unsigned HOST_WIDE_INT
16172 aarch64_asan_shadow_offset (void)
16173 {
16174   return (HOST_WIDE_INT_1 << 36);
16175 }
16176
16177 static rtx
16178 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16179                         int code, tree treeop0, tree treeop1)
16180 {
16181   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16182   rtx op0, op1;
16183   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16184   insn_code icode;
16185   struct expand_operand ops[4];
16186
16187   start_sequence ();
16188   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16189
16190   op_mode = GET_MODE (op0);
16191   if (op_mode == VOIDmode)
16192     op_mode = GET_MODE (op1);
16193
16194   switch (op_mode)
16195     {
16196     case E_QImode:
16197     case E_HImode:
16198     case E_SImode:
16199       cmp_mode = SImode;
16200       icode = CODE_FOR_cmpsi;
16201       break;
16202
16203     case E_DImode:
16204       cmp_mode = DImode;
16205       icode = CODE_FOR_cmpdi;
16206       break;
16207
16208     case E_SFmode:
16209       cmp_mode = SFmode;
16210       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16211       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16212       break;
16213
16214     case E_DFmode:
16215       cmp_mode = DFmode;
16216       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16217       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16218       break;
16219
16220     default:
16221       end_sequence ();
16222       return NULL_RTX;
16223     }
16224
16225   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16226   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16227   if (!op0 || !op1)
16228     {
16229       end_sequence ();
16230       return NULL_RTX;
16231     }
16232   *prep_seq = get_insns ();
16233   end_sequence ();
16234
16235   create_fixed_operand (&ops[0], op0);
16236   create_fixed_operand (&ops[1], op1);
16237
16238   start_sequence ();
16239   if (!maybe_expand_insn (icode, 2, ops))
16240     {
16241       end_sequence ();
16242       return NULL_RTX;
16243     }
16244   *gen_seq = get_insns ();
16245   end_sequence ();
16246
16247   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16248                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16249 }
16250
16251 static rtx
16252 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16253                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
16254 {
16255   rtx op0, op1, target;
16256   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16257   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16258   insn_code icode;
16259   struct expand_operand ops[6];
16260   int aarch64_cond;
16261
16262   push_to_sequence (*prep_seq);
16263   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16264
16265   op_mode = GET_MODE (op0);
16266   if (op_mode == VOIDmode)
16267     op_mode = GET_MODE (op1);
16268
16269   switch (op_mode)
16270     {
16271     case E_QImode:
16272     case E_HImode:
16273     case E_SImode:
16274       cmp_mode = SImode;
16275       icode = CODE_FOR_ccmpsi;
16276       break;
16277
16278     case E_DImode:
16279       cmp_mode = DImode;
16280       icode = CODE_FOR_ccmpdi;
16281       break;
16282
16283     case E_SFmode:
16284       cmp_mode = SFmode;
16285       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16286       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16287       break;
16288
16289     case E_DFmode:
16290       cmp_mode = DFmode;
16291       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16292       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16293       break;
16294
16295     default:
16296       end_sequence ();
16297       return NULL_RTX;
16298     }
16299
16300   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16301   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16302   if (!op0 || !op1)
16303     {
16304       end_sequence ();
16305       return NULL_RTX;
16306     }
16307   *prep_seq = get_insns ();
16308   end_sequence ();
16309
16310   target = gen_rtx_REG (cc_mode, CC_REGNUM);
16311   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16312
16313   if (bit_code != AND)
16314     {
16315       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16316                                                 GET_MODE (XEXP (prev, 0))),
16317                              VOIDmode, XEXP (prev, 0), const0_rtx);
16318       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16319     }
16320
16321   create_fixed_operand (&ops[0], XEXP (prev, 0));
16322   create_fixed_operand (&ops[1], target);
16323   create_fixed_operand (&ops[2], op0);
16324   create_fixed_operand (&ops[3], op1);
16325   create_fixed_operand (&ops[4], prev);
16326   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16327
16328   push_to_sequence (*gen_seq);
16329   if (!maybe_expand_insn (icode, 6, ops))
16330     {
16331       end_sequence ();
16332       return NULL_RTX;
16333     }
16334
16335   *gen_seq = get_insns ();
16336   end_sequence ();
16337
16338   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16339 }
16340
16341 #undef TARGET_GEN_CCMP_FIRST
16342 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16343
16344 #undef TARGET_GEN_CCMP_NEXT
16345 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16346
16347 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
16348    instruction fusion of some sort.  */
16349
16350 static bool
16351 aarch64_macro_fusion_p (void)
16352 {
16353   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16354 }
16355
16356
16357 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
16358    should be kept together during scheduling.  */
16359
16360 static bool
16361 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16362 {
16363   rtx set_dest;
16364   rtx prev_set = single_set (prev);
16365   rtx curr_set = single_set (curr);
16366   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
16367   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16368
16369   if (!aarch64_macro_fusion_p ())
16370     return false;
16371
16372   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16373     {
16374       /* We are trying to match:
16375          prev (mov)  == (set (reg r0) (const_int imm16))
16376          curr (movk) == (set (zero_extract (reg r0)
16377                                            (const_int 16)
16378                                            (const_int 16))
16379                              (const_int imm16_1))  */
16380
16381       set_dest = SET_DEST (curr_set);
16382
16383       if (GET_CODE (set_dest) == ZERO_EXTRACT
16384           && CONST_INT_P (SET_SRC (curr_set))
16385           && CONST_INT_P (SET_SRC (prev_set))
16386           && CONST_INT_P (XEXP (set_dest, 2))
16387           && INTVAL (XEXP (set_dest, 2)) == 16
16388           && REG_P (XEXP (set_dest, 0))
16389           && REG_P (SET_DEST (prev_set))
16390           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16391         {
16392           return true;
16393         }
16394     }
16395
16396   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16397     {
16398
16399       /*  We're trying to match:
16400           prev (adrp) == (set (reg r1)
16401                               (high (symbol_ref ("SYM"))))
16402           curr (add) == (set (reg r0)
16403                              (lo_sum (reg r1)
16404                                      (symbol_ref ("SYM"))))
16405           Note that r0 need not necessarily be the same as r1, especially
16406           during pre-regalloc scheduling.  */
16407
16408       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16409           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16410         {
16411           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16412               && REG_P (XEXP (SET_SRC (curr_set), 0))
16413               && REGNO (XEXP (SET_SRC (curr_set), 0))
16414                  == REGNO (SET_DEST (prev_set))
16415               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16416                               XEXP (SET_SRC (curr_set), 1)))
16417             return true;
16418         }
16419     }
16420
16421   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16422     {
16423
16424       /* We're trying to match:
16425          prev (movk) == (set (zero_extract (reg r0)
16426                                            (const_int 16)
16427                                            (const_int 32))
16428                              (const_int imm16_1))
16429          curr (movk) == (set (zero_extract (reg r0)
16430                                            (const_int 16)
16431                                            (const_int 48))
16432                              (const_int imm16_2))  */
16433
16434       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16435           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16436           && REG_P (XEXP (SET_DEST (prev_set), 0))
16437           && REG_P (XEXP (SET_DEST (curr_set), 0))
16438           && REGNO (XEXP (SET_DEST (prev_set), 0))
16439              == REGNO (XEXP (SET_DEST (curr_set), 0))
16440           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16441           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16442           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16443           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16444           && CONST_INT_P (SET_SRC (prev_set))
16445           && CONST_INT_P (SET_SRC (curr_set)))
16446         return true;
16447
16448     }
16449   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16450     {
16451       /* We're trying to match:
16452           prev (adrp) == (set (reg r0)
16453                               (high (symbol_ref ("SYM"))))
16454           curr (ldr) == (set (reg r1)
16455                              (mem (lo_sum (reg r0)
16456                                              (symbol_ref ("SYM")))))
16457                  or
16458           curr (ldr) == (set (reg r1)
16459                              (zero_extend (mem
16460                                            (lo_sum (reg r0)
16461                                                    (symbol_ref ("SYM"))))))  */
16462       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16463           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16464         {
16465           rtx curr_src = SET_SRC (curr_set);
16466
16467           if (GET_CODE (curr_src) == ZERO_EXTEND)
16468             curr_src = XEXP (curr_src, 0);
16469
16470           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16471               && REG_P (XEXP (XEXP (curr_src, 0), 0))
16472               && REGNO (XEXP (XEXP (curr_src, 0), 0))
16473                  == REGNO (SET_DEST (prev_set))
16474               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16475                               XEXP (SET_SRC (prev_set), 0)))
16476               return true;
16477         }
16478     }
16479
16480   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16481        && aarch_crypto_can_dual_issue (prev, curr))
16482     return true;
16483
16484   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16485       && any_condjump_p (curr))
16486     {
16487       enum attr_type prev_type = get_attr_type (prev);
16488
16489       unsigned int condreg1, condreg2;
16490       rtx cc_reg_1;
16491       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16492       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16493
16494       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16495           && prev
16496           && modified_in_p (cc_reg_1, prev))
16497         {
16498           /* FIXME: this misses some which is considered simple arthematic
16499              instructions for ThunderX.  Simple shifts are missed here.  */
16500           if (prev_type == TYPE_ALUS_SREG
16501               || prev_type == TYPE_ALUS_IMM
16502               || prev_type == TYPE_LOGICS_REG
16503               || prev_type == TYPE_LOGICS_IMM)
16504             return true;
16505         }
16506     }
16507
16508   if (prev_set
16509       && curr_set
16510       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16511       && any_condjump_p (curr))
16512     {
16513       /* We're trying to match:
16514           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16515           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
16516                                                          (const_int 0))
16517                                                  (label_ref ("SYM"))
16518                                                  (pc))  */
16519       if (SET_DEST (curr_set) == (pc_rtx)
16520           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16521           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16522           && REG_P (SET_DEST (prev_set))
16523           && REGNO (SET_DEST (prev_set))
16524              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16525         {
16526           /* Fuse ALU operations followed by conditional branch instruction.  */
16527           switch (get_attr_type (prev))
16528             {
16529             case TYPE_ALU_IMM:
16530             case TYPE_ALU_SREG:
16531             case TYPE_ADC_REG:
16532             case TYPE_ADC_IMM:
16533             case TYPE_ADCS_REG:
16534             case TYPE_ADCS_IMM:
16535             case TYPE_LOGIC_REG:
16536             case TYPE_LOGIC_IMM:
16537             case TYPE_CSEL:
16538             case TYPE_ADR:
16539             case TYPE_MOV_IMM:
16540             case TYPE_SHIFT_REG:
16541             case TYPE_SHIFT_IMM:
16542             case TYPE_BFM:
16543             case TYPE_RBIT:
16544             case TYPE_REV:
16545             case TYPE_EXTEND:
16546               return true;
16547
16548             default:;
16549             }
16550         }
16551     }
16552
16553   return false;
16554 }
16555
16556 /* Return true iff the instruction fusion described by OP is enabled.  */
16557
16558 bool
16559 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16560 {
16561   return (aarch64_tune_params.fusible_ops & op) != 0;
16562 }
16563
16564 /* If MEM is in the form of [base+offset], extract the two parts
16565    of address and set to BASE and OFFSET, otherwise return false
16566    after clearing BASE and OFFSET.  */
16567
16568 bool
16569 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16570 {
16571   rtx addr;
16572
16573   gcc_assert (MEM_P (mem));
16574
16575   addr = XEXP (mem, 0);
16576
16577   if (REG_P (addr))
16578     {
16579       *base = addr;
16580       *offset = const0_rtx;
16581       return true;
16582     }
16583
16584   if (GET_CODE (addr) == PLUS
16585       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16586     {
16587       *base = XEXP (addr, 0);
16588       *offset = XEXP (addr, 1);
16589       return true;
16590     }
16591
16592   *base = NULL_RTX;
16593   *offset = NULL_RTX;
16594
16595   return false;
16596 }
16597
16598 /* Types for scheduling fusion.  */
16599 enum sched_fusion_type
16600 {
16601   SCHED_FUSION_NONE = 0,
16602   SCHED_FUSION_LD_SIGN_EXTEND,
16603   SCHED_FUSION_LD_ZERO_EXTEND,
16604   SCHED_FUSION_LD,
16605   SCHED_FUSION_ST,
16606   SCHED_FUSION_NUM
16607 };
16608
16609 /* If INSN is a load or store of address in the form of [base+offset],
16610    extract the two parts and set to BASE and OFFSET.  Return scheduling
16611    fusion type this INSN is.  */
16612
16613 static enum sched_fusion_type
16614 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16615 {
16616   rtx x, dest, src;
16617   enum sched_fusion_type fusion = SCHED_FUSION_LD;
16618
16619   gcc_assert (INSN_P (insn));
16620   x = PATTERN (insn);
16621   if (GET_CODE (x) != SET)
16622     return SCHED_FUSION_NONE;
16623
16624   src = SET_SRC (x);
16625   dest = SET_DEST (x);
16626
16627   machine_mode dest_mode = GET_MODE (dest);
16628
16629   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16630     return SCHED_FUSION_NONE;
16631
16632   if (GET_CODE (src) == SIGN_EXTEND)
16633     {
16634       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16635       src = XEXP (src, 0);
16636       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16637         return SCHED_FUSION_NONE;
16638     }
16639   else if (GET_CODE (src) == ZERO_EXTEND)
16640     {
16641       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16642       src = XEXP (src, 0);
16643       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16644         return SCHED_FUSION_NONE;
16645     }
16646
16647   if (GET_CODE (src) == MEM && REG_P (dest))
16648     extract_base_offset_in_addr (src, base, offset);
16649   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16650     {
16651       fusion = SCHED_FUSION_ST;
16652       extract_base_offset_in_addr (dest, base, offset);
16653     }
16654   else
16655     return SCHED_FUSION_NONE;
16656
16657   if (*base == NULL_RTX || *offset == NULL_RTX)
16658     fusion = SCHED_FUSION_NONE;
16659
16660   return fusion;
16661 }
16662
16663 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16664
16665    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16666    and PRI are only calculated for these instructions.  For other instruction,
16667    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
16668    type instruction fusion can be added by returning different priorities.
16669
16670    It's important that irrelevant instructions get the largest FUSION_PRI.  */
16671
16672 static void
16673 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16674                                int *fusion_pri, int *pri)
16675 {
16676   int tmp, off_val;
16677   rtx base, offset;
16678   enum sched_fusion_type fusion;
16679
16680   gcc_assert (INSN_P (insn));
16681
16682   tmp = max_pri - 1;
16683   fusion = fusion_load_store (insn, &base, &offset);
16684   if (fusion == SCHED_FUSION_NONE)
16685     {
16686       *pri = tmp;
16687       *fusion_pri = tmp;
16688       return;
16689     }
16690
16691   /* Set FUSION_PRI according to fusion type and base register.  */
16692   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16693
16694   /* Calculate PRI.  */
16695   tmp /= 2;
16696
16697   /* INSN with smaller offset goes first.  */
16698   off_val = (int)(INTVAL (offset));
16699   if (off_val >= 0)
16700     tmp -= (off_val & 0xfffff);
16701   else
16702     tmp += ((- off_val) & 0xfffff);
16703
16704   *pri = tmp;
16705   return;
16706 }
16707
16708 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16709    Adjust priority of sha1h instructions so they are scheduled before
16710    other SHA1 instructions.  */
16711
16712 static int
16713 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16714 {
16715   rtx x = PATTERN (insn);
16716
16717   if (GET_CODE (x) == SET)
16718     {
16719       x = SET_SRC (x);
16720
16721       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16722         return priority + 10;
16723     }
16724
16725   return priority;
16726 }
16727
16728 /* Given OPERANDS of consecutive load/store, check if we can merge
16729    them into ldp/stp.  LOAD is true if they are load instructions.
16730    MODE is the mode of memory operands.  */
16731
16732 bool
16733 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16734                                 machine_mode mode)
16735 {
16736   HOST_WIDE_INT offval_1, offval_2, msize;
16737   enum reg_class rclass_1, rclass_2;
16738   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16739
16740   if (load)
16741     {
16742       mem_1 = operands[1];
16743       mem_2 = operands[3];
16744       reg_1 = operands[0];
16745       reg_2 = operands[2];
16746       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16747       if (REGNO (reg_1) == REGNO (reg_2))
16748         return false;
16749     }
16750   else
16751     {
16752       mem_1 = operands[0];
16753       mem_2 = operands[2];
16754       reg_1 = operands[1];
16755       reg_2 = operands[3];
16756     }
16757
16758   /* The mems cannot be volatile.  */
16759   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16760     return false;
16761
16762   /* If we have SImode and slow unaligned ldp,
16763      check the alignment to be at least 8 byte. */
16764   if (mode == SImode
16765       && (aarch64_tune_params.extra_tuning_flags
16766           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16767       && !optimize_size
16768       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16769     return false;
16770
16771   /* Check if the addresses are in the form of [base+offset].  */
16772   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16773   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16774     return false;
16775   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16776   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16777     return false;
16778
16779   /* Check if the bases are same.  */
16780   if (!rtx_equal_p (base_1, base_2))
16781     return false;
16782
16783   offval_1 = INTVAL (offset_1);
16784   offval_2 = INTVAL (offset_2);
16785   /* We should only be trying this for fixed-sized modes.  There is no
16786      SVE LDP/STP instruction.  */
16787   msize = GET_MODE_SIZE (mode).to_constant ();
16788   /* Check if the offsets are consecutive.  */
16789   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16790     return false;
16791
16792   /* Check if the addresses are clobbered by load.  */
16793   if (load)
16794     {
16795       if (reg_mentioned_p (reg_1, mem_1))
16796         return false;
16797
16798       /* In increasing order, the last load can clobber the address.  */
16799       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16800       return false;
16801     }
16802
16803   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16804     rclass_1 = FP_REGS;
16805   else
16806     rclass_1 = GENERAL_REGS;
16807
16808   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16809     rclass_2 = FP_REGS;
16810   else
16811     rclass_2 = GENERAL_REGS;
16812
16813   /* Check if the registers are of same class.  */
16814   if (rclass_1 != rclass_2)
16815     return false;
16816
16817   return true;
16818 }
16819
16820 /* Given OPERANDS of consecutive load/store, check if we can merge
16821    them into ldp/stp by adjusting the offset.  LOAD is true if they
16822    are load instructions.  MODE is the mode of memory operands.
16823
16824    Given below consecutive stores:
16825
16826      str  w1, [xb, 0x100]
16827      str  w1, [xb, 0x104]
16828      str  w1, [xb, 0x108]
16829      str  w1, [xb, 0x10c]
16830
16831    Though the offsets are out of the range supported by stp, we can
16832    still pair them after adjusting the offset, like:
16833
16834      add  scratch, xb, 0x100
16835      stp  w1, w1, [scratch]
16836      stp  w1, w1, [scratch, 0x8]
16837
16838    The peephole patterns detecting this opportunity should guarantee
16839    the scratch register is avaliable.  */
16840
16841 bool
16842 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16843                                        scalar_mode mode)
16844 {
16845   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
16846   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
16847   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
16848   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
16849
16850   if (load)
16851     {
16852       reg_1 = operands[0];
16853       mem_1 = operands[1];
16854       reg_2 = operands[2];
16855       mem_2 = operands[3];
16856       reg_3 = operands[4];
16857       mem_3 = operands[5];
16858       reg_4 = operands[6];
16859       mem_4 = operands[7];
16860       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
16861                   && REG_P (reg_3) && REG_P (reg_4));
16862       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
16863         return false;
16864     }
16865   else
16866     {
16867       mem_1 = operands[0];
16868       reg_1 = operands[1];
16869       mem_2 = operands[2];
16870       reg_2 = operands[3];
16871       mem_3 = operands[4];
16872       reg_3 = operands[5];
16873       mem_4 = operands[6];
16874       reg_4 = operands[7];
16875     }
16876   /* Skip if memory operand is by itslef valid for ldp/stp.  */
16877   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
16878     return false;
16879
16880   /* The mems cannot be volatile.  */
16881   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
16882       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
16883     return false;
16884
16885   /* Check if the addresses are in the form of [base+offset].  */
16886   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16887   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16888     return false;
16889   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16890   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16891     return false;
16892   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
16893   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
16894     return false;
16895   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
16896   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
16897     return false;
16898
16899   /* Check if the bases are same.  */
16900   if (!rtx_equal_p (base_1, base_2)
16901       || !rtx_equal_p (base_2, base_3)
16902       || !rtx_equal_p (base_3, base_4))
16903     return false;
16904
16905   offval_1 = INTVAL (offset_1);
16906   offval_2 = INTVAL (offset_2);
16907   offval_3 = INTVAL (offset_3);
16908   offval_4 = INTVAL (offset_4);
16909   msize = GET_MODE_SIZE (mode);
16910   /* Check if the offsets are consecutive.  */
16911   if ((offval_1 != (offval_2 + msize)
16912        || offval_1 != (offval_3 + msize * 2)
16913        || offval_1 != (offval_4 + msize * 3))
16914       && (offval_4 != (offval_3 + msize)
16915           || offval_4 != (offval_2 + msize * 2)
16916           || offval_4 != (offval_1 + msize * 3)))
16917     return false;
16918
16919   /* Check if the addresses are clobbered by load.  */
16920   if (load)
16921     {
16922       if (reg_mentioned_p (reg_1, mem_1)
16923           || reg_mentioned_p (reg_2, mem_2)
16924           || reg_mentioned_p (reg_3, mem_3))
16925         return false;
16926
16927       /* In increasing order, the last load can clobber the address.  */
16928       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
16929         return false;
16930     }
16931
16932   /* If we have SImode and slow unaligned ldp,
16933      check the alignment to be at least 8 byte. */
16934   if (mode == SImode
16935       && (aarch64_tune_params.extra_tuning_flags
16936           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16937       && !optimize_size
16938       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16939     return false;
16940
16941   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16942     rclass_1 = FP_REGS;
16943   else
16944     rclass_1 = GENERAL_REGS;
16945
16946   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16947     rclass_2 = FP_REGS;
16948   else
16949     rclass_2 = GENERAL_REGS;
16950
16951   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
16952     rclass_3 = FP_REGS;
16953   else
16954     rclass_3 = GENERAL_REGS;
16955
16956   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
16957     rclass_4 = FP_REGS;
16958   else
16959     rclass_4 = GENERAL_REGS;
16960
16961   /* Check if the registers are of same class.  */
16962   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
16963     return false;
16964
16965   return true;
16966 }
16967
16968 /* Given OPERANDS of consecutive load/store, this function pairs them
16969    into ldp/stp after adjusting the offset.  It depends on the fact
16970    that addresses of load/store instructions are in increasing order.
16971    MODE is the mode of memory operands.  CODE is the rtl operator
16972    which should be applied to all memory operands, it's SIGN_EXTEND,
16973    ZERO_EXTEND or UNKNOWN.  */
16974
16975 bool
16976 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
16977                              scalar_mode mode, RTX_CODE code)
16978 {
16979   rtx base, offset, t1, t2;
16980   rtx mem_1, mem_2, mem_3, mem_4;
16981   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
16982
16983   if (load)
16984     {
16985       mem_1 = operands[1];
16986       mem_2 = operands[3];
16987       mem_3 = operands[5];
16988       mem_4 = operands[7];
16989     }
16990   else
16991     {
16992       mem_1 = operands[0];
16993       mem_2 = operands[2];
16994       mem_3 = operands[4];
16995       mem_4 = operands[6];
16996       gcc_assert (code == UNKNOWN);
16997     }
16998
16999   extract_base_offset_in_addr (mem_1, &base, &offset);
17000   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
17001
17002   /* Adjust offset thus it can fit in ldp/stp instruction.  */
17003   msize = GET_MODE_SIZE (mode);
17004   stp_off_limit = msize * 0x40;
17005   off_val = INTVAL (offset);
17006   abs_off = (off_val < 0) ? -off_val : off_val;
17007   new_off = abs_off % stp_off_limit;
17008   adj_off = abs_off - new_off;
17009
17010   /* Further adjust to make sure all offsets are OK.  */
17011   if ((new_off + msize * 2) >= stp_off_limit)
17012     {
17013       adj_off += stp_off_limit;
17014       new_off -= stp_off_limit;
17015     }
17016
17017   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
17018   if (adj_off >= 0x1000)
17019     return false;
17020
17021   if (off_val < 0)
17022     {
17023       adj_off = -adj_off;
17024       new_off = -new_off;
17025     }
17026
17027   /* Create new memory references.  */
17028   mem_1 = change_address (mem_1, VOIDmode,
17029                           plus_constant (DImode, operands[8], new_off));
17030
17031   /* Check if the adjusted address is OK for ldp/stp.  */
17032   if (!aarch64_mem_pair_operand (mem_1, mode))
17033     return false;
17034
17035   msize = GET_MODE_SIZE (mode);
17036   mem_2 = change_address (mem_2, VOIDmode,
17037                           plus_constant (DImode,
17038                                          operands[8],
17039                                          new_off + msize));
17040   mem_3 = change_address (mem_3, VOIDmode,
17041                           plus_constant (DImode,
17042                                          operands[8],
17043                                          new_off + msize * 2));
17044   mem_4 = change_address (mem_4, VOIDmode,
17045                           plus_constant (DImode,
17046                                          operands[8],
17047                                          new_off + msize * 3));
17048
17049   if (code == ZERO_EXTEND)
17050     {
17051       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17052       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17053       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17054       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17055     }
17056   else if (code == SIGN_EXTEND)
17057     {
17058       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17059       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17060       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17061       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17062     }
17063
17064   if (load)
17065     {
17066       operands[1] = mem_1;
17067       operands[3] = mem_2;
17068       operands[5] = mem_3;
17069       operands[7] = mem_4;
17070     }
17071   else
17072     {
17073       operands[0] = mem_1;
17074       operands[2] = mem_2;
17075       operands[4] = mem_3;
17076       operands[6] = mem_4;
17077     }
17078
17079   /* Emit adjusting instruction.  */
17080   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
17081   /* Emit ldp/stp instructions.  */
17082   t1 = gen_rtx_SET (operands[0], operands[1]);
17083   t2 = gen_rtx_SET (operands[2], operands[3]);
17084   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17085   t1 = gen_rtx_SET (operands[4], operands[5]);
17086   t2 = gen_rtx_SET (operands[6], operands[7]);
17087   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17088   return true;
17089 }
17090
17091 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
17092    it isn't worth branching around empty masked ops (including masked
17093    stores).  */
17094
17095 static bool
17096 aarch64_empty_mask_is_expensive (unsigned)
17097 {
17098   return false;
17099 }
17100
17101 /* Return 1 if pseudo register should be created and used to hold
17102    GOT address for PIC code.  */
17103
17104 bool
17105 aarch64_use_pseudo_pic_reg (void)
17106 {
17107   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17108 }
17109
17110 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
17111
17112 static int
17113 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17114 {
17115   switch (XINT (x, 1))
17116     {
17117     case UNSPEC_GOTSMALLPIC:
17118     case UNSPEC_GOTSMALLPIC28K:
17119     case UNSPEC_GOTTINYPIC:
17120       return 0;
17121     default:
17122       break;
17123     }
17124
17125   return default_unspec_may_trap_p (x, flags);
17126 }
17127
17128
17129 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17130    return the log2 of that value.  Otherwise return -1.  */
17131
17132 int
17133 aarch64_fpconst_pow_of_2 (rtx x)
17134 {
17135   const REAL_VALUE_TYPE *r;
17136
17137   if (!CONST_DOUBLE_P (x))
17138     return -1;
17139
17140   r = CONST_DOUBLE_REAL_VALUE (x);
17141
17142   if (REAL_VALUE_NEGATIVE (*r)
17143       || REAL_VALUE_ISNAN (*r)
17144       || REAL_VALUE_ISINF (*r)
17145       || !real_isinteger (r, DFmode))
17146     return -1;
17147
17148   return exact_log2 (real_to_integer (r));
17149 }
17150
17151 /* If X is a vector of equal CONST_DOUBLE values and that value is
17152    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
17153
17154 int
17155 aarch64_vec_fpconst_pow_of_2 (rtx x)
17156 {
17157   int nelts;
17158   if (GET_CODE (x) != CONST_VECTOR
17159       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17160     return -1;
17161
17162   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17163     return -1;
17164
17165   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17166   if (firstval <= 0)
17167     return -1;
17168
17169   for (int i = 1; i < nelts; i++)
17170     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17171       return -1;
17172
17173   return firstval;
17174 }
17175
17176 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17177    to float.
17178
17179    __fp16 always promotes through this hook.
17180    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17181    through the generic excess precision logic rather than here.  */
17182
17183 static tree
17184 aarch64_promoted_type (const_tree t)
17185 {
17186   if (SCALAR_FLOAT_TYPE_P (t)
17187       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17188     return float_type_node;
17189
17190   return NULL_TREE;
17191 }
17192
17193 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
17194
17195 static bool
17196 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17197                            optimization_type opt_type)
17198 {
17199   switch (op)
17200     {
17201     case rsqrt_optab:
17202       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17203
17204     default:
17205       return true;
17206     }
17207 }
17208
17209 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
17210
17211 static unsigned int
17212 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17213                                         int *offset)
17214 {
17215   /* Polynomial invariant 1 == (VG / 2) - 1.  */
17216   gcc_assert (i == 1);
17217   *factor = 2;
17218   *offset = 1;
17219   return AARCH64_DWARF_VG;
17220 }
17221
17222 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17223    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17224
17225 static bool
17226 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17227 {
17228   return (mode == HFmode
17229           ? true
17230           : default_libgcc_floating_mode_supported_p (mode));
17231 }
17232
17233 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17234    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17235
17236 static bool
17237 aarch64_scalar_mode_supported_p (scalar_mode mode)
17238 {
17239   return (mode == HFmode
17240           ? true
17241           : default_scalar_mode_supported_p (mode));
17242 }
17243
17244 /* Set the value of FLT_EVAL_METHOD.
17245    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17246
17247     0: evaluate all operations and constants, whose semantic type has at
17248        most the range and precision of type float, to the range and
17249        precision of float; evaluate all other operations and constants to
17250        the range and precision of the semantic type;
17251
17252     N, where _FloatN is a supported interchange floating type
17253        evaluate all operations and constants, whose semantic type has at
17254        most the range and precision of _FloatN type, to the range and
17255        precision of the _FloatN type; evaluate all other operations and
17256        constants to the range and precision of the semantic type;
17257
17258    If we have the ARMv8.2-A extensions then we support _Float16 in native
17259    precision, so we should set this to 16.  Otherwise, we support the type,
17260    but want to evaluate expressions in float precision, so set this to
17261    0.  */
17262
17263 static enum flt_eval_method
17264 aarch64_excess_precision (enum excess_precision_type type)
17265 {
17266   switch (type)
17267     {
17268       case EXCESS_PRECISION_TYPE_FAST:
17269       case EXCESS_PRECISION_TYPE_STANDARD:
17270         /* We can calculate either in 16-bit range and precision or
17271            32-bit range and precision.  Make that decision based on whether
17272            we have native support for the ARMv8.2-A 16-bit floating-point
17273            instructions or not.  */
17274         return (TARGET_FP_F16INST
17275                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17276                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17277       case EXCESS_PRECISION_TYPE_IMPLICIT:
17278         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17279       default:
17280         gcc_unreachable ();
17281     }
17282   return FLT_EVAL_METHOD_UNPREDICTABLE;
17283 }
17284
17285 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
17286    scheduled for speculative execution.  Reject the long-running division
17287    and square-root instructions.  */
17288
17289 static bool
17290 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17291 {
17292   switch (get_attr_type (insn))
17293     {
17294       case TYPE_SDIV:
17295       case TYPE_UDIV:
17296       case TYPE_FDIVS:
17297       case TYPE_FDIVD:
17298       case TYPE_FSQRTS:
17299       case TYPE_FSQRTD:
17300       case TYPE_NEON_FP_SQRT_S:
17301       case TYPE_NEON_FP_SQRT_D:
17302       case TYPE_NEON_FP_SQRT_S_Q:
17303       case TYPE_NEON_FP_SQRT_D_Q:
17304       case TYPE_NEON_FP_DIV_S:
17305       case TYPE_NEON_FP_DIV_D:
17306       case TYPE_NEON_FP_DIV_S_Q:
17307       case TYPE_NEON_FP_DIV_D_Q:
17308         return false;
17309       default:
17310         return true;
17311     }
17312 }
17313
17314 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
17315
17316 static int
17317 aarch64_compute_pressure_classes (reg_class *classes)
17318 {
17319   int i = 0;
17320   classes[i++] = GENERAL_REGS;
17321   classes[i++] = FP_REGS;
17322   /* PR_REGS isn't a useful pressure class because many predicate pseudo
17323      registers need to go in PR_LO_REGS at some point during their
17324      lifetime.  Splitting it into two halves has the effect of making
17325      all predicates count against PR_LO_REGS, so that we try whenever
17326      possible to restrict the number of live predicates to 8.  This
17327      greatly reduces the amount of spilling in certain loops.  */
17328   classes[i++] = PR_LO_REGS;
17329   classes[i++] = PR_HI_REGS;
17330   return i;
17331 }
17332
17333 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
17334
17335 static bool
17336 aarch64_can_change_mode_class (machine_mode from,
17337                                machine_mode to, reg_class_t)
17338 {
17339   if (BYTES_BIG_ENDIAN)
17340     {
17341       bool from_sve_p = aarch64_sve_data_mode_p (from);
17342       bool to_sve_p = aarch64_sve_data_mode_p (to);
17343
17344       /* Don't allow changes between SVE data modes and non-SVE modes.
17345          See the comment at the head of aarch64-sve.md for details.  */
17346       if (from_sve_p != to_sve_p)
17347         return false;
17348
17349       /* Don't allow changes in element size: lane 0 of the new vector
17350          would not then be lane 0 of the old vector.  See the comment
17351          above aarch64_maybe_expand_sve_subreg_move for a more detailed
17352          description.
17353
17354          In the worst case, this forces a register to be spilled in
17355          one mode and reloaded in the other, which handles the
17356          endianness correctly.  */
17357       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17358         return false;
17359     }
17360   return true;
17361 }
17362
17363 /* Implement TARGET_EARLY_REMAT_MODES.  */
17364
17365 static void
17366 aarch64_select_early_remat_modes (sbitmap modes)
17367 {
17368   /* SVE values are not normally live across a call, so it should be
17369      worth doing early rematerialization even in VL-specific mode.  */
17370   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17371     {
17372       machine_mode mode = (machine_mode) i;
17373       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17374       if (vec_flags & VEC_ANY_SVE)
17375         bitmap_set_bit (modes, i);
17376     }
17377 }
17378
17379 /* Target-specific selftests.  */
17380
17381 #if CHECKING_P
17382
17383 namespace selftest {
17384
17385 /* Selftest for the RTL loader.
17386    Verify that the RTL loader copes with a dump from
17387    print_rtx_function.  This is essentially just a test that class
17388    function_reader can handle a real dump, but it also verifies
17389    that lookup_reg_by_dump_name correctly handles hard regs.
17390    The presence of hard reg names in the dump means that the test is
17391    target-specific, hence it is in this file.  */
17392
17393 static void
17394 aarch64_test_loading_full_dump ()
17395 {
17396   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17397
17398   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17399
17400   rtx_insn *insn_1 = get_insn_by_uid (1);
17401   ASSERT_EQ (NOTE, GET_CODE (insn_1));
17402
17403   rtx_insn *insn_15 = get_insn_by_uid (15);
17404   ASSERT_EQ (INSN, GET_CODE (insn_15));
17405   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17406
17407   /* Verify crtl->return_rtx.  */
17408   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17409   ASSERT_EQ (0, REGNO (crtl->return_rtx));
17410   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17411 }
17412
17413 /* Run all target-specific selftests.  */
17414
17415 static void
17416 aarch64_run_selftests (void)
17417 {
17418   aarch64_test_loading_full_dump ();
17419 }
17420
17421 } // namespace selftest
17422
17423 #endif /* #if CHECKING_P */
17424
17425 #undef TARGET_ADDRESS_COST
17426 #define TARGET_ADDRESS_COST aarch64_address_cost
17427
17428 /* This hook will determines whether unnamed bitfields affect the alignment
17429    of the containing structure.  The hook returns true if the structure
17430    should inherit the alignment requirements of an unnamed bitfield's
17431    type.  */
17432 #undef TARGET_ALIGN_ANON_BITFIELD
17433 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17434
17435 #undef TARGET_ASM_ALIGNED_DI_OP
17436 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17437
17438 #undef TARGET_ASM_ALIGNED_HI_OP
17439 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17440
17441 #undef TARGET_ASM_ALIGNED_SI_OP
17442 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17443
17444 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17445 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17446   hook_bool_const_tree_hwi_hwi_const_tree_true
17447
17448 #undef TARGET_ASM_FILE_START
17449 #define TARGET_ASM_FILE_START aarch64_start_file
17450
17451 #undef TARGET_ASM_OUTPUT_MI_THUNK
17452 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17453
17454 #undef TARGET_ASM_SELECT_RTX_SECTION
17455 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17456
17457 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17458 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17459
17460 #undef TARGET_BUILD_BUILTIN_VA_LIST
17461 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17462
17463 #undef TARGET_CALLEE_COPIES
17464 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17465
17466 #undef TARGET_CAN_ELIMINATE
17467 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17468
17469 #undef TARGET_CAN_INLINE_P
17470 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17471
17472 #undef TARGET_CANNOT_FORCE_CONST_MEM
17473 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17474
17475 #undef TARGET_CASE_VALUES_THRESHOLD
17476 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17477
17478 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17479 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17480
17481 /* Only the least significant bit is used for initialization guard
17482    variables.  */
17483 #undef TARGET_CXX_GUARD_MASK_BIT
17484 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17485
17486 #undef TARGET_C_MODE_FOR_SUFFIX
17487 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17488
17489 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17490 #undef  TARGET_DEFAULT_TARGET_FLAGS
17491 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17492 #endif
17493
17494 #undef TARGET_CLASS_MAX_NREGS
17495 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17496
17497 #undef TARGET_BUILTIN_DECL
17498 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17499
17500 #undef TARGET_BUILTIN_RECIPROCAL
17501 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17502
17503 #undef TARGET_C_EXCESS_PRECISION
17504 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17505
17506 #undef  TARGET_EXPAND_BUILTIN
17507 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17508
17509 #undef TARGET_EXPAND_BUILTIN_VA_START
17510 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17511
17512 #undef TARGET_FOLD_BUILTIN
17513 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17514
17515 #undef TARGET_FUNCTION_ARG
17516 #define TARGET_FUNCTION_ARG aarch64_function_arg
17517
17518 #undef TARGET_FUNCTION_ARG_ADVANCE
17519 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17520
17521 #undef TARGET_FUNCTION_ARG_BOUNDARY
17522 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17523
17524 #undef TARGET_FUNCTION_ARG_PADDING
17525 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17526
17527 #undef TARGET_GET_RAW_RESULT_MODE
17528 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17529 #undef TARGET_GET_RAW_ARG_MODE
17530 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17531
17532 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17533 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17534
17535 #undef TARGET_FUNCTION_VALUE
17536 #define TARGET_FUNCTION_VALUE aarch64_function_value
17537
17538 #undef TARGET_FUNCTION_VALUE_REGNO_P
17539 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17540
17541 #undef TARGET_GIMPLE_FOLD_BUILTIN
17542 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17543
17544 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17545 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17546
17547 #undef  TARGET_INIT_BUILTINS
17548 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
17549
17550 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17551 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17552   aarch64_ira_change_pseudo_allocno_class
17553
17554 #undef TARGET_LEGITIMATE_ADDRESS_P
17555 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17556
17557 #undef TARGET_LEGITIMATE_CONSTANT_P
17558 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17559
17560 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17561 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17562   aarch64_legitimize_address_displacement
17563
17564 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17565 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17566
17567 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17568 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17569 aarch64_libgcc_floating_mode_supported_p
17570
17571 #undef TARGET_MANGLE_TYPE
17572 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17573
17574 #undef TARGET_MEMORY_MOVE_COST
17575 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17576
17577 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17578 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17579
17580 #undef TARGET_MUST_PASS_IN_STACK
17581 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17582
17583 /* This target hook should return true if accesses to volatile bitfields
17584    should use the narrowest mode possible.  It should return false if these
17585    accesses should use the bitfield container type.  */
17586 #undef TARGET_NARROW_VOLATILE_BITFIELD
17587 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17588
17589 #undef  TARGET_OPTION_OVERRIDE
17590 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17591
17592 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17593 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17594   aarch64_override_options_after_change
17595
17596 #undef TARGET_OPTION_SAVE
17597 #define TARGET_OPTION_SAVE aarch64_option_save
17598
17599 #undef TARGET_OPTION_RESTORE
17600 #define TARGET_OPTION_RESTORE aarch64_option_restore
17601
17602 #undef TARGET_OPTION_PRINT
17603 #define TARGET_OPTION_PRINT aarch64_option_print
17604
17605 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17606 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17607
17608 #undef TARGET_SET_CURRENT_FUNCTION
17609 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17610
17611 #undef TARGET_PASS_BY_REFERENCE
17612 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17613
17614 #undef TARGET_PREFERRED_RELOAD_CLASS
17615 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17616
17617 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17618 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17619
17620 #undef TARGET_PROMOTED_TYPE
17621 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17622
17623 #undef TARGET_SECONDARY_RELOAD
17624 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17625
17626 #undef TARGET_SHIFT_TRUNCATION_MASK
17627 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17628
17629 #undef TARGET_SETUP_INCOMING_VARARGS
17630 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17631
17632 #undef TARGET_STRUCT_VALUE_RTX
17633 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
17634
17635 #undef TARGET_REGISTER_MOVE_COST
17636 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17637
17638 #undef TARGET_RETURN_IN_MEMORY
17639 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17640
17641 #undef TARGET_RETURN_IN_MSB
17642 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17643
17644 #undef TARGET_RTX_COSTS
17645 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17646
17647 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17648 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17649
17650 #undef TARGET_SCHED_ISSUE_RATE
17651 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17652
17653 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17654 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17655   aarch64_sched_first_cycle_multipass_dfa_lookahead
17656
17657 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17658 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17659   aarch64_first_cycle_multipass_dfa_lookahead_guard
17660
17661 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17662 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17663   aarch64_get_separate_components
17664
17665 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17666 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17667   aarch64_components_for_bb
17668
17669 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17670 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17671   aarch64_disqualify_components
17672
17673 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17674 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17675   aarch64_emit_prologue_components
17676
17677 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17678 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17679   aarch64_emit_epilogue_components
17680
17681 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17682 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17683   aarch64_set_handled_components
17684
17685 #undef TARGET_TRAMPOLINE_INIT
17686 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17687
17688 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17689 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17690
17691 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17692 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17693
17694 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17695 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17696   aarch64_builtin_support_vector_misalignment
17697
17698 #undef TARGET_ARRAY_MODE
17699 #define TARGET_ARRAY_MODE aarch64_array_mode
17700
17701 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17702 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17703
17704 #undef TARGET_VECTORIZE_ADD_STMT_COST
17705 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17706
17707 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17708 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17709   aarch64_builtin_vectorization_cost
17710
17711 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17712 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17713
17714 #undef TARGET_VECTORIZE_BUILTINS
17715 #define TARGET_VECTORIZE_BUILTINS
17716
17717 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17718 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17719   aarch64_builtin_vectorized_function
17720
17721 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17722 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17723   aarch64_autovectorize_vector_sizes
17724
17725 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17726 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17727   aarch64_atomic_assign_expand_fenv
17728
17729 /* Section anchor support.  */
17730
17731 #undef TARGET_MIN_ANCHOR_OFFSET
17732 #define TARGET_MIN_ANCHOR_OFFSET -256
17733
17734 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17735    byte offset; we can do much more for larger data types, but have no way
17736    to determine the size of the access.  We assume accesses are aligned.  */
17737 #undef TARGET_MAX_ANCHOR_OFFSET
17738 #define TARGET_MAX_ANCHOR_OFFSET 4095
17739
17740 #undef TARGET_VECTOR_ALIGNMENT
17741 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17742
17743 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17744 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17745   aarch64_vectorize_preferred_vector_alignment
17746 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17747 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17748   aarch64_simd_vector_alignment_reachable
17749
17750 /* vec_perm support.  */
17751
17752 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17753 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17754   aarch64_vectorize_vec_perm_const
17755
17756 #undef TARGET_VECTORIZE_GET_MASK_MODE
17757 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17758 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17759 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17760   aarch64_empty_mask_is_expensive
17761
17762 #undef TARGET_INIT_LIBFUNCS
17763 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17764
17765 #undef TARGET_FIXED_CONDITION_CODE_REGS
17766 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17767
17768 #undef TARGET_FLAGS_REGNUM
17769 #define TARGET_FLAGS_REGNUM CC_REGNUM
17770
17771 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17772 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17773
17774 #undef TARGET_ASAN_SHADOW_OFFSET
17775 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17776
17777 #undef TARGET_LEGITIMIZE_ADDRESS
17778 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17779
17780 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17781 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17782
17783 #undef TARGET_CAN_USE_DOLOOP_P
17784 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17785
17786 #undef TARGET_SCHED_ADJUST_PRIORITY
17787 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17788
17789 #undef TARGET_SCHED_MACRO_FUSION_P
17790 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17791
17792 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17793 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17794
17795 #undef TARGET_SCHED_FUSION_PRIORITY
17796 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17797
17798 #undef TARGET_UNSPEC_MAY_TRAP_P
17799 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17800
17801 #undef TARGET_USE_PSEUDO_PIC_REG
17802 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17803
17804 #undef TARGET_PRINT_OPERAND
17805 #define TARGET_PRINT_OPERAND aarch64_print_operand
17806
17807 #undef TARGET_PRINT_OPERAND_ADDRESS
17808 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17809
17810 #undef TARGET_OPTAB_SUPPORTED_P
17811 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17812
17813 #undef TARGET_OMIT_STRUCT_RETURN_REG
17814 #define TARGET_OMIT_STRUCT_RETURN_REG true
17815
17816 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17817 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17818   aarch64_dwarf_poly_indeterminate_value
17819
17820 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
17821 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17822 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17823
17824 #undef TARGET_HARD_REGNO_NREGS
17825 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17826 #undef TARGET_HARD_REGNO_MODE_OK
17827 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17828
17829 #undef TARGET_MODES_TIEABLE_P
17830 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17831
17832 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17833 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17834   aarch64_hard_regno_call_part_clobbered
17835
17836 #undef TARGET_CONSTANT_ALIGNMENT
17837 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17838
17839 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17840 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17841
17842 #undef TARGET_CAN_CHANGE_MODE_CLASS
17843 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17844
17845 #undef TARGET_SELECT_EARLY_REMAT_MODES
17846 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
17847
17848 #if CHECKING_P
17849 #undef TARGET_RUN_TARGET_SELFTESTS
17850 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17851 #endif /* #if CHECKING_P */
17852
17853 struct gcc_target targetm = TARGET_INITIALIZER;
17854
17855 #include "gt-aarch64.h"