gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2018 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "diagnostic.h"
  44 #include "insn-attr.h"
  45 #include "alias.h"
  46 #include "fold-const.h"
  47 #include "stor-layout.h"
  48 #include "calls.h"
  49 #include "varasm.h"
  50 #include "output.h"
  51 #include "flags.h"
  52 #include "explow.h"
  53 #include "expr.h"
  54 #include "reload.h"
  55 #include "langhooks.h"
  56 #include "opts.h"
  57 #include "params.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74
  75 /* This file should be included last.  */
  76 #include "target-def.h"
  77
  78 /* Defined for convenience.  */
  79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  80
  81 /* Classifies an address.
  82
  83    ADDRESS_REG_IMM
  84        A simple base register plus immediate offset.
  85
  86    ADDRESS_REG_WB
  87        A base register indexed by immediate offset with writeback.
  88
  89    ADDRESS_REG_REG
  90        A base register indexed by (optionally scaled) register.
  91
  92    ADDRESS_REG_UXTW
  93        A base register indexed by (optionally scaled) zero-extended register.
  94
  95    ADDRESS_REG_SXTW
  96        A base register indexed by (optionally scaled) sign-extended register.
  97
  98    ADDRESS_LO_SUM
  99        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 100
 101    ADDRESS_SYMBOLIC:
 102        A constant symbolic address, in pc-relative literal pool.  */
 103
 104 enum aarch64_address_type {
 105   ADDRESS_REG_IMM,
 106   ADDRESS_REG_WB,
 107   ADDRESS_REG_REG,
 108   ADDRESS_REG_UXTW,
 109   ADDRESS_REG_SXTW,
 110   ADDRESS_LO_SUM,
 111   ADDRESS_SYMBOLIC
 112 };
 113
 114 struct aarch64_address_info {
 115   enum aarch64_address_type type;
 116   rtx base;
 117   rtx offset;
 118   poly_int64 const_offset;
 119   int shift;
 120   enum aarch64_symbol_type symbol_type;
 121 };
 122
 123 /* Information about a legitimate vector immediate operand.  */
 124 struct simd_immediate_info
 125 {
 126   enum insn_type { MOV, MVN };
 127   enum modifier_type { LSL, MSL };
 128
 129   simd_immediate_info () {}
 130   simd_immediate_info (scalar_float_mode, rtx);
 131   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 132                        insn_type = MOV, modifier_type = LSL,
 133                        unsigned int = 0);
 134   simd_immediate_info (scalar_mode, rtx, rtx);
 135
 136   /* The mode of the elements.  */
 137   scalar_mode elt_mode;
 138
 139   /* The value of each element if all elements are the same, or the
 140      first value if the constant is a series.  */
 141   rtx value;
 142
 143   /* The value of the step if the constant is a series, null otherwise.  */
 144   rtx step;
 145
 146   /* The instruction to use to move the immediate into a vector.  */
 147   insn_type insn;
 148
 149   /* The kind of shift modifier to use, and the number of bits to shift.
 150      This is (LSL, 0) if no shift is needed.  */
 151   modifier_type modifier;
 152   unsigned int shift;
 153 };
 154
 155 /* Construct a floating-point immediate in which each element has mode
 156    ELT_MODE_IN and value VALUE_IN.  */
 157 inline simd_immediate_info
 158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 159   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 160     modifier (LSL), shift (0)
 161 {}
 162
 163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 164    and value VALUE_IN.  The other parameters are as for the structure
 165    fields.  */
 166 inline simd_immediate_info
 167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 168                        unsigned HOST_WIDE_INT value_in,
 169                        insn_type insn_in, modifier_type modifier_in,
 170                        unsigned int shift_in)
 171   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 172     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 173 {}
 174
 175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 176    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 177 inline simd_immediate_info
 178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 179   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 180     modifier (LSL), shift (0)
 181 {}
 182
 183 /* The current code model.  */
 184 enum aarch64_code_model aarch64_cmodel;
 185
 186 /* The number of 64-bit elements in an SVE vector.  */
 187 poly_uint16 aarch64_sve_vg;
 188
 189 #ifdef HAVE_AS_TLS
 190 #undef TARGET_HAVE_TLS
 191 #define TARGET_HAVE_TLS 1
 192 #endif
 193
 194 static bool aarch64_composite_type_p (const_tree, machine_mode);
 195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 196                                                      const_tree,
 197                                                      machine_mode *, int *,
 198                                                      bool *);
 199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 201 static void aarch64_override_options_after_change (void);
 202 static bool aarch64_vector_mode_supported_p (machine_mode);
 203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 205                                                          const_tree type,
 206                                                          int misalignment,
 207                                                          bool is_packed);
 208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
 210
 211 /* Major revision number of the ARM Architecture implemented by the target.  */
 212 unsigned aarch64_architecture_version;
 213
 214 /* The processor for which instructions should be scheduled.  */
 215 enum aarch64_processor aarch64_tune = cortexa53;
 216
 217 /* Mask to specify which instruction scheduling options should be used.  */
 218 unsigned long aarch64_tune_flags = 0;
 219
 220 /* Global flag for PC relative loads.  */
 221 bool aarch64_pcrelative_literal_loads;
 222
 223 /* Support for command line parsing of boolean flags in the tuning
 224    structures.  */
 225 struct aarch64_flag_desc
 226 {
 227   const char* name;
 228   unsigned int flag;
 229 };
 230
 231 #define AARCH64_FUSION_PAIR(name, internal_name) \
 232   { name, AARCH64_FUSE_##internal_name },
 233 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 234 {
 235   { "none", AARCH64_FUSE_NOTHING },
 236 #include "aarch64-fusion-pairs.def"
 237   { "all", AARCH64_FUSE_ALL },
 238   { NULL, AARCH64_FUSE_NOTHING }
 239 };
 240
 241 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 242   { name, AARCH64_EXTRA_TUNE_##internal_name },
 243 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 244 {
 245   { "none", AARCH64_EXTRA_TUNE_NONE },
 246 #include "aarch64-tuning-flags.def"
 247   { "all", AARCH64_EXTRA_TUNE_ALL },
 248   { NULL, AARCH64_EXTRA_TUNE_NONE }
 249 };
 250
 251 /* Tuning parameters.  */
 252
 253 static const struct cpu_addrcost_table generic_addrcost_table =
 254 {
 255     {
 256       1, /* hi  */
 257       0, /* si  */
 258       0, /* di  */
 259       1, /* ti  */
 260     },
 261   0, /* pre_modify  */
 262   0, /* post_modify  */
 263   0, /* register_offset  */
 264   0, /* register_sextend  */
 265   0, /* register_zextend  */
 266   0 /* imm_offset  */
 267 };
 268
 269 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 270 {
 271     {
 272       0, /* hi  */
 273       0, /* si  */
 274       0, /* di  */
 275       2, /* ti  */
 276     },
 277   0, /* pre_modify  */
 278   0, /* post_modify  */
 279   1, /* register_offset  */
 280   1, /* register_sextend  */
 281   2, /* register_zextend  */
 282   0, /* imm_offset  */
 283 };
 284
 285 static const struct cpu_addrcost_table xgene1_addrcost_table =
 286 {
 287     {
 288       1, /* hi  */
 289       0, /* si  */
 290       0, /* di  */
 291       1, /* ti  */
 292     },
 293   1, /* pre_modify  */
 294   0, /* post_modify  */
 295   0, /* register_offset  */
 296   1, /* register_sextend  */
 297   1, /* register_zextend  */
 298   0, /* imm_offset  */
 299 };
 300
 301 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 302 {
 303     {
 304       1, /* hi  */
 305       1, /* si  */
 306       1, /* di  */
 307       2, /* ti  */
 308     },
 309   0, /* pre_modify  */
 310   0, /* post_modify  */
 311   2, /* register_offset  */
 312   3, /* register_sextend  */
 313   3, /* register_zextend  */
 314   0, /* imm_offset  */
 315 };
 316
 317 static const struct cpu_regmove_cost generic_regmove_cost =
 318 {
 319   1, /* GP2GP  */
 320   /* Avoid the use of slow int<->fp moves for spilling by setting
 321      their cost higher than memmov_cost.  */
 322   5, /* GP2FP  */
 323   5, /* FP2GP  */
 324   2 /* FP2FP  */
 325 };
 326
 327 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 328 {
 329   1, /* GP2GP  */
 330   /* Avoid the use of slow int<->fp moves for spilling by setting
 331      their cost higher than memmov_cost.  */
 332   5, /* GP2FP  */
 333   5, /* FP2GP  */
 334   2 /* FP2FP  */
 335 };
 336
 337 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 338 {
 339   1, /* GP2GP  */
 340   /* Avoid the use of slow int<->fp moves for spilling by setting
 341      their cost higher than memmov_cost.  */
 342   5, /* GP2FP  */
 343   5, /* FP2GP  */
 344   2 /* FP2FP  */
 345 };
 346
 347 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 348 {
 349   1, /* GP2GP  */
 350   /* Avoid the use of slow int<->fp moves for spilling by setting
 351      their cost higher than memmov_cost (actual, 4 and 9).  */
 352   9, /* GP2FP  */
 353   9, /* FP2GP  */
 354   1 /* FP2FP  */
 355 };
 356
 357 static const struct cpu_regmove_cost thunderx_regmove_cost =
 358 {
 359   2, /* GP2GP  */
 360   2, /* GP2FP  */
 361   6, /* FP2GP  */
 362   4 /* FP2FP  */
 363 };
 364
 365 static const struct cpu_regmove_cost xgene1_regmove_cost =
 366 {
 367   1, /* GP2GP  */
 368   /* Avoid the use of slow int<->fp moves for spilling by setting
 369      their cost higher than memmov_cost.  */
 370   8, /* GP2FP  */
 371   8, /* FP2GP  */
 372   2 /* FP2FP  */
 373 };
 374
 375 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 376 {
 377   2, /* GP2GP  */
 378   /* Avoid the use of int<->fp moves for spilling.  */
 379   6, /* GP2FP  */
 380   6, /* FP2GP  */
 381   4 /* FP2FP  */
 382 };
 383
 384 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 385 {
 386   1, /* GP2GP  */
 387   /* Avoid the use of int<->fp moves for spilling.  */
 388   8, /* GP2FP  */
 389   8, /* FP2GP  */
 390   4  /* FP2FP  */
 391 };
 392
 393 /* Generic costs for vector insn classes.  */
 394 static const struct cpu_vector_cost generic_vector_cost =
 395 {
 396   1, /* scalar_int_stmt_cost  */
 397   1, /* scalar_fp_stmt_cost  */
 398   1, /* scalar_load_cost  */
 399   1, /* scalar_store_cost  */
 400   1, /* vec_int_stmt_cost  */
 401   1, /* vec_fp_stmt_cost  */
 402   2, /* vec_permute_cost  */
 403   1, /* vec_to_scalar_cost  */
 404   1, /* scalar_to_vec_cost  */
 405   1, /* vec_align_load_cost  */
 406   1, /* vec_unalign_load_cost  */
 407   1, /* vec_unalign_store_cost  */
 408   1, /* vec_store_cost  */
 409   3, /* cond_taken_branch_cost  */
 410   1 /* cond_not_taken_branch_cost  */
 411 };
 412
 413 /* ThunderX costs for vector insn classes.  */
 414 static const struct cpu_vector_cost thunderx_vector_cost =
 415 {
 416   1, /* scalar_int_stmt_cost  */
 417   1, /* scalar_fp_stmt_cost  */
 418   3, /* scalar_load_cost  */
 419   1, /* scalar_store_cost  */
 420   4, /* vec_int_stmt_cost  */
 421   1, /* vec_fp_stmt_cost  */
 422   4, /* vec_permute_cost  */
 423   2, /* vec_to_scalar_cost  */
 424   2, /* scalar_to_vec_cost  */
 425   3, /* vec_align_load_cost  */
 426   5, /* vec_unalign_load_cost  */
 427   5, /* vec_unalign_store_cost  */
 428   1, /* vec_store_cost  */
 429   3, /* cond_taken_branch_cost  */
 430   3 /* cond_not_taken_branch_cost  */
 431 };
 432
 433 /* Generic costs for vector insn classes.  */
 434 static const struct cpu_vector_cost cortexa57_vector_cost =
 435 {
 436   1, /* scalar_int_stmt_cost  */
 437   1, /* scalar_fp_stmt_cost  */
 438   4, /* scalar_load_cost  */
 439   1, /* scalar_store_cost  */
 440   2, /* vec_int_stmt_cost  */
 441   2, /* vec_fp_stmt_cost  */
 442   3, /* vec_permute_cost  */
 443   8, /* vec_to_scalar_cost  */
 444   8, /* scalar_to_vec_cost  */
 445   4, /* vec_align_load_cost  */
 446   4, /* vec_unalign_load_cost  */
 447   1, /* vec_unalign_store_cost  */
 448   1, /* vec_store_cost  */
 449   1, /* cond_taken_branch_cost  */
 450   1 /* cond_not_taken_branch_cost  */
 451 };
 452
 453 static const struct cpu_vector_cost exynosm1_vector_cost =
 454 {
 455   1, /* scalar_int_stmt_cost  */
 456   1, /* scalar_fp_stmt_cost  */
 457   5, /* scalar_load_cost  */
 458   1, /* scalar_store_cost  */
 459   3, /* vec_int_stmt_cost  */
 460   3, /* vec_fp_stmt_cost  */
 461   3, /* vec_permute_cost  */
 462   3, /* vec_to_scalar_cost  */
 463   3, /* scalar_to_vec_cost  */
 464   5, /* vec_align_load_cost  */
 465   5, /* vec_unalign_load_cost  */
 466   1, /* vec_unalign_store_cost  */
 467   1, /* vec_store_cost  */
 468   1, /* cond_taken_branch_cost  */
 469   1 /* cond_not_taken_branch_cost  */
 470 };
 471
 472 /* Generic costs for vector insn classes.  */
 473 static const struct cpu_vector_cost xgene1_vector_cost =
 474 {
 475   1, /* scalar_int_stmt_cost  */
 476   1, /* scalar_fp_stmt_cost  */
 477   5, /* scalar_load_cost  */
 478   1, /* scalar_store_cost  */
 479   2, /* vec_int_stmt_cost  */
 480   2, /* vec_fp_stmt_cost  */
 481   2, /* vec_permute_cost  */
 482   4, /* vec_to_scalar_cost  */
 483   4, /* scalar_to_vec_cost  */
 484   10, /* vec_align_load_cost  */
 485   10, /* vec_unalign_load_cost  */
 486   2, /* vec_unalign_store_cost  */
 487   2, /* vec_store_cost  */
 488   2, /* cond_taken_branch_cost  */
 489   1 /* cond_not_taken_branch_cost  */
 490 };
 491
 492 /* Costs for vector insn classes for Vulcan.  */
 493 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 494 {
 495   1, /* scalar_int_stmt_cost  */
 496   6, /* scalar_fp_stmt_cost  */
 497   4, /* scalar_load_cost  */
 498   1, /* scalar_store_cost  */
 499   5, /* vec_int_stmt_cost  */
 500   6, /* vec_fp_stmt_cost  */
 501   3, /* vec_permute_cost  */
 502   6, /* vec_to_scalar_cost  */
 503   5, /* scalar_to_vec_cost  */
 504   8, /* vec_align_load_cost  */
 505   8, /* vec_unalign_load_cost  */
 506   4, /* vec_unalign_store_cost  */
 507   4, /* vec_store_cost  */
 508   2, /* cond_taken_branch_cost  */
 509   1  /* cond_not_taken_branch_cost  */
 510 };
 511
 512 /* Generic costs for branch instructions.  */
 513 static const struct cpu_branch_cost generic_branch_cost =
 514 {
 515   1,  /* Predictable.  */
 516   3   /* Unpredictable.  */
 517 };
 518
 519 /* Generic approximation modes.  */
 520 static const cpu_approx_modes generic_approx_modes =
 521 {
 522   AARCH64_APPROX_NONE,  /* division  */
 523   AARCH64_APPROX_NONE,  /* sqrt  */
 524   AARCH64_APPROX_NONE   /* recip_sqrt  */
 525 };
 526
 527 /* Approximation modes for Exynos M1.  */
 528 static const cpu_approx_modes exynosm1_approx_modes =
 529 {
 530   AARCH64_APPROX_NONE,  /* division  */
 531   AARCH64_APPROX_ALL,   /* sqrt  */
 532   AARCH64_APPROX_ALL    /* recip_sqrt  */
 533 };
 534
 535 /* Approximation modes for X-Gene 1.  */
 536 static const cpu_approx_modes xgene1_approx_modes =
 537 {
 538   AARCH64_APPROX_NONE,  /* division  */
 539   AARCH64_APPROX_NONE,  /* sqrt  */
 540   AARCH64_APPROX_ALL    /* recip_sqrt  */
 541 };
 542
 543 /* Generic prefetch settings (which disable prefetch).  */
 544 static const cpu_prefetch_tune generic_prefetch_tune =
 545 {
 546   0,                    /* num_slots  */
 547   -1,                   /* l1_cache_size  */
 548   -1,                   /* l1_cache_line_size  */
 549   -1,                   /* l2_cache_size  */
 550   -1                    /* default_opt_level  */
 551 };
 552
 553 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 554 {
 555   0,                    /* num_slots  */
 556   -1,                   /* l1_cache_size  */
 557   64,                   /* l1_cache_line_size  */
 558   -1,                   /* l2_cache_size  */
 559   -1                    /* default_opt_level  */
 560 };
 561
 562 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 563 {
 564   4,                    /* num_slots  */
 565   32,                   /* l1_cache_size  */
 566   64,                   /* l1_cache_line_size  */
 567   1024,                 /* l2_cache_size  */
 568   -1                    /* default_opt_level  */
 569 };
 570
 571 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 572 {
 573   8,                    /* num_slots  */
 574   32,                   /* l1_cache_size  */
 575   128,                  /* l1_cache_line_size  */
 576   16*1024,              /* l2_cache_size  */
 577   3                     /* default_opt_level  */
 578 };
 579
 580 static const cpu_prefetch_tune thunderx_prefetch_tune =
 581 {
 582   8,                    /* num_slots  */
 583   32,                   /* l1_cache_size  */
 584   128,                  /* l1_cache_line_size  */
 585   -1,                   /* l2_cache_size  */
 586   -1                    /* default_opt_level  */
 587 };
 588
 589 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 590 {
 591   8,                    /* num_slots  */
 592   32,                   /* l1_cache_size  */
 593   64,                   /* l1_cache_line_size  */
 594   256,                  /* l2_cache_size  */
 595   -1                    /* default_opt_level  */
 596 };
 597
 598 static const struct tune_params generic_tunings =
 599 {
 600   &cortexa57_extra_costs,
 601   &generic_addrcost_table,
 602   &generic_regmove_cost,
 603   &generic_vector_cost,
 604   &generic_branch_cost,
 605   &generic_approx_modes,
 606   4, /* memmov_cost  */
 607   2, /* issue_rate  */
 608   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 609   8,    /* function_align.  */
 610   4,    /* jump_align.  */
 611   8,    /* loop_align.  */
 612   2,    /* int_reassoc_width.  */
 613   4,    /* fp_reassoc_width.  */
 614   1,    /* vec_reassoc_width.  */
 615   2,    /* min_div_recip_mul_sf.  */
 616   2,    /* min_div_recip_mul_df.  */
 617   0,    /* max_case_values.  */
 618   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 619   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 620   &generic_prefetch_tune
 621 };
 622
 623 static const struct tune_params cortexa35_tunings =
 624 {
 625   &cortexa53_extra_costs,
 626   &generic_addrcost_table,
 627   &cortexa53_regmove_cost,
 628   &generic_vector_cost,
 629   &generic_branch_cost,
 630   &generic_approx_modes,
 631   4, /* memmov_cost  */
 632   1, /* issue_rate  */
 633   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 634    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 635   16,   /* function_align.  */
 636   4,    /* jump_align.  */
 637   8,    /* loop_align.  */
 638   2,    /* int_reassoc_width.  */
 639   4,    /* fp_reassoc_width.  */
 640   1,    /* vec_reassoc_width.  */
 641   2,    /* min_div_recip_mul_sf.  */
 642   2,    /* min_div_recip_mul_df.  */
 643   0,    /* max_case_values.  */
 644   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 645   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 646   &generic_prefetch_tune
 647 };
 648
 649 static const struct tune_params cortexa53_tunings =
 650 {
 651   &cortexa53_extra_costs,
 652   &generic_addrcost_table,
 653   &cortexa53_regmove_cost,
 654   &generic_vector_cost,
 655   &generic_branch_cost,
 656   &generic_approx_modes,
 657   4, /* memmov_cost  */
 658   2, /* issue_rate  */
 659   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 660    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 661   16,   /* function_align.  */
 662   4,    /* jump_align.  */
 663   8,    /* loop_align.  */
 664   2,    /* int_reassoc_width.  */
 665   4,    /* fp_reassoc_width.  */
 666   1,    /* vec_reassoc_width.  */
 667   2,    /* min_div_recip_mul_sf.  */
 668   2,    /* min_div_recip_mul_df.  */
 669   0,    /* max_case_values.  */
 670   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 671   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 672   &generic_prefetch_tune
 673 };
 674
 675 static const struct tune_params cortexa57_tunings =
 676 {
 677   &cortexa57_extra_costs,
 678   &generic_addrcost_table,
 679   &cortexa57_regmove_cost,
 680   &cortexa57_vector_cost,
 681   &generic_branch_cost,
 682   &generic_approx_modes,
 683   4, /* memmov_cost  */
 684   3, /* issue_rate  */
 685   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 686    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 687   16,   /* function_align.  */
 688   4,    /* jump_align.  */
 689   8,    /* loop_align.  */
 690   2,    /* int_reassoc_width.  */
 691   4,    /* fp_reassoc_width.  */
 692   1,    /* vec_reassoc_width.  */
 693   2,    /* min_div_recip_mul_sf.  */
 694   2,    /* min_div_recip_mul_df.  */
 695   0,    /* max_case_values.  */
 696   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 697   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 698   &generic_prefetch_tune
 699 };
 700
 701 static const struct tune_params cortexa72_tunings =
 702 {
 703   &cortexa57_extra_costs,
 704   &generic_addrcost_table,
 705   &cortexa57_regmove_cost,
 706   &cortexa57_vector_cost,
 707   &generic_branch_cost,
 708   &generic_approx_modes,
 709   4, /* memmov_cost  */
 710   3, /* issue_rate  */
 711   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 712    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 713   16,   /* function_align.  */
 714   4,    /* jump_align.  */
 715   8,    /* loop_align.  */
 716   2,    /* int_reassoc_width.  */
 717   4,    /* fp_reassoc_width.  */
 718   1,    /* vec_reassoc_width.  */
 719   2,    /* min_div_recip_mul_sf.  */
 720   2,    /* min_div_recip_mul_df.  */
 721   0,    /* max_case_values.  */
 722   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 723   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 724   &generic_prefetch_tune
 725 };
 726
 727 static const struct tune_params cortexa73_tunings =
 728 {
 729   &cortexa57_extra_costs,
 730   &generic_addrcost_table,
 731   &cortexa57_regmove_cost,
 732   &cortexa57_vector_cost,
 733   &generic_branch_cost,
 734   &generic_approx_modes,
 735   4, /* memmov_cost.  */
 736   2, /* issue_rate.  */
 737   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 738    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 739   16,   /* function_align.  */
 740   4,    /* jump_align.  */
 741   8,    /* loop_align.  */
 742   2,    /* int_reassoc_width.  */
 743   4,    /* fp_reassoc_width.  */
 744   1,    /* vec_reassoc_width.  */
 745   2,    /* min_div_recip_mul_sf.  */
 746   2,    /* min_div_recip_mul_df.  */
 747   0,    /* max_case_values.  */
 748   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 749   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 750   &generic_prefetch_tune
 751 };
 752
 753
 754
 755 static const struct tune_params exynosm1_tunings =
 756 {
 757   &exynosm1_extra_costs,
 758   &exynosm1_addrcost_table,
 759   &exynosm1_regmove_cost,
 760   &exynosm1_vector_cost,
 761   &generic_branch_cost,
 762   &exynosm1_approx_modes,
 763   4,    /* memmov_cost  */
 764   3,    /* issue_rate  */
 765   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 766   4,    /* function_align.  */
 767   4,    /* jump_align.  */
 768   4,    /* loop_align.  */
 769   2,    /* int_reassoc_width.  */
 770   4,    /* fp_reassoc_width.  */
 771   1,    /* vec_reassoc_width.  */
 772   2,    /* min_div_recip_mul_sf.  */
 773   2,    /* min_div_recip_mul_df.  */
 774   48,   /* max_case_values.  */
 775   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 776   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 777   &exynosm1_prefetch_tune
 778 };
 779
 780 static const struct tune_params thunderxt88_tunings =
 781 {
 782   &thunderx_extra_costs,
 783   &generic_addrcost_table,
 784   &thunderx_regmove_cost,
 785   &thunderx_vector_cost,
 786   &generic_branch_cost,
 787   &generic_approx_modes,
 788   6, /* memmov_cost  */
 789   2, /* issue_rate  */
 790   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 791   8,    /* function_align.  */
 792   8,    /* jump_align.  */
 793   8,    /* loop_align.  */
 794   2,    /* int_reassoc_width.  */
 795   4,    /* fp_reassoc_width.  */
 796   1,    /* vec_reassoc_width.  */
 797   2,    /* min_div_recip_mul_sf.  */
 798   2,    /* min_div_recip_mul_df.  */
 799   0,    /* max_case_values.  */
 800   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 801   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 802   &thunderxt88_prefetch_tune
 803 };
 804
 805 static const struct tune_params thunderx_tunings =
 806 {
 807   &thunderx_extra_costs,
 808   &generic_addrcost_table,
 809   &thunderx_regmove_cost,
 810   &thunderx_vector_cost,
 811   &generic_branch_cost,
 812   &generic_approx_modes,
 813   6, /* memmov_cost  */
 814   2, /* issue_rate  */
 815   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 816   8,    /* function_align.  */
 817   8,    /* jump_align.  */
 818   8,    /* loop_align.  */
 819   2,    /* int_reassoc_width.  */
 820   4,    /* fp_reassoc_width.  */
 821   1,    /* vec_reassoc_width.  */
 822   2,    /* min_div_recip_mul_sf.  */
 823   2,    /* min_div_recip_mul_df.  */
 824   0,    /* max_case_values.  */
 825   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 826   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 827    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 828   &thunderx_prefetch_tune
 829 };
 830
 831 static const struct tune_params xgene1_tunings =
 832 {
 833   &xgene1_extra_costs,
 834   &xgene1_addrcost_table,
 835   &xgene1_regmove_cost,
 836   &xgene1_vector_cost,
 837   &generic_branch_cost,
 838   &xgene1_approx_modes,
 839   6, /* memmov_cost  */
 840   4, /* issue_rate  */
 841   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 842   16,   /* function_align.  */
 843   8,    /* jump_align.  */
 844   16,   /* loop_align.  */
 845   2,    /* int_reassoc_width.  */
 846   4,    /* fp_reassoc_width.  */
 847   1,    /* vec_reassoc_width.  */
 848   2,    /* min_div_recip_mul_sf.  */
 849   2,    /* min_div_recip_mul_df.  */
 850   0,    /* max_case_values.  */
 851   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 852   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 853   &generic_prefetch_tune
 854 };
 855
 856 static const struct tune_params qdf24xx_tunings =
 857 {
 858   &qdf24xx_extra_costs,
 859   &generic_addrcost_table,
 860   &qdf24xx_regmove_cost,
 861   &generic_vector_cost,
 862   &generic_branch_cost,
 863   &generic_approx_modes,
 864   4, /* memmov_cost  */
 865   4, /* issue_rate  */
 866   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 867    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 868   16,   /* function_align.  */
 869   8,    /* jump_align.  */
 870   16,   /* loop_align.  */
 871   2,    /* int_reassoc_width.  */
 872   4,    /* fp_reassoc_width.  */
 873   1,    /* vec_reassoc_width.  */
 874   2,    /* min_div_recip_mul_sf.  */
 875   2,    /* min_div_recip_mul_df.  */
 876   0,    /* max_case_values.  */
 877   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 878   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 879   &qdf24xx_prefetch_tune
 880 };
 881
 882 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
 883    for now.  */
 884 static const struct tune_params saphira_tunings =
 885 {
 886   &generic_extra_costs,
 887   &generic_addrcost_table,
 888   &generic_regmove_cost,
 889   &generic_vector_cost,
 890   &generic_branch_cost,
 891   &generic_approx_modes,
 892   4, /* memmov_cost  */
 893   4, /* issue_rate  */
 894   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 895    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 896   16,   /* function_align.  */
 897   8,    /* jump_align.  */
 898   16,   /* loop_align.  */
 899   2,    /* int_reassoc_width.  */
 900   4,    /* fp_reassoc_width.  */
 901   1,    /* vec_reassoc_width.  */
 902   2,    /* min_div_recip_mul_sf.  */
 903   2,    /* min_div_recip_mul_df.  */
 904   0,    /* max_case_values.  */
 905   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 906   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 907   &generic_prefetch_tune
 908 };
 909
 910 static const struct tune_params thunderx2t99_tunings =
 911 {
 912   &thunderx2t99_extra_costs,
 913   &thunderx2t99_addrcost_table,
 914   &thunderx2t99_regmove_cost,
 915   &thunderx2t99_vector_cost,
 916   &generic_branch_cost,
 917   &generic_approx_modes,
 918   4, /* memmov_cost.  */
 919   4, /* issue_rate.  */
 920   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 921    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 922   16,   /* function_align.  */
 923   8,    /* jump_align.  */
 924   16,   /* loop_align.  */
 925   3,    /* int_reassoc_width.  */
 926   2,    /* fp_reassoc_width.  */
 927   2,    /* vec_reassoc_width.  */
 928   2,    /* min_div_recip_mul_sf.  */
 929   2,    /* min_div_recip_mul_df.  */
 930   0,    /* max_case_values.  */
 931   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 932   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 933   &thunderx2t99_prefetch_tune
 934 };
 935
 936 /* Support for fine-grained override of the tuning structures.  */
 937 struct aarch64_tuning_override_function
 938 {
 939   const char* name;
 940   void (*parse_override)(const char*, struct tune_params*);
 941 };
 942
 943 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 944 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 945
 946 static const struct aarch64_tuning_override_function
 947 aarch64_tuning_override_functions[] =
 948 {
 949   { "fuse", aarch64_parse_fuse_string },
 950   { "tune", aarch64_parse_tune_string },
 951   { NULL, NULL }
 952 };
 953
 954 /* A processor implementing AArch64.  */
 955 struct processor
 956 {
 957   const char *const name;
 958   enum aarch64_processor ident;
 959   enum aarch64_processor sched_core;
 960   enum aarch64_arch arch;
 961   unsigned architecture_version;
 962   const unsigned long flags;
 963   const struct tune_params *const tune;
 964 };
 965
 966 /* Architectures implementing AArch64.  */
 967 static const struct processor all_architectures[] =
 968 {
 969 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 970   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 971 #include "aarch64-arches.def"
 972   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 973 };
 974
 975 /* Processor cores implementing AArch64.  */
 976 static const struct processor all_cores[] =
 977 {
 978 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 979   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 980   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 981   FLAGS, &COSTS##_tunings},
 982 #include "aarch64-cores.def"
 983   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 984     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 985   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 986 };
 987
 988
 989 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 990    handling code or by target attributes.  */
 991 static const struct processor *selected_arch;
 992 static const struct processor *selected_cpu;
 993 static const struct processor *selected_tune;
 994
 995 /* The current tuning set.  */
 996 struct tune_params aarch64_tune_params = generic_tunings;
 997
 998 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 999
1000 /* An ISA extension in the co-processor and main instruction set space.  */
1001 struct aarch64_option_extension
1002 {
1003   const char *const name;
1004   const unsigned long flags_on;
1005   const unsigned long flags_off;
1006 };
1007
1008 typedef enum aarch64_cond_code
1009 {
1010   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1011   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1012   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1013 }
1014 aarch64_cc;
1015
1016 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1017
1018 /* The condition codes of the processor, and the inverse function.  */
1019 static const char * const aarch64_condition_codes[] =
1020 {
1021   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1022   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1023 };
1024
1025 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1026 const char *
1027 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1028                         const char * branch_format)
1029 {
1030     rtx_code_label * tmp_label = gen_label_rtx ();
1031     char label_buf[256];
1032     char buffer[128];
1033     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1034                                  CODE_LABEL_NUMBER (tmp_label));
1035     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1036     rtx dest_label = operands[pos_label];
1037     operands[pos_label] = tmp_label;
1038
1039     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1040     output_asm_insn (buffer, operands);
1041
1042     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1043     operands[pos_label] = dest_label;
1044     output_asm_insn (buffer, operands);
1045     return "";
1046 }
1047
1048 void
1049 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
1050 {
1051   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
1052   if (TARGET_GENERAL_REGS_ONLY)
1053     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
1054   else
1055     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1056 }
1057
1058 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1059    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1060    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
1061    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1062    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
1063    irrespectively of its cost results in bad allocations with many redundant
1064    int<->FP moves which are expensive on various cores.
1065    To avoid this we don't allow ALL_REGS as the allocno class, but force a
1066    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
1067    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
1068    Otherwise set the allocno class depending on the mode.
1069    The result of this is that it is no longer inefficient to have a higher
1070    memory move cost than the register move cost.
1071 */
1072
1073 static reg_class_t
1074 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1075                                          reg_class_t best_class)
1076 {
1077   machine_mode mode;
1078
1079   if (allocno_class != ALL_REGS)
1080     return allocno_class;
1081
1082   if (best_class != ALL_REGS)
1083     return best_class;
1084
1085   mode = PSEUDO_REGNO_MODE (regno);
1086   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1087 }
1088
1089 static unsigned int
1090 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1091 {
1092   if (GET_MODE_UNIT_SIZE (mode) == 4)
1093     return aarch64_tune_params.min_div_recip_mul_sf;
1094   return aarch64_tune_params.min_div_recip_mul_df;
1095 }
1096
1097 /* Return the reassociation width of treeop OPC with mode MODE.  */
1098 static int
1099 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1100 {
1101   if (VECTOR_MODE_P (mode))
1102     return aarch64_tune_params.vec_reassoc_width;
1103   if (INTEGRAL_MODE_P (mode))
1104     return aarch64_tune_params.int_reassoc_width;
1105   /* Avoid reassociating floating point addition so we emit more FMAs.  */
1106   if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1107     return aarch64_tune_params.fp_reassoc_width;
1108   return 1;
1109 }
1110
1111 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1112 unsigned
1113 aarch64_dbx_register_number (unsigned regno)
1114 {
1115    if (GP_REGNUM_P (regno))
1116      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1117    else if (regno == SP_REGNUM)
1118      return AARCH64_DWARF_SP;
1119    else if (FP_REGNUM_P (regno))
1120      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1121    else if (PR_REGNUM_P (regno))
1122      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1123    else if (regno == VG_REGNUM)
1124      return AARCH64_DWARF_VG;
1125
1126    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1127       equivalent DWARF register.  */
1128    return DWARF_FRAME_REGISTERS;
1129 }
1130
1131 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1132 static bool
1133 aarch64_advsimd_struct_mode_p (machine_mode mode)
1134 {
1135   return (TARGET_SIMD
1136           && (mode == OImode || mode == CImode || mode == XImode));
1137 }
1138
1139 /* Return true if MODE is an SVE predicate mode.  */
1140 static bool
1141 aarch64_sve_pred_mode_p (machine_mode mode)
1142 {
1143   return (TARGET_SVE
1144           && (mode == VNx16BImode
1145               || mode == VNx8BImode
1146               || mode == VNx4BImode
1147               || mode == VNx2BImode));
1148 }
1149
1150 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1151 const unsigned int VEC_ADVSIMD  = 1;
1152 const unsigned int VEC_SVE_DATA = 2;
1153 const unsigned int VEC_SVE_PRED = 4;
1154 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1155    a structure of 2, 3 or 4 vectors.  */
1156 const unsigned int VEC_STRUCT   = 8;
1157 /* Useful combinations of the above.  */
1158 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1159 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1160
1161 /* Return a set of flags describing the vector properties of mode MODE.
1162    Ignore modes that are not supported by the current target.  */
1163 static unsigned int
1164 aarch64_classify_vector_mode (machine_mode mode)
1165 {
1166   if (aarch64_advsimd_struct_mode_p (mode))
1167     return VEC_ADVSIMD | VEC_STRUCT;
1168
1169   if (aarch64_sve_pred_mode_p (mode))
1170     return VEC_SVE_PRED;
1171
1172   scalar_mode inner = GET_MODE_INNER (mode);
1173   if (VECTOR_MODE_P (mode)
1174       && (inner == QImode
1175           || inner == HImode
1176           || inner == HFmode
1177           || inner == SImode
1178           || inner == SFmode
1179           || inner == DImode
1180           || inner == DFmode))
1181     {
1182       if (TARGET_SVE)
1183         {
1184           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1185             return VEC_SVE_DATA;
1186           if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1187               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1188               || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1189             return VEC_SVE_DATA | VEC_STRUCT;
1190         }
1191
1192       /* This includes V1DF but not V1DI (which doesn't exist).  */
1193       if (TARGET_SIMD
1194           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1195               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1196         return VEC_ADVSIMD;
1197     }
1198
1199   return 0;
1200 }
1201
1202 /* Return true if MODE is any of the data vector modes, including
1203    structure modes.  */
1204 static bool
1205 aarch64_vector_data_mode_p (machine_mode mode)
1206 {
1207   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1208 }
1209
1210 /* Return true if MODE is an SVE data vector mode; either a single vector
1211    or a structure of vectors.  */
1212 static bool
1213 aarch64_sve_data_mode_p (machine_mode mode)
1214 {
1215   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1216 }
1217
1218 /* Implement target hook TARGET_ARRAY_MODE.  */
1219 static opt_machine_mode
1220 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1221 {
1222   if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1223       && IN_RANGE (nelems, 2, 4))
1224     return mode_for_vector (GET_MODE_INNER (mode),
1225                             GET_MODE_NUNITS (mode) * nelems);
1226
1227   return opt_machine_mode ();
1228 }
1229
1230 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1231 static bool
1232 aarch64_array_mode_supported_p (machine_mode mode,
1233                                 unsigned HOST_WIDE_INT nelems)
1234 {
1235   if (TARGET_SIMD
1236       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1237           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1238       && (nelems >= 2 && nelems <= 4))
1239     return true;
1240
1241   return false;
1242 }
1243
1244 /* Return the SVE predicate mode to use for elements that have
1245    ELEM_NBYTES bytes, if such a mode exists.  */
1246
1247 opt_machine_mode
1248 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1249 {
1250   if (TARGET_SVE)
1251     {
1252       if (elem_nbytes == 1)
1253         return VNx16BImode;
1254       if (elem_nbytes == 2)
1255         return VNx8BImode;
1256       if (elem_nbytes == 4)
1257         return VNx4BImode;
1258       if (elem_nbytes == 8)
1259         return VNx2BImode;
1260     }
1261   return opt_machine_mode ();
1262 }
1263
1264 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1265
1266 static opt_machine_mode
1267 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1268 {
1269   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1270     {
1271       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1272       machine_mode pred_mode;
1273       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1274         return pred_mode;
1275     }
1276
1277   return default_get_mask_mode (nunits, nbytes);
1278 }
1279
1280 /* Implement TARGET_HARD_REGNO_NREGS.  */
1281
1282 static unsigned int
1283 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1284 {
1285   /* ??? Logically we should only need to provide a value when
1286      HARD_REGNO_MODE_OK says that the combination is valid,
1287      but at the moment we need to handle all modes.  Just ignore
1288      any runtime parts for registers that can't store them.  */
1289   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1290   switch (aarch64_regno_regclass (regno))
1291     {
1292     case FP_REGS:
1293     case FP_LO_REGS:
1294       if (aarch64_sve_data_mode_p (mode))
1295         return exact_div (GET_MODE_SIZE (mode),
1296                           BYTES_PER_SVE_VECTOR).to_constant ();
1297       return CEIL (lowest_size, UNITS_PER_VREG);
1298     case PR_REGS:
1299     case PR_LO_REGS:
1300     case PR_HI_REGS:
1301       return 1;
1302     default:
1303       return CEIL (lowest_size, UNITS_PER_WORD);
1304     }
1305   gcc_unreachable ();
1306 }
1307
1308 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1309
1310 static bool
1311 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1312 {
1313   if (GET_MODE_CLASS (mode) == MODE_CC)
1314     return regno == CC_REGNUM;
1315
1316   if (regno == VG_REGNUM)
1317     /* This must have the same size as _Unwind_Word.  */
1318     return mode == DImode;
1319
1320   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1321   if (vec_flags & VEC_SVE_PRED)
1322     return PR_REGNUM_P (regno);
1323
1324   if (PR_REGNUM_P (regno))
1325     return 0;
1326
1327   if (regno == SP_REGNUM)
1328     /* The purpose of comparing with ptr_mode is to support the
1329        global register variable associated with the stack pointer
1330        register via the syntax of asm ("wsp") in ILP32.  */
1331     return mode == Pmode || mode == ptr_mode;
1332
1333   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1334     return mode == Pmode;
1335
1336   if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1337     return true;
1338
1339   if (FP_REGNUM_P (regno))
1340     {
1341       if (vec_flags & VEC_STRUCT)
1342         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1343       else
1344         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1345     }
1346
1347   return false;
1348 }
1349
1350 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1351    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1352    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1353
1354 static bool
1355 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1356 {
1357   return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1358 }
1359
1360 /* Implement REGMODE_NATURAL_SIZE.  */
1361 poly_uint64
1362 aarch64_regmode_natural_size (machine_mode mode)
1363 {
1364   /* The natural size for SVE data modes is one SVE data vector,
1365      and similarly for predicates.  We can't independently modify
1366      anything smaller than that.  */
1367   /* ??? For now, only do this for variable-width SVE registers.
1368      Doing it for constant-sized registers breaks lower-subreg.c.  */
1369   /* ??? And once that's fixed, we should probably have similar
1370      code for Advanced SIMD.  */
1371   if (!aarch64_sve_vg.is_constant ())
1372     {
1373       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1374       if (vec_flags & VEC_SVE_PRED)
1375         return BYTES_PER_SVE_PRED;
1376       if (vec_flags & VEC_SVE_DATA)
1377         return BYTES_PER_SVE_VECTOR;
1378     }
1379   return UNITS_PER_WORD;
1380 }
1381
1382 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1383 machine_mode
1384 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1385                                      machine_mode mode)
1386 {
1387   /* The predicate mode determines which bits are significant and
1388      which are "don't care".  Decreasing the number of lanes would
1389      lose data while increasing the number of lanes would make bits
1390      unnecessarily significant.  */
1391   if (PR_REGNUM_P (regno))
1392     return mode;
1393   if (known_ge (GET_MODE_SIZE (mode), 4))
1394     return mode;
1395   else
1396     return SImode;
1397 }
1398
1399 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1400    that strcpy from constants will be faster.  */
1401
1402 static HOST_WIDE_INT
1403 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1404 {
1405   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1406     return MAX (align, BITS_PER_WORD);
1407   return align;
1408 }
1409
1410 /* Return true if calls to DECL should be treated as
1411    long-calls (ie called via a register).  */
1412 static bool
1413 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1414 {
1415   return false;
1416 }
1417
1418 /* Return true if calls to symbol-ref SYM should be treated as
1419    long-calls (ie called via a register).  */
1420 bool
1421 aarch64_is_long_call_p (rtx sym)
1422 {
1423   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1424 }
1425
1426 /* Return true if calls to symbol-ref SYM should not go through
1427    plt stubs.  */
1428
1429 bool
1430 aarch64_is_noplt_call_p (rtx sym)
1431 {
1432   const_tree decl = SYMBOL_REF_DECL (sym);
1433
1434   if (flag_pic
1435       && decl
1436       && (!flag_plt
1437           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1438       && !targetm.binds_local_p (decl))
1439     return true;
1440
1441   return false;
1442 }
1443
1444 /* Return true if the offsets to a zero/sign-extract operation
1445    represent an expression that matches an extend operation.  The
1446    operands represent the paramters from
1447
1448    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1449 bool
1450 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1451                                 rtx extract_imm)
1452 {
1453   HOST_WIDE_INT mult_val, extract_val;
1454
1455   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1456     return false;
1457
1458   mult_val = INTVAL (mult_imm);
1459   extract_val = INTVAL (extract_imm);
1460
1461   if (extract_val > 8
1462       && extract_val < GET_MODE_BITSIZE (mode)
1463       && exact_log2 (extract_val & ~7) > 0
1464       && (extract_val & 7) <= 4
1465       && mult_val == (1 << (extract_val & 7)))
1466     return true;
1467
1468   return false;
1469 }
1470
1471 /* Emit an insn that's a simple single-set.  Both the operands must be
1472    known to be valid.  */
1473 inline static rtx_insn *
1474 emit_set_insn (rtx x, rtx y)
1475 {
1476   return emit_insn (gen_rtx_SET (x, y));
1477 }
1478
1479 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1480    return the rtx for register 0 in the proper mode.  */
1481 rtx
1482 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1483 {
1484   machine_mode mode = SELECT_CC_MODE (code, x, y);
1485   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1486
1487   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1488   return cc_reg;
1489 }
1490
1491 /* Build the SYMBOL_REF for __tls_get_addr.  */
1492
1493 static GTY(()) rtx tls_get_addr_libfunc;
1494
1495 rtx
1496 aarch64_tls_get_addr (void)
1497 {
1498   if (!tls_get_addr_libfunc)
1499     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1500   return tls_get_addr_libfunc;
1501 }
1502
1503 /* Return the TLS model to use for ADDR.  */
1504
1505 static enum tls_model
1506 tls_symbolic_operand_type (rtx addr)
1507 {
1508   enum tls_model tls_kind = TLS_MODEL_NONE;
1509   if (GET_CODE (addr) == CONST)
1510     {
1511       poly_int64 addend;
1512       rtx sym = strip_offset (addr, &addend);
1513       if (GET_CODE (sym) == SYMBOL_REF)
1514         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1515     }
1516   else if (GET_CODE (addr) == SYMBOL_REF)
1517     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1518
1519   return tls_kind;
1520 }
1521
1522 /* We'll allow lo_sum's in addresses in our legitimate addresses
1523    so that combine would take care of combining addresses where
1524    necessary, but for generation purposes, we'll generate the address
1525    as :
1526    RTL                               Absolute
1527    tmp = hi (symbol_ref);            adrp  x1, foo
1528    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1529                                      nop
1530
1531    PIC                               TLS
1532    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1533    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1534                                      bl   __tls_get_addr
1535                                      nop
1536
1537    Load TLS symbol, depending on TLS mechanism and TLS access model.
1538
1539    Global Dynamic - Traditional TLS:
1540    adrp tmp, :tlsgd:imm
1541    add  dest, tmp, #:tlsgd_lo12:imm
1542    bl   __tls_get_addr
1543
1544    Global Dynamic - TLS Descriptors:
1545    adrp dest, :tlsdesc:imm
1546    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1547    add  dest, dest, #:tlsdesc_lo12:imm
1548    blr  tmp
1549    mrs  tp, tpidr_el0
1550    add  dest, dest, tp
1551
1552    Initial Exec:
1553    mrs  tp, tpidr_el0
1554    adrp tmp, :gottprel:imm
1555    ldr  dest, [tmp, #:gottprel_lo12:imm]
1556    add  dest, dest, tp
1557
1558    Local Exec:
1559    mrs  tp, tpidr_el0
1560    add  t0, tp, #:tprel_hi12:imm, lsl #12
1561    add  t0, t0, #:tprel_lo12_nc:imm
1562 */
1563
1564 static void
1565 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1566                                    enum aarch64_symbol_type type)
1567 {
1568   switch (type)
1569     {
1570     case SYMBOL_SMALL_ABSOLUTE:
1571       {
1572         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1573         rtx tmp_reg = dest;
1574         machine_mode mode = GET_MODE (dest);
1575
1576         gcc_assert (mode == Pmode || mode == ptr_mode);
1577
1578         if (can_create_pseudo_p ())
1579           tmp_reg = gen_reg_rtx (mode);
1580
1581         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1582         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1583         return;
1584       }
1585
1586     case SYMBOL_TINY_ABSOLUTE:
1587       emit_insn (gen_rtx_SET (dest, imm));
1588       return;
1589
1590     case SYMBOL_SMALL_GOT_28K:
1591       {
1592         machine_mode mode = GET_MODE (dest);
1593         rtx gp_rtx = pic_offset_table_rtx;
1594         rtx insn;
1595         rtx mem;
1596
1597         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1598            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1599            decide rtx costs, in which case pic_offset_table_rtx is not
1600            initialized.  For that case no need to generate the first adrp
1601            instruction as the final cost for global variable access is
1602            one instruction.  */
1603         if (gp_rtx != NULL)
1604           {
1605             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1606                using the page base as GOT base, the first page may be wasted,
1607                in the worst scenario, there is only 28K space for GOT).
1608
1609                The generate instruction sequence for accessing global variable
1610                is:
1611
1612                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1613
1614                Only one instruction needed. But we must initialize
1615                pic_offset_table_rtx properly.  We generate initialize insn for
1616                every global access, and allow CSE to remove all redundant.
1617
1618                The final instruction sequences will look like the following
1619                for multiply global variables access.
1620
1621                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1622
1623                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1624                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1625                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1626                  ...  */
1627
1628             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1629             crtl->uses_pic_offset_table = 1;
1630             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1631
1632             if (mode != GET_MODE (gp_rtx))
1633              gp_rtx = gen_lowpart (mode, gp_rtx);
1634
1635           }
1636
1637         if (mode == ptr_mode)
1638           {
1639             if (mode == DImode)
1640               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1641             else
1642               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1643
1644             mem = XVECEXP (SET_SRC (insn), 0, 0);
1645           }
1646         else
1647           {
1648             gcc_assert (mode == Pmode);
1649
1650             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1651             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1652           }
1653
1654         /* The operand is expected to be MEM.  Whenever the related insn
1655            pattern changed, above code which calculate mem should be
1656            updated.  */
1657         gcc_assert (GET_CODE (mem) == MEM);
1658         MEM_READONLY_P (mem) = 1;
1659         MEM_NOTRAP_P (mem) = 1;
1660         emit_insn (insn);
1661         return;
1662       }
1663
1664     case SYMBOL_SMALL_GOT_4G:
1665       {
1666         /* In ILP32, the mode of dest can be either SImode or DImode,
1667            while the got entry is always of SImode size.  The mode of
1668            dest depends on how dest is used: if dest is assigned to a
1669            pointer (e.g. in the memory), it has SImode; it may have
1670            DImode if dest is dereferenced to access the memeory.
1671            This is why we have to handle three different ldr_got_small
1672            patterns here (two patterns for ILP32).  */
1673
1674         rtx insn;
1675         rtx mem;
1676         rtx tmp_reg = dest;
1677         machine_mode mode = GET_MODE (dest);
1678
1679         if (can_create_pseudo_p ())
1680           tmp_reg = gen_reg_rtx (mode);
1681
1682         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1683         if (mode == ptr_mode)
1684           {
1685             if (mode == DImode)
1686               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1687             else
1688               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1689
1690             mem = XVECEXP (SET_SRC (insn), 0, 0);
1691           }
1692         else
1693           {
1694             gcc_assert (mode == Pmode);
1695
1696             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1697             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1698           }
1699
1700         gcc_assert (GET_CODE (mem) == MEM);
1701         MEM_READONLY_P (mem) = 1;
1702         MEM_NOTRAP_P (mem) = 1;
1703         emit_insn (insn);
1704         return;
1705       }
1706
1707     case SYMBOL_SMALL_TLSGD:
1708       {
1709         rtx_insn *insns;
1710         machine_mode mode = GET_MODE (dest);
1711         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1712
1713         start_sequence ();
1714         if (TARGET_ILP32)
1715           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1716         else
1717           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1718         insns = get_insns ();
1719         end_sequence ();
1720
1721         RTL_CONST_CALL_P (insns) = 1;
1722         emit_libcall_block (insns, dest, result, imm);
1723         return;
1724       }
1725
1726     case SYMBOL_SMALL_TLSDESC:
1727       {
1728         machine_mode mode = GET_MODE (dest);
1729         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1730         rtx tp;
1731
1732         gcc_assert (mode == Pmode || mode == ptr_mode);
1733
1734         /* In ILP32, the got entry is always of SImode size.  Unlike
1735            small GOT, the dest is fixed at reg 0.  */
1736         if (TARGET_ILP32)
1737           emit_insn (gen_tlsdesc_small_si (imm));
1738         else
1739           emit_insn (gen_tlsdesc_small_di (imm));
1740         tp = aarch64_load_tp (NULL);
1741
1742         if (mode != Pmode)
1743           tp = gen_lowpart (mode, tp);
1744
1745         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1746         if (REG_P (dest))
1747           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1748         return;
1749       }
1750
1751     case SYMBOL_SMALL_TLSIE:
1752       {
1753         /* In ILP32, the mode of dest can be either SImode or DImode,
1754            while the got entry is always of SImode size.  The mode of
1755            dest depends on how dest is used: if dest is assigned to a
1756            pointer (e.g. in the memory), it has SImode; it may have
1757            DImode if dest is dereferenced to access the memeory.
1758            This is why we have to handle three different tlsie_small
1759            patterns here (two patterns for ILP32).  */
1760         machine_mode mode = GET_MODE (dest);
1761         rtx tmp_reg = gen_reg_rtx (mode);
1762         rtx tp = aarch64_load_tp (NULL);
1763
1764         if (mode == ptr_mode)
1765           {
1766             if (mode == DImode)
1767               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1768             else
1769               {
1770                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1771                 tp = gen_lowpart (mode, tp);
1772               }
1773           }
1774         else
1775           {
1776             gcc_assert (mode == Pmode);
1777             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1778           }
1779
1780         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1781         if (REG_P (dest))
1782           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1783         return;
1784       }
1785
1786     case SYMBOL_TLSLE12:
1787     case SYMBOL_TLSLE24:
1788     case SYMBOL_TLSLE32:
1789     case SYMBOL_TLSLE48:
1790       {
1791         machine_mode mode = GET_MODE (dest);
1792         rtx tp = aarch64_load_tp (NULL);
1793
1794         if (mode != Pmode)
1795           tp = gen_lowpart (mode, tp);
1796
1797         switch (type)
1798           {
1799           case SYMBOL_TLSLE12:
1800             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1801                         (dest, tp, imm));
1802             break;
1803           case SYMBOL_TLSLE24:
1804             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1805                         (dest, tp, imm));
1806           break;
1807           case SYMBOL_TLSLE32:
1808             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1809                         (dest, imm));
1810             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1811                         (dest, dest, tp));
1812           break;
1813           case SYMBOL_TLSLE48:
1814             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1815                         (dest, imm));
1816             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1817                         (dest, dest, tp));
1818             break;
1819           default:
1820             gcc_unreachable ();
1821           }
1822
1823         if (REG_P (dest))
1824           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1825         return;
1826       }
1827
1828     case SYMBOL_TINY_GOT:
1829       emit_insn (gen_ldr_got_tiny (dest, imm));
1830       return;
1831
1832     case SYMBOL_TINY_TLSIE:
1833       {
1834         machine_mode mode = GET_MODE (dest);
1835         rtx tp = aarch64_load_tp (NULL);
1836
1837         if (mode == ptr_mode)
1838           {
1839             if (mode == DImode)
1840               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1841             else
1842               {
1843                 tp = gen_lowpart (mode, tp);
1844                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1845               }
1846           }
1847         else
1848           {
1849             gcc_assert (mode == Pmode);
1850             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1851           }
1852
1853         if (REG_P (dest))
1854           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1855         return;
1856       }
1857
1858     default:
1859       gcc_unreachable ();
1860     }
1861 }
1862
1863 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1864    handle all moves if !can_create_pseudo_p ().  The distinction is
1865    important because, unlike emit_move_insn, the move expanders know
1866    how to force Pmode objects into the constant pool even when the
1867    constant pool address is not itself legitimate.  */
1868 static rtx
1869 aarch64_emit_move (rtx dest, rtx src)
1870 {
1871   return (can_create_pseudo_p ()
1872           ? emit_move_insn (dest, src)
1873           : emit_move_insn_1 (dest, src));
1874 }
1875
1876 /* Split a 128-bit move operation into two 64-bit move operations,
1877    taking care to handle partial overlap of register to register
1878    copies.  Special cases are needed when moving between GP regs and
1879    FP regs.  SRC can be a register, constant or memory; DST a register
1880    or memory.  If either operand is memory it must not have any side
1881    effects.  */
1882 void
1883 aarch64_split_128bit_move (rtx dst, rtx src)
1884 {
1885   rtx dst_lo, dst_hi;
1886   rtx src_lo, src_hi;
1887
1888   machine_mode mode = GET_MODE (dst);
1889
1890   gcc_assert (mode == TImode || mode == TFmode);
1891   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1892   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1893
1894   if (REG_P (dst) && REG_P (src))
1895     {
1896       int src_regno = REGNO (src);
1897       int dst_regno = REGNO (dst);
1898
1899       /* Handle FP <-> GP regs.  */
1900       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1901         {
1902           src_lo = gen_lowpart (word_mode, src);
1903           src_hi = gen_highpart (word_mode, src);
1904
1905           if (mode == TImode)
1906             {
1907               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1908               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1909             }
1910           else
1911             {
1912               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1913               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1914             }
1915           return;
1916         }
1917       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1918         {
1919           dst_lo = gen_lowpart (word_mode, dst);
1920           dst_hi = gen_highpart (word_mode, dst);
1921
1922           if (mode == TImode)
1923             {
1924               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1925               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1926             }
1927           else
1928             {
1929               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1930               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1931             }
1932           return;
1933         }
1934     }
1935
1936   dst_lo = gen_lowpart (word_mode, dst);
1937   dst_hi = gen_highpart (word_mode, dst);
1938   src_lo = gen_lowpart (word_mode, src);
1939   src_hi = gen_highpart_mode (word_mode, mode, src);
1940
1941   /* At most one pairing may overlap.  */
1942   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1943     {
1944       aarch64_emit_move (dst_hi, src_hi);
1945       aarch64_emit_move (dst_lo, src_lo);
1946     }
1947   else
1948     {
1949       aarch64_emit_move (dst_lo, src_lo);
1950       aarch64_emit_move (dst_hi, src_hi);
1951     }
1952 }
1953
1954 bool
1955 aarch64_split_128bit_move_p (rtx dst, rtx src)
1956 {
1957   return (! REG_P (src)
1958           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1959 }
1960
1961 /* Split a complex SIMD combine.  */
1962
1963 void
1964 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1965 {
1966   machine_mode src_mode = GET_MODE (src1);
1967   machine_mode dst_mode = GET_MODE (dst);
1968
1969   gcc_assert (VECTOR_MODE_P (dst_mode));
1970   gcc_assert (register_operand (dst, dst_mode)
1971               && register_operand (src1, src_mode)
1972               && register_operand (src2, src_mode));
1973
1974   rtx (*gen) (rtx, rtx, rtx);
1975
1976   switch (src_mode)
1977     {
1978     case E_V8QImode:
1979       gen = gen_aarch64_simd_combinev8qi;
1980       break;
1981     case E_V4HImode:
1982       gen = gen_aarch64_simd_combinev4hi;
1983       break;
1984     case E_V2SImode:
1985       gen = gen_aarch64_simd_combinev2si;
1986       break;
1987     case E_V4HFmode:
1988       gen = gen_aarch64_simd_combinev4hf;
1989       break;
1990     case E_V2SFmode:
1991       gen = gen_aarch64_simd_combinev2sf;
1992       break;
1993     case E_DImode:
1994       gen = gen_aarch64_simd_combinedi;
1995       break;
1996     case E_DFmode:
1997       gen = gen_aarch64_simd_combinedf;
1998       break;
1999     default:
2000       gcc_unreachable ();
2001     }
2002
2003   emit_insn (gen (dst, src1, src2));
2004   return;
2005 }
2006
2007 /* Split a complex SIMD move.  */
2008
2009 void
2010 aarch64_split_simd_move (rtx dst, rtx src)
2011 {
2012   machine_mode src_mode = GET_MODE (src);
2013   machine_mode dst_mode = GET_MODE (dst);
2014
2015   gcc_assert (VECTOR_MODE_P (dst_mode));
2016
2017   if (REG_P (dst) && REG_P (src))
2018     {
2019       rtx (*gen) (rtx, rtx);
2020
2021       gcc_assert (VECTOR_MODE_P (src_mode));
2022
2023       switch (src_mode)
2024         {
2025         case E_V16QImode:
2026           gen = gen_aarch64_split_simd_movv16qi;
2027           break;
2028         case E_V8HImode:
2029           gen = gen_aarch64_split_simd_movv8hi;
2030           break;
2031         case E_V4SImode:
2032           gen = gen_aarch64_split_simd_movv4si;
2033           break;
2034         case E_V2DImode:
2035           gen = gen_aarch64_split_simd_movv2di;
2036           break;
2037         case E_V8HFmode:
2038           gen = gen_aarch64_split_simd_movv8hf;
2039           break;
2040         case E_V4SFmode:
2041           gen = gen_aarch64_split_simd_movv4sf;
2042           break;
2043         case E_V2DFmode:
2044           gen = gen_aarch64_split_simd_movv2df;
2045           break;
2046         default:
2047           gcc_unreachable ();
2048         }
2049
2050       emit_insn (gen (dst, src));
2051       return;
2052     }
2053 }
2054
2055 bool
2056 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2057                               machine_mode ymode, rtx y)
2058 {
2059   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2060   gcc_assert (r != NULL);
2061   return rtx_equal_p (x, r);
2062 }
2063
2064
2065 static rtx
2066 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2067 {
2068   if (can_create_pseudo_p ())
2069     return force_reg (mode, value);
2070   else
2071     {
2072       gcc_assert (x);
2073       aarch64_emit_move (x, value);
2074       return x;
2075     }
2076 }
2077
2078 /* Return true if we can move VALUE into a register using a single
2079    CNT[BHWD] instruction.  */
2080
2081 static bool
2082 aarch64_sve_cnt_immediate_p (poly_int64 value)
2083 {
2084   HOST_WIDE_INT factor = value.coeffs[0];
2085   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2086   return (value.coeffs[1] == factor
2087           && IN_RANGE (factor, 2, 16 * 16)
2088           && (factor & 1) == 0
2089           && factor <= 16 * (factor & -factor));
2090 }
2091
2092 /* Likewise for rtx X.  */
2093
2094 bool
2095 aarch64_sve_cnt_immediate_p (rtx x)
2096 {
2097   poly_int64 value;
2098   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2099 }
2100
2101 /* Return the asm string for an instruction with a CNT-like vector size
2102    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2103    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2104    first part of the operands template (the part that comes before the
2105    vector size itself).  FACTOR is the number of quadwords.
2106    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2107    If it is zero, we can use any element size.  */
2108
2109 static char *
2110 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2111                                   unsigned int factor,
2112                                   unsigned int nelts_per_vq)
2113 {
2114   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2115
2116   if (nelts_per_vq == 0)
2117     /* There is some overlap in the ranges of the four CNT instructions.
2118        Here we always use the smallest possible element size, so that the
2119        multiplier is 1 whereever possible.  */
2120     nelts_per_vq = factor & -factor;
2121   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2122   gcc_assert (IN_RANGE (shift, 1, 4));
2123   char suffix = "dwhb"[shift - 1];
2124
2125   factor >>= shift;
2126   unsigned int written;
2127   if (factor == 1)
2128     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2129                         prefix, suffix, operands);
2130   else
2131     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2132                         prefix, suffix, operands, factor);
2133   gcc_assert (written < sizeof (buffer));
2134   return buffer;
2135 }
2136
2137 /* Return the asm string for an instruction with a CNT-like vector size
2138    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2139    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2140    first part of the operands template (the part that comes before the
2141    vector size itself).  X is the value of the vector size operand,
2142    as a polynomial integer rtx.  */
2143
2144 char *
2145 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2146                                   rtx x)
2147 {
2148   poly_int64 value = rtx_to_poly_int64 (x);
2149   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2150   return aarch64_output_sve_cnt_immediate (prefix, operands,
2151                                            value.coeffs[1], 0);
2152 }
2153
2154 /* Return true if we can add VALUE to a register using a single ADDVL
2155    or ADDPL instruction.  */
2156
2157 static bool
2158 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2159 {
2160   HOST_WIDE_INT factor = value.coeffs[0];
2161   if (factor == 0 || value.coeffs[1] != factor)
2162     return false;
2163   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2164      and a value of 16 is one vector width.  */
2165   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2166           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2167 }
2168
2169 /* Likewise for rtx X.  */
2170
2171 bool
2172 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2173 {
2174   poly_int64 value;
2175   return (poly_int_rtx_p (x, &value)
2176           && aarch64_sve_addvl_addpl_immediate_p (value));
2177 }
2178
2179 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2180    and storing the result in operand 0.  */
2181
2182 char *
2183 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2184 {
2185   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2186   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2187   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2188
2189   /* Use INC or DEC if possible.  */
2190   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2191     {
2192       if (aarch64_sve_cnt_immediate_p (offset_value))
2193         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2194                                                  offset_value.coeffs[1], 0);
2195       if (aarch64_sve_cnt_immediate_p (-offset_value))
2196         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2197                                                  -offset_value.coeffs[1], 0);
2198     }
2199
2200   int factor = offset_value.coeffs[1];
2201   if ((factor & 15) == 0)
2202     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2203   else
2204     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2205   return buffer;
2206 }
2207
2208 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2209    instruction.  If it is, store the number of elements in each vector
2210    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2211    factor in *FACTOR_OUT (if nonnull).  */
2212
2213 bool
2214 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2215                                  unsigned int *nelts_per_vq_out)
2216 {
2217   rtx elt;
2218   poly_int64 value;
2219
2220   if (!const_vec_duplicate_p (x, &elt)
2221       || !poly_int_rtx_p (elt, &value))
2222     return false;
2223
2224   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2225   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2226     /* There's no vector INCB.  */
2227     return false;
2228
2229   HOST_WIDE_INT factor = value.coeffs[0];
2230   if (value.coeffs[1] != factor)
2231     return false;
2232
2233   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2234   if ((factor % nelts_per_vq) != 0
2235       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2236     return false;
2237
2238   if (factor_out)
2239     *factor_out = factor;
2240   if (nelts_per_vq_out)
2241     *nelts_per_vq_out = nelts_per_vq;
2242   return true;
2243 }
2244
2245 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2246    instruction.  */
2247
2248 bool
2249 aarch64_sve_inc_dec_immediate_p (rtx x)
2250 {
2251   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2252 }
2253
2254 /* Return the asm template for an SVE vector INC or DEC instruction.
2255    OPERANDS gives the operands before the vector count and X is the
2256    value of the vector count operand itself.  */
2257
2258 char *
2259 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2260 {
2261   int factor;
2262   unsigned int nelts_per_vq;
2263   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2264     gcc_unreachable ();
2265   if (factor < 0)
2266     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2267                                              nelts_per_vq);
2268   else
2269     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2270                                              nelts_per_vq);
2271 }
2272
2273 static int
2274 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2275                                 scalar_int_mode mode)
2276 {
2277   int i;
2278   unsigned HOST_WIDE_INT val, val2, mask;
2279   int one_match, zero_match;
2280   int num_insns;
2281
2282   val = INTVAL (imm);
2283
2284   if (aarch64_move_imm (val, mode))
2285     {
2286       if (generate)
2287         emit_insn (gen_rtx_SET (dest, imm));
2288       return 1;
2289     }
2290
2291   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2292      (with XXXX non-zero). In that case check to see if the move can be done in
2293      a smaller mode.  */
2294   val2 = val & 0xffffffff;
2295   if (mode == DImode
2296       && aarch64_move_imm (val2, SImode)
2297       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2298     {
2299       if (generate)
2300         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2301
2302       /* Check if we have to emit a second instruction by checking to see
2303          if any of the upper 32 bits of the original DI mode value is set.  */
2304       if (val == val2)
2305         return 1;
2306
2307       i = (val >> 48) ? 48 : 32;
2308
2309       if (generate)
2310          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2311                                     GEN_INT ((val >> i) & 0xffff)));
2312
2313       return 2;
2314     }
2315
2316   if ((val >> 32) == 0 || mode == SImode)
2317     {
2318       if (generate)
2319         {
2320           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2321           if (mode == SImode)
2322             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2323                                        GEN_INT ((val >> 16) & 0xffff)));
2324           else
2325             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2326                                        GEN_INT ((val >> 16) & 0xffff)));
2327         }
2328       return 2;
2329     }
2330
2331   /* Remaining cases are all for DImode.  */
2332
2333   mask = 0xffff;
2334   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2335     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2336   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2337     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2338
2339   if (zero_match != 2 && one_match != 2)
2340     {
2341       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2342          For a 64-bit bitmask try whether changing 16 bits to all ones or
2343          zeroes creates a valid bitmask.  To check any repeated bitmask,
2344          try using 16 bits from the other 32-bit half of val.  */
2345
2346       for (i = 0; i < 64; i += 16, mask <<= 16)
2347         {
2348           val2 = val & ~mask;
2349           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2350             break;
2351           val2 = val | mask;
2352           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2353             break;
2354           val2 = val2 & ~mask;
2355           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2356           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2357             break;
2358         }
2359       if (i != 64)
2360         {
2361           if (generate)
2362             {
2363               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2364               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2365                                          GEN_INT ((val >> i) & 0xffff)));
2366             }
2367           return 2;
2368         }
2369     }
2370
2371   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2372      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2373      otherwise skip zero bits.  */
2374
2375   num_insns = 1;
2376   mask = 0xffff;
2377   val2 = one_match > zero_match ? ~val : val;
2378   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2379
2380   if (generate)
2381     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2382                                            ? (val | ~(mask << i))
2383                                            : (val & (mask << i)))));
2384   for (i += 16; i < 64; i += 16)
2385     {
2386       if ((val2 & (mask << i)) == 0)
2387         continue;
2388       if (generate)
2389         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2390                                    GEN_INT ((val >> i) & 0xffff)));
2391       num_insns ++;
2392     }
2393
2394   return num_insns;
2395 }
2396
2397 /* Return whether imm is a 128-bit immediate which is simple enough to
2398    expand inline.  */
2399 bool
2400 aarch64_mov128_immediate (rtx imm)
2401 {
2402   if (GET_CODE (imm) == CONST_INT)
2403     return true;
2404
2405   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2406
2407   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2408   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2409
2410   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2411          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2412 }
2413
2414
2415 /* Return the number of temporary registers that aarch64_add_offset_1
2416    would need to add OFFSET to a register.  */
2417
2418 static unsigned int
2419 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2420 {
2421   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2422 }
2423
2424 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2425    a non-polynomial OFFSET.  MODE is the mode of the addition.
2426    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2427    be set and CFA adjustments added to the generated instructions.
2428
2429    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2430    temporary if register allocation is already complete.  This temporary
2431    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2432    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2433    the immediate again.
2434
2435    Since this function may be used to adjust the stack pointer, we must
2436    ensure that it cannot cause transient stack deallocation (for example
2437    by first incrementing SP and then decrementing when adjusting by a
2438    large immediate).  */
2439
2440 static void
2441 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2442                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2443                       bool frame_related_p, bool emit_move_imm)
2444 {
2445   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2446   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2447
2448   HOST_WIDE_INT moffset = abs_hwi (offset);
2449   rtx_insn *insn;
2450
2451   if (!moffset)
2452     {
2453       if (!rtx_equal_p (dest, src))
2454         {
2455           insn = emit_insn (gen_rtx_SET (dest, src));
2456           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2457         }
2458       return;
2459     }
2460
2461   /* Single instruction adjustment.  */
2462   if (aarch64_uimm12_shift (moffset))
2463     {
2464       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2465       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2466       return;
2467     }
2468
2469   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2470      and either:
2471
2472      a) the offset cannot be loaded by a 16-bit move or
2473      b) there is no spare register into which we can move it.  */
2474   if (moffset < 0x1000000
2475       && ((!temp1 && !can_create_pseudo_p ())
2476           || !aarch64_move_imm (moffset, mode)))
2477     {
2478       HOST_WIDE_INT low_off = moffset & 0xfff;
2479
2480       low_off = offset < 0 ? -low_off : low_off;
2481       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2482       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2483       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2484       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2485       return;
2486     }
2487
2488   /* Emit a move immediate if required and an addition/subtraction.  */
2489   if (emit_move_imm)
2490     {
2491       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2492       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2493     }
2494   insn = emit_insn (offset < 0
2495                     ? gen_sub3_insn (dest, src, temp1)
2496                     : gen_add3_insn (dest, src, temp1));
2497   if (frame_related_p)
2498     {
2499       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2500       rtx adj = plus_constant (mode, src, offset);
2501       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2502     }
2503 }
2504
2505 /* Return the number of temporary registers that aarch64_add_offset
2506    would need to move OFFSET into a register or add OFFSET to a register;
2507    ADD_P is true if we want the latter rather than the former.  */
2508
2509 static unsigned int
2510 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2511 {
2512   /* This follows the same structure as aarch64_add_offset.  */
2513   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2514     return 0;
2515
2516   unsigned int count = 0;
2517   HOST_WIDE_INT factor = offset.coeffs[1];
2518   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2519   poly_int64 poly_offset (factor, factor);
2520   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2521     /* Need one register for the ADDVL/ADDPL result.  */
2522     count += 1;
2523   else if (factor != 0)
2524     {
2525       factor = abs (factor);
2526       if (factor > 16 * (factor & -factor))
2527         /* Need one register for the CNT result and one for the multiplication
2528            factor.  If necessary, the second temporary can be reused for the
2529            constant part of the offset.  */
2530         return 2;
2531       /* Need one register for the CNT result (which might then
2532          be shifted).  */
2533       count += 1;
2534     }
2535   return count + aarch64_add_offset_1_temporaries (constant);
2536 }
2537
2538 /* If X can be represented as a poly_int64, return the number
2539    of temporaries that are required to add it to a register.
2540    Return -1 otherwise.  */
2541
2542 int
2543 aarch64_add_offset_temporaries (rtx x)
2544 {
2545   poly_int64 offset;
2546   if (!poly_int_rtx_p (x, &offset))
2547     return -1;
2548   return aarch64_offset_temporaries (true, offset);
2549 }
2550
2551 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2552    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2553    be set and CFA adjustments added to the generated instructions.
2554
2555    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2556    temporary if register allocation is already complete.  This temporary
2557    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2558    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2559    false to avoid emitting the immediate again.
2560
2561    TEMP2, if nonnull, is a second temporary register that doesn't
2562    overlap either DEST or REG.
2563
2564    Since this function may be used to adjust the stack pointer, we must
2565    ensure that it cannot cause transient stack deallocation (for example
2566    by first incrementing SP and then decrementing when adjusting by a
2567    large immediate).  */
2568
2569 static void
2570 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2571                     poly_int64 offset, rtx temp1, rtx temp2,
2572                     bool frame_related_p, bool emit_move_imm = true)
2573 {
2574   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2575   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2576   gcc_assert (temp1 == NULL_RTX
2577               || !frame_related_p
2578               || !reg_overlap_mentioned_p (temp1, dest));
2579   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2580
2581   /* Try using ADDVL or ADDPL to add the whole value.  */
2582   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2583     {
2584       rtx offset_rtx = gen_int_mode (offset, mode);
2585       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2586       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2587       return;
2588     }
2589
2590   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2591      SVE vector register, over and above the minimum size of 128 bits.
2592      This is equivalent to half the value returned by CNTD with a
2593      vector shape of ALL.  */
2594   HOST_WIDE_INT factor = offset.coeffs[1];
2595   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2596
2597   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2598   poly_int64 poly_offset (factor, factor);
2599   if (src != const0_rtx
2600       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2601     {
2602       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2603       if (frame_related_p)
2604         {
2605           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2606           RTX_FRAME_RELATED_P (insn) = true;
2607           src = dest;
2608         }
2609       else
2610         {
2611           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2612           src = aarch64_force_temporary (mode, temp1, addr);
2613           temp1 = temp2;
2614           temp2 = NULL_RTX;
2615         }
2616     }
2617   /* Otherwise use a CNT-based sequence.  */
2618   else if (factor != 0)
2619     {
2620       /* Use a subtraction if we have a negative factor.  */
2621       rtx_code code = PLUS;
2622       if (factor < 0)
2623         {
2624           factor = -factor;
2625           code = MINUS;
2626         }
2627
2628       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2629          into the multiplication.  */
2630       rtx val;
2631       int shift = 0;
2632       if (factor & 1)
2633         /* Use a right shift by 1.  */
2634         shift = -1;
2635       else
2636         factor /= 2;
2637       HOST_WIDE_INT low_bit = factor & -factor;
2638       if (factor <= 16 * low_bit)
2639         {
2640           if (factor > 16 * 8)
2641             {
2642               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2643                  the value with the minimum multiplier and shift it into
2644                  position.  */
2645               int extra_shift = exact_log2 (low_bit);
2646               shift += extra_shift;
2647               factor >>= extra_shift;
2648             }
2649           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2650         }
2651       else
2652         {
2653           /* Use CNTD, then multiply it by FACTOR.  */
2654           val = gen_int_mode (poly_int64 (2, 2), mode);
2655           val = aarch64_force_temporary (mode, temp1, val);
2656
2657           /* Go back to using a negative multiplication factor if we have
2658              no register from which to subtract.  */
2659           if (code == MINUS && src == const0_rtx)
2660             {
2661               factor = -factor;
2662               code = PLUS;
2663             }
2664           rtx coeff1 = gen_int_mode (factor, mode);
2665           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2666           val = gen_rtx_MULT (mode, val, coeff1);
2667         }
2668
2669       if (shift > 0)
2670         {
2671           /* Multiply by 1 << SHIFT.  */
2672           val = aarch64_force_temporary (mode, temp1, val);
2673           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2674         }
2675       else if (shift == -1)
2676         {
2677           /* Divide by 2.  */
2678           val = aarch64_force_temporary (mode, temp1, val);
2679           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2680         }
2681
2682       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
2683       if (src != const0_rtx)
2684         {
2685           val = aarch64_force_temporary (mode, temp1, val);
2686           val = gen_rtx_fmt_ee (code, mode, src, val);
2687         }
2688       else if (code == MINUS)
2689         {
2690           val = aarch64_force_temporary (mode, temp1, val);
2691           val = gen_rtx_NEG (mode, val);
2692         }
2693
2694       if (constant == 0 || frame_related_p)
2695         {
2696           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2697           if (frame_related_p)
2698             {
2699               RTX_FRAME_RELATED_P (insn) = true;
2700               add_reg_note (insn, REG_CFA_ADJUST_CFA,
2701                             gen_rtx_SET (dest, plus_constant (Pmode, src,
2702                                                               poly_offset)));
2703             }
2704           src = dest;
2705           if (constant == 0)
2706             return;
2707         }
2708       else
2709         {
2710           src = aarch64_force_temporary (mode, temp1, val);
2711           temp1 = temp2;
2712           temp2 = NULL_RTX;
2713         }
2714
2715       emit_move_imm = true;
2716     }
2717
2718   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2719                         frame_related_p, emit_move_imm);
2720 }
2721
2722 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2723    than a poly_int64.  */
2724
2725 void
2726 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2727                           rtx offset_rtx, rtx temp1, rtx temp2)
2728 {
2729   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2730                       temp1, temp2, false);
2731 }
2732
2733 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2734    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
2735    if TEMP1 already contains abs (DELTA).  */
2736
2737 static inline void
2738 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2739 {
2740   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2741                       temp1, temp2, true, emit_move_imm);
2742 }
2743
2744 /* Subtract DELTA from the stack pointer, marking the instructions
2745    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
2746    if nonnull.  */
2747
2748 static inline void
2749 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2750 {
2751   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2752                       temp1, temp2, frame_related_p);
2753 }
2754
2755 /* Set DEST to (vec_series BASE STEP).  */
2756
2757 static void
2758 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2759 {
2760   machine_mode mode = GET_MODE (dest);
2761   scalar_mode inner = GET_MODE_INNER (mode);
2762
2763   /* Each operand can be a register or an immediate in the range [-16, 15].  */
2764   if (!aarch64_sve_index_immediate_p (base))
2765     base = force_reg (inner, base);
2766   if (!aarch64_sve_index_immediate_p (step))
2767     step = force_reg (inner, step);
2768
2769   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2770 }
2771
2772 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2773    integer of mode INT_MODE.  Return true on success.  */
2774
2775 static bool
2776 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2777                                       rtx src)
2778 {
2779   /* If the constant is smaller than 128 bits, we can do the move
2780      using a vector of SRC_MODEs.  */
2781   if (src_mode != TImode)
2782     {
2783       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2784                                      GET_MODE_SIZE (src_mode));
2785       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2786       emit_move_insn (gen_lowpart (dup_mode, dest),
2787                       gen_const_vec_duplicate (dup_mode, src));
2788       return true;
2789     }
2790
2791   /* Use LD1RQ[BHWD] to load the 128 bits from memory.  */
2792   src = force_const_mem (src_mode, src);
2793   if (!src)
2794     return false;
2795
2796   /* Make sure that the address is legitimate.  */
2797   if (!aarch64_sve_ld1r_operand_p (src))
2798     {
2799       rtx addr = force_reg (Pmode, XEXP (src, 0));
2800       src = replace_equiv_address (src, addr);
2801     }
2802
2803   machine_mode mode = GET_MODE (dest);
2804   unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2805   machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2806   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2807   src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2808   emit_insn (gen_rtx_SET (dest, src));
2809   return true;
2810 }
2811
2812 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2813    isn't a simple duplicate or series.  */
2814
2815 static void
2816 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2817 {
2818   machine_mode mode = GET_MODE (src);
2819   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2820   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2821   gcc_assert (npatterns > 1);
2822
2823   if (nelts_per_pattern == 1)
2824     {
2825       /* The constant is a repeating seqeuence of at least two elements,
2826          where the repeating elements occupy no more than 128 bits.
2827          Get an integer representation of the replicated value.  */
2828       scalar_int_mode int_mode;
2829       if (BYTES_BIG_ENDIAN)
2830         /* For now, always use LD1RQ to load the value on big-endian
2831            targets, since the handling of smaller integers includes a
2832            subreg that is semantically an element reverse.  */
2833         int_mode = TImode;
2834       else
2835         {
2836           unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2837           gcc_assert (int_bits <= 128);
2838           int_mode = int_mode_for_size (int_bits, 0).require ();
2839         }
2840       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2841       if (int_value
2842           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2843         return;
2844     }
2845
2846   /* Expand each pattern individually.  */
2847   rtx_vector_builder builder;
2848   auto_vec<rtx, 16> vectors (npatterns);
2849   for (unsigned int i = 0; i < npatterns; ++i)
2850     {
2851       builder.new_vector (mode, 1, nelts_per_pattern);
2852       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2853         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2854       vectors.quick_push (force_reg (mode, builder.build ()));
2855     }
2856
2857   /* Use permutes to interleave the separate vectors.  */
2858   while (npatterns > 1)
2859     {
2860       npatterns /= 2;
2861       for (unsigned int i = 0; i < npatterns; ++i)
2862         {
2863           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2864           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2865           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2866           vectors[i] = tmp;
2867         }
2868     }
2869   gcc_assert (vectors[0] == dest);
2870 }
2871
2872 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
2873    is a pattern that can be used to set DEST to a replicated scalar
2874    element.  */
2875
2876 void
2877 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2878                               rtx (*gen_vec_duplicate) (rtx, rtx))
2879 {
2880   machine_mode mode = GET_MODE (dest);
2881
2882   /* Check on what type of symbol it is.  */
2883   scalar_int_mode int_mode;
2884   if ((GET_CODE (imm) == SYMBOL_REF
2885        || GET_CODE (imm) == LABEL_REF
2886        || GET_CODE (imm) == CONST
2887        || GET_CODE (imm) == CONST_POLY_INT)
2888       && is_a <scalar_int_mode> (mode, &int_mode))
2889     {
2890       rtx mem;
2891       poly_int64 offset;
2892       HOST_WIDE_INT const_offset;
2893       enum aarch64_symbol_type sty;
2894
2895       /* If we have (const (plus symbol offset)), separate out the offset
2896          before we start classifying the symbol.  */
2897       rtx base = strip_offset (imm, &offset);
2898
2899       /* We must always add an offset involving VL separately, rather than
2900          folding it into the relocation.  */
2901       if (!offset.is_constant (&const_offset))
2902         {
2903           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2904             emit_insn (gen_rtx_SET (dest, imm));
2905           else
2906             {
2907               /* Do arithmetic on 32-bit values if the result is smaller
2908                  than that.  */
2909               if (partial_subreg_p (int_mode, SImode))
2910                 {
2911                   /* It is invalid to do symbol calculations in modes
2912                      narrower than SImode.  */
2913                   gcc_assert (base == const0_rtx);
2914                   dest = gen_lowpart (SImode, dest);
2915                   int_mode = SImode;
2916                 }
2917               if (base != const0_rtx)
2918                 {
2919                   base = aarch64_force_temporary (int_mode, dest, base);
2920                   aarch64_add_offset (int_mode, dest, base, offset,
2921                                       NULL_RTX, NULL_RTX, false);
2922                 }
2923               else
2924                 aarch64_add_offset (int_mode, dest, base, offset,
2925                                     dest, NULL_RTX, false);
2926             }
2927           return;
2928         }
2929
2930       sty = aarch64_classify_symbol (base, const_offset);
2931       switch (sty)
2932         {
2933         case SYMBOL_FORCE_TO_MEM:
2934           if (const_offset != 0
2935               && targetm.cannot_force_const_mem (int_mode, imm))
2936             {
2937               gcc_assert (can_create_pseudo_p ());
2938               base = aarch64_force_temporary (int_mode, dest, base);
2939               aarch64_add_offset (int_mode, dest, base, const_offset,
2940                                   NULL_RTX, NULL_RTX, false);
2941               return;
2942             }
2943
2944           mem = force_const_mem (ptr_mode, imm);
2945           gcc_assert (mem);
2946
2947           /* If we aren't generating PC relative literals, then
2948              we need to expand the literal pool access carefully.
2949              This is something that needs to be done in a number
2950              of places, so could well live as a separate function.  */
2951           if (!aarch64_pcrelative_literal_loads)
2952             {
2953               gcc_assert (can_create_pseudo_p ());
2954               base = gen_reg_rtx (ptr_mode);
2955               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2956               if (ptr_mode != Pmode)
2957                 base = convert_memory_address (Pmode, base);
2958               mem = gen_rtx_MEM (ptr_mode, base);
2959             }
2960
2961           if (int_mode != ptr_mode)
2962             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2963
2964           emit_insn (gen_rtx_SET (dest, mem));
2965
2966           return;
2967
2968         case SYMBOL_SMALL_TLSGD:
2969         case SYMBOL_SMALL_TLSDESC:
2970         case SYMBOL_SMALL_TLSIE:
2971         case SYMBOL_SMALL_GOT_28K:
2972         case SYMBOL_SMALL_GOT_4G:
2973         case SYMBOL_TINY_GOT:
2974         case SYMBOL_TINY_TLSIE:
2975           if (const_offset != 0)
2976             {
2977               gcc_assert(can_create_pseudo_p ());
2978               base = aarch64_force_temporary (int_mode, dest, base);
2979               aarch64_add_offset (int_mode, dest, base, const_offset,
2980                                   NULL_RTX, NULL_RTX, false);
2981               return;
2982             }
2983           /* FALLTHRU */
2984
2985         case SYMBOL_SMALL_ABSOLUTE:
2986         case SYMBOL_TINY_ABSOLUTE:
2987         case SYMBOL_TLSLE12:
2988         case SYMBOL_TLSLE24:
2989         case SYMBOL_TLSLE32:
2990         case SYMBOL_TLSLE48:
2991           aarch64_load_symref_appropriately (dest, imm, sty);
2992           return;
2993
2994         default:
2995           gcc_unreachable ();
2996         }
2997     }
2998
2999   if (!CONST_INT_P (imm))
3000     {
3001       rtx base, step, value;
3002       if (GET_CODE (imm) == HIGH
3003           || aarch64_simd_valid_immediate (imm, NULL))
3004         emit_insn (gen_rtx_SET (dest, imm));
3005       else if (const_vec_series_p (imm, &base, &step))
3006         aarch64_expand_vec_series (dest, base, step);
3007       else if (const_vec_duplicate_p (imm, &value))
3008         {
3009           /* If the constant is out of range of an SVE vector move,
3010              load it from memory if we can, otherwise move it into
3011              a register and use a DUP.  */
3012           scalar_mode inner_mode = GET_MODE_INNER (mode);
3013           rtx op = force_const_mem (inner_mode, value);
3014           if (!op)
3015             op = force_reg (inner_mode, value);
3016           else if (!aarch64_sve_ld1r_operand_p (op))
3017             {
3018               rtx addr = force_reg (Pmode, XEXP (op, 0));
3019               op = replace_equiv_address (op, addr);
3020             }
3021           emit_insn (gen_vec_duplicate (dest, op));
3022         }
3023       else if (GET_CODE (imm) == CONST_VECTOR
3024                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3025         aarch64_expand_sve_const_vector (dest, imm);
3026       else
3027         {
3028           rtx mem = force_const_mem (mode, imm);
3029           gcc_assert (mem);
3030           emit_move_insn (dest, mem);
3031         }
3032
3033       return;
3034     }
3035
3036   aarch64_internal_mov_immediate (dest, imm, true,
3037                                   as_a <scalar_int_mode> (mode));
3038 }
3039
3040 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3041    that is known to contain PTRUE.  */
3042
3043 void
3044 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3045 {
3046   emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3047                                                 gen_rtvec (2, pred, src),
3048                                                 UNSPEC_MERGE_PTRUE)));
3049 }
3050
3051 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3052    operand is in memory.  In this case we need to use the predicated LD1
3053    and ST1 instead of LDR and STR, both for correctness on big-endian
3054    targets and because LD1 and ST1 support a wider range of addressing modes.
3055    PRED_MODE is the mode of the predicate.
3056
3057    See the comment at the head of aarch64-sve.md for details about the
3058    big-endian handling.  */
3059
3060 void
3061 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3062 {
3063   machine_mode mode = GET_MODE (dest);
3064   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3065   if (!register_operand (src, mode)
3066       && !register_operand (dest, mode))
3067     {
3068       rtx tmp = gen_reg_rtx (mode);
3069       if (MEM_P (src))
3070         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3071       else
3072         emit_move_insn (tmp, src);
3073       src = tmp;
3074     }
3075   aarch64_emit_sve_pred_move (dest, ptrue, src);
3076 }
3077
3078 /* Called only on big-endian targets.  See whether an SVE vector move
3079    from SRC to DEST is effectively a REV[BHW] instruction, because at
3080    least one operand is a subreg of an SVE vector that has wider or
3081    narrower elements.  Return true and emit the instruction if so.
3082
3083    For example:
3084
3085      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3086
3087    represents a VIEW_CONVERT between the following vectors, viewed
3088    in memory order:
3089
3090      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
3091      R1: { [0],      [1],      [2],      [3],     ... }
3092
3093    The high part of lane X in R2 should therefore correspond to lane X*2
3094    of R1, but the register representations are:
3095
3096          msb                                      lsb
3097      R2: ...... [1].high  [1].low   [0].high  [0].low
3098      R1: ...... [3]       [2]       [1]       [0]
3099
3100    where the low part of lane X in R2 corresponds to lane X*2 in R1.
3101    We therefore need a reverse operation to swap the high and low values
3102    around.
3103
3104    This is purely an optimization.  Without it we would spill the
3105    subreg operand to the stack in one mode and reload it in the
3106    other mode, which has the same effect as the REV.  */
3107
3108 bool
3109 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3110 {
3111   gcc_assert (BYTES_BIG_ENDIAN);
3112   if (GET_CODE (dest) == SUBREG)
3113     dest = SUBREG_REG (dest);
3114   if (GET_CODE (src) == SUBREG)
3115     src = SUBREG_REG (src);
3116
3117   /* The optimization handles two single SVE REGs with different element
3118      sizes.  */
3119   if (!REG_P (dest)
3120       || !REG_P (src)
3121       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3122       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3123       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3124           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3125     return false;
3126
3127   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
3128   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3129   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3130                                UNSPEC_REV_SUBREG);
3131   emit_insn (gen_rtx_SET (dest, unspec));
3132   return true;
3133 }
3134
3135 /* Return a copy of X with mode MODE, without changing its other
3136    attributes.  Unlike gen_lowpart, this doesn't care whether the
3137    mode change is valid.  */
3138
3139 static rtx
3140 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3141 {
3142   if (GET_MODE (x) == mode)
3143     return x;
3144
3145   x = shallow_copy_rtx (x);
3146   set_mode_and_regno (x, mode, REGNO (x));
3147   return x;
3148 }
3149
3150 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3151    operands.  */
3152
3153 void
3154 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3155 {
3156   /* Decide which REV operation we need.  The mode with narrower elements
3157      determines the mode of the operands and the mode with the wider
3158      elements determines the reverse width.  */
3159   machine_mode mode_with_wider_elts = GET_MODE (dest);
3160   machine_mode mode_with_narrower_elts = GET_MODE (src);
3161   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3162       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3163     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3164
3165   unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3166   unsigned int unspec;
3167   if (wider_bytes == 8)
3168     unspec = UNSPEC_REV64;
3169   else if (wider_bytes == 4)
3170     unspec = UNSPEC_REV32;
3171   else if (wider_bytes == 2)
3172     unspec = UNSPEC_REV16;
3173   else
3174     gcc_unreachable ();
3175   machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3176
3177   /* Emit:
3178
3179        (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3180                          UNSPEC_MERGE_PTRUE))
3181
3182      with the appropriate modes.  */
3183   ptrue = gen_lowpart (pred_mode, ptrue);
3184   dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3185   src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3186   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3187   src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3188                         UNSPEC_MERGE_PTRUE);
3189   emit_insn (gen_rtx_SET (dest, src));
3190 }
3191
3192 static bool
3193 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3194                                  tree exp ATTRIBUTE_UNUSED)
3195 {
3196   /* Currently, always true.  */
3197   return true;
3198 }
3199
3200 /* Implement TARGET_PASS_BY_REFERENCE.  */
3201
3202 static bool
3203 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3204                            machine_mode mode,
3205                            const_tree type,
3206                            bool named ATTRIBUTE_UNUSED)
3207 {
3208   HOST_WIDE_INT size;
3209   machine_mode dummymode;
3210   int nregs;
3211
3212   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3213   if (mode == BLKmode && type)
3214     size = int_size_in_bytes (type);
3215   else
3216     /* No frontends can create types with variable-sized modes, so we
3217        shouldn't be asked to pass or return them.  */
3218     size = GET_MODE_SIZE (mode).to_constant ();
3219
3220   /* Aggregates are passed by reference based on their size.  */
3221   if (type && AGGREGATE_TYPE_P (type))
3222     {
3223       size = int_size_in_bytes (type);
3224     }
3225
3226   /* Variable sized arguments are always returned by reference.  */
3227   if (size < 0)
3228     return true;
3229
3230   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3231   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3232                                                &dummymode, &nregs,
3233                                                NULL))
3234     return false;
3235
3236   /* Arguments which are variable sized or larger than 2 registers are
3237      passed by reference unless they are a homogenous floating point
3238      aggregate.  */
3239   return size > 2 * UNITS_PER_WORD;
3240 }
3241
3242 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3243 static bool
3244 aarch64_return_in_msb (const_tree valtype)
3245 {
3246   machine_mode dummy_mode;
3247   int dummy_int;
3248
3249   /* Never happens in little-endian mode.  */
3250   if (!BYTES_BIG_ENDIAN)
3251     return false;
3252
3253   /* Only composite types smaller than or equal to 16 bytes can
3254      be potentially returned in registers.  */
3255   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3256       || int_size_in_bytes (valtype) <= 0
3257       || int_size_in_bytes (valtype) > 16)
3258     return false;
3259
3260   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3261      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3262      is always passed/returned in the least significant bits of fp/simd
3263      register(s).  */
3264   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3265                                                &dummy_mode, &dummy_int, NULL))
3266     return false;
3267
3268   return true;
3269 }
3270
3271 /* Implement TARGET_FUNCTION_VALUE.
3272    Define how to find the value returned by a function.  */
3273
3274 static rtx
3275 aarch64_function_value (const_tree type, const_tree func,
3276                         bool outgoing ATTRIBUTE_UNUSED)
3277 {
3278   machine_mode mode;
3279   int unsignedp;
3280   int count;
3281   machine_mode ag_mode;
3282
3283   mode = TYPE_MODE (type);
3284   if (INTEGRAL_TYPE_P (type))
3285     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3286
3287   if (aarch64_return_in_msb (type))
3288     {
3289       HOST_WIDE_INT size = int_size_in_bytes (type);
3290
3291       if (size % UNITS_PER_WORD != 0)
3292         {
3293           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3294           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3295         }
3296     }
3297
3298   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3299                                                &ag_mode, &count, NULL))
3300     {
3301       if (!aarch64_composite_type_p (type, mode))
3302         {
3303           gcc_assert (count == 1 && mode == ag_mode);
3304           return gen_rtx_REG (mode, V0_REGNUM);
3305         }
3306       else
3307         {
3308           int i;
3309           rtx par;
3310
3311           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3312           for (i = 0; i < count; i++)
3313             {
3314               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3315               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3316               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3317               XVECEXP (par, 0, i) = tmp;
3318             }
3319           return par;
3320         }
3321     }
3322   else
3323     return gen_rtx_REG (mode, R0_REGNUM);
3324 }
3325
3326 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3327    Return true if REGNO is the number of a hard register in which the values
3328    of called function may come back.  */
3329
3330 static bool
3331 aarch64_function_value_regno_p (const unsigned int regno)
3332 {
3333   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3334      of 16-byte return values are: 128-bit integers and 16-byte small
3335      structures (excluding homogeneous floating-point aggregates).  */
3336   if (regno == R0_REGNUM || regno == R1_REGNUM)
3337     return true;
3338
3339   /* Up to four fp/simd registers can return a function value, e.g. a
3340      homogeneous floating-point aggregate having four members.  */
3341   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3342     return TARGET_FLOAT;
3343
3344   return false;
3345 }
3346
3347 /* Implement TARGET_RETURN_IN_MEMORY.
3348
3349    If the type T of the result of a function is such that
3350      void func (T arg)
3351    would require that arg be passed as a value in a register (or set of
3352    registers) according to the parameter passing rules, then the result
3353    is returned in the same registers as would be used for such an
3354    argument.  */
3355
3356 static bool
3357 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3358 {
3359   HOST_WIDE_INT size;
3360   machine_mode ag_mode;
3361   int count;
3362
3363   if (!AGGREGATE_TYPE_P (type)
3364       && TREE_CODE (type) != COMPLEX_TYPE
3365       && TREE_CODE (type) != VECTOR_TYPE)
3366     /* Simple scalar types always returned in registers.  */
3367     return false;
3368
3369   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3370                                                type,
3371                                                &ag_mode,
3372                                                &count,
3373                                                NULL))
3374     return false;
3375
3376   /* Types larger than 2 registers returned in memory.  */
3377   size = int_size_in_bytes (type);
3378   return (size < 0 || size > 2 * UNITS_PER_WORD);
3379 }
3380
3381 static bool
3382 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3383                                const_tree type, int *nregs)
3384 {
3385   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3386   return aarch64_vfp_is_call_or_return_candidate (mode,
3387                                                   type,
3388                                                   &pcum->aapcs_vfp_rmode,
3389                                                   nregs,
3390                                                   NULL);
3391 }
3392
3393 /* Given MODE and TYPE of a function argument, return the alignment in
3394    bits.  The idea is to suppress any stronger alignment requested by
3395    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3396    This is a helper function for local use only.  */
3397
3398 static unsigned int
3399 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3400 {
3401   if (!type)
3402     return GET_MODE_ALIGNMENT (mode);
3403
3404   if (integer_zerop (TYPE_SIZE (type)))
3405     return 0;
3406
3407   gcc_assert (TYPE_MODE (type) == mode);
3408
3409   if (!AGGREGATE_TYPE_P (type))
3410     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3411
3412   if (TREE_CODE (type) == ARRAY_TYPE)
3413     return TYPE_ALIGN (TREE_TYPE (type));
3414
3415   unsigned int alignment = 0;
3416   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3417     if (TREE_CODE (field) == FIELD_DECL)
3418       alignment = std::max (alignment, DECL_ALIGN (field));
3419
3420   return alignment;
3421 }
3422
3423 /* Layout a function argument according to the AAPCS64 rules.  The rule
3424    numbers refer to the rule numbers in the AAPCS64.  */
3425
3426 static void
3427 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3428                     const_tree type,
3429                     bool named ATTRIBUTE_UNUSED)
3430 {
3431   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3432   int ncrn, nvrn, nregs;
3433   bool allocate_ncrn, allocate_nvrn;
3434   HOST_WIDE_INT size;
3435
3436   /* We need to do this once per argument.  */
3437   if (pcum->aapcs_arg_processed)
3438     return;
3439
3440   pcum->aapcs_arg_processed = true;
3441
3442   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3443   if (type)
3444     size = int_size_in_bytes (type);
3445   else
3446     /* No frontends can create types with variable-sized modes, so we
3447        shouldn't be asked to pass or return them.  */
3448     size = GET_MODE_SIZE (mode).to_constant ();
3449   size = ROUND_UP (size, UNITS_PER_WORD);
3450
3451   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3452   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3453                                                  mode,
3454                                                  type,
3455                                                  &nregs);
3456
3457   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3458      The following code thus handles passing by SIMD/FP registers first.  */
3459
3460   nvrn = pcum->aapcs_nvrn;
3461
3462   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3463      and homogenous short-vector aggregates (HVA).  */
3464   if (allocate_nvrn)
3465     {
3466       if (!TARGET_FLOAT)
3467         aarch64_err_no_fpadvsimd (mode, "argument");
3468
3469       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3470         {
3471           pcum->aapcs_nextnvrn = nvrn + nregs;
3472           if (!aarch64_composite_type_p (type, mode))
3473             {
3474               gcc_assert (nregs == 1);
3475               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3476             }
3477           else
3478             {
3479               rtx par;
3480               int i;
3481               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3482               for (i = 0; i < nregs; i++)
3483                 {
3484                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3485                                          V0_REGNUM + nvrn + i);
3486                   rtx offset = gen_int_mode
3487                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3488                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3489                   XVECEXP (par, 0, i) = tmp;
3490                 }
3491               pcum->aapcs_reg = par;
3492             }
3493           return;
3494         }
3495       else
3496         {
3497           /* C.3 NSRN is set to 8.  */
3498           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3499           goto on_stack;
3500         }
3501     }
3502
3503   ncrn = pcum->aapcs_ncrn;
3504   nregs = size / UNITS_PER_WORD;
3505
3506   /* C6 - C9.  though the sign and zero extension semantics are
3507      handled elsewhere.  This is the case where the argument fits
3508      entirely general registers.  */
3509   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3510     {
3511
3512       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3513
3514       /* C.8 if the argument has an alignment of 16 then the NGRN is
3515          rounded up to the next even number.  */
3516       if (nregs == 2
3517           && ncrn % 2
3518           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3519              comparison is there because for > 16 * BITS_PER_UNIT
3520              alignment nregs should be > 2 and therefore it should be
3521              passed by reference rather than value.  */
3522           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3523         {
3524           ++ncrn;
3525           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3526         }
3527
3528       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3529          A reg is still generated for it, but the caller should be smart
3530          enough not to use it.  */
3531       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3532         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3533       else
3534         {
3535           rtx par;
3536           int i;
3537
3538           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3539           for (i = 0; i < nregs; i++)
3540             {
3541               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3542               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3543                                        GEN_INT (i * UNITS_PER_WORD));
3544               XVECEXP (par, 0, i) = tmp;
3545             }
3546           pcum->aapcs_reg = par;
3547         }
3548
3549       pcum->aapcs_nextncrn = ncrn + nregs;
3550       return;
3551     }
3552
3553   /* C.11  */
3554   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3555
3556   /* The argument is passed on stack; record the needed number of words for
3557      this argument and align the total size if necessary.  */
3558 on_stack:
3559   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3560
3561   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3562     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3563                                        16 / UNITS_PER_WORD);
3564   return;
3565 }
3566
3567 /* Implement TARGET_FUNCTION_ARG.  */
3568
3569 static rtx
3570 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3571                       const_tree type, bool named)
3572 {
3573   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3574   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3575
3576   if (mode == VOIDmode)
3577     return NULL_RTX;
3578
3579   aarch64_layout_arg (pcum_v, mode, type, named);
3580   return pcum->aapcs_reg;
3581 }
3582
3583 void
3584 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3585                            const_tree fntype ATTRIBUTE_UNUSED,
3586                            rtx libname ATTRIBUTE_UNUSED,
3587                            const_tree fndecl ATTRIBUTE_UNUSED,
3588                            unsigned n_named ATTRIBUTE_UNUSED)
3589 {
3590   pcum->aapcs_ncrn = 0;
3591   pcum->aapcs_nvrn = 0;
3592   pcum->aapcs_nextncrn = 0;
3593   pcum->aapcs_nextnvrn = 0;
3594   pcum->pcs_variant = ARM_PCS_AAPCS64;
3595   pcum->aapcs_reg = NULL_RTX;
3596   pcum->aapcs_arg_processed = false;
3597   pcum->aapcs_stack_words = 0;
3598   pcum->aapcs_stack_size = 0;
3599
3600   if (!TARGET_FLOAT
3601       && fndecl && TREE_PUBLIC (fndecl)
3602       && fntype && fntype != error_mark_node)
3603     {
3604       const_tree type = TREE_TYPE (fntype);
3605       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
3606       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
3607       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3608                                                    &mode, &nregs, NULL))
3609         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
3610     }
3611   return;
3612 }
3613
3614 static void
3615 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3616                               machine_mode mode,
3617                               const_tree type,
3618                               bool named)
3619 {
3620   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3621   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3622     {
3623       aarch64_layout_arg (pcum_v, mode, type, named);
3624       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3625                   != (pcum->aapcs_stack_words != 0));
3626       pcum->aapcs_arg_processed = false;
3627       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3628       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3629       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3630       pcum->aapcs_stack_words = 0;
3631       pcum->aapcs_reg = NULL_RTX;
3632     }
3633 }
3634
3635 bool
3636 aarch64_function_arg_regno_p (unsigned regno)
3637 {
3638   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3639           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3640 }
3641
3642 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
3643    PARM_BOUNDARY bits of alignment, but will be given anything up
3644    to STACK_BOUNDARY bits if the type requires it.  This makes sure
3645    that both before and after the layout of each argument, the Next
3646    Stacked Argument Address (NSAA) will have a minimum alignment of
3647    8 bytes.  */
3648
3649 static unsigned int
3650 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3651 {
3652   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3653   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3654 }
3655
3656 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
3657
3658 static fixed_size_mode
3659 aarch64_get_reg_raw_mode (int regno)
3660 {
3661   if (TARGET_SVE && FP_REGNUM_P (regno))
3662     /* Don't use the SVE part of the register for __builtin_apply and
3663        __builtin_return.  The SVE registers aren't used by the normal PCS,
3664        so using them there would be a waste of time.  The PCS extensions
3665        for SVE types are fundamentally incompatible with the
3666        __builtin_return/__builtin_apply interface.  */
3667     return as_a <fixed_size_mode> (V16QImode);
3668   return default_get_reg_raw_mode (regno);
3669 }
3670
3671 /* Implement TARGET_FUNCTION_ARG_PADDING.
3672
3673    Small aggregate types are placed in the lowest memory address.
3674
3675    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
3676
3677 static pad_direction
3678 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3679 {
3680   /* On little-endian targets, the least significant byte of every stack
3681      argument is passed at the lowest byte address of the stack slot.  */
3682   if (!BYTES_BIG_ENDIAN)
3683     return PAD_UPWARD;
3684
3685   /* Otherwise, integral, floating-point and pointer types are padded downward:
3686      the least significant byte of a stack argument is passed at the highest
3687      byte address of the stack slot.  */
3688   if (type
3689       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3690          || POINTER_TYPE_P (type))
3691       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3692     return PAD_DOWNWARD;
3693
3694   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
3695   return PAD_UPWARD;
3696 }
3697
3698 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3699
3700    It specifies padding for the last (may also be the only)
3701    element of a block move between registers and memory.  If
3702    assuming the block is in the memory, padding upward means that
3703    the last element is padded after its highest significant byte,
3704    while in downward padding, the last element is padded at the
3705    its least significant byte side.
3706
3707    Small aggregates and small complex types are always padded
3708    upwards.
3709
3710    We don't need to worry about homogeneous floating-point or
3711    short-vector aggregates; their move is not affected by the
3712    padding direction determined here.  Regardless of endianness,
3713    each element of such an aggregate is put in the least
3714    significant bits of a fp/simd register.
3715
3716    Return !BYTES_BIG_ENDIAN if the least significant byte of the
3717    register has useful data, and return the opposite if the most
3718    significant byte does.  */
3719
3720 bool
3721 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3722                      bool first ATTRIBUTE_UNUSED)
3723 {
3724
3725   /* Small composite types are always padded upward.  */
3726   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3727     {
3728       HOST_WIDE_INT size;
3729       if (type)
3730         size = int_size_in_bytes (type);
3731       else
3732         /* No frontends can create types with variable-sized modes, so we
3733            shouldn't be asked to pass or return them.  */
3734         size = GET_MODE_SIZE (mode).to_constant ();
3735       if (size < 2 * UNITS_PER_WORD)
3736         return true;
3737     }
3738
3739   /* Otherwise, use the default padding.  */
3740   return !BYTES_BIG_ENDIAN;
3741 }
3742
3743 static scalar_int_mode
3744 aarch64_libgcc_cmp_return_mode (void)
3745 {
3746   return SImode;
3747 }
3748
3749 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3750
3751 /* We use the 12-bit shifted immediate arithmetic instructions so values
3752    must be multiple of (1 << 12), i.e. 4096.  */
3753 #define ARITH_FACTOR 4096
3754
3755 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3756 #error Cannot use simple address calculation for stack probing
3757 #endif
3758
3759 /* The pair of scratch registers used for stack probing.  */
3760 #define PROBE_STACK_FIRST_REG  9
3761 #define PROBE_STACK_SECOND_REG 10
3762
3763 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3764    inclusive.  These are offsets from the current stack pointer.  */
3765
3766 static void
3767 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3768 {
3769   HOST_WIDE_INT size;
3770   if (!poly_size.is_constant (&size))
3771     {
3772       sorry ("stack probes for SVE frames");
3773       return;
3774     }
3775
3776   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3777
3778   /* See the same assertion on PROBE_INTERVAL above.  */
3779   gcc_assert ((first % ARITH_FACTOR) == 0);
3780
3781   /* See if we have a constant small number of probes to generate.  If so,
3782      that's the easy case.  */
3783   if (size <= PROBE_INTERVAL)
3784     {
3785       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3786
3787       emit_set_insn (reg1,
3788                      plus_constant (Pmode,
3789                                     stack_pointer_rtx, -(first + base)));
3790       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3791     }
3792
3793   /* The run-time loop is made up of 8 insns in the generic case while the
3794      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
3795   else if (size <= 4 * PROBE_INTERVAL)
3796     {
3797       HOST_WIDE_INT i, rem;
3798
3799       emit_set_insn (reg1,
3800                      plus_constant (Pmode,
3801                                     stack_pointer_rtx,
3802                                     -(first + PROBE_INTERVAL)));
3803       emit_stack_probe (reg1);
3804
3805       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3806          it exceeds SIZE.  If only two probes are needed, this will not
3807          generate any code.  Then probe at FIRST + SIZE.  */
3808       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3809         {
3810           emit_set_insn (reg1,
3811                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3812           emit_stack_probe (reg1);
3813         }
3814
3815       rem = size - (i - PROBE_INTERVAL);
3816       if (rem > 256)
3817         {
3818           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3819
3820           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3821           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3822         }
3823       else
3824         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3825     }
3826
3827   /* Otherwise, do the same as above, but in a loop.  Note that we must be
3828      extra careful with variables wrapping around because we might be at
3829      the very top (or the very bottom) of the address space and we have
3830      to be able to handle this case properly; in particular, we use an
3831      equality test for the loop condition.  */
3832   else
3833     {
3834       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3835
3836       /* Step 1: round SIZE to the previous multiple of the interval.  */
3837
3838       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3839
3840
3841       /* Step 2: compute initial and final value of the loop counter.  */
3842
3843       /* TEST_ADDR = SP + FIRST.  */
3844       emit_set_insn (reg1,
3845                      plus_constant (Pmode, stack_pointer_rtx, -first));
3846
3847       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
3848       HOST_WIDE_INT adjustment = - (first + rounded_size);
3849       if (! aarch64_uimm12_shift (adjustment))
3850         {
3851           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3852                                           true, Pmode);
3853           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3854         }
3855       else
3856         emit_set_insn (reg2,
3857                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
3858
3859       /* Step 3: the loop
3860
3861          do
3862            {
3863              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3864              probe at TEST_ADDR
3865            }
3866          while (TEST_ADDR != LAST_ADDR)
3867
3868          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3869          until it is equal to ROUNDED_SIZE.  */
3870
3871       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3872
3873
3874       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3875          that SIZE is equal to ROUNDED_SIZE.  */
3876
3877       if (size != rounded_size)
3878         {
3879           HOST_WIDE_INT rem = size - rounded_size;
3880
3881           if (rem > 256)
3882             {
3883               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3884
3885               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3886               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3887             }
3888           else
3889             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3890         }
3891     }
3892
3893   /* Make sure nothing is scheduled before we are done.  */
3894   emit_insn (gen_blockage ());
3895 }
3896
3897 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
3898    absolute addresses.  */
3899
3900 const char *
3901 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3902 {
3903   static int labelno = 0;
3904   char loop_lab[32];
3905   rtx xops[2];
3906
3907   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3908
3909   /* Loop.  */
3910   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3911
3912   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
3913   xops[0] = reg1;
3914   xops[1] = GEN_INT (PROBE_INTERVAL);
3915   output_asm_insn ("sub\t%0, %0, %1", xops);
3916
3917   /* Probe at TEST_ADDR.  */
3918   output_asm_insn ("str\txzr, [%0]", xops);
3919
3920   /* Test if TEST_ADDR == LAST_ADDR.  */
3921   xops[1] = reg2;
3922   output_asm_insn ("cmp\t%0, %1", xops);
3923
3924   /* Branch.  */
3925   fputs ("\tb.ne\t", asm_out_file);
3926   assemble_name_raw (asm_out_file, loop_lab);
3927   fputc ('\n', asm_out_file);
3928
3929   return "";
3930 }
3931
3932 /* Mark the registers that need to be saved by the callee and calculate
3933    the size of the callee-saved registers area and frame record (both FP
3934    and LR may be omitted).  */
3935 static void
3936 aarch64_layout_frame (void)
3937 {
3938   HOST_WIDE_INT offset = 0;
3939   int regno, last_fp_reg = INVALID_REGNUM;
3940
3941   if (reload_completed && cfun->machine->frame.laid_out)
3942     return;
3943
3944   /* Force a frame chain for EH returns so the return address is at FP+8.  */
3945   cfun->machine->frame.emit_frame_chain
3946     = frame_pointer_needed || crtl->calls_eh_return;
3947
3948   /* Emit a frame chain if the frame pointer is enabled.
3949      If -momit-leaf-frame-pointer is used, do not use a frame chain
3950      in leaf functions which do not use LR.  */
3951   if (flag_omit_frame_pointer == 2
3952       && !(flag_omit_leaf_frame_pointer && crtl->is_leaf
3953            && !df_regs_ever_live_p (LR_REGNUM)))
3954     cfun->machine->frame.emit_frame_chain = true;
3955
3956 #define SLOT_NOT_REQUIRED (-2)
3957 #define SLOT_REQUIRED     (-1)
3958
3959   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
3960   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
3961
3962   /* First mark all the registers that really need to be saved...  */
3963   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3964     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3965
3966   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3967     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3968
3969   /* ... that includes the eh data registers (if needed)...  */
3970   if (crtl->calls_eh_return)
3971     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
3972       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
3973         = SLOT_REQUIRED;
3974
3975   /* ... and any callee saved register that dataflow says is live.  */
3976   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3977     if (df_regs_ever_live_p (regno)
3978         && (regno == R30_REGNUM
3979             || !call_used_regs[regno]))
3980       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3981
3982   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3983     if (df_regs_ever_live_p (regno)
3984         && !call_used_regs[regno])
3985       {
3986         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3987         last_fp_reg = regno;
3988       }
3989
3990   if (cfun->machine->frame.emit_frame_chain)
3991     {
3992       /* FP and LR are placed in the linkage record.  */
3993       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
3994       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
3995       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
3996       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
3997       offset = 2 * UNITS_PER_WORD;
3998     }
3999
4000   /* Now assign stack slots for them.  */
4001   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4002     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4003       {
4004         cfun->machine->frame.reg_offset[regno] = offset;
4005         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4006           cfun->machine->frame.wb_candidate1 = regno;
4007         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4008           cfun->machine->frame.wb_candidate2 = regno;
4009         offset += UNITS_PER_WORD;
4010       }
4011
4012   HOST_WIDE_INT max_int_offset = offset;
4013   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4014   bool has_align_gap = offset != max_int_offset;
4015
4016   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4017     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4018       {
4019         /* If there is an alignment gap between integer and fp callee-saves,
4020            allocate the last fp register to it if possible.  */
4021         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4022           {
4023             cfun->machine->frame.reg_offset[regno] = max_int_offset;
4024             break;
4025           }
4026
4027         cfun->machine->frame.reg_offset[regno] = offset;
4028         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4029           cfun->machine->frame.wb_candidate1 = regno;
4030         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4031                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4032           cfun->machine->frame.wb_candidate2 = regno;
4033         offset += UNITS_PER_WORD;
4034       }
4035
4036   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4037
4038   cfun->machine->frame.saved_regs_size = offset;
4039
4040   HOST_WIDE_INT varargs_and_saved_regs_size
4041     = offset + cfun->machine->frame.saved_varargs_size;
4042
4043   cfun->machine->frame.hard_fp_offset
4044     = aligned_upper_bound (varargs_and_saved_regs_size
4045                            + get_frame_size (),
4046                            STACK_BOUNDARY / BITS_PER_UNIT);
4047
4048   /* Both these values are already aligned.  */
4049   gcc_assert (multiple_p (crtl->outgoing_args_size,
4050                           STACK_BOUNDARY / BITS_PER_UNIT));
4051   cfun->machine->frame.frame_size
4052     = (cfun->machine->frame.hard_fp_offset
4053        + crtl->outgoing_args_size);
4054
4055   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4056
4057   cfun->machine->frame.initial_adjust = 0;
4058   cfun->machine->frame.final_adjust = 0;
4059   cfun->machine->frame.callee_adjust = 0;
4060   cfun->machine->frame.callee_offset = 0;
4061
4062   HOST_WIDE_INT max_push_offset = 0;
4063   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4064     max_push_offset = 512;
4065   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4066     max_push_offset = 256;
4067
4068   HOST_WIDE_INT const_size, const_fp_offset;
4069   if (cfun->machine->frame.frame_size.is_constant (&const_size)
4070       && const_size < max_push_offset
4071       && known_eq (crtl->outgoing_args_size, 0))
4072     {
4073       /* Simple, small frame with no outgoing arguments:
4074          stp reg1, reg2, [sp, -frame_size]!
4075          stp reg3, reg4, [sp, 16]  */
4076       cfun->machine->frame.callee_adjust = const_size;
4077     }
4078   else if (known_lt (crtl->outgoing_args_size
4079                      + cfun->machine->frame.saved_regs_size, 512)
4080            && !(cfun->calls_alloca
4081                 && known_lt (cfun->machine->frame.hard_fp_offset,
4082                              max_push_offset)))
4083     {
4084       /* Frame with small outgoing arguments:
4085          sub sp, sp, frame_size
4086          stp reg1, reg2, [sp, outgoing_args_size]
4087          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
4088       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4089       cfun->machine->frame.callee_offset
4090         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4091     }
4092   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4093            && const_fp_offset < max_push_offset)
4094     {
4095       /* Frame with large outgoing arguments but a small local area:
4096          stp reg1, reg2, [sp, -hard_fp_offset]!
4097          stp reg3, reg4, [sp, 16]
4098          sub sp, sp, outgoing_args_size  */
4099       cfun->machine->frame.callee_adjust = const_fp_offset;
4100       cfun->machine->frame.final_adjust
4101         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4102     }
4103   else
4104     {
4105       /* Frame with large local area and outgoing arguments using frame pointer:
4106          sub sp, sp, hard_fp_offset
4107          stp x29, x30, [sp, 0]
4108          add x29, sp, 0
4109          stp reg3, reg4, [sp, 16]
4110          sub sp, sp, outgoing_args_size  */
4111       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4112       cfun->machine->frame.final_adjust
4113         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4114     }
4115
4116   cfun->machine->frame.laid_out = true;
4117 }
4118
4119 /* Return true if the register REGNO is saved on entry to
4120    the current function.  */
4121
4122 static bool
4123 aarch64_register_saved_on_entry (int regno)
4124 {
4125   return cfun->machine->frame.reg_offset[regno] >= 0;
4126 }
4127
4128 /* Return the next register up from REGNO up to LIMIT for the callee
4129    to save.  */
4130
4131 static unsigned
4132 aarch64_next_callee_save (unsigned regno, unsigned limit)
4133 {
4134   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4135     regno ++;
4136   return regno;
4137 }
4138
4139 /* Push the register number REGNO of mode MODE to the stack with write-back
4140    adjusting the stack by ADJUSTMENT.  */
4141
4142 static void
4143 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4144                            HOST_WIDE_INT adjustment)
4145  {
4146   rtx base_rtx = stack_pointer_rtx;
4147   rtx insn, reg, mem;
4148
4149   reg = gen_rtx_REG (mode, regno);
4150   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4151                             plus_constant (Pmode, base_rtx, -adjustment));
4152   mem = gen_frame_mem (mode, mem);
4153
4154   insn = emit_move_insn (mem, reg);
4155   RTX_FRAME_RELATED_P (insn) = 1;
4156 }
4157
4158 /* Generate and return an instruction to store the pair of registers
4159    REG and REG2 of mode MODE to location BASE with write-back adjusting
4160    the stack location BASE by ADJUSTMENT.  */
4161
4162 static rtx
4163 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4164                           HOST_WIDE_INT adjustment)
4165 {
4166   switch (mode)
4167     {
4168     case E_DImode:
4169       return gen_storewb_pairdi_di (base, base, reg, reg2,
4170                                     GEN_INT (-adjustment),
4171                                     GEN_INT (UNITS_PER_WORD - adjustment));
4172     case E_DFmode:
4173       return gen_storewb_pairdf_di (base, base, reg, reg2,
4174                                     GEN_INT (-adjustment),
4175                                     GEN_INT (UNITS_PER_WORD - adjustment));
4176     default:
4177       gcc_unreachable ();
4178     }
4179 }
4180
4181 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4182    stack pointer by ADJUSTMENT.  */
4183
4184 static void
4185 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4186 {
4187   rtx_insn *insn;
4188   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4189
4190   if (regno2 == INVALID_REGNUM)
4191     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4192
4193   rtx reg1 = gen_rtx_REG (mode, regno1);
4194   rtx reg2 = gen_rtx_REG (mode, regno2);
4195
4196   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4197                                               reg2, adjustment));
4198   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4199   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4200   RTX_FRAME_RELATED_P (insn) = 1;
4201 }
4202
4203 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4204    adjusting it by ADJUSTMENT afterwards.  */
4205
4206 static rtx
4207 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4208                          HOST_WIDE_INT adjustment)
4209 {
4210   switch (mode)
4211     {
4212     case E_DImode:
4213       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4214                                    GEN_INT (UNITS_PER_WORD));
4215     case E_DFmode:
4216       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4217                                    GEN_INT (UNITS_PER_WORD));
4218     default:
4219       gcc_unreachable ();
4220     }
4221 }
4222
4223 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4224    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4225    into CFI_OPS.  */
4226
4227 static void
4228 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4229                   rtx *cfi_ops)
4230 {
4231   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4232   rtx reg1 = gen_rtx_REG (mode, regno1);
4233
4234   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4235
4236   if (regno2 == INVALID_REGNUM)
4237     {
4238       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4239       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4240       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4241     }
4242   else
4243     {
4244       rtx reg2 = gen_rtx_REG (mode, regno2);
4245       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4246       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4247                                           reg2, adjustment));
4248     }
4249 }
4250
4251 /* Generate and return a store pair instruction of mode MODE to store
4252    register REG1 to MEM1 and register REG2 to MEM2.  */
4253
4254 static rtx
4255 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4256                         rtx reg2)
4257 {
4258   switch (mode)
4259     {
4260     case E_DImode:
4261       return gen_store_pairdi (mem1, reg1, mem2, reg2);
4262
4263     case E_DFmode:
4264       return gen_store_pairdf (mem1, reg1, mem2, reg2);
4265
4266     default:
4267       gcc_unreachable ();
4268     }
4269 }
4270
4271 /* Generate and regurn a load pair isntruction of mode MODE to load register
4272    REG1 from MEM1 and register REG2 from MEM2.  */
4273
4274 static rtx
4275 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4276                        rtx mem2)
4277 {
4278   switch (mode)
4279     {
4280     case E_DImode:
4281       return gen_load_pairdi (reg1, mem1, reg2, mem2);
4282
4283     case E_DFmode:
4284       return gen_load_pairdf (reg1, mem1, reg2, mem2);
4285
4286     default:
4287       gcc_unreachable ();
4288     }
4289 }
4290
4291 /* Return TRUE if return address signing should be enabled for the current
4292    function, otherwise return FALSE.  */
4293
4294 bool
4295 aarch64_return_address_signing_enabled (void)
4296 {
4297   /* This function should only be called after frame laid out.   */
4298   gcc_assert (cfun->machine->frame.laid_out);
4299
4300   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4301      if it's LR is pushed onto stack.  */
4302   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4303           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4304               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4305 }
4306
4307 /* Emit code to save the callee-saved registers from register number START
4308    to LIMIT to the stack at the location starting at offset START_OFFSET,
4309    skipping any write-back candidates if SKIP_WB is true.  */
4310
4311 static void
4312 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4313                            unsigned start, unsigned limit, bool skip_wb)
4314 {
4315   rtx_insn *insn;
4316   unsigned regno;
4317   unsigned regno2;
4318
4319   for (regno = aarch64_next_callee_save (start, limit);
4320        regno <= limit;
4321        regno = aarch64_next_callee_save (regno + 1, limit))
4322     {
4323       rtx reg, mem;
4324       poly_int64 offset;
4325
4326       if (skip_wb
4327           && (regno == cfun->machine->frame.wb_candidate1
4328               || regno == cfun->machine->frame.wb_candidate2))
4329         continue;
4330
4331       if (cfun->machine->reg_is_wrapped_separately[regno])
4332        continue;
4333
4334       reg = gen_rtx_REG (mode, regno);
4335       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4336       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4337                                                 offset));
4338
4339       regno2 = aarch64_next_callee_save (regno + 1, limit);
4340
4341       if (regno2 <= limit
4342           && !cfun->machine->reg_is_wrapped_separately[regno2]
4343           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4344               == cfun->machine->frame.reg_offset[regno2]))
4345
4346         {
4347           rtx reg2 = gen_rtx_REG (mode, regno2);
4348           rtx mem2;
4349
4350           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4351           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4352                                                      offset));
4353           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4354                                                     reg2));
4355
4356           /* The first part of a frame-related parallel insn is
4357              always assumed to be relevant to the frame
4358              calculations; subsequent parts, are only
4359              frame-related if explicitly marked.  */
4360           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4361           regno = regno2;
4362         }
4363       else
4364         insn = emit_move_insn (mem, reg);
4365
4366       RTX_FRAME_RELATED_P (insn) = 1;
4367     }
4368 }
4369
4370 /* Emit code to restore the callee registers of mode MODE from register
4371    number START up to and including LIMIT.  Restore from the stack offset
4372    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4373    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4374
4375 static void
4376 aarch64_restore_callee_saves (machine_mode mode,
4377                               poly_int64 start_offset, unsigned start,
4378                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4379 {
4380   rtx base_rtx = stack_pointer_rtx;
4381   unsigned regno;
4382   unsigned regno2;
4383   poly_int64 offset;
4384
4385   for (regno = aarch64_next_callee_save (start, limit);
4386        regno <= limit;
4387        regno = aarch64_next_callee_save (regno + 1, limit))
4388     {
4389       if (cfun->machine->reg_is_wrapped_separately[regno])
4390        continue;
4391
4392       rtx reg, mem;
4393
4394       if (skip_wb
4395           && (regno == cfun->machine->frame.wb_candidate1
4396               || regno == cfun->machine->frame.wb_candidate2))
4397         continue;
4398
4399       reg = gen_rtx_REG (mode, regno);
4400       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4401       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4402
4403       regno2 = aarch64_next_callee_save (regno + 1, limit);
4404
4405       if (regno2 <= limit
4406           && !cfun->machine->reg_is_wrapped_separately[regno2]
4407           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4408               == cfun->machine->frame.reg_offset[regno2]))
4409         {
4410           rtx reg2 = gen_rtx_REG (mode, regno2);
4411           rtx mem2;
4412
4413           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4414           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4415           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4416
4417           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4418           regno = regno2;
4419         }
4420       else
4421         emit_move_insn (reg, mem);
4422       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4423     }
4424 }
4425
4426 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4427    of MODE.  */
4428
4429 static inline bool
4430 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4431 {
4432   HOST_WIDE_INT multiple;
4433   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4434           && IN_RANGE (multiple, -8, 7));
4435 }
4436
4437 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4438    of MODE.  */
4439
4440 static inline bool
4441 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4442 {
4443   HOST_WIDE_INT multiple;
4444   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4445           && IN_RANGE (multiple, 0, 63));
4446 }
4447
4448 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4449    of MODE.  */
4450
4451 bool
4452 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4453 {
4454   HOST_WIDE_INT multiple;
4455   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4456           && IN_RANGE (multiple, -64, 63));
4457 }
4458
4459 /* Return true if OFFSET is a signed 9-bit value.  */
4460
4461 static inline bool
4462 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4463                                poly_int64 offset)
4464 {
4465   HOST_WIDE_INT const_offset;
4466   return (offset.is_constant (&const_offset)
4467           && IN_RANGE (const_offset, -256, 255));
4468 }
4469
4470 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4471    of MODE.  */
4472
4473 static inline bool
4474 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4475 {
4476   HOST_WIDE_INT multiple;
4477   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4478           && IN_RANGE (multiple, -256, 255));
4479 }
4480
4481 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4482    of MODE.  */
4483
4484 static inline bool
4485 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4486 {
4487   HOST_WIDE_INT multiple;
4488   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4489           && IN_RANGE (multiple, 0, 4095));
4490 }
4491
4492 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
4493
4494 static sbitmap
4495 aarch64_get_separate_components (void)
4496 {
4497   aarch64_layout_frame ();
4498
4499   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4500   bitmap_clear (components);
4501
4502   /* The registers we need saved to the frame.  */
4503   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4504     if (aarch64_register_saved_on_entry (regno))
4505       {
4506         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4507         if (!frame_pointer_needed)
4508           offset += cfun->machine->frame.frame_size
4509                     - cfun->machine->frame.hard_fp_offset;
4510         /* Check that we can access the stack slot of the register with one
4511            direct load with no adjustments needed.  */
4512         if (offset_12bit_unsigned_scaled_p (DImode, offset))
4513           bitmap_set_bit (components, regno);
4514       }
4515
4516   /* Don't mess with the hard frame pointer.  */
4517   if (frame_pointer_needed)
4518     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4519
4520   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4521   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4522   /* If aarch64_layout_frame has chosen registers to store/restore with
4523      writeback don't interfere with them to avoid having to output explicit
4524      stack adjustment instructions.  */
4525   if (reg2 != INVALID_REGNUM)
4526     bitmap_clear_bit (components, reg2);
4527   if (reg1 != INVALID_REGNUM)
4528     bitmap_clear_bit (components, reg1);
4529
4530   bitmap_clear_bit (components, LR_REGNUM);
4531   bitmap_clear_bit (components, SP_REGNUM);
4532
4533   return components;
4534 }
4535
4536 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
4537
4538 static sbitmap
4539 aarch64_components_for_bb (basic_block bb)
4540 {
4541   bitmap in = DF_LIVE_IN (bb);
4542   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4543   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4544
4545   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4546   bitmap_clear (components);
4547
4548   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
4549   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4550     if ((!call_used_regs[regno])
4551        && (bitmap_bit_p (in, regno)
4552            || bitmap_bit_p (gen, regno)
4553            || bitmap_bit_p (kill, regno)))
4554       {
4555         unsigned regno2, offset, offset2;
4556         bitmap_set_bit (components, regno);
4557
4558         /* If there is a callee-save at an adjacent offset, add it too
4559            to increase the use of LDP/STP.  */
4560         offset = cfun->machine->frame.reg_offset[regno];
4561         regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4562
4563         if (regno2 <= LAST_SAVED_REGNUM)
4564           {
4565             offset2 = cfun->machine->frame.reg_offset[regno2];
4566             if ((offset & ~8) == (offset2 & ~8))
4567               bitmap_set_bit (components, regno2);
4568           }
4569       }
4570
4571   return components;
4572 }
4573
4574 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4575    Nothing to do for aarch64.  */
4576
4577 static void
4578 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4579 {
4580 }
4581
4582 /* Return the next set bit in BMP from START onwards.  Return the total number
4583    of bits in BMP if no set bit is found at or after START.  */
4584
4585 static unsigned int
4586 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4587 {
4588   unsigned int nbits = SBITMAP_SIZE (bmp);
4589   if (start == nbits)
4590     return start;
4591
4592   gcc_assert (start < nbits);
4593   for (unsigned int i = start; i < nbits; i++)
4594     if (bitmap_bit_p (bmp, i))
4595       return i;
4596
4597   return nbits;
4598 }
4599
4600 /* Do the work for aarch64_emit_prologue_components and
4601    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
4602    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4603    for these components or the epilogue sequence.  That is, it determines
4604    whether we should emit stores or loads and what kind of CFA notes to attach
4605    to the insns.  Otherwise the logic for the two sequences is very
4606    similar.  */
4607
4608 static void
4609 aarch64_process_components (sbitmap components, bool prologue_p)
4610 {
4611   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4612                              ? HARD_FRAME_POINTER_REGNUM
4613                              : STACK_POINTER_REGNUM);
4614
4615   unsigned last_regno = SBITMAP_SIZE (components);
4616   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4617   rtx_insn *insn = NULL;
4618
4619   while (regno != last_regno)
4620     {
4621       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4622          so DFmode for the vector registers is enough.  */
4623       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4624       rtx reg = gen_rtx_REG (mode, regno);
4625       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4626       if (!frame_pointer_needed)
4627         offset += cfun->machine->frame.frame_size
4628                   - cfun->machine->frame.hard_fp_offset;
4629       rtx addr = plus_constant (Pmode, ptr_reg, offset);
4630       rtx mem = gen_frame_mem (mode, addr);
4631
4632       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4633       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4634       /* No more registers to handle after REGNO.
4635          Emit a single save/restore and exit.  */
4636       if (regno2 == last_regno)
4637         {
4638           insn = emit_insn (set);
4639           RTX_FRAME_RELATED_P (insn) = 1;
4640           if (prologue_p)
4641             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4642           else
4643             add_reg_note (insn, REG_CFA_RESTORE, reg);
4644           break;
4645         }
4646
4647       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4648       /* The next register is not of the same class or its offset is not
4649          mergeable with the current one into a pair.  */
4650       if (!satisfies_constraint_Ump (mem)
4651           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4652           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4653                        GET_MODE_SIZE (mode)))
4654         {
4655           insn = emit_insn (set);
4656           RTX_FRAME_RELATED_P (insn) = 1;
4657           if (prologue_p)
4658             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4659           else
4660             add_reg_note (insn, REG_CFA_RESTORE, reg);
4661
4662           regno = regno2;
4663           continue;
4664         }
4665
4666       /* REGNO2 can be saved/restored in a pair with REGNO.  */
4667       rtx reg2 = gen_rtx_REG (mode, regno2);
4668       if (!frame_pointer_needed)
4669         offset2 += cfun->machine->frame.frame_size
4670                   - cfun->machine->frame.hard_fp_offset;
4671       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4672       rtx mem2 = gen_frame_mem (mode, addr2);
4673       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4674                              : gen_rtx_SET (reg2, mem2);
4675
4676       if (prologue_p)
4677         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4678       else
4679         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4680
4681       RTX_FRAME_RELATED_P (insn) = 1;
4682       if (prologue_p)
4683         {
4684           add_reg_note (insn, REG_CFA_OFFSET, set);
4685           add_reg_note (insn, REG_CFA_OFFSET, set2);
4686         }
4687       else
4688         {
4689           add_reg_note (insn, REG_CFA_RESTORE, reg);
4690           add_reg_note (insn, REG_CFA_RESTORE, reg2);
4691         }
4692
4693       regno = aarch64_get_next_set_bit (components, regno2 + 1);
4694     }
4695 }
4696
4697 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
4698
4699 static void
4700 aarch64_emit_prologue_components (sbitmap components)
4701 {
4702   aarch64_process_components (components, true);
4703 }
4704
4705 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
4706
4707 static void
4708 aarch64_emit_epilogue_components (sbitmap components)
4709 {
4710   aarch64_process_components (components, false);
4711 }
4712
4713 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
4714
4715 static void
4716 aarch64_set_handled_components (sbitmap components)
4717 {
4718   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4719     if (bitmap_bit_p (components, regno))
4720       cfun->machine->reg_is_wrapped_separately[regno] = true;
4721 }
4722
4723 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4724    is saved at BASE + OFFSET.  */
4725
4726 static void
4727 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4728                             rtx base, poly_int64 offset)
4729 {
4730   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4731   add_reg_note (insn, REG_CFA_EXPRESSION,
4732                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4733 }
4734
4735 /* AArch64 stack frames generated by this compiler look like:
4736
4737         +-------------------------------+
4738         |                               |
4739         |  incoming stack arguments     |
4740         |                               |
4741         +-------------------------------+
4742         |                               | <-- incoming stack pointer (aligned)
4743         |  callee-allocated save area   |
4744         |  for register varargs         |
4745         |                               |
4746         +-------------------------------+
4747         |  local variables              | <-- frame_pointer_rtx
4748         |                               |
4749         +-------------------------------+
4750         |  padding0                     | \
4751         +-------------------------------+  |
4752         |  callee-saved registers       |  | frame.saved_regs_size
4753         +-------------------------------+  |
4754         |  LR'                          |  |
4755         +-------------------------------+  |
4756         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
4757         +-------------------------------+
4758         |  dynamic allocation           |
4759         +-------------------------------+
4760         |  padding                      |
4761         +-------------------------------+
4762         |  outgoing stack arguments     | <-- arg_pointer
4763         |                               |
4764         +-------------------------------+
4765         |                               | <-- stack_pointer_rtx (aligned)
4766
4767    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4768    but leave frame_pointer_rtx and hard_frame_pointer_rtx
4769    unchanged.  */
4770
4771 /* Generate the prologue instructions for entry into a function.
4772    Establish the stack frame by decreasing the stack pointer with a
4773    properly calculated size and, if necessary, create a frame record
4774    filled with the values of LR and previous frame pointer.  The
4775    current FP is also set up if it is in use.  */
4776
4777 void
4778 aarch64_expand_prologue (void)
4779 {
4780   aarch64_layout_frame ();
4781
4782   poly_int64 frame_size = cfun->machine->frame.frame_size;
4783   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4784   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4785   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4786   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4787   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4788   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4789   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4790   rtx_insn *insn;
4791
4792   /* Sign return address for functions.  */
4793   if (aarch64_return_address_signing_enabled ())
4794     {
4795       insn = emit_insn (gen_pacisp ());
4796       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4797       RTX_FRAME_RELATED_P (insn) = 1;
4798     }
4799
4800   if (flag_stack_usage_info)
4801     current_function_static_stack_size = constant_lower_bound (frame_size);
4802
4803   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4804     {
4805       if (crtl->is_leaf && !cfun->calls_alloca)
4806         {
4807           if (maybe_gt (frame_size, PROBE_INTERVAL)
4808               && maybe_gt (frame_size, get_stack_check_protect ()))
4809             aarch64_emit_probe_stack_range (get_stack_check_protect (),
4810                                             (frame_size
4811                                              - get_stack_check_protect ()));
4812         }
4813       else if (maybe_gt (frame_size, 0))
4814         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4815     }
4816
4817   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4818   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4819
4820   aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4821
4822   if (callee_adjust != 0)
4823     aarch64_push_regs (reg1, reg2, callee_adjust);
4824
4825   if (emit_frame_chain)
4826     {
4827       poly_int64 reg_offset = callee_adjust;
4828       if (callee_adjust == 0)
4829         {
4830           reg1 = R29_REGNUM;
4831           reg2 = R30_REGNUM;
4832           reg_offset = callee_offset;
4833           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4834         }
4835       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4836                           stack_pointer_rtx, callee_offset,
4837                           ip1_rtx, ip0_rtx, frame_pointer_needed);
4838       if (frame_pointer_needed && !frame_size.is_constant ())
4839         {
4840           /* Variable-sized frames need to describe the save slot
4841              address using DW_CFA_expression rather than DW_CFA_offset.
4842              This means that, without taking further action, the
4843              locations of the registers that we've already saved would
4844              remain based on the stack pointer even after we redefine
4845              the CFA based on the frame pointer.  We therefore need new
4846              DW_CFA_expressions to re-express the save slots with addresses
4847              based on the frame pointer.  */
4848           rtx_insn *insn = get_last_insn ();
4849           gcc_assert (RTX_FRAME_RELATED_P (insn));
4850
4851           /* Add an explicit CFA definition if this was previously
4852              implicit.  */
4853           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4854             {
4855               rtx src = plus_constant (Pmode, stack_pointer_rtx,
4856                                        callee_offset);
4857               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4858                             gen_rtx_SET (hard_frame_pointer_rtx, src));
4859             }
4860
4861           /* Change the save slot expressions for the registers that
4862              we've already saved.  */
4863           reg_offset -= callee_offset;
4864           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4865                                       reg_offset + UNITS_PER_WORD);
4866           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4867                                       reg_offset);
4868         }
4869       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4870     }
4871
4872   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4873                              callee_adjust != 0 || emit_frame_chain);
4874   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4875                              callee_adjust != 0 || emit_frame_chain);
4876   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4877 }
4878
4879 /* Return TRUE if we can use a simple_return insn.
4880
4881    This function checks whether the callee saved stack is empty, which
4882    means no restore actions are need. The pro_and_epilogue will use
4883    this to check whether shrink-wrapping opt is feasible.  */
4884
4885 bool
4886 aarch64_use_return_insn_p (void)
4887 {
4888   if (!reload_completed)
4889     return false;
4890
4891   if (crtl->profile)
4892     return false;
4893
4894   aarch64_layout_frame ();
4895
4896   return known_eq (cfun->machine->frame.frame_size, 0);
4897 }
4898
4899 /* Generate the epilogue instructions for returning from a function.
4900    This is almost exactly the reverse of the prolog sequence, except
4901    that we need to insert barriers to avoid scheduling loads that read
4902    from a deallocated stack, and we optimize the unwind records by
4903    emitting them all together if possible.  */
4904 void
4905 aarch64_expand_epilogue (bool for_sibcall)
4906 {
4907   aarch64_layout_frame ();
4908
4909   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4910   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4911   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4912   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4913   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4914   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4915   rtx cfi_ops = NULL;
4916   rtx_insn *insn;
4917   /* A stack clash protection prologue may not have left IP0_REGNUM or
4918      IP1_REGNUM in a usable state.  The same is true for allocations
4919      with an SVE component, since we then need both temporary registers
4920      for each allocation.  */
4921   bool can_inherit_p = (initial_adjust.is_constant ()
4922                         && final_adjust.is_constant ()
4923                         && !flag_stack_clash_protection);
4924
4925   /* We need to add memory barrier to prevent read from deallocated stack.  */
4926   bool need_barrier_p
4927     = maybe_ne (get_frame_size ()
4928                 + cfun->machine->frame.saved_varargs_size, 0);
4929
4930   /* Emit a barrier to prevent loads from a deallocated stack.  */
4931   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4932       || cfun->calls_alloca
4933       || crtl->calls_eh_return)
4934     {
4935       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4936       need_barrier_p = false;
4937     }
4938
4939   /* Restore the stack pointer from the frame pointer if it may not
4940      be the same as the stack pointer.  */
4941   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4942   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4943   if (frame_pointer_needed
4944       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
4945     /* If writeback is used when restoring callee-saves, the CFA
4946        is restored on the instruction doing the writeback.  */
4947     aarch64_add_offset (Pmode, stack_pointer_rtx,
4948                         hard_frame_pointer_rtx, -callee_offset,
4949                         ip1_rtx, ip0_rtx, callee_adjust == 0);
4950   else
4951     aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
4952                     !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
4953
4954   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4955                                 callee_adjust != 0, &cfi_ops);
4956   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4957                                 callee_adjust != 0, &cfi_ops);
4958
4959   if (need_barrier_p)
4960     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4961
4962   if (callee_adjust != 0)
4963     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
4964
4965   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
4966     {
4967       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
4968       insn = get_last_insn ();
4969       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
4970       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
4971       RTX_FRAME_RELATED_P (insn) = 1;
4972       cfi_ops = NULL;
4973     }
4974
4975   aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
4976                   !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
4977
4978   if (cfi_ops)
4979     {
4980       /* Emit delayed restores and reset the CFA to be SP.  */
4981       insn = get_last_insn ();
4982       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
4983       REG_NOTES (insn) = cfi_ops;
4984       RTX_FRAME_RELATED_P (insn) = 1;
4985     }
4986
4987   /* We prefer to emit the combined return/authenticate instruction RETAA,
4988      however there are three cases in which we must instead emit an explicit
4989      authentication instruction.
4990
4991         1) Sibcalls don't return in a normal way, so if we're about to call one
4992            we must authenticate.
4993
4994         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
4995            generating code for !TARGET_ARMV8_3 we can't use it and must
4996            explicitly authenticate.
4997
4998         3) On an eh_return path we make extra stack adjustments to update the
4999            canonical frame address to be the exception handler's CFA.  We want
5000            to authenticate using the CFA of the function which calls eh_return.
5001     */
5002   if (aarch64_return_address_signing_enabled ()
5003       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5004     {
5005       insn = emit_insn (gen_autisp ());
5006       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5007       RTX_FRAME_RELATED_P (insn) = 1;
5008     }
5009
5010   /* Stack adjustment for exception handler.  */
5011   if (crtl->calls_eh_return)
5012     {
5013       /* We need to unwind the stack by the offset computed by
5014          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
5015          to be SP; letting the CFA move during this adjustment
5016          is just as correct as retaining the CFA from the body
5017          of the function.  Therefore, do nothing special.  */
5018       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5019     }
5020
5021   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5022   if (!for_sibcall)
5023     emit_jump_insn (ret_rtx);
5024 }
5025
5026 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
5027    normally or return to a previous frame after unwinding.
5028
5029    An EH return uses a single shared return sequence.  The epilogue is
5030    exactly like a normal epilogue except that it has an extra input
5031    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5032    that must be applied after the frame has been destroyed.  An extra label
5033    is inserted before the epilogue which initializes this register to zero,
5034    and this is the entry point for a normal return.
5035
5036    An actual EH return updates the return address, initializes the stack
5037    adjustment and jumps directly into the epilogue (bypassing the zeroing
5038    of the adjustment).  Since the return address is typically saved on the
5039    stack when a function makes a call, the saved LR must be updated outside
5040    the epilogue.
5041
5042    This poses problems as the store is generated well before the epilogue,
5043    so the offset of LR is not known yet.  Also optimizations will remove the
5044    store as it appears dead, even after the epilogue is generated (as the
5045    base or offset for loading LR is different in many cases).
5046
5047    To avoid these problems this implementation forces the frame pointer
5048    in eh_return functions so that the location of LR is fixed and known early.
5049    It also marks the store volatile, so no optimization is permitted to
5050    remove the store.  */
5051 rtx
5052 aarch64_eh_return_handler_rtx (void)
5053 {
5054   rtx tmp = gen_frame_mem (Pmode,
5055     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5056
5057   /* Mark the store volatile, so no optimization is permitted to remove it.  */
5058   MEM_VOLATILE_P (tmp) = true;
5059   return tmp;
5060 }
5061
5062 /* Output code to add DELTA to the first argument, and then jump
5063    to FUNCTION.  Used for C++ multiple inheritance.  */
5064 static void
5065 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5066                          HOST_WIDE_INT delta,
5067                          HOST_WIDE_INT vcall_offset,
5068                          tree function)
5069 {
5070   /* The this pointer is always in x0.  Note that this differs from
5071      Arm where the this pointer maybe bumped to r1 if r0 is required
5072      to return a pointer to an aggregate.  On AArch64 a result value
5073      pointer will be in x8.  */
5074   int this_regno = R0_REGNUM;
5075   rtx this_rtx, temp0, temp1, addr, funexp;
5076   rtx_insn *insn;
5077
5078   reload_completed = 1;
5079   emit_note (NOTE_INSN_PROLOGUE_END);
5080
5081   this_rtx = gen_rtx_REG (Pmode, this_regno);
5082   temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5083   temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5084
5085   if (vcall_offset == 0)
5086     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5087   else
5088     {
5089       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5090
5091       addr = this_rtx;
5092       if (delta != 0)
5093         {
5094           if (delta >= -256 && delta < 256)
5095             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5096                                        plus_constant (Pmode, this_rtx, delta));
5097           else
5098             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5099                                 temp1, temp0, false);
5100         }
5101
5102       if (Pmode == ptr_mode)
5103         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5104       else
5105         aarch64_emit_move (temp0,
5106                            gen_rtx_ZERO_EXTEND (Pmode,
5107                                                 gen_rtx_MEM (ptr_mode, addr)));
5108
5109       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5110           addr = plus_constant (Pmode, temp0, vcall_offset);
5111       else
5112         {
5113           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5114                                           Pmode);
5115           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5116         }
5117
5118       if (Pmode == ptr_mode)
5119         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5120       else
5121         aarch64_emit_move (temp1,
5122                            gen_rtx_SIGN_EXTEND (Pmode,
5123                                                 gen_rtx_MEM (ptr_mode, addr)));
5124
5125       emit_insn (gen_add2_insn (this_rtx, temp1));
5126     }
5127
5128   /* Generate a tail call to the target function.  */
5129   if (!TREE_USED (function))
5130     {
5131       assemble_external (function);
5132       TREE_USED (function) = 1;
5133     }
5134   funexp = XEXP (DECL_RTL (function), 0);
5135   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5136   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5137   SIBLING_CALL_P (insn) = 1;
5138
5139   insn = get_insns ();
5140   shorten_branches (insn);
5141   final_start_function (insn, file, 1);
5142   final (insn, file, 1);
5143   final_end_function ();
5144
5145   /* Stop pretending to be a post-reload pass.  */
5146   reload_completed = 0;
5147 }
5148
5149 static bool
5150 aarch64_tls_referenced_p (rtx x)
5151 {
5152   if (!TARGET_HAVE_TLS)
5153     return false;
5154   subrtx_iterator::array_type array;
5155   FOR_EACH_SUBRTX (iter, array, x, ALL)
5156     {
5157       const_rtx x = *iter;
5158       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5159         return true;
5160       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5161          TLS offsets, not real symbol references.  */
5162       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5163         iter.skip_subrtxes ();
5164     }
5165   return false;
5166 }
5167
5168
5169 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5170    a left shift of 0 or 12 bits.  */
5171 bool
5172 aarch64_uimm12_shift (HOST_WIDE_INT val)
5173 {
5174   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5175           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5176           );
5177 }
5178
5179
5180 /* Return true if val is an immediate that can be loaded into a
5181    register by a MOVZ instruction.  */
5182 static bool
5183 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5184 {
5185   if (GET_MODE_SIZE (mode) > 4)
5186     {
5187       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5188           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5189         return 1;
5190     }
5191   else
5192     {
5193       /* Ignore sign extension.  */
5194       val &= (HOST_WIDE_INT) 0xffffffff;
5195     }
5196   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5197           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5198 }
5199
5200 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
5201    64-bit (DImode) integer.  */
5202
5203 static unsigned HOST_WIDE_INT
5204 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5205 {
5206   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5207   while (size < 64)
5208     {
5209       val &= (HOST_WIDE_INT_1U << size) - 1;
5210       val |= val << size;
5211       size *= 2;
5212     }
5213   return val;
5214 }
5215
5216 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
5217
5218 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5219   {
5220     0x0000000100000001ull,
5221     0x0001000100010001ull,
5222     0x0101010101010101ull,
5223     0x1111111111111111ull,
5224     0x5555555555555555ull,
5225   };
5226
5227
5228 /* Return true if val is a valid bitmask immediate.  */
5229
5230 bool
5231 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5232 {
5233   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5234   int bits;
5235
5236   /* Check for a single sequence of one bits and return quickly if so.
5237      The special cases of all ones and all zeroes returns false.  */
5238   val = aarch64_replicate_bitmask_imm (val_in, mode);
5239   tmp = val + (val & -val);
5240
5241   if (tmp == (tmp & -tmp))
5242     return (val + 1) > 1;
5243
5244   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
5245   if (mode == SImode)
5246     val = (val << 32) | (val & 0xffffffff);
5247
5248   /* Invert if the immediate doesn't start with a zero bit - this means we
5249      only need to search for sequences of one bits.  */
5250   if (val & 1)
5251     val = ~val;
5252
5253   /* Find the first set bit and set tmp to val with the first sequence of one
5254      bits removed.  Return success if there is a single sequence of ones.  */
5255   first_one = val & -val;
5256   tmp = val & (val + first_one);
5257
5258   if (tmp == 0)
5259     return true;
5260
5261   /* Find the next set bit and compute the difference in bit position.  */
5262   next_one = tmp & -tmp;
5263   bits = clz_hwi (first_one) - clz_hwi (next_one);
5264   mask = val ^ tmp;
5265
5266   /* Check the bit position difference is a power of 2, and that the first
5267      sequence of one bits fits within 'bits' bits.  */
5268   if ((mask >> bits) != 0 || bits != (bits & -bits))
5269     return false;
5270
5271   /* Check the sequence of one bits is repeated 64/bits times.  */
5272   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5273 }
5274
5275 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5276    Assumed precondition: VAL_IN Is not zero.  */
5277
5278 unsigned HOST_WIDE_INT
5279 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5280 {
5281   int lowest_bit_set = ctz_hwi (val_in);
5282   int highest_bit_set = floor_log2 (val_in);
5283   gcc_assert (val_in != 0);
5284
5285   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5286           (HOST_WIDE_INT_1U << lowest_bit_set));
5287 }
5288
5289 /* Create constant where bits outside of lowest bit set to highest bit set
5290    are set to 1.  */
5291
5292 unsigned HOST_WIDE_INT
5293 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5294 {
5295   return val_in | ~aarch64_and_split_imm1 (val_in);
5296 }
5297
5298 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
5299
5300 bool
5301 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5302 {
5303   scalar_int_mode int_mode;
5304   if (!is_a <scalar_int_mode> (mode, &int_mode))
5305     return false;
5306
5307   if (aarch64_bitmask_imm (val_in, int_mode))
5308     return false;
5309
5310   if (aarch64_move_imm (val_in, int_mode))
5311     return false;
5312
5313   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5314
5315   return aarch64_bitmask_imm (imm2, int_mode);
5316 }
5317
5318 /* Return true if val is an immediate that can be loaded into a
5319    register in a single instruction.  */
5320 bool
5321 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5322 {
5323   scalar_int_mode int_mode;
5324   if (!is_a <scalar_int_mode> (mode, &int_mode))
5325     return false;
5326
5327   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5328     return 1;
5329   return aarch64_bitmask_imm (val, int_mode);
5330 }
5331
5332 static bool
5333 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5334 {
5335   rtx base, offset;
5336
5337   if (GET_CODE (x) == HIGH)
5338     return true;
5339
5340   /* There's no way to calculate VL-based values using relocations.  */
5341   subrtx_iterator::array_type array;
5342   FOR_EACH_SUBRTX (iter, array, x, ALL)
5343     if (GET_CODE (*iter) == CONST_POLY_INT)
5344       return true;
5345
5346   split_const (x, &base, &offset);
5347   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5348     {
5349       if (aarch64_classify_symbol (base, INTVAL (offset))
5350           != SYMBOL_FORCE_TO_MEM)
5351         return true;
5352       else
5353         /* Avoid generating a 64-bit relocation in ILP32; leave
5354            to aarch64_expand_mov_immediate to handle it properly.  */
5355         return mode != ptr_mode;
5356     }
5357
5358   return aarch64_tls_referenced_p (x);
5359 }
5360
5361 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5362    The expansion for a table switch is quite expensive due to the number
5363    of instructions, the table lookup and hard to predict indirect jump.
5364    When optimizing for speed, and -O3 enabled, use the per-core tuning if
5365    set, otherwise use tables for > 16 cases as a tradeoff between size and
5366    performance.  When optimizing for size, use the default setting.  */
5367
5368 static unsigned int
5369 aarch64_case_values_threshold (void)
5370 {
5371   /* Use the specified limit for the number of cases before using jump
5372      tables at higher optimization levels.  */
5373   if (optimize > 2
5374       && selected_cpu->tune->max_case_values != 0)
5375     return selected_cpu->tune->max_case_values;
5376   else
5377     return optimize_size ? default_case_values_threshold () : 17;
5378 }
5379
5380 /* Return true if register REGNO is a valid index register.
5381    STRICT_P is true if REG_OK_STRICT is in effect.  */
5382
5383 bool
5384 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5385 {
5386   if (!HARD_REGISTER_NUM_P (regno))
5387     {
5388       if (!strict_p)
5389         return true;
5390
5391       if (!reg_renumber)
5392         return false;
5393
5394       regno = reg_renumber[regno];
5395     }
5396   return GP_REGNUM_P (regno);
5397 }
5398
5399 /* Return true if register REGNO is a valid base register for mode MODE.
5400    STRICT_P is true if REG_OK_STRICT is in effect.  */
5401
5402 bool
5403 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5404 {
5405   if (!HARD_REGISTER_NUM_P (regno))
5406     {
5407       if (!strict_p)
5408         return true;
5409
5410       if (!reg_renumber)
5411         return false;
5412
5413       regno = reg_renumber[regno];
5414     }
5415
5416   /* The fake registers will be eliminated to either the stack or
5417      hard frame pointer, both of which are usually valid base registers.
5418      Reload deals with the cases where the eliminated form isn't valid.  */
5419   return (GP_REGNUM_P (regno)
5420           || regno == SP_REGNUM
5421           || regno == FRAME_POINTER_REGNUM
5422           || regno == ARG_POINTER_REGNUM);
5423 }
5424
5425 /* Return true if X is a valid base register for mode MODE.
5426    STRICT_P is true if REG_OK_STRICT is in effect.  */
5427
5428 static bool
5429 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5430 {
5431   if (!strict_p
5432       && GET_CODE (x) == SUBREG
5433       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5434     x = SUBREG_REG (x);
5435
5436   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5437 }
5438
5439 /* Return true if address offset is a valid index.  If it is, fill in INFO
5440    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5441
5442 static bool
5443 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5444                         machine_mode mode, bool strict_p)
5445 {
5446   enum aarch64_address_type type;
5447   rtx index;
5448   int shift;
5449
5450   /* (reg:P) */
5451   if ((REG_P (x) || GET_CODE (x) == SUBREG)
5452       && GET_MODE (x) == Pmode)
5453     {
5454       type = ADDRESS_REG_REG;
5455       index = x;
5456       shift = 0;
5457     }
5458   /* (sign_extend:DI (reg:SI)) */
5459   else if ((GET_CODE (x) == SIGN_EXTEND
5460             || GET_CODE (x) == ZERO_EXTEND)
5461            && GET_MODE (x) == DImode
5462            && GET_MODE (XEXP (x, 0)) == SImode)
5463     {
5464       type = (GET_CODE (x) == SIGN_EXTEND)
5465         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5466       index = XEXP (x, 0);
5467       shift = 0;
5468     }
5469   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5470   else if (GET_CODE (x) == MULT
5471            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5472                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5473            && GET_MODE (XEXP (x, 0)) == DImode
5474            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5475            && CONST_INT_P (XEXP (x, 1)))
5476     {
5477       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5478         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5479       index = XEXP (XEXP (x, 0), 0);
5480       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5481     }
5482   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5483   else if (GET_CODE (x) == ASHIFT
5484            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5485                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5486            && GET_MODE (XEXP (x, 0)) == DImode
5487            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5488            && CONST_INT_P (XEXP (x, 1)))
5489     {
5490       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5491         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5492       index = XEXP (XEXP (x, 0), 0);
5493       shift = INTVAL (XEXP (x, 1));
5494     }
5495   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5496   else if ((GET_CODE (x) == SIGN_EXTRACT
5497             || GET_CODE (x) == ZERO_EXTRACT)
5498            && GET_MODE (x) == DImode
5499            && GET_CODE (XEXP (x, 0)) == MULT
5500            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5501            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5502     {
5503       type = (GET_CODE (x) == SIGN_EXTRACT)
5504         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5505       index = XEXP (XEXP (x, 0), 0);
5506       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5507       if (INTVAL (XEXP (x, 1)) != 32 + shift
5508           || INTVAL (XEXP (x, 2)) != 0)
5509         shift = -1;
5510     }
5511   /* (and:DI (mult:DI (reg:DI) (const_int scale))
5512      (const_int 0xffffffff<<shift)) */
5513   else if (GET_CODE (x) == AND
5514            && GET_MODE (x) == DImode
5515            && GET_CODE (XEXP (x, 0)) == MULT
5516            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5517            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5518            && CONST_INT_P (XEXP (x, 1)))
5519     {
5520       type = ADDRESS_REG_UXTW;
5521       index = XEXP (XEXP (x, 0), 0);
5522       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5523       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5524         shift = -1;
5525     }
5526   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5527   else if ((GET_CODE (x) == SIGN_EXTRACT
5528             || GET_CODE (x) == ZERO_EXTRACT)
5529            && GET_MODE (x) == DImode
5530            && GET_CODE (XEXP (x, 0)) == ASHIFT
5531            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5532            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5533     {
5534       type = (GET_CODE (x) == SIGN_EXTRACT)
5535         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5536       index = XEXP (XEXP (x, 0), 0);
5537       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5538       if (INTVAL (XEXP (x, 1)) != 32 + shift
5539           || INTVAL (XEXP (x, 2)) != 0)
5540         shift = -1;
5541     }
5542   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5543      (const_int 0xffffffff<<shift)) */
5544   else if (GET_CODE (x) == AND
5545            && GET_MODE (x) == DImode
5546            && GET_CODE (XEXP (x, 0)) == ASHIFT
5547            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5548            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5549            && CONST_INT_P (XEXP (x, 1)))
5550     {
5551       type = ADDRESS_REG_UXTW;
5552       index = XEXP (XEXP (x, 0), 0);
5553       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5554       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5555         shift = -1;
5556     }
5557   /* (mult:P (reg:P) (const_int scale)) */
5558   else if (GET_CODE (x) == MULT
5559            && GET_MODE (x) == Pmode
5560            && GET_MODE (XEXP (x, 0)) == Pmode
5561            && CONST_INT_P (XEXP (x, 1)))
5562     {
5563       type = ADDRESS_REG_REG;
5564       index = XEXP (x, 0);
5565       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5566     }
5567   /* (ashift:P (reg:P) (const_int shift)) */
5568   else if (GET_CODE (x) == ASHIFT
5569            && GET_MODE (x) == Pmode
5570            && GET_MODE (XEXP (x, 0)) == Pmode
5571            && CONST_INT_P (XEXP (x, 1)))
5572     {
5573       type = ADDRESS_REG_REG;
5574       index = XEXP (x, 0);
5575       shift = INTVAL (XEXP (x, 1));
5576     }
5577   else
5578     return false;
5579
5580   if (!strict_p
5581       && GET_CODE (index) == SUBREG
5582       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5583     index = SUBREG_REG (index);
5584
5585   if (aarch64_sve_data_mode_p (mode))
5586     {
5587       if (type != ADDRESS_REG_REG
5588           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5589         return false;
5590     }
5591   else
5592     {
5593       if (shift != 0
5594           && !(IN_RANGE (shift, 1, 3)
5595                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5596         return false;
5597     }
5598
5599   if (REG_P (index)
5600       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5601     {
5602       info->type = type;
5603       info->offset = index;
5604       info->shift = shift;
5605       return true;
5606     }
5607
5608   return false;
5609 }
5610
5611 /* Return true if MODE is one of the modes for which we
5612    support LDP/STP operations.  */
5613
5614 static bool
5615 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5616 {
5617   return mode == SImode || mode == DImode
5618          || mode == SFmode || mode == DFmode
5619          || (aarch64_vector_mode_supported_p (mode)
5620              && known_eq (GET_MODE_SIZE (mode), 8));
5621 }
5622
5623 /* Return true if REGNO is a virtual pointer register, or an eliminable
5624    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
5625    include stack_pointer or hard_frame_pointer.  */
5626 static bool
5627 virt_or_elim_regno_p (unsigned regno)
5628 {
5629   return ((regno >= FIRST_VIRTUAL_REGISTER
5630            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5631           || regno == FRAME_POINTER_REGNUM
5632           || regno == ARG_POINTER_REGNUM);
5633 }
5634
5635 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5636    If it is, fill in INFO appropriately.  STRICT_P is true if
5637    REG_OK_STRICT is in effect.  */
5638
5639 static bool
5640 aarch64_classify_address (struct aarch64_address_info *info,
5641                           rtx x, machine_mode mode, bool strict_p,
5642                           aarch64_addr_query_type type = ADDR_QUERY_M)
5643 {
5644   enum rtx_code code = GET_CODE (x);
5645   rtx op0, op1;
5646   poly_int64 offset;
5647
5648   HOST_WIDE_INT const_size;
5649
5650   /* On BE, we use load/store pair for all large int mode load/stores.
5651      TI/TFmode may also use a load/store pair.  */
5652   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5653   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5654   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5655                             || mode == TImode
5656                             || mode == TFmode
5657                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5658
5659   bool allow_reg_index_p = (!load_store_pair_p
5660                             && (known_lt (GET_MODE_SIZE (mode), 16)
5661                                 || vec_flags == VEC_ADVSIMD
5662                                 || vec_flags == VEC_SVE_DATA));
5663
5664   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5665      [Rn, #offset, MUL VL].  */
5666   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5667       && (code != REG && code != PLUS))
5668     return false;
5669
5670   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5671      REG addressing.  */
5672   if (advsimd_struct_p
5673       && !BYTES_BIG_ENDIAN
5674       && (code != POST_INC && code != REG))
5675     return false;
5676
5677   gcc_checking_assert (GET_MODE (x) == VOIDmode
5678                        || SCALAR_INT_MODE_P (GET_MODE (x)));
5679
5680   switch (code)
5681     {
5682     case REG:
5683     case SUBREG:
5684       info->type = ADDRESS_REG_IMM;
5685       info->base = x;
5686       info->offset = const0_rtx;
5687       info->const_offset = 0;
5688       return aarch64_base_register_rtx_p (x, strict_p);
5689
5690     case PLUS:
5691       op0 = XEXP (x, 0);
5692       op1 = XEXP (x, 1);
5693
5694       if (! strict_p
5695           && REG_P (op0)
5696           && virt_or_elim_regno_p (REGNO (op0))
5697           && poly_int_rtx_p (op1, &offset))
5698         {
5699           info->type = ADDRESS_REG_IMM;
5700           info->base = op0;
5701           info->offset = op1;
5702           info->const_offset = offset;
5703
5704           return true;
5705         }
5706
5707       if (maybe_ne (GET_MODE_SIZE (mode), 0)
5708           && aarch64_base_register_rtx_p (op0, strict_p)
5709           && poly_int_rtx_p (op1, &offset))
5710         {
5711           info->type = ADDRESS_REG_IMM;
5712           info->base = op0;
5713           info->offset = op1;
5714           info->const_offset = offset;
5715
5716           /* TImode and TFmode values are allowed in both pairs of X
5717              registers and individual Q registers.  The available
5718              address modes are:
5719              X,X: 7-bit signed scaled offset
5720              Q:   9-bit signed offset
5721              We conservatively require an offset representable in either mode.
5722              When performing the check for pairs of X registers i.e.  LDP/STP
5723              pass down DImode since that is the natural size of the LDP/STP
5724              instruction memory accesses.  */
5725           if (mode == TImode || mode == TFmode)
5726             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5727                     && (offset_9bit_signed_unscaled_p (mode, offset)
5728                         || offset_12bit_unsigned_scaled_p (mode, offset)));
5729
5730           /* A 7bit offset check because OImode will emit a ldp/stp
5731              instruction (only big endian will get here).
5732              For ldp/stp instructions, the offset is scaled for the size of a
5733              single element of the pair.  */
5734           if (mode == OImode)
5735             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5736
5737           /* Three 9/12 bit offsets checks because CImode will emit three
5738              ldr/str instructions (only big endian will get here).  */
5739           if (mode == CImode)
5740             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5741                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5742                         || offset_12bit_unsigned_scaled_p (V16QImode,
5743                                                            offset + 32)));
5744
5745           /* Two 7bit offsets checks because XImode will emit two ldp/stp
5746              instructions (only big endian will get here).  */
5747           if (mode == XImode)
5748             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5749                     && aarch64_offset_7bit_signed_scaled_p (TImode,
5750                                                             offset + 32));
5751
5752           /* Make "m" use the LD1 offset range for SVE data modes, so
5753              that pre-RTL optimizers like ivopts will work to that
5754              instead of the wider LDR/STR range.  */
5755           if (vec_flags == VEC_SVE_DATA)
5756             return (type == ADDR_QUERY_M
5757                     ? offset_4bit_signed_scaled_p (mode, offset)
5758                     : offset_9bit_signed_scaled_p (mode, offset));
5759
5760           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5761             {
5762               poly_int64 end_offset = (offset
5763                                        + GET_MODE_SIZE (mode)
5764                                        - BYTES_PER_SVE_VECTOR);
5765               return (type == ADDR_QUERY_M
5766                       ? offset_4bit_signed_scaled_p (mode, offset)
5767                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5768                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5769                                                          end_offset)));
5770             }
5771
5772           if (vec_flags == VEC_SVE_PRED)
5773             return offset_9bit_signed_scaled_p (mode, offset);
5774
5775           if (load_store_pair_p)
5776             return ((known_eq (GET_MODE_SIZE (mode), 4)
5777                      || known_eq (GET_MODE_SIZE (mode), 8))
5778                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5779           else
5780             return (offset_9bit_signed_unscaled_p (mode, offset)
5781                     || offset_12bit_unsigned_scaled_p (mode, offset));
5782         }
5783
5784       if (allow_reg_index_p)
5785         {
5786           /* Look for base + (scaled/extended) index register.  */
5787           if (aarch64_base_register_rtx_p (op0, strict_p)
5788               && aarch64_classify_index (info, op1, mode, strict_p))
5789             {
5790               info->base = op0;
5791               return true;
5792             }
5793           if (aarch64_base_register_rtx_p (op1, strict_p)
5794               && aarch64_classify_index (info, op0, mode, strict_p))
5795             {
5796               info->base = op1;
5797               return true;
5798             }
5799         }
5800
5801       return false;
5802
5803     case POST_INC:
5804     case POST_DEC:
5805     case PRE_INC:
5806     case PRE_DEC:
5807       info->type = ADDRESS_REG_WB;
5808       info->base = XEXP (x, 0);
5809       info->offset = NULL_RTX;
5810       return aarch64_base_register_rtx_p (info->base, strict_p);
5811
5812     case POST_MODIFY:
5813     case PRE_MODIFY:
5814       info->type = ADDRESS_REG_WB;
5815       info->base = XEXP (x, 0);
5816       if (GET_CODE (XEXP (x, 1)) == PLUS
5817           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5818           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5819           && aarch64_base_register_rtx_p (info->base, strict_p))
5820         {
5821           info->offset = XEXP (XEXP (x, 1), 1);
5822           info->const_offset = offset;
5823
5824           /* TImode and TFmode values are allowed in both pairs of X
5825              registers and individual Q registers.  The available
5826              address modes are:
5827              X,X: 7-bit signed scaled offset
5828              Q:   9-bit signed offset
5829              We conservatively require an offset representable in either mode.
5830            */
5831           if (mode == TImode || mode == TFmode)
5832             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5833                     && offset_9bit_signed_unscaled_p (mode, offset));
5834
5835           if (load_store_pair_p)
5836             return ((known_eq (GET_MODE_SIZE (mode), 4)
5837                      || known_eq (GET_MODE_SIZE (mode), 8))
5838                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5839           else
5840             return offset_9bit_signed_unscaled_p (mode, offset);
5841         }
5842       return false;
5843
5844     case CONST:
5845     case SYMBOL_REF:
5846     case LABEL_REF:
5847       /* load literal: pc-relative constant pool entry.  Only supported
5848          for SI mode or larger.  */
5849       info->type = ADDRESS_SYMBOLIC;
5850
5851       if (!load_store_pair_p
5852           && GET_MODE_SIZE (mode).is_constant (&const_size)
5853           && const_size >= 4)
5854         {
5855           rtx sym, addend;
5856
5857           split_const (x, &sym, &addend);
5858           return ((GET_CODE (sym) == LABEL_REF
5859                    || (GET_CODE (sym) == SYMBOL_REF
5860                        && CONSTANT_POOL_ADDRESS_P (sym)
5861                        && aarch64_pcrelative_literal_loads)));
5862         }
5863       return false;
5864
5865     case LO_SUM:
5866       info->type = ADDRESS_LO_SUM;
5867       info->base = XEXP (x, 0);
5868       info->offset = XEXP (x, 1);
5869       if (allow_reg_index_p
5870           && aarch64_base_register_rtx_p (info->base, strict_p))
5871         {
5872           rtx sym, offs;
5873           split_const (info->offset, &sym, &offs);
5874           if (GET_CODE (sym) == SYMBOL_REF
5875               && (aarch64_classify_symbol (sym, INTVAL (offs))
5876                   == SYMBOL_SMALL_ABSOLUTE))
5877             {
5878               /* The symbol and offset must be aligned to the access size.  */
5879               unsigned int align;
5880
5881               if (CONSTANT_POOL_ADDRESS_P (sym))
5882                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5883               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5884                 {
5885                   tree exp = SYMBOL_REF_DECL (sym);
5886                   align = TYPE_ALIGN (TREE_TYPE (exp));
5887                   align = aarch64_constant_alignment (exp, align);
5888                 }
5889               else if (SYMBOL_REF_DECL (sym))
5890                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5891               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5892                        && SYMBOL_REF_BLOCK (sym) != NULL)
5893                 align = SYMBOL_REF_BLOCK (sym)->alignment;
5894               else
5895                 align = BITS_PER_UNIT;
5896
5897               poly_int64 ref_size = GET_MODE_SIZE (mode);
5898               if (known_eq (ref_size, 0))
5899                 ref_size = GET_MODE_SIZE (DImode);
5900
5901               return (multiple_p (INTVAL (offs), ref_size)
5902                       && multiple_p (align / BITS_PER_UNIT, ref_size));
5903             }
5904         }
5905       return false;
5906
5907     default:
5908       return false;
5909     }
5910 }
5911
5912 /* Return true if the address X is valid for a PRFM instruction.
5913    STRICT_P is true if we should do strict checking with
5914    aarch64_classify_address.  */
5915
5916 bool
5917 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5918 {
5919   struct aarch64_address_info addr;
5920
5921   /* PRFM accepts the same addresses as DImode...  */
5922   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5923   if (!res)
5924     return false;
5925
5926   /* ... except writeback forms.  */
5927   return addr.type != ADDRESS_REG_WB;
5928 }
5929
5930 bool
5931 aarch64_symbolic_address_p (rtx x)
5932 {
5933   rtx offset;
5934
5935   split_const (x, &x, &offset);
5936   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
5937 }
5938
5939 /* Classify the base of symbolic expression X.  */
5940
5941 enum aarch64_symbol_type
5942 aarch64_classify_symbolic_expression (rtx x)
5943 {
5944   rtx offset;
5945
5946   split_const (x, &x, &offset);
5947   return aarch64_classify_symbol (x, INTVAL (offset));
5948 }
5949
5950
5951 /* Return TRUE if X is a legitimate address for accessing memory in
5952    mode MODE.  */
5953 static bool
5954 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
5955 {
5956   struct aarch64_address_info addr;
5957
5958   return aarch64_classify_address (&addr, x, mode, strict_p);
5959 }
5960
5961 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5962    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5963 bool
5964 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
5965                               aarch64_addr_query_type type)
5966 {
5967   struct aarch64_address_info addr;
5968
5969   return aarch64_classify_address (&addr, x, mode, strict_p, type);
5970 }
5971
5972 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
5973
5974 static bool
5975 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
5976                                          poly_int64 orig_offset,
5977                                          machine_mode mode)
5978 {
5979   HOST_WIDE_INT size;
5980   if (GET_MODE_SIZE (mode).is_constant (&size))
5981     {
5982       HOST_WIDE_INT const_offset, second_offset;
5983
5984       /* A general SVE offset is A * VQ + B.  Remove the A component from
5985          coefficient 0 in order to get the constant B.  */
5986       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
5987
5988       /* Split an out-of-range address displacement into a base and
5989          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
5990          range otherwise to increase opportunities for sharing the base
5991          address of different sizes.  Unaligned accesses use the signed
5992          9-bit range, TImode/TFmode use the intersection of signed
5993          scaled 7-bit and signed 9-bit offset.  */
5994       if (mode == TImode || mode == TFmode)
5995         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
5996       else if ((const_offset & (size - 1)) != 0)
5997         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
5998       else
5999         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6000
6001       if (second_offset == 0 || known_eq (orig_offset, second_offset))
6002         return false;
6003
6004       /* Split the offset into second_offset and the rest.  */
6005       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6006       *offset2 = gen_int_mode (second_offset, Pmode);
6007       return true;
6008     }
6009   else
6010     {
6011       /* Get the mode we should use as the basis of the range.  For structure
6012          modes this is the mode of one vector.  */
6013       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6014       machine_mode step_mode
6015         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6016
6017       /* Get the "mul vl" multiplier we'd like to use.  */
6018       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6019       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6020       if (vec_flags & VEC_SVE_DATA)
6021         /* LDR supports a 9-bit range, but the move patterns for
6022            structure modes require all vectors to be in range of the
6023            same base.  The simplest way of accomodating that while still
6024            promoting reuse of anchor points between different modes is
6025            to use an 8-bit range unconditionally.  */
6026         vnum = ((vnum + 128) & 255) - 128;
6027       else
6028         /* Predicates are only handled singly, so we might as well use
6029            the full range.  */
6030         vnum = ((vnum + 256) & 511) - 256;
6031       if (vnum == 0)
6032         return false;
6033
6034       /* Convert the "mul vl" multiplier into a byte offset.  */
6035       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6036       if (known_eq (second_offset, orig_offset))
6037         return false;
6038
6039       /* Split the offset into second_offset and the rest.  */
6040       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6041       *offset2 = gen_int_mode (second_offset, Pmode);
6042       return true;
6043     }
6044 }
6045
6046 /* Return the binary representation of floating point constant VALUE in INTVAL.
6047    If the value cannot be converted, return false without setting INTVAL.
6048    The conversion is done in the given MODE.  */
6049 bool
6050 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6051 {
6052
6053   /* We make a general exception for 0.  */
6054   if (aarch64_float_const_zero_rtx_p (value))
6055     {
6056       *intval = 0;
6057       return true;
6058     }
6059
6060   scalar_float_mode mode;
6061   if (GET_CODE (value) != CONST_DOUBLE
6062       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6063       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6064       /* Only support up to DF mode.  */
6065       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6066     return false;
6067
6068   unsigned HOST_WIDE_INT ival = 0;
6069
6070   long res[2];
6071   real_to_target (res,
6072                   CONST_DOUBLE_REAL_VALUE (value),
6073                   REAL_MODE_FORMAT (mode));
6074
6075   if (mode == DFmode)
6076     {
6077       int order = BYTES_BIG_ENDIAN ? 1 : 0;
6078       ival = zext_hwi (res[order], 32);
6079       ival |= (zext_hwi (res[1 - order], 32) << 32);
6080     }
6081   else
6082       ival = zext_hwi (res[0], 32);
6083
6084   *intval = ival;
6085   return true;
6086 }
6087
6088 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6089    single MOV(+MOVK) followed by an FMOV.  */
6090 bool
6091 aarch64_float_const_rtx_p (rtx x)
6092 {
6093   machine_mode mode = GET_MODE (x);
6094   if (mode == VOIDmode)
6095     return false;
6096
6097   /* Determine whether it's cheaper to write float constants as
6098      mov/movk pairs over ldr/adrp pairs.  */
6099   unsigned HOST_WIDE_INT ival;
6100
6101   if (GET_CODE (x) == CONST_DOUBLE
6102       && SCALAR_FLOAT_MODE_P (mode)
6103       && aarch64_reinterpret_float_as_int (x, &ival))
6104     {
6105       scalar_int_mode imode = (mode == HFmode
6106                                ? SImode
6107                                : int_mode_for_mode (mode).require ());
6108       int num_instr = aarch64_internal_mov_immediate
6109                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6110       return num_instr < 3;
6111     }
6112
6113   return false;
6114 }
6115
6116 /* Return TRUE if rtx X is immediate constant 0.0 */
6117 bool
6118 aarch64_float_const_zero_rtx_p (rtx x)
6119 {
6120   if (GET_MODE (x) == VOIDmode)
6121     return false;
6122
6123   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6124     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6125   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6126 }
6127
6128 /* Return TRUE if rtx X is immediate constant that fits in a single
6129    MOVI immediate operation.  */
6130 bool
6131 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6132 {
6133   if (!TARGET_SIMD)
6134      return false;
6135
6136   machine_mode vmode;
6137   scalar_int_mode imode;
6138   unsigned HOST_WIDE_INT ival;
6139
6140   if (GET_CODE (x) == CONST_DOUBLE
6141       && SCALAR_FLOAT_MODE_P (mode))
6142     {
6143       if (!aarch64_reinterpret_float_as_int (x, &ival))
6144         return false;
6145
6146       /* We make a general exception for 0.  */
6147       if (aarch64_float_const_zero_rtx_p (x))
6148         return true;
6149
6150       imode = int_mode_for_mode (mode).require ();
6151     }
6152   else if (GET_CODE (x) == CONST_INT
6153            && is_a <scalar_int_mode> (mode, &imode))
6154     ival = INTVAL (x);
6155   else
6156     return false;
6157
6158    /* use a 64 bit mode for everything except for DI/DF mode, where we use
6159      a 128 bit vector mode.  */
6160   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6161
6162   vmode = aarch64_simd_container_mode (imode, width);
6163   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6164
6165   return aarch64_simd_valid_immediate (v_op, NULL);
6166 }
6167
6168
6169 /* Return the fixed registers used for condition codes.  */
6170
6171 static bool
6172 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6173 {
6174   *p1 = CC_REGNUM;
6175   *p2 = INVALID_REGNUM;
6176   return true;
6177 }
6178
6179 /* This function is used by the call expanders of the machine description.
6180    RESULT is the register in which the result is returned.  It's NULL for
6181    "call" and "sibcall".
6182    MEM is the location of the function call.
6183    SIBCALL indicates whether this function call is normal call or sibling call.
6184    It will generate different pattern accordingly.  */
6185
6186 void
6187 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6188 {
6189   rtx call, callee, tmp;
6190   rtvec vec;
6191   machine_mode mode;
6192
6193   gcc_assert (MEM_P (mem));
6194   callee = XEXP (mem, 0);
6195   mode = GET_MODE (callee);
6196   gcc_assert (mode == Pmode);
6197
6198   /* Decide if we should generate indirect calls by loading the
6199      address of the callee into a register before performing
6200      the branch-and-link.  */
6201   if (SYMBOL_REF_P (callee)
6202       ? (aarch64_is_long_call_p (callee)
6203          || aarch64_is_noplt_call_p (callee))
6204       : !REG_P (callee))
6205     XEXP (mem, 0) = force_reg (mode, callee);
6206
6207   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6208
6209   if (result != NULL_RTX)
6210     call = gen_rtx_SET (result, call);
6211
6212   if (sibcall)
6213     tmp = ret_rtx;
6214   else
6215     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6216
6217   vec = gen_rtvec (2, call, tmp);
6218   call = gen_rtx_PARALLEL (VOIDmode, vec);
6219
6220   aarch64_emit_call_insn (call);
6221 }
6222
6223 /* Emit call insn with PAT and do aarch64-specific handling.  */
6224
6225 void
6226 aarch64_emit_call_insn (rtx pat)
6227 {
6228   rtx insn = emit_call_insn (pat);
6229
6230   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6231   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6232   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6233 }
6234
6235 machine_mode
6236 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6237 {
6238   /* All floating point compares return CCFP if it is an equality
6239      comparison, and CCFPE otherwise.  */
6240   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6241     {
6242       switch (code)
6243         {
6244         case EQ:
6245         case NE:
6246         case UNORDERED:
6247         case ORDERED:
6248         case UNLT:
6249         case UNLE:
6250         case UNGT:
6251         case UNGE:
6252         case UNEQ:
6253           return CCFPmode;
6254
6255         case LT:
6256         case LE:
6257         case GT:
6258         case GE:
6259         case LTGT:
6260           return CCFPEmode;
6261
6262         default:
6263           gcc_unreachable ();
6264         }
6265     }
6266
6267   /* Equality comparisons of short modes against zero can be performed
6268      using the TST instruction with the appropriate bitmask.  */
6269   if (y == const0_rtx && REG_P (x)
6270       && (code == EQ || code == NE)
6271       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6272     return CC_NZmode;
6273
6274   /* Similarly, comparisons of zero_extends from shorter modes can
6275      be performed using an ANDS with an immediate mask.  */
6276   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6277       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6278       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6279       && (code == EQ || code == NE))
6280     return CC_NZmode;
6281
6282   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6283       && y == const0_rtx
6284       && (code == EQ || code == NE || code == LT || code == GE)
6285       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6286           || GET_CODE (x) == NEG
6287           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6288               && CONST_INT_P (XEXP (x, 2)))))
6289     return CC_NZmode;
6290
6291   /* A compare with a shifted operand.  Because of canonicalization,
6292      the comparison will have to be swapped when we emit the assembly
6293      code.  */
6294   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6295       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6296       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6297           || GET_CODE (x) == LSHIFTRT
6298           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6299     return CC_SWPmode;
6300
6301   /* Similarly for a negated operand, but we can only do this for
6302      equalities.  */
6303   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6304       && (REG_P (y) || GET_CODE (y) == SUBREG)
6305       && (code == EQ || code == NE)
6306       && GET_CODE (x) == NEG)
6307     return CC_Zmode;
6308
6309   /* A test for unsigned overflow.  */
6310   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6311       && code == NE
6312       && GET_CODE (x) == PLUS
6313       && GET_CODE (y) == ZERO_EXTEND)
6314     return CC_Cmode;
6315
6316   /* For everything else, return CCmode.  */
6317   return CCmode;
6318 }
6319
6320 static int
6321 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6322
6323 int
6324 aarch64_get_condition_code (rtx x)
6325 {
6326   machine_mode mode = GET_MODE (XEXP (x, 0));
6327   enum rtx_code comp_code = GET_CODE (x);
6328
6329   if (GET_MODE_CLASS (mode) != MODE_CC)
6330     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6331   return aarch64_get_condition_code_1 (mode, comp_code);
6332 }
6333
6334 static int
6335 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6336 {
6337   switch (mode)
6338     {
6339     case E_CCFPmode:
6340     case E_CCFPEmode:
6341       switch (comp_code)
6342         {
6343         case GE: return AARCH64_GE;
6344         case GT: return AARCH64_GT;
6345         case LE: return AARCH64_LS;
6346         case LT: return AARCH64_MI;
6347         case NE: return AARCH64_NE;
6348         case EQ: return AARCH64_EQ;
6349         case ORDERED: return AARCH64_VC;
6350         case UNORDERED: return AARCH64_VS;
6351         case UNLT: return AARCH64_LT;
6352         case UNLE: return AARCH64_LE;
6353         case UNGT: return AARCH64_HI;
6354         case UNGE: return AARCH64_PL;
6355         default: return -1;
6356         }
6357       break;
6358
6359     case E_CCmode:
6360       switch (comp_code)
6361         {
6362         case NE: return AARCH64_NE;
6363         case EQ: return AARCH64_EQ;
6364         case GE: return AARCH64_GE;
6365         case GT: return AARCH64_GT;
6366         case LE: return AARCH64_LE;
6367         case LT: return AARCH64_LT;
6368         case GEU: return AARCH64_CS;
6369         case GTU: return AARCH64_HI;
6370         case LEU: return AARCH64_LS;
6371         case LTU: return AARCH64_CC;
6372         default: return -1;
6373         }
6374       break;
6375
6376     case E_CC_SWPmode:
6377       switch (comp_code)
6378         {
6379         case NE: return AARCH64_NE;
6380         case EQ: return AARCH64_EQ;
6381         case GE: return AARCH64_LE;
6382         case GT: return AARCH64_LT;
6383         case LE: return AARCH64_GE;
6384         case LT: return AARCH64_GT;
6385         case GEU: return AARCH64_LS;
6386         case GTU: return AARCH64_CC;
6387         case LEU: return AARCH64_CS;
6388         case LTU: return AARCH64_HI;
6389         default: return -1;
6390         }
6391       break;
6392
6393     case E_CC_NZmode:
6394       switch (comp_code)
6395         {
6396         case NE: return AARCH64_NE;
6397         case EQ: return AARCH64_EQ;
6398         case GE: return AARCH64_PL;
6399         case LT: return AARCH64_MI;
6400         default: return -1;
6401         }
6402       break;
6403
6404     case E_CC_Zmode:
6405       switch (comp_code)
6406         {
6407         case NE: return AARCH64_NE;
6408         case EQ: return AARCH64_EQ;
6409         default: return -1;
6410         }
6411       break;
6412
6413     case E_CC_Cmode:
6414       switch (comp_code)
6415         {
6416         case NE: return AARCH64_CS;
6417         case EQ: return AARCH64_CC;
6418         default: return -1;
6419         }
6420       break;
6421
6422     default:
6423       return -1;
6424     }
6425
6426   return -1;
6427 }
6428
6429 bool
6430 aarch64_const_vec_all_same_in_range_p (rtx x,
6431                                        HOST_WIDE_INT minval,
6432                                        HOST_WIDE_INT maxval)
6433 {
6434   rtx elt;
6435   return (const_vec_duplicate_p (x, &elt)
6436           && CONST_INT_P (elt)
6437           && IN_RANGE (INTVAL (elt), minval, maxval));
6438 }
6439
6440 bool
6441 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6442 {
6443   return aarch64_const_vec_all_same_in_range_p (x, val, val);
6444 }
6445
6446 /* Return true if VEC is a constant in which every element is in the range
6447    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
6448
6449 static bool
6450 aarch64_const_vec_all_in_range_p (rtx vec,
6451                                   HOST_WIDE_INT minval,
6452                                   HOST_WIDE_INT maxval)
6453 {
6454   if (GET_CODE (vec) != CONST_VECTOR
6455       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6456     return false;
6457
6458   int nunits;
6459   if (!CONST_VECTOR_STEPPED_P (vec))
6460     nunits = const_vector_encoded_nelts (vec);
6461   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6462     return false;
6463
6464   for (int i = 0; i < nunits; i++)
6465     {
6466       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6467       if (!CONST_INT_P (vec_elem)
6468           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6469         return false;
6470     }
6471   return true;
6472 }
6473
6474 /* N Z C V.  */
6475 #define AARCH64_CC_V 1
6476 #define AARCH64_CC_C (1 << 1)
6477 #define AARCH64_CC_Z (1 << 2)
6478 #define AARCH64_CC_N (1 << 3)
6479
6480 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
6481 static const int aarch64_nzcv_codes[] =
6482 {
6483   0,            /* EQ, Z == 1.  */
6484   AARCH64_CC_Z, /* NE, Z == 0.  */
6485   0,            /* CS, C == 1.  */
6486   AARCH64_CC_C, /* CC, C == 0.  */
6487   0,            /* MI, N == 1.  */
6488   AARCH64_CC_N, /* PL, N == 0.  */
6489   0,            /* VS, V == 1.  */
6490   AARCH64_CC_V, /* VC, V == 0.  */
6491   0,            /* HI, C ==1 && Z == 0.  */
6492   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
6493   AARCH64_CC_V, /* GE, N == V.  */
6494   0,            /* LT, N != V.  */
6495   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
6496   0,            /* LE, !(Z == 0 && N == V).  */
6497   0,            /* AL, Any.  */
6498   0             /* NV, Any.  */
6499 };
6500
6501 /* Print floating-point vector immediate operand X to F, negating it
6502    first if NEGATE is true.  Return true on success, false if it isn't
6503    a constant we can handle.  */
6504
6505 static bool
6506 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6507 {
6508   rtx elt;
6509
6510   if (!const_vec_duplicate_p (x, &elt))
6511     return false;
6512
6513   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6514   if (negate)
6515     r = real_value_negate (&r);
6516
6517   /* We only handle the SVE single-bit immediates here.  */
6518   if (real_equal (&r, &dconst0))
6519     asm_fprintf (f, "0.0");
6520   else if (real_equal (&r, &dconst1))
6521     asm_fprintf (f, "1.0");
6522   else if (real_equal (&r, &dconsthalf))
6523     asm_fprintf (f, "0.5");
6524   else
6525     return false;
6526
6527   return true;
6528 }
6529
6530 /* Return the equivalent letter for size.  */
6531 static char
6532 sizetochar (int size)
6533 {
6534   switch (size)
6535     {
6536     case 64: return 'd';
6537     case 32: return 's';
6538     case 16: return 'h';
6539     case 8 : return 'b';
6540     default: gcc_unreachable ();
6541     }
6542 }
6543
6544 /* Print operand X to file F in a target specific manner according to CODE.
6545    The acceptable formatting commands given by CODE are:
6546      'c':               An integer or symbol address without a preceding #
6547                         sign.
6548      'C':               Take the duplicated element in a vector constant
6549                         and print it in hex.
6550      'D':               Take the duplicated element in a vector constant
6551                         and print it as an unsigned integer, in decimal.
6552      'e':               Print the sign/zero-extend size as a character 8->b,
6553                         16->h, 32->w.
6554      'p':               Prints N such that 2^N == X (X must be power of 2 and
6555                         const int).
6556      'P':               Print the number of non-zero bits in X (a const_int).
6557      'H':               Print the higher numbered register of a pair (TImode)
6558                         of regs.
6559      'm':               Print a condition (eq, ne, etc).
6560      'M':               Same as 'm', but invert condition.
6561      'N':               Take the duplicated element in a vector constant
6562                         and print the negative of it in decimal.
6563      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
6564      'S/T/U/V':         Print a FP/SIMD register name for a register list.
6565                         The register printed is the FP/SIMD register name
6566                         of X + 0/1/2/3 for S/T/U/V.
6567      'R':               Print a scalar FP/SIMD register name + 1.
6568      'X':               Print bottom 16 bits of integer constant in hex.
6569      'w/x':             Print a general register name or the zero register
6570                         (32-bit or 64-bit).
6571      '0':               Print a normal operand, if it's a general register,
6572                         then we assume DImode.
6573      'k':               Print NZCV for conditional compare instructions.
6574      'A':               Output address constant representing the first
6575                         argument of X, specifying a relocation offset
6576                         if appropriate.
6577      'L':               Output constant address specified by X
6578                         with a relocation offset if appropriate.
6579      'G':               Prints address of X, specifying a PC relative
6580                         relocation mode if appropriate.
6581      'y':               Output address of LDP or STP - this is used for
6582                         some LDP/STPs which don't use a PARALLEL in their
6583                         pattern (so the mode needs to be adjusted).
6584      'z':               Output address of a typical LDP or STP.  */
6585
6586 static void
6587 aarch64_print_operand (FILE *f, rtx x, int code)
6588 {
6589   rtx elt;
6590   switch (code)
6591     {
6592     case 'c':
6593       switch (GET_CODE (x))
6594         {
6595         case CONST_INT:
6596           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6597           break;
6598
6599         case SYMBOL_REF:
6600           output_addr_const (f, x);
6601           break;
6602
6603         case CONST:
6604           if (GET_CODE (XEXP (x, 0)) == PLUS
6605               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6606             {
6607               output_addr_const (f, x);
6608               break;
6609             }
6610           /* Fall through.  */
6611
6612         default:
6613           output_operand_lossage ("unsupported operand for code '%c'", code);
6614         }
6615       break;
6616
6617     case 'e':
6618       {
6619         int n;
6620
6621         if (!CONST_INT_P (x)
6622             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6623           {
6624             output_operand_lossage ("invalid operand for '%%%c'", code);
6625             return;
6626           }
6627
6628         switch (n)
6629           {
6630           case 3:
6631             fputc ('b', f);
6632             break;
6633           case 4:
6634             fputc ('h', f);
6635             break;
6636           case 5:
6637             fputc ('w', f);
6638             break;
6639           default:
6640             output_operand_lossage ("invalid operand for '%%%c'", code);
6641             return;
6642           }
6643       }
6644       break;
6645
6646     case 'p':
6647       {
6648         int n;
6649
6650         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6651           {
6652             output_operand_lossage ("invalid operand for '%%%c'", code);
6653             return;
6654           }
6655
6656         asm_fprintf (f, "%d", n);
6657       }
6658       break;
6659
6660     case 'P':
6661       if (!CONST_INT_P (x))
6662         {
6663           output_operand_lossage ("invalid operand for '%%%c'", code);
6664           return;
6665         }
6666
6667       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6668       break;
6669
6670     case 'H':
6671       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6672         {
6673           output_operand_lossage ("invalid operand for '%%%c'", code);
6674           return;
6675         }
6676
6677       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6678       break;
6679
6680     case 'M':
6681     case 'm':
6682       {
6683         int cond_code;
6684         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
6685         if (x == const_true_rtx)
6686           {
6687             if (code == 'M')
6688               fputs ("nv", f);
6689             return;
6690           }
6691
6692         if (!COMPARISON_P (x))
6693           {
6694             output_operand_lossage ("invalid operand for '%%%c'", code);
6695             return;
6696           }
6697
6698         cond_code = aarch64_get_condition_code (x);
6699         gcc_assert (cond_code >= 0);
6700         if (code == 'M')
6701           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6702         fputs (aarch64_condition_codes[cond_code], f);
6703       }
6704       break;
6705
6706     case 'N':
6707       if (!const_vec_duplicate_p (x, &elt))
6708         {
6709           output_operand_lossage ("invalid vector constant");
6710           return;
6711         }
6712
6713       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6714         asm_fprintf (f, "%wd", -INTVAL (elt));
6715       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6716                && aarch64_print_vector_float_operand (f, x, true))
6717         ;
6718       else
6719         {
6720           output_operand_lossage ("invalid vector constant");
6721           return;
6722         }
6723       break;
6724
6725     case 'b':
6726     case 'h':
6727     case 's':
6728     case 'd':
6729     case 'q':
6730       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6731         {
6732           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6733           return;
6734         }
6735       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6736       break;
6737
6738     case 'S':
6739     case 'T':
6740     case 'U':
6741     case 'V':
6742       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6743         {
6744           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6745           return;
6746         }
6747       asm_fprintf (f, "%c%d",
6748                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6749                    REGNO (x) - V0_REGNUM + (code - 'S'));
6750       break;
6751
6752     case 'R':
6753       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6754         {
6755           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6756           return;
6757         }
6758       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6759       break;
6760
6761     case 'X':
6762       if (!CONST_INT_P (x))
6763         {
6764           output_operand_lossage ("invalid operand for '%%%c'", code);
6765           return;
6766         }
6767       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6768       break;
6769
6770     case 'C':
6771       {
6772         /* Print a replicated constant in hex.  */
6773         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6774           {
6775             output_operand_lossage ("invalid operand for '%%%c'", code);
6776             return;
6777           }
6778         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6779         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6780       }
6781       break;
6782
6783     case 'D':
6784       {
6785         /* Print a replicated constant in decimal, treating it as
6786            unsigned.  */
6787         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6788           {
6789             output_operand_lossage ("invalid operand for '%%%c'", code);
6790             return;
6791           }
6792         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6793         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6794       }
6795       break;
6796
6797     case 'w':
6798     case 'x':
6799       if (x == const0_rtx
6800           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6801         {
6802           asm_fprintf (f, "%czr", code);
6803           break;
6804         }
6805
6806       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6807         {
6808           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6809           break;
6810         }
6811
6812       if (REG_P (x) && REGNO (x) == SP_REGNUM)
6813         {
6814           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6815           break;
6816         }
6817
6818       /* Fall through */
6819
6820     case 0:
6821       if (x == NULL)
6822         {
6823           output_operand_lossage ("missing operand");
6824           return;
6825         }
6826
6827       switch (GET_CODE (x))
6828         {
6829         case REG:
6830           if (aarch64_sve_data_mode_p (GET_MODE (x)))
6831             {
6832               if (REG_NREGS (x) == 1)
6833                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6834               else
6835                 {
6836                   char suffix
6837                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6838                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
6839                                REGNO (x) - V0_REGNUM, suffix,
6840                                END_REGNO (x) - V0_REGNUM - 1, suffix);
6841                 }
6842             }
6843           else
6844             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6845           break;
6846
6847         case MEM:
6848           output_address (GET_MODE (x), XEXP (x, 0));
6849           break;
6850
6851         case LABEL_REF:
6852         case SYMBOL_REF:
6853           output_addr_const (asm_out_file, x);
6854           break;
6855
6856         case CONST_INT:
6857           asm_fprintf (f, "%wd", INTVAL (x));
6858           break;
6859
6860         case CONST:
6861           if (!VECTOR_MODE_P (GET_MODE (x)))
6862             {
6863               output_addr_const (asm_out_file, x);
6864               break;
6865             }
6866           /* fall through */
6867
6868         case CONST_VECTOR:
6869           if (!const_vec_duplicate_p (x, &elt))
6870             {
6871               output_operand_lossage ("invalid vector constant");
6872               return;
6873             }
6874
6875           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6876             asm_fprintf (f, "%wd", INTVAL (elt));
6877           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6878                    && aarch64_print_vector_float_operand (f, x, false))
6879             ;
6880           else
6881             {
6882               output_operand_lossage ("invalid vector constant");
6883               return;
6884             }
6885           break;
6886
6887         case CONST_DOUBLE:
6888           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6889              be getting CONST_DOUBLEs holding integers.  */
6890           gcc_assert (GET_MODE (x) != VOIDmode);
6891           if (aarch64_float_const_zero_rtx_p (x))
6892             {
6893               fputc ('0', f);
6894               break;
6895             }
6896           else if (aarch64_float_const_representable_p (x))
6897             {
6898 #define buf_size 20
6899               char float_buf[buf_size] = {'\0'};
6900               real_to_decimal_for_mode (float_buf,
6901                                         CONST_DOUBLE_REAL_VALUE (x),
6902                                         buf_size, buf_size,
6903                                         1, GET_MODE (x));
6904               asm_fprintf (asm_out_file, "%s", float_buf);
6905               break;
6906 #undef buf_size
6907             }
6908           output_operand_lossage ("invalid constant");
6909           return;
6910         default:
6911           output_operand_lossage ("invalid operand");
6912           return;
6913         }
6914       break;
6915
6916     case 'A':
6917       if (GET_CODE (x) == HIGH)
6918         x = XEXP (x, 0);
6919
6920       switch (aarch64_classify_symbolic_expression (x))
6921         {
6922         case SYMBOL_SMALL_GOT_4G:
6923           asm_fprintf (asm_out_file, ":got:");
6924           break;
6925
6926         case SYMBOL_SMALL_TLSGD:
6927           asm_fprintf (asm_out_file, ":tlsgd:");
6928           break;
6929
6930         case SYMBOL_SMALL_TLSDESC:
6931           asm_fprintf (asm_out_file, ":tlsdesc:");
6932           break;
6933
6934         case SYMBOL_SMALL_TLSIE:
6935           asm_fprintf (asm_out_file, ":gottprel:");
6936           break;
6937
6938         case SYMBOL_TLSLE24:
6939           asm_fprintf (asm_out_file, ":tprel:");
6940           break;
6941
6942         case SYMBOL_TINY_GOT:
6943           gcc_unreachable ();
6944           break;
6945
6946         default:
6947           break;
6948         }
6949       output_addr_const (asm_out_file, x);
6950       break;
6951
6952     case 'L':
6953       switch (aarch64_classify_symbolic_expression (x))
6954         {
6955         case SYMBOL_SMALL_GOT_4G:
6956           asm_fprintf (asm_out_file, ":lo12:");
6957           break;
6958
6959         case SYMBOL_SMALL_TLSGD:
6960           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
6961           break;
6962
6963         case SYMBOL_SMALL_TLSDESC:
6964           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
6965           break;
6966
6967         case SYMBOL_SMALL_TLSIE:
6968           asm_fprintf (asm_out_file, ":gottprel_lo12:");
6969           break;
6970
6971         case SYMBOL_TLSLE12:
6972           asm_fprintf (asm_out_file, ":tprel_lo12:");
6973           break;
6974
6975         case SYMBOL_TLSLE24:
6976           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
6977           break;
6978
6979         case SYMBOL_TINY_GOT:
6980           asm_fprintf (asm_out_file, ":got:");
6981           break;
6982
6983         case SYMBOL_TINY_TLSIE:
6984           asm_fprintf (asm_out_file, ":gottprel:");
6985           break;
6986
6987         default:
6988           break;
6989         }
6990       output_addr_const (asm_out_file, x);
6991       break;
6992
6993     case 'G':
6994       switch (aarch64_classify_symbolic_expression (x))
6995         {
6996         case SYMBOL_TLSLE24:
6997           asm_fprintf (asm_out_file, ":tprel_hi12:");
6998           break;
6999         default:
7000           break;
7001         }
7002       output_addr_const (asm_out_file, x);
7003       break;
7004
7005     case 'k':
7006       {
7007         HOST_WIDE_INT cond_code;
7008
7009         if (!CONST_INT_P (x))
7010           {
7011             output_operand_lossage ("invalid operand for '%%%c'", code);
7012             return;
7013           }
7014
7015         cond_code = INTVAL (x);
7016         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7017         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7018       }
7019       break;
7020
7021     case 'y':
7022     case 'z':
7023       {
7024         machine_mode mode = GET_MODE (x);
7025
7026         if (GET_CODE (x) != MEM
7027             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7028           {
7029             output_operand_lossage ("invalid operand for '%%%c'", code);
7030             return;
7031           }
7032
7033         if (code == 'y')
7034           /* LDP/STP which uses a single double-width memory operand.
7035              Adjust the mode to appear like a typical LDP/STP.
7036              Currently this is supported for 16-byte accesses only.  */
7037           mode = DFmode;
7038
7039         if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
7040           output_operand_lossage ("invalid operand prefix '%%%c'", code);
7041       }
7042       break;
7043
7044     default:
7045       output_operand_lossage ("invalid operand prefix '%%%c'", code);
7046       return;
7047     }
7048 }
7049
7050 /* Print address 'x' of a memory access with mode 'mode'.
7051    'op' is the context required by aarch64_classify_address.  It can either be
7052    MEM for a normal memory access or PARALLEL for LDP/STP.  */
7053 static bool
7054 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7055                                 aarch64_addr_query_type type)
7056 {
7057   struct aarch64_address_info addr;
7058   unsigned int size;
7059
7060   /* Check all addresses are Pmode - including ILP32.  */
7061   if (GET_MODE (x) != Pmode)
7062     output_operand_lossage ("invalid address mode");
7063
7064   if (aarch64_classify_address (&addr, x, mode, true, type))
7065     switch (addr.type)
7066       {
7067       case ADDRESS_REG_IMM:
7068         if (known_eq (addr.const_offset, 0))
7069           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7070         else if (aarch64_sve_data_mode_p (mode))
7071           {
7072             HOST_WIDE_INT vnum
7073               = exact_div (addr.const_offset,
7074                            BYTES_PER_SVE_VECTOR).to_constant ();
7075             asm_fprintf (f, "[%s, #%wd, mul vl]",
7076                          reg_names[REGNO (addr.base)], vnum);
7077           }
7078         else if (aarch64_sve_pred_mode_p (mode))
7079           {
7080             HOST_WIDE_INT vnum
7081               = exact_div (addr.const_offset,
7082                            BYTES_PER_SVE_PRED).to_constant ();
7083             asm_fprintf (f, "[%s, #%wd, mul vl]",
7084                          reg_names[REGNO (addr.base)], vnum);
7085           }
7086         else
7087           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7088                        INTVAL (addr.offset));
7089         return true;
7090
7091       case ADDRESS_REG_REG:
7092         if (addr.shift == 0)
7093           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7094                        reg_names [REGNO (addr.offset)]);
7095         else
7096           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7097                        reg_names [REGNO (addr.offset)], addr.shift);
7098         return true;
7099
7100       case ADDRESS_REG_UXTW:
7101         if (addr.shift == 0)
7102           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7103                        REGNO (addr.offset) - R0_REGNUM);
7104         else
7105           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7106                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7107         return true;
7108
7109       case ADDRESS_REG_SXTW:
7110         if (addr.shift == 0)
7111           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7112                        REGNO (addr.offset) - R0_REGNUM);
7113         else
7114           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7115                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
7116         return true;
7117
7118       case ADDRESS_REG_WB:
7119         /* Writeback is only supported for fixed-width modes.  */
7120         size = GET_MODE_SIZE (mode).to_constant ();
7121         switch (GET_CODE (x))
7122           {
7123           case PRE_INC:
7124             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7125             return true;
7126           case POST_INC:
7127             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7128             return true;
7129           case PRE_DEC:
7130             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7131             return true;
7132           case POST_DEC:
7133             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7134             return true;
7135           case PRE_MODIFY:
7136             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7137                          INTVAL (addr.offset));
7138             return true;
7139           case POST_MODIFY:
7140             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7141                          INTVAL (addr.offset));
7142             return true;
7143           default:
7144             break;
7145           }
7146         break;
7147
7148       case ADDRESS_LO_SUM:
7149         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7150         output_addr_const (f, addr.offset);
7151         asm_fprintf (f, "]");
7152         return true;
7153
7154       case ADDRESS_SYMBOLIC:
7155         output_addr_const (f, x);
7156         return true;
7157       }
7158
7159   return false;
7160 }
7161
7162 /* Print address 'x' of a LDP/STP with mode 'mode'.  */
7163 static bool
7164 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7165 {
7166   return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7167 }
7168
7169 /* Print address 'x' of a memory access with mode 'mode'.  */
7170 static void
7171 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7172 {
7173   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7174     output_addr_const (f, x);
7175 }
7176
7177 bool
7178 aarch64_label_mentioned_p (rtx x)
7179 {
7180   const char *fmt;
7181   int i;
7182
7183   if (GET_CODE (x) == LABEL_REF)
7184     return true;
7185
7186   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7187      referencing instruction, but they are constant offsets, not
7188      symbols.  */
7189   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7190     return false;
7191
7192   fmt = GET_RTX_FORMAT (GET_CODE (x));
7193   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7194     {
7195       if (fmt[i] == 'E')
7196         {
7197           int j;
7198
7199           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7200             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7201               return 1;
7202         }
7203       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7204         return 1;
7205     }
7206
7207   return 0;
7208 }
7209
7210 /* Implement REGNO_REG_CLASS.  */
7211
7212 enum reg_class
7213 aarch64_regno_regclass (unsigned regno)
7214 {
7215   if (GP_REGNUM_P (regno))
7216     return GENERAL_REGS;
7217
7218   if (regno == SP_REGNUM)
7219     return STACK_REG;
7220
7221   if (regno == FRAME_POINTER_REGNUM
7222       || regno == ARG_POINTER_REGNUM)
7223     return POINTER_REGS;
7224
7225   if (FP_REGNUM_P (regno))
7226     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
7227
7228   if (PR_REGNUM_P (regno))
7229     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7230
7231   return NO_REGS;
7232 }
7233
7234 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7235    If OFFSET is out of range, return an offset of an anchor point
7236    that is in range.  Return 0 otherwise.  */
7237
7238 static HOST_WIDE_INT
7239 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7240                        machine_mode mode)
7241 {
7242   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
7243   if (size > 16)
7244     return (offset + 0x400) & ~0x7f0;
7245
7246   /* For offsets that aren't a multiple of the access size, the limit is
7247      -256...255.  */
7248   if (offset & (size - 1))
7249     {
7250       /* BLKmode typically uses LDP of X-registers.  */
7251       if (mode == BLKmode)
7252         return (offset + 512) & ~0x3ff;
7253       return (offset + 0x100) & ~0x1ff;
7254     }
7255
7256   /* Small negative offsets are supported.  */
7257   if (IN_RANGE (offset, -256, 0))
7258     return 0;
7259
7260   if (mode == TImode || mode == TFmode)
7261     return (offset + 0x100) & ~0x1ff;
7262
7263   /* Use 12-bit offset by access size.  */
7264   return offset & (~0xfff * size);
7265 }
7266
7267 static rtx
7268 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
7269 {
7270   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7271      where mask is selected by alignment and size of the offset.
7272      We try to pick as large a range for the offset as possible to
7273      maximize the chance of a CSE.  However, for aligned addresses
7274      we limit the range to 4k so that structures with different sized
7275      elements are likely to use the same base.  We need to be careful
7276      not to split a CONST for some forms of address expression, otherwise
7277      it will generate sub-optimal code.  */
7278
7279   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7280     {
7281       rtx base = XEXP (x, 0);
7282       rtx offset_rtx = XEXP (x, 1);
7283       HOST_WIDE_INT offset = INTVAL (offset_rtx);
7284
7285       if (GET_CODE (base) == PLUS)
7286         {
7287           rtx op0 = XEXP (base, 0);
7288           rtx op1 = XEXP (base, 1);
7289
7290           /* Force any scaling into a temp for CSE.  */
7291           op0 = force_reg (Pmode, op0);
7292           op1 = force_reg (Pmode, op1);
7293
7294           /* Let the pointer register be in op0.  */
7295           if (REG_POINTER (op1))
7296             std::swap (op0, op1);
7297
7298           /* If the pointer is virtual or frame related, then we know that
7299              virtual register instantiation or register elimination is going
7300              to apply a second constant.  We want the two constants folded
7301              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
7302           if (virt_or_elim_regno_p (REGNO (op0)))
7303             {
7304               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7305                                    NULL_RTX, true, OPTAB_DIRECT);
7306               return gen_rtx_PLUS (Pmode, base, op1);
7307             }
7308
7309           /* Otherwise, in order to encourage CSE (and thence loop strength
7310              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
7311           base = expand_binop (Pmode, add_optab, op0, op1,
7312                                NULL_RTX, true, OPTAB_DIRECT);
7313           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7314         }
7315
7316       HOST_WIDE_INT size;
7317       if (GET_MODE_SIZE (mode).is_constant (&size))
7318         {
7319           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7320                                                              mode);
7321           if (base_offset != 0)
7322             {
7323               base = plus_constant (Pmode, base, base_offset);
7324               base = force_operand (base, NULL_RTX);
7325               return plus_constant (Pmode, base, offset - base_offset);
7326             }
7327         }
7328     }
7329
7330   return x;
7331 }
7332
7333 /* Return the reload icode required for a constant pool in mode.  */
7334 static enum insn_code
7335 aarch64_constant_pool_reload_icode (machine_mode mode)
7336 {
7337   switch (mode)
7338     {
7339     case E_SFmode:
7340       return CODE_FOR_aarch64_reload_movcpsfdi;
7341
7342     case E_DFmode:
7343       return CODE_FOR_aarch64_reload_movcpdfdi;
7344
7345     case E_TFmode:
7346       return CODE_FOR_aarch64_reload_movcptfdi;
7347
7348     case E_V8QImode:
7349       return CODE_FOR_aarch64_reload_movcpv8qidi;
7350
7351     case E_V16QImode:
7352       return CODE_FOR_aarch64_reload_movcpv16qidi;
7353
7354     case E_V4HImode:
7355       return CODE_FOR_aarch64_reload_movcpv4hidi;
7356
7357     case E_V8HImode:
7358       return CODE_FOR_aarch64_reload_movcpv8hidi;
7359
7360     case E_V2SImode:
7361       return CODE_FOR_aarch64_reload_movcpv2sidi;
7362
7363     case E_V4SImode:
7364       return CODE_FOR_aarch64_reload_movcpv4sidi;
7365
7366     case E_V2DImode:
7367       return CODE_FOR_aarch64_reload_movcpv2didi;
7368
7369     case E_V2DFmode:
7370       return CODE_FOR_aarch64_reload_movcpv2dfdi;
7371
7372     default:
7373       gcc_unreachable ();
7374     }
7375
7376   gcc_unreachable ();
7377 }
7378 static reg_class_t
7379 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7380                           reg_class_t rclass,
7381                           machine_mode mode,
7382                           secondary_reload_info *sri)
7383 {
7384   /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7385      directly by the *aarch64_sve_mov<mode>_be move pattern.  See the
7386      comment at the head of aarch64-sve.md for more details about the
7387      big-endian handling.  */
7388   if (BYTES_BIG_ENDIAN
7389       && reg_class_subset_p (rclass, FP_REGS)
7390       && !((REG_P (x) && HARD_REGISTER_P (x))
7391            || aarch64_simd_valid_immediate (x, NULL))
7392       && aarch64_sve_data_mode_p (mode))
7393     {
7394       sri->icode = CODE_FOR_aarch64_sve_reload_be;
7395       return NO_REGS;
7396     }
7397
7398   /* If we have to disable direct literal pool loads and stores because the
7399      function is too big, then we need a scratch register.  */
7400   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7401       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7402           || targetm.vector_mode_supported_p (GET_MODE (x)))
7403       && !aarch64_pcrelative_literal_loads)
7404     {
7405       sri->icode = aarch64_constant_pool_reload_icode (mode);
7406       return NO_REGS;
7407     }
7408
7409   /* Without the TARGET_SIMD instructions we cannot move a Q register
7410      to a Q register directly.  We need a scratch.  */
7411   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7412       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7413       && reg_class_subset_p (rclass, FP_REGS))
7414     {
7415       if (mode == TFmode)
7416         sri->icode = CODE_FOR_aarch64_reload_movtf;
7417       else if (mode == TImode)
7418         sri->icode = CODE_FOR_aarch64_reload_movti;
7419       return NO_REGS;
7420     }
7421
7422   /* A TFmode or TImode memory access should be handled via an FP_REGS
7423      because AArch64 has richer addressing modes for LDR/STR instructions
7424      than LDP/STP instructions.  */
7425   if (TARGET_FLOAT && rclass == GENERAL_REGS
7426       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7427     return FP_REGS;
7428
7429   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7430       return GENERAL_REGS;
7431
7432   return NO_REGS;
7433 }
7434
7435 static bool
7436 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7437 {
7438   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7439
7440   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7441      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
7442   if (frame_pointer_needed)
7443     return to == HARD_FRAME_POINTER_REGNUM;
7444   return true;
7445 }
7446
7447 poly_int64
7448 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7449 {
7450   aarch64_layout_frame ();
7451
7452   if (to == HARD_FRAME_POINTER_REGNUM)
7453     {
7454       if (from == ARG_POINTER_REGNUM)
7455         return cfun->machine->frame.hard_fp_offset;
7456
7457       if (from == FRAME_POINTER_REGNUM)
7458         return cfun->machine->frame.hard_fp_offset
7459                - cfun->machine->frame.locals_offset;
7460     }
7461
7462   if (to == STACK_POINTER_REGNUM)
7463     {
7464       if (from == FRAME_POINTER_REGNUM)
7465           return cfun->machine->frame.frame_size
7466                  - cfun->machine->frame.locals_offset;
7467     }
7468
7469   return cfun->machine->frame.frame_size;
7470 }
7471
7472 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
7473    previous frame.  */
7474
7475 rtx
7476 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7477 {
7478   if (count != 0)
7479     return const0_rtx;
7480   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7481 }
7482
7483
7484 static void
7485 aarch64_asm_trampoline_template (FILE *f)
7486 {
7487   if (TARGET_ILP32)
7488     {
7489       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7490       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7491     }
7492   else
7493     {
7494       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7495       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7496     }
7497   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7498   assemble_aligned_integer (4, const0_rtx);
7499   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7500   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7501 }
7502
7503 static void
7504 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7505 {
7506   rtx fnaddr, mem, a_tramp;
7507   const int tramp_code_sz = 16;
7508
7509   /* Don't need to copy the trailing D-words, we fill those in below.  */
7510   emit_block_move (m_tramp, assemble_trampoline_template (),
7511                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7512   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7513   fnaddr = XEXP (DECL_RTL (fndecl), 0);
7514   if (GET_MODE (fnaddr) != ptr_mode)
7515     fnaddr = convert_memory_address (ptr_mode, fnaddr);
7516   emit_move_insn (mem, fnaddr);
7517
7518   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7519   emit_move_insn (mem, chain_value);
7520
7521   /* XXX We should really define a "clear_cache" pattern and use
7522      gen_clear_cache().  */
7523   a_tramp = XEXP (m_tramp, 0);
7524   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7525                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7526                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7527                      ptr_mode);
7528 }
7529
7530 static unsigned char
7531 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7532 {
7533   /* ??? Logically we should only need to provide a value when
7534      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7535      can hold MODE, but at the moment we need to handle all modes.
7536      Just ignore any runtime parts for registers that can't store them.  */
7537   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7538   unsigned int nregs;
7539   switch (regclass)
7540     {
7541     case TAILCALL_ADDR_REGS:
7542     case POINTER_REGS:
7543     case GENERAL_REGS:
7544     case ALL_REGS:
7545     case POINTER_AND_FP_REGS:
7546     case FP_REGS:
7547     case FP_LO_REGS:
7548       if (aarch64_sve_data_mode_p (mode)
7549           && constant_multiple_p (GET_MODE_SIZE (mode),
7550                                   BYTES_PER_SVE_VECTOR, &nregs))
7551         return nregs;
7552       return (aarch64_vector_data_mode_p (mode)
7553               ? CEIL (lowest_size, UNITS_PER_VREG)
7554               : CEIL (lowest_size, UNITS_PER_WORD));
7555     case STACK_REG:
7556     case PR_REGS:
7557     case PR_LO_REGS:
7558     case PR_HI_REGS:
7559       return 1;
7560
7561     case NO_REGS:
7562       return 0;
7563
7564     default:
7565       break;
7566     }
7567   gcc_unreachable ();
7568 }
7569
7570 static reg_class_t
7571 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7572 {
7573   if (regclass == POINTER_REGS)
7574     return GENERAL_REGS;
7575
7576   if (regclass == STACK_REG)
7577     {
7578       if (REG_P(x)
7579           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7580           return regclass;
7581
7582       return NO_REGS;
7583     }
7584
7585   /* Register eliminiation can result in a request for
7586      SP+constant->FP_REGS.  We cannot support such operations which
7587      use SP as source and an FP_REG as destination, so reject out
7588      right now.  */
7589   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7590     {
7591       rtx lhs = XEXP (x, 0);
7592
7593       /* Look through a possible SUBREG introduced by ILP32.  */
7594       if (GET_CODE (lhs) == SUBREG)
7595         lhs = SUBREG_REG (lhs);
7596
7597       gcc_assert (REG_P (lhs));
7598       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7599                                       POINTER_REGS));
7600       return NO_REGS;
7601     }
7602
7603   return regclass;
7604 }
7605
7606 void
7607 aarch64_asm_output_labelref (FILE* f, const char *name)
7608 {
7609   asm_fprintf (f, "%U%s", name);
7610 }
7611
7612 static void
7613 aarch64_elf_asm_constructor (rtx symbol, int priority)
7614 {
7615   if (priority == DEFAULT_INIT_PRIORITY)
7616     default_ctor_section_asm_out_constructor (symbol, priority);
7617   else
7618     {
7619       section *s;
7620       /* While priority is known to be in range [0, 65535], so 18 bytes
7621          would be enough, the compiler might not know that.  To avoid
7622          -Wformat-truncation false positive, use a larger size.  */
7623       char buf[23];
7624       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7625       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7626       switch_to_section (s);
7627       assemble_align (POINTER_SIZE);
7628       assemble_aligned_integer (POINTER_BYTES, symbol);
7629     }
7630 }
7631
7632 static void
7633 aarch64_elf_asm_destructor (rtx symbol, int priority)
7634 {
7635   if (priority == DEFAULT_INIT_PRIORITY)
7636     default_dtor_section_asm_out_destructor (symbol, priority);
7637   else
7638     {
7639       section *s;
7640       /* While priority is known to be in range [0, 65535], so 18 bytes
7641          would be enough, the compiler might not know that.  To avoid
7642          -Wformat-truncation false positive, use a larger size.  */
7643       char buf[23];
7644       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7645       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7646       switch_to_section (s);
7647       assemble_align (POINTER_SIZE);
7648       assemble_aligned_integer (POINTER_BYTES, symbol);
7649     }
7650 }
7651
7652 const char*
7653 aarch64_output_casesi (rtx *operands)
7654 {
7655   char buf[100];
7656   char label[100];
7657   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7658   int index;
7659   static const char *const patterns[4][2] =
7660   {
7661     {
7662       "ldrb\t%w3, [%0,%w1,uxtw]",
7663       "add\t%3, %4, %w3, sxtb #2"
7664     },
7665     {
7666       "ldrh\t%w3, [%0,%w1,uxtw #1]",
7667       "add\t%3, %4, %w3, sxth #2"
7668     },
7669     {
7670       "ldr\t%w3, [%0,%w1,uxtw #2]",
7671       "add\t%3, %4, %w3, sxtw #2"
7672     },
7673     /* We assume that DImode is only generated when not optimizing and
7674        that we don't really need 64-bit address offsets.  That would
7675        imply an object file with 8GB of code in a single function!  */
7676     {
7677       "ldr\t%w3, [%0,%w1,uxtw #2]",
7678       "add\t%3, %4, %w3, sxtw #2"
7679     }
7680   };
7681
7682   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7683
7684   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7685   index = exact_log2 (GET_MODE_SIZE (mode));
7686
7687   gcc_assert (index >= 0 && index <= 3);
7688
7689   /* Need to implement table size reduction, by chaning the code below.  */
7690   output_asm_insn (patterns[index][0], operands);
7691   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7692   snprintf (buf, sizeof (buf),
7693             "adr\t%%4, %s", targetm.strip_name_encoding (label));
7694   output_asm_insn (buf, operands);
7695   output_asm_insn (patterns[index][1], operands);
7696   output_asm_insn ("br\t%3", operands);
7697   assemble_label (asm_out_file, label);
7698   return "";
7699 }
7700
7701
7702 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7703    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7704    operator.  */
7705
7706 int
7707 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7708 {
7709   if (shift >= 0 && shift <= 3)
7710     {
7711       int size;
7712       for (size = 8; size <= 32; size *= 2)
7713         {
7714           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7715           if (mask == bits << shift)
7716             return size;
7717         }
7718     }
7719   return 0;
7720 }
7721
7722 /* Constant pools are per function only when PC relative
7723    literal loads are true or we are in the large memory
7724    model.  */
7725
7726 static inline bool
7727 aarch64_can_use_per_function_literal_pools_p (void)
7728 {
7729   return (aarch64_pcrelative_literal_loads
7730           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7731 }
7732
7733 static bool
7734 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7735 {
7736   /* We can't use blocks for constants when we're using a per-function
7737      constant pool.  */
7738   return !aarch64_can_use_per_function_literal_pools_p ();
7739 }
7740
7741 /* Select appropriate section for constants depending
7742    on where we place literal pools.  */
7743
7744 static section *
7745 aarch64_select_rtx_section (machine_mode mode,
7746                             rtx x,
7747                             unsigned HOST_WIDE_INT align)
7748 {
7749   if (aarch64_can_use_per_function_literal_pools_p ())
7750     return function_section (current_function_decl);
7751
7752   return default_elf_select_rtx_section (mode, x, align);
7753 }
7754
7755 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
7756 void
7757 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7758                                   HOST_WIDE_INT offset)
7759 {
7760   /* When using per-function literal pools, we must ensure that any code
7761      section is aligned to the minimal instruction length, lest we get
7762      errors from the assembler re "unaligned instructions".  */
7763   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7764     ASM_OUTPUT_ALIGN (f, 2);
7765 }
7766
7767 /* Costs.  */
7768
7769 /* Helper function for rtx cost calculation.  Strip a shift expression
7770    from X.  Returns the inner operand if successful, or the original
7771    expression on failure.  */
7772 static rtx
7773 aarch64_strip_shift (rtx x)
7774 {
7775   rtx op = x;
7776
7777   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7778      we can convert both to ROR during final output.  */
7779   if ((GET_CODE (op) == ASHIFT
7780        || GET_CODE (op) == ASHIFTRT
7781        || GET_CODE (op) == LSHIFTRT
7782        || GET_CODE (op) == ROTATERT
7783        || GET_CODE (op) == ROTATE)
7784       && CONST_INT_P (XEXP (op, 1)))
7785     return XEXP (op, 0);
7786
7787   if (GET_CODE (op) == MULT
7788       && CONST_INT_P (XEXP (op, 1))
7789       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7790     return XEXP (op, 0);
7791
7792   return x;
7793 }
7794
7795 /* Helper function for rtx cost calculation.  Strip an extend
7796    expression from X.  Returns the inner operand if successful, or the
7797    original expression on failure.  We deal with a number of possible
7798    canonicalization variations here. If STRIP_SHIFT is true, then
7799    we can strip off a shift also.  */
7800 static rtx
7801 aarch64_strip_extend (rtx x, bool strip_shift)
7802 {
7803   scalar_int_mode mode;
7804   rtx op = x;
7805
7806   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7807     return op;
7808
7809   /* Zero and sign extraction of a widened value.  */
7810   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7811       && XEXP (op, 2) == const0_rtx
7812       && GET_CODE (XEXP (op, 0)) == MULT
7813       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7814                                          XEXP (op, 1)))
7815     return XEXP (XEXP (op, 0), 0);
7816
7817   /* It can also be represented (for zero-extend) as an AND with an
7818      immediate.  */
7819   if (GET_CODE (op) == AND
7820       && GET_CODE (XEXP (op, 0)) == MULT
7821       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7822       && CONST_INT_P (XEXP (op, 1))
7823       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7824                            INTVAL (XEXP (op, 1))) != 0)
7825     return XEXP (XEXP (op, 0), 0);
7826
7827   /* Now handle extended register, as this may also have an optional
7828      left shift by 1..4.  */
7829   if (strip_shift
7830       && GET_CODE (op) == ASHIFT
7831       && CONST_INT_P (XEXP (op, 1))
7832       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7833     op = XEXP (op, 0);
7834
7835   if (GET_CODE (op) == ZERO_EXTEND
7836       || GET_CODE (op) == SIGN_EXTEND)
7837     op = XEXP (op, 0);
7838
7839   if (op != x)
7840     return op;
7841
7842   return x;
7843 }
7844
7845 /* Return true iff CODE is a shift supported in combination
7846    with arithmetic instructions.  */
7847
7848 static bool
7849 aarch64_shift_p (enum rtx_code code)
7850 {
7851   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7852 }
7853
7854
7855 /* Return true iff X is a cheap shift without a sign extend. */
7856
7857 static bool
7858 aarch64_cheap_mult_shift_p (rtx x)
7859 {
7860   rtx op0, op1;
7861
7862   op0 = XEXP (x, 0);
7863   op1 = XEXP (x, 1);
7864
7865   if (!(aarch64_tune_params.extra_tuning_flags
7866                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7867     return false;
7868
7869   if (GET_CODE (op0) == SIGN_EXTEND)
7870     return false;
7871
7872   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7873       && UINTVAL (op1) <= 4)
7874     return true;
7875
7876   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7877     return false;
7878
7879   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7880
7881   if (l2 > 0 && l2 <= 4)
7882     return true;
7883
7884   return false;
7885 }
7886
7887 /* Helper function for rtx cost calculation.  Calculate the cost of
7888    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7889    Return the calculated cost of the expression, recursing manually in to
7890    operands where needed.  */
7891
7892 static int
7893 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7894 {
7895   rtx op0, op1;
7896   const struct cpu_cost_table *extra_cost
7897     = aarch64_tune_params.insn_extra_cost;
7898   int cost = 0;
7899   bool compound_p = (outer == PLUS || outer == MINUS);
7900   machine_mode mode = GET_MODE (x);
7901
7902   gcc_checking_assert (code == MULT);
7903
7904   op0 = XEXP (x, 0);
7905   op1 = XEXP (x, 1);
7906
7907   if (VECTOR_MODE_P (mode))
7908     mode = GET_MODE_INNER (mode);
7909
7910   /* Integer multiply/fma.  */
7911   if (GET_MODE_CLASS (mode) == MODE_INT)
7912     {
7913       /* The multiply will be canonicalized as a shift, cost it as such.  */
7914       if (aarch64_shift_p (GET_CODE (x))
7915           || (CONST_INT_P (op1)
7916               && exact_log2 (INTVAL (op1)) > 0))
7917         {
7918           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7919                            || GET_CODE (op0) == SIGN_EXTEND;
7920           if (speed)
7921             {
7922               if (compound_p)
7923                 {
7924                   /* If the shift is considered cheap,
7925                      then don't add any cost. */
7926                   if (aarch64_cheap_mult_shift_p (x))
7927                     ;
7928                   else if (REG_P (op1))
7929                     /* ARITH + shift-by-register.  */
7930                     cost += extra_cost->alu.arith_shift_reg;
7931                   else if (is_extend)
7932                     /* ARITH + extended register.  We don't have a cost field
7933                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
7934                     cost += extra_cost->alu.extend_arith;
7935                   else
7936                     /* ARITH + shift-by-immediate.  */
7937                     cost += extra_cost->alu.arith_shift;
7938                 }
7939               else
7940                 /* LSL (immediate).  */
7941                 cost += extra_cost->alu.shift;
7942
7943             }
7944           /* Strip extends as we will have costed them in the case above.  */
7945           if (is_extend)
7946             op0 = aarch64_strip_extend (op0, true);
7947
7948           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7949
7950           return cost;
7951         }
7952
7953       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
7954          compound and let the below cases handle it.  After all, MNEG is a
7955          special-case alias of MSUB.  */
7956       if (GET_CODE (op0) == NEG)
7957         {
7958           op0 = XEXP (op0, 0);
7959           compound_p = true;
7960         }
7961
7962       /* Integer multiplies or FMAs have zero/sign extending variants.  */
7963       if ((GET_CODE (op0) == ZERO_EXTEND
7964            && GET_CODE (op1) == ZERO_EXTEND)
7965           || (GET_CODE (op0) == SIGN_EXTEND
7966               && GET_CODE (op1) == SIGN_EXTEND))
7967         {
7968           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
7969           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
7970
7971           if (speed)
7972             {
7973               if (compound_p)
7974                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
7975                 cost += extra_cost->mult[0].extend_add;
7976               else
7977                 /* MUL/SMULL/UMULL.  */
7978                 cost += extra_cost->mult[0].extend;
7979             }
7980
7981           return cost;
7982         }
7983
7984       /* This is either an integer multiply or a MADD.  In both cases
7985          we want to recurse and cost the operands.  */
7986       cost += rtx_cost (op0, mode, MULT, 0, speed);
7987       cost += rtx_cost (op1, mode, MULT, 1, speed);
7988
7989       if (speed)
7990         {
7991           if (compound_p)
7992             /* MADD/MSUB.  */
7993             cost += extra_cost->mult[mode == DImode].add;
7994           else
7995             /* MUL.  */
7996             cost += extra_cost->mult[mode == DImode].simple;
7997         }
7998
7999       return cost;
8000     }
8001   else
8002     {
8003       if (speed)
8004         {
8005           /* Floating-point FMA/FMUL can also support negations of the
8006              operands, unless the rounding mode is upward or downward in
8007              which case FNMUL is different than FMUL with operand negation.  */
8008           bool neg0 = GET_CODE (op0) == NEG;
8009           bool neg1 = GET_CODE (op1) == NEG;
8010           if (compound_p || !flag_rounding_math || (neg0 && neg1))
8011             {
8012               if (neg0)
8013                 op0 = XEXP (op0, 0);
8014               if (neg1)
8015                 op1 = XEXP (op1, 0);
8016             }
8017
8018           if (compound_p)
8019             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
8020             cost += extra_cost->fp[mode == DFmode].fma;
8021           else
8022             /* FMUL/FNMUL.  */
8023             cost += extra_cost->fp[mode == DFmode].mult;
8024         }
8025
8026       cost += rtx_cost (op0, mode, MULT, 0, speed);
8027       cost += rtx_cost (op1, mode, MULT, 1, speed);
8028       return cost;
8029     }
8030 }
8031
8032 static int
8033 aarch64_address_cost (rtx x,
8034                       machine_mode mode,
8035                       addr_space_t as ATTRIBUTE_UNUSED,
8036                       bool speed)
8037 {
8038   enum rtx_code c = GET_CODE (x);
8039   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8040   struct aarch64_address_info info;
8041   int cost = 0;
8042   info.shift = 0;
8043
8044   if (!aarch64_classify_address (&info, x, mode, false))
8045     {
8046       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8047         {
8048           /* This is a CONST or SYMBOL ref which will be split
8049              in a different way depending on the code model in use.
8050              Cost it through the generic infrastructure.  */
8051           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8052           /* Divide through by the cost of one instruction to
8053              bring it to the same units as the address costs.  */
8054           cost_symbol_ref /= COSTS_N_INSNS (1);
8055           /* The cost is then the cost of preparing the address,
8056              followed by an immediate (possibly 0) offset.  */
8057           return cost_symbol_ref + addr_cost->imm_offset;
8058         }
8059       else
8060         {
8061           /* This is most likely a jump table from a case
8062              statement.  */
8063           return addr_cost->register_offset;
8064         }
8065     }
8066
8067   switch (info.type)
8068     {
8069       case ADDRESS_LO_SUM:
8070       case ADDRESS_SYMBOLIC:
8071       case ADDRESS_REG_IMM:
8072         cost += addr_cost->imm_offset;
8073         break;
8074
8075       case ADDRESS_REG_WB:
8076         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8077           cost += addr_cost->pre_modify;
8078         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8079           cost += addr_cost->post_modify;
8080         else
8081           gcc_unreachable ();
8082
8083         break;
8084
8085       case ADDRESS_REG_REG:
8086         cost += addr_cost->register_offset;
8087         break;
8088
8089       case ADDRESS_REG_SXTW:
8090         cost += addr_cost->register_sextend;
8091         break;
8092
8093       case ADDRESS_REG_UXTW:
8094         cost += addr_cost->register_zextend;
8095         break;
8096
8097       default:
8098         gcc_unreachable ();
8099     }
8100
8101
8102   if (info.shift > 0)
8103     {
8104       /* For the sake of calculating the cost of the shifted register
8105          component, we can treat same sized modes in the same way.  */
8106       if (known_eq (GET_MODE_BITSIZE (mode), 16))
8107         cost += addr_cost->addr_scale_costs.hi;
8108       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8109         cost += addr_cost->addr_scale_costs.si;
8110       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8111         cost += addr_cost->addr_scale_costs.di;
8112       else
8113         /* We can't tell, or this is a 128-bit vector.  */
8114         cost += addr_cost->addr_scale_costs.ti;
8115     }
8116
8117   return cost;
8118 }
8119
8120 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
8121    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
8122    to be taken.  */
8123
8124 int
8125 aarch64_branch_cost (bool speed_p, bool predictable_p)
8126 {
8127   /* When optimizing for speed, use the cost of unpredictable branches.  */
8128   const struct cpu_branch_cost *branch_costs =
8129     aarch64_tune_params.branch_costs;
8130
8131   if (!speed_p || predictable_p)
8132     return branch_costs->predictable;
8133   else
8134     return branch_costs->unpredictable;
8135 }
8136
8137 /* Return true if the RTX X in mode MODE is a zero or sign extract
8138    usable in an ADD or SUB (extended register) instruction.  */
8139 static bool
8140 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8141 {
8142   /* Catch add with a sign extract.
8143      This is add_<optab><mode>_multp2.  */
8144   if (GET_CODE (x) == SIGN_EXTRACT
8145       || GET_CODE (x) == ZERO_EXTRACT)
8146     {
8147       rtx op0 = XEXP (x, 0);
8148       rtx op1 = XEXP (x, 1);
8149       rtx op2 = XEXP (x, 2);
8150
8151       if (GET_CODE (op0) == MULT
8152           && CONST_INT_P (op1)
8153           && op2 == const0_rtx
8154           && CONST_INT_P (XEXP (op0, 1))
8155           && aarch64_is_extend_from_extract (mode,
8156                                              XEXP (op0, 1),
8157                                              op1))
8158         {
8159           return true;
8160         }
8161     }
8162   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8163      No shift.  */
8164   else if (GET_CODE (x) == SIGN_EXTEND
8165            || GET_CODE (x) == ZERO_EXTEND)
8166     return REG_P (XEXP (x, 0));
8167
8168   return false;
8169 }
8170
8171 static bool
8172 aarch64_frint_unspec_p (unsigned int u)
8173 {
8174   switch (u)
8175     {
8176       case UNSPEC_FRINTZ:
8177       case UNSPEC_FRINTP:
8178       case UNSPEC_FRINTM:
8179       case UNSPEC_FRINTA:
8180       case UNSPEC_FRINTN:
8181       case UNSPEC_FRINTX:
8182       case UNSPEC_FRINTI:
8183         return true;
8184
8185       default:
8186         return false;
8187     }
8188 }
8189
8190 /* Return true iff X is an rtx that will match an extr instruction
8191    i.e. as described in the *extr<mode>5_insn family of patterns.
8192    OP0 and OP1 will be set to the operands of the shifts involved
8193    on success and will be NULL_RTX otherwise.  */
8194
8195 static bool
8196 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8197 {
8198   rtx op0, op1;
8199   scalar_int_mode mode;
8200   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8201     return false;
8202
8203   *res_op0 = NULL_RTX;
8204   *res_op1 = NULL_RTX;
8205
8206   if (GET_CODE (x) != IOR)
8207     return false;
8208
8209   op0 = XEXP (x, 0);
8210   op1 = XEXP (x, 1);
8211
8212   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8213       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8214     {
8215      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
8216       if (GET_CODE (op1) == ASHIFT)
8217         std::swap (op0, op1);
8218
8219       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8220         return false;
8221
8222       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8223       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8224
8225       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8226           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8227         {
8228           *res_op0 = XEXP (op0, 0);
8229           *res_op1 = XEXP (op1, 0);
8230           return true;
8231         }
8232     }
8233
8234   return false;
8235 }
8236
8237 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8238    storing it in *COST.  Result is true if the total cost of the operation
8239    has now been calculated.  */
8240 static bool
8241 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8242 {
8243   rtx inner;
8244   rtx comparator;
8245   enum rtx_code cmpcode;
8246
8247   if (COMPARISON_P (op0))
8248     {
8249       inner = XEXP (op0, 0);
8250       comparator = XEXP (op0, 1);
8251       cmpcode = GET_CODE (op0);
8252     }
8253   else
8254     {
8255       inner = op0;
8256       comparator = const0_rtx;
8257       cmpcode = NE;
8258     }
8259
8260   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8261     {
8262       /* Conditional branch.  */
8263       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8264         return true;
8265       else
8266         {
8267           if (cmpcode == NE || cmpcode == EQ)
8268             {
8269               if (comparator == const0_rtx)
8270                 {
8271                   /* TBZ/TBNZ/CBZ/CBNZ.  */
8272                   if (GET_CODE (inner) == ZERO_EXTRACT)
8273                     /* TBZ/TBNZ.  */
8274                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8275                                        ZERO_EXTRACT, 0, speed);
8276                   else
8277                     /* CBZ/CBNZ.  */
8278                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8279
8280                 return true;
8281               }
8282             }
8283           else if (cmpcode == LT || cmpcode == GE)
8284             {
8285               /* TBZ/TBNZ.  */
8286               if (comparator == const0_rtx)
8287                 return true;
8288             }
8289         }
8290     }
8291   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8292     {
8293       /* CCMP.  */
8294       if (GET_CODE (op1) == COMPARE)
8295         {
8296           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
8297           if (XEXP (op1, 1) == const0_rtx)
8298             *cost += 1;
8299           if (speed)
8300             {
8301               machine_mode mode = GET_MODE (XEXP (op1, 0));
8302               const struct cpu_cost_table *extra_cost
8303                 = aarch64_tune_params.insn_extra_cost;
8304
8305               if (GET_MODE_CLASS (mode) == MODE_INT)
8306                 *cost += extra_cost->alu.arith;
8307               else
8308                 *cost += extra_cost->fp[mode == DFmode].compare;
8309             }
8310           return true;
8311         }
8312
8313       /* It's a conditional operation based on the status flags,
8314          so it must be some flavor of CSEL.  */
8315
8316       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
8317       if (GET_CODE (op1) == NEG
8318           || GET_CODE (op1) == NOT
8319           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8320         op1 = XEXP (op1, 0);
8321       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8322         {
8323           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
8324           op1 = XEXP (op1, 0);
8325           op2 = XEXP (op2, 0);
8326         }
8327
8328       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8329       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8330       return true;
8331     }
8332
8333   /* We don't know what this is, cost all operands.  */
8334   return false;
8335 }
8336
8337 /* Check whether X is a bitfield operation of the form shift + extend that
8338    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
8339    operand to which the bitfield operation is applied.  Otherwise return
8340    NULL_RTX.  */
8341
8342 static rtx
8343 aarch64_extend_bitfield_pattern_p (rtx x)
8344 {
8345   rtx_code outer_code = GET_CODE (x);
8346   machine_mode outer_mode = GET_MODE (x);
8347
8348   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8349       && outer_mode != SImode && outer_mode != DImode)
8350     return NULL_RTX;
8351
8352   rtx inner = XEXP (x, 0);
8353   rtx_code inner_code = GET_CODE (inner);
8354   machine_mode inner_mode = GET_MODE (inner);
8355   rtx op = NULL_RTX;
8356
8357   switch (inner_code)
8358     {
8359       case ASHIFT:
8360         if (CONST_INT_P (XEXP (inner, 1))
8361             && (inner_mode == QImode || inner_mode == HImode))
8362           op = XEXP (inner, 0);
8363         break;
8364       case LSHIFTRT:
8365         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8366             && (inner_mode == QImode || inner_mode == HImode))
8367           op = XEXP (inner, 0);
8368         break;
8369       case ASHIFTRT:
8370         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8371             && (inner_mode == QImode || inner_mode == HImode))
8372           op = XEXP (inner, 0);
8373         break;
8374       default:
8375         break;
8376     }
8377
8378   return op;
8379 }
8380
8381 /* Return true if the mask and a shift amount from an RTX of the form
8382    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8383    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
8384
8385 bool
8386 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8387                                     rtx shft_amnt)
8388 {
8389   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8390          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8391          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8392          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8393 }
8394
8395 /* Calculate the cost of calculating X, storing it in *COST.  Result
8396    is true if the total cost of the operation has now been calculated.  */
8397 static bool
8398 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8399                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8400 {
8401   rtx op0, op1, op2;
8402   const struct cpu_cost_table *extra_cost
8403     = aarch64_tune_params.insn_extra_cost;
8404   int code = GET_CODE (x);
8405   scalar_int_mode int_mode;
8406
8407   /* By default, assume that everything has equivalent cost to the
8408      cheapest instruction.  Any additional costs are applied as a delta
8409      above this default.  */
8410   *cost = COSTS_N_INSNS (1);
8411
8412   switch (code)
8413     {
8414     case SET:
8415       /* The cost depends entirely on the operands to SET.  */
8416       *cost = 0;
8417       op0 = SET_DEST (x);
8418       op1 = SET_SRC (x);
8419
8420       switch (GET_CODE (op0))
8421         {
8422         case MEM:
8423           if (speed)
8424             {
8425               rtx address = XEXP (op0, 0);
8426               if (VECTOR_MODE_P (mode))
8427                 *cost += extra_cost->ldst.storev;
8428               else if (GET_MODE_CLASS (mode) == MODE_INT)
8429                 *cost += extra_cost->ldst.store;
8430               else if (mode == SFmode)
8431                 *cost += extra_cost->ldst.storef;
8432               else if (mode == DFmode)
8433                 *cost += extra_cost->ldst.stored;
8434
8435               *cost +=
8436                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8437                                                      0, speed));
8438             }
8439
8440           *cost += rtx_cost (op1, mode, SET, 1, speed);
8441           return true;
8442
8443         case SUBREG:
8444           if (! REG_P (SUBREG_REG (op0)))
8445             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8446
8447           /* Fall through.  */
8448         case REG:
8449           /* The cost is one per vector-register copied.  */
8450           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8451             {
8452               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8453               *cost = COSTS_N_INSNS (nregs);
8454             }
8455           /* const0_rtx is in general free, but we will use an
8456              instruction to set a register to 0.  */
8457           else if (REG_P (op1) || op1 == const0_rtx)
8458             {
8459               /* The cost is 1 per register copied.  */
8460               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8461               *cost = COSTS_N_INSNS (nregs);
8462             }
8463           else
8464             /* Cost is just the cost of the RHS of the set.  */
8465             *cost += rtx_cost (op1, mode, SET, 1, speed);
8466           return true;
8467
8468         case ZERO_EXTRACT:
8469         case SIGN_EXTRACT:
8470           /* Bit-field insertion.  Strip any redundant widening of
8471              the RHS to meet the width of the target.  */
8472           if (GET_CODE (op1) == SUBREG)
8473             op1 = SUBREG_REG (op1);
8474           if ((GET_CODE (op1) == ZERO_EXTEND
8475                || GET_CODE (op1) == SIGN_EXTEND)
8476               && CONST_INT_P (XEXP (op0, 1))
8477               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8478               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8479             op1 = XEXP (op1, 0);
8480
8481           if (CONST_INT_P (op1))
8482             {
8483               /* MOV immediate is assumed to always be cheap.  */
8484               *cost = COSTS_N_INSNS (1);
8485             }
8486           else
8487             {
8488               /* BFM.  */
8489               if (speed)
8490                 *cost += extra_cost->alu.bfi;
8491               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8492             }
8493
8494           return true;
8495
8496         default:
8497           /* We can't make sense of this, assume default cost.  */
8498           *cost = COSTS_N_INSNS (1);
8499           return false;
8500         }
8501       return false;
8502
8503     case CONST_INT:
8504       /* If an instruction can incorporate a constant within the
8505          instruction, the instruction's expression avoids calling
8506          rtx_cost() on the constant.  If rtx_cost() is called on a
8507          constant, then it is usually because the constant must be
8508          moved into a register by one or more instructions.
8509
8510          The exception is constant 0, which can be expressed
8511          as XZR/WZR and is therefore free.  The exception to this is
8512          if we have (set (reg) (const0_rtx)) in which case we must cost
8513          the move.  However, we can catch that when we cost the SET, so
8514          we don't need to consider that here.  */
8515       if (x == const0_rtx)
8516         *cost = 0;
8517       else
8518         {
8519           /* To an approximation, building any other constant is
8520              proportionally expensive to the number of instructions
8521              required to build that constant.  This is true whether we
8522              are compiling for SPEED or otherwise.  */
8523           if (!is_a <scalar_int_mode> (mode, &int_mode))
8524             int_mode = word_mode;
8525           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8526                                  (NULL_RTX, x, false, int_mode));
8527         }
8528       return true;
8529
8530     case CONST_DOUBLE:
8531
8532       /* First determine number of instructions to do the move
8533           as an integer constant.  */
8534       if (!aarch64_float_const_representable_p (x)
8535            && !aarch64_can_const_movi_rtx_p (x, mode)
8536            && aarch64_float_const_rtx_p (x))
8537         {
8538           unsigned HOST_WIDE_INT ival;
8539           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8540           gcc_assert (succeed);
8541
8542           scalar_int_mode imode = (mode == HFmode
8543                                    ? SImode
8544                                    : int_mode_for_mode (mode).require ());
8545           int ncost = aarch64_internal_mov_immediate
8546                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8547           *cost += COSTS_N_INSNS (ncost);
8548           return true;
8549         }
8550
8551       if (speed)
8552         {
8553           /* mov[df,sf]_aarch64.  */
8554           if (aarch64_float_const_representable_p (x))
8555             /* FMOV (scalar immediate).  */
8556             *cost += extra_cost->fp[mode == DFmode].fpconst;
8557           else if (!aarch64_float_const_zero_rtx_p (x))
8558             {
8559               /* This will be a load from memory.  */
8560               if (mode == DFmode)
8561                 *cost += extra_cost->ldst.loadd;
8562               else
8563                 *cost += extra_cost->ldst.loadf;
8564             }
8565           else
8566             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
8567                or MOV v0.s[0], wzr - neither of which are modeled by the
8568                cost tables.  Just use the default cost.  */
8569             {
8570             }
8571         }
8572
8573       return true;
8574
8575     case MEM:
8576       if (speed)
8577         {
8578           /* For loads we want the base cost of a load, plus an
8579              approximation for the additional cost of the addressing
8580              mode.  */
8581           rtx address = XEXP (x, 0);
8582           if (VECTOR_MODE_P (mode))
8583             *cost += extra_cost->ldst.loadv;
8584           else if (GET_MODE_CLASS (mode) == MODE_INT)
8585             *cost += extra_cost->ldst.load;
8586           else if (mode == SFmode)
8587             *cost += extra_cost->ldst.loadf;
8588           else if (mode == DFmode)
8589             *cost += extra_cost->ldst.loadd;
8590
8591           *cost +=
8592                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8593                                                      0, speed));
8594         }
8595
8596       return true;
8597
8598     case NEG:
8599       op0 = XEXP (x, 0);
8600
8601       if (VECTOR_MODE_P (mode))
8602         {
8603           if (speed)
8604             {
8605               /* FNEG.  */
8606               *cost += extra_cost->vect.alu;
8607             }
8608           return false;
8609         }
8610
8611       if (GET_MODE_CLASS (mode) == MODE_INT)
8612         {
8613           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8614               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8615             {
8616               /* CSETM.  */
8617               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8618               return true;
8619             }
8620
8621           /* Cost this as SUB wzr, X.  */
8622           op0 = CONST0_RTX (mode);
8623           op1 = XEXP (x, 0);
8624           goto cost_minus;
8625         }
8626
8627       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8628         {
8629           /* Support (neg(fma...)) as a single instruction only if
8630              sign of zeros is unimportant.  This matches the decision
8631              making in aarch64.md.  */
8632           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8633             {
8634               /* FNMADD.  */
8635               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8636               return true;
8637             }
8638           if (GET_CODE (op0) == MULT)
8639             {
8640               /* FNMUL.  */
8641               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8642               return true;
8643             }
8644           if (speed)
8645             /* FNEG.  */
8646             *cost += extra_cost->fp[mode == DFmode].neg;
8647           return false;
8648         }
8649
8650       return false;
8651
8652     case CLRSB:
8653     case CLZ:
8654       if (speed)
8655         {
8656           if (VECTOR_MODE_P (mode))
8657             *cost += extra_cost->vect.alu;
8658           else
8659             *cost += extra_cost->alu.clz;
8660         }
8661
8662       return false;
8663
8664     case COMPARE:
8665       op0 = XEXP (x, 0);
8666       op1 = XEXP (x, 1);
8667
8668       if (op1 == const0_rtx
8669           && GET_CODE (op0) == AND)
8670         {
8671           x = op0;
8672           mode = GET_MODE (op0);
8673           goto cost_logic;
8674         }
8675
8676       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8677         {
8678           /* TODO: A write to the CC flags possibly costs extra, this
8679              needs encoding in the cost tables.  */
8680
8681           mode = GET_MODE (op0);
8682           /* ANDS.  */
8683           if (GET_CODE (op0) == AND)
8684             {
8685               x = op0;
8686               goto cost_logic;
8687             }
8688
8689           if (GET_CODE (op0) == PLUS)
8690             {
8691               /* ADDS (and CMN alias).  */
8692               x = op0;
8693               goto cost_plus;
8694             }
8695
8696           if (GET_CODE (op0) == MINUS)
8697             {
8698               /* SUBS.  */
8699               x = op0;
8700               goto cost_minus;
8701             }
8702
8703           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8704               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8705               && CONST_INT_P (XEXP (op0, 2)))
8706             {
8707               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8708                  Handle it here directly rather than going to cost_logic
8709                  since we know the immediate generated for the TST is valid
8710                  so we can avoid creating an intermediate rtx for it only
8711                  for costing purposes.  */
8712               if (speed)
8713                 *cost += extra_cost->alu.logical;
8714
8715               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8716                                  ZERO_EXTRACT, 0, speed);
8717               return true;
8718             }
8719
8720           if (GET_CODE (op1) == NEG)
8721             {
8722               /* CMN.  */
8723               if (speed)
8724                 *cost += extra_cost->alu.arith;
8725
8726               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8727               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8728               return true;
8729             }
8730
8731           /* CMP.
8732
8733              Compare can freely swap the order of operands, and
8734              canonicalization puts the more complex operation first.
8735              But the integer MINUS logic expects the shift/extend
8736              operation in op1.  */
8737           if (! (REG_P (op0)
8738                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8739           {
8740             op0 = XEXP (x, 1);
8741             op1 = XEXP (x, 0);
8742           }
8743           goto cost_minus;
8744         }
8745
8746       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8747         {
8748           /* FCMP.  */
8749           if (speed)
8750             *cost += extra_cost->fp[mode == DFmode].compare;
8751
8752           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8753             {
8754               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8755               /* FCMP supports constant 0.0 for no extra cost. */
8756               return true;
8757             }
8758           return false;
8759         }
8760
8761       if (VECTOR_MODE_P (mode))
8762         {
8763           /* Vector compare.  */
8764           if (speed)
8765             *cost += extra_cost->vect.alu;
8766
8767           if (aarch64_float_const_zero_rtx_p (op1))
8768             {
8769               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8770                  cost.  */
8771               return true;
8772             }
8773           return false;
8774         }
8775       return false;
8776
8777     case MINUS:
8778       {
8779         op0 = XEXP (x, 0);
8780         op1 = XEXP (x, 1);
8781
8782 cost_minus:
8783         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8784
8785         /* Detect valid immediates.  */
8786         if ((GET_MODE_CLASS (mode) == MODE_INT
8787              || (GET_MODE_CLASS (mode) == MODE_CC
8788                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8789             && CONST_INT_P (op1)
8790             && aarch64_uimm12_shift (INTVAL (op1)))
8791           {
8792             if (speed)
8793               /* SUB(S) (immediate).  */
8794               *cost += extra_cost->alu.arith;
8795             return true;
8796           }
8797
8798         /* Look for SUB (extended register).  */
8799         if (is_a <scalar_int_mode> (mode, &int_mode)
8800             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8801           {
8802             if (speed)
8803               *cost += extra_cost->alu.extend_arith;
8804
8805             op1 = aarch64_strip_extend (op1, true);
8806             *cost += rtx_cost (op1, VOIDmode,
8807                                (enum rtx_code) GET_CODE (op1), 0, speed);
8808             return true;
8809           }
8810
8811         rtx new_op1 = aarch64_strip_extend (op1, false);
8812
8813         /* Cost this as an FMA-alike operation.  */
8814         if ((GET_CODE (new_op1) == MULT
8815              || aarch64_shift_p (GET_CODE (new_op1)))
8816             && code != COMPARE)
8817           {
8818             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8819                                             (enum rtx_code) code,
8820                                             speed);
8821             return true;
8822           }
8823
8824         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8825
8826         if (speed)
8827           {
8828             if (VECTOR_MODE_P (mode))
8829               {
8830                 /* Vector SUB.  */
8831                 *cost += extra_cost->vect.alu;
8832               }
8833             else if (GET_MODE_CLASS (mode) == MODE_INT)
8834               {
8835                 /* SUB(S).  */
8836                 *cost += extra_cost->alu.arith;
8837               }
8838             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8839               {
8840                 /* FSUB.  */
8841                 *cost += extra_cost->fp[mode == DFmode].addsub;
8842               }
8843           }
8844         return true;
8845       }
8846
8847     case PLUS:
8848       {
8849         rtx new_op0;
8850
8851         op0 = XEXP (x, 0);
8852         op1 = XEXP (x, 1);
8853
8854 cost_plus:
8855         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8856             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8857           {
8858             /* CSINC.  */
8859             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8860             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8861             return true;
8862           }
8863
8864         if (GET_MODE_CLASS (mode) == MODE_INT
8865             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8866                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8867           {
8868             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8869
8870             if (speed)
8871               /* ADD (immediate).  */
8872               *cost += extra_cost->alu.arith;
8873             return true;
8874           }
8875
8876         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8877
8878         /* Look for ADD (extended register).  */
8879         if (is_a <scalar_int_mode> (mode, &int_mode)
8880             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8881           {
8882             if (speed)
8883               *cost += extra_cost->alu.extend_arith;
8884
8885             op0 = aarch64_strip_extend (op0, true);
8886             *cost += rtx_cost (op0, VOIDmode,
8887                                (enum rtx_code) GET_CODE (op0), 0, speed);
8888             return true;
8889           }
8890
8891         /* Strip any extend, leave shifts behind as we will
8892            cost them through mult_cost.  */
8893         new_op0 = aarch64_strip_extend (op0, false);
8894
8895         if (GET_CODE (new_op0) == MULT
8896             || aarch64_shift_p (GET_CODE (new_op0)))
8897           {
8898             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8899                                             speed);
8900             return true;
8901           }
8902
8903         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8904
8905         if (speed)
8906           {
8907             if (VECTOR_MODE_P (mode))
8908               {
8909                 /* Vector ADD.  */
8910                 *cost += extra_cost->vect.alu;
8911               }
8912             else if (GET_MODE_CLASS (mode) == MODE_INT)
8913               {
8914                 /* ADD.  */
8915                 *cost += extra_cost->alu.arith;
8916               }
8917             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8918               {
8919                 /* FADD.  */
8920                 *cost += extra_cost->fp[mode == DFmode].addsub;
8921               }
8922           }
8923         return true;
8924       }
8925
8926     case BSWAP:
8927       *cost = COSTS_N_INSNS (1);
8928
8929       if (speed)
8930         {
8931           if (VECTOR_MODE_P (mode))
8932             *cost += extra_cost->vect.alu;
8933           else
8934             *cost += extra_cost->alu.rev;
8935         }
8936       return false;
8937
8938     case IOR:
8939       if (aarch_rev16_p (x))
8940         {
8941           *cost = COSTS_N_INSNS (1);
8942
8943           if (speed)
8944             {
8945               if (VECTOR_MODE_P (mode))
8946                 *cost += extra_cost->vect.alu;
8947               else
8948                 *cost += extra_cost->alu.rev;
8949             }
8950           return true;
8951         }
8952
8953       if (aarch64_extr_rtx_p (x, &op0, &op1))
8954         {
8955           *cost += rtx_cost (op0, mode, IOR, 0, speed);
8956           *cost += rtx_cost (op1, mode, IOR, 1, speed);
8957           if (speed)
8958             *cost += extra_cost->alu.shift;
8959
8960           return true;
8961         }
8962     /* Fall through.  */
8963     case XOR:
8964     case AND:
8965     cost_logic:
8966       op0 = XEXP (x, 0);
8967       op1 = XEXP (x, 1);
8968
8969       if (VECTOR_MODE_P (mode))
8970         {
8971           if (speed)
8972             *cost += extra_cost->vect.alu;
8973           return true;
8974         }
8975
8976       if (code == AND
8977           && GET_CODE (op0) == MULT
8978           && CONST_INT_P (XEXP (op0, 1))
8979           && CONST_INT_P (op1)
8980           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
8981                                INTVAL (op1)) != 0)
8982         {
8983           /* This is a UBFM/SBFM.  */
8984           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
8985           if (speed)
8986             *cost += extra_cost->alu.bfx;
8987           return true;
8988         }
8989
8990       if (is_int_mode (mode, &int_mode))
8991         {
8992           if (CONST_INT_P (op1))
8993             {
8994               /* We have a mask + shift version of a UBFIZ
8995                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
8996               if (GET_CODE (op0) == ASHIFT
8997                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
8998                                                          XEXP (op0, 1)))
8999                 {
9000                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
9001                                      (enum rtx_code) code, 0, speed);
9002                   if (speed)
9003                     *cost += extra_cost->alu.bfx;
9004
9005                   return true;
9006                 }
9007               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9008                 {
9009                 /* We possibly get the immediate for free, this is not
9010                    modelled.  */
9011                   *cost += rtx_cost (op0, int_mode,
9012                                      (enum rtx_code) code, 0, speed);
9013                   if (speed)
9014                     *cost += extra_cost->alu.logical;
9015
9016                   return true;
9017                 }
9018             }
9019           else
9020             {
9021               rtx new_op0 = op0;
9022
9023               /* Handle ORN, EON, or BIC.  */
9024               if (GET_CODE (op0) == NOT)
9025                 op0 = XEXP (op0, 0);
9026
9027               new_op0 = aarch64_strip_shift (op0);
9028
9029               /* If we had a shift on op0 then this is a logical-shift-
9030                  by-register/immediate operation.  Otherwise, this is just
9031                  a logical operation.  */
9032               if (speed)
9033                 {
9034                   if (new_op0 != op0)
9035                     {
9036                       /* Shift by immediate.  */
9037                       if (CONST_INT_P (XEXP (op0, 1)))
9038                         *cost += extra_cost->alu.log_shift;
9039                       else
9040                         *cost += extra_cost->alu.log_shift_reg;
9041                     }
9042                   else
9043                     *cost += extra_cost->alu.logical;
9044                 }
9045
9046               /* In both cases we want to cost both operands.  */
9047               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9048                                  0, speed);
9049               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9050                                  1, speed);
9051
9052               return true;
9053             }
9054         }
9055       return false;
9056
9057     case NOT:
9058       x = XEXP (x, 0);
9059       op0 = aarch64_strip_shift (x);
9060
9061       if (VECTOR_MODE_P (mode))
9062         {
9063           /* Vector NOT.  */
9064           *cost += extra_cost->vect.alu;
9065           return false;
9066         }
9067
9068       /* MVN-shifted-reg.  */
9069       if (op0 != x)
9070         {
9071           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9072
9073           if (speed)
9074             *cost += extra_cost->alu.log_shift;
9075
9076           return true;
9077         }
9078       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9079          Handle the second form here taking care that 'a' in the above can
9080          be a shift.  */
9081       else if (GET_CODE (op0) == XOR)
9082         {
9083           rtx newop0 = XEXP (op0, 0);
9084           rtx newop1 = XEXP (op0, 1);
9085           rtx op0_stripped = aarch64_strip_shift (newop0);
9086
9087           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9088           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9089
9090           if (speed)
9091             {
9092               if (op0_stripped != newop0)
9093                 *cost += extra_cost->alu.log_shift;
9094               else
9095                 *cost += extra_cost->alu.logical;
9096             }
9097
9098           return true;
9099         }
9100       /* MVN.  */
9101       if (speed)
9102         *cost += extra_cost->alu.logical;
9103
9104       return false;
9105
9106     case ZERO_EXTEND:
9107
9108       op0 = XEXP (x, 0);
9109       /* If a value is written in SI mode, then zero extended to DI
9110          mode, the operation will in general be free as a write to
9111          a 'w' register implicitly zeroes the upper bits of an 'x'
9112          register.  However, if this is
9113
9114            (set (reg) (zero_extend (reg)))
9115
9116          we must cost the explicit register move.  */
9117       if (mode == DImode
9118           && GET_MODE (op0) == SImode
9119           && outer == SET)
9120         {
9121           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9122
9123         /* If OP_COST is non-zero, then the cost of the zero extend
9124            is effectively the cost of the inner operation.  Otherwise
9125            we have a MOV instruction and we take the cost from the MOV
9126            itself.  This is true independently of whether we are
9127            optimizing for space or time.  */
9128           if (op_cost)
9129             *cost = op_cost;
9130
9131           return true;
9132         }
9133       else if (MEM_P (op0))
9134         {
9135           /* All loads can zero extend to any size for free.  */
9136           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9137           return true;
9138         }
9139
9140       op0 = aarch64_extend_bitfield_pattern_p (x);
9141       if (op0)
9142         {
9143           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9144           if (speed)
9145             *cost += extra_cost->alu.bfx;
9146           return true;
9147         }
9148
9149       if (speed)
9150         {
9151           if (VECTOR_MODE_P (mode))
9152             {
9153               /* UMOV.  */
9154               *cost += extra_cost->vect.alu;
9155             }
9156           else
9157             {
9158               /* We generate an AND instead of UXTB/UXTH.  */
9159               *cost += extra_cost->alu.logical;
9160             }
9161         }
9162       return false;
9163
9164     case SIGN_EXTEND:
9165       if (MEM_P (XEXP (x, 0)))
9166         {
9167           /* LDRSH.  */
9168           if (speed)
9169             {
9170               rtx address = XEXP (XEXP (x, 0), 0);
9171               *cost += extra_cost->ldst.load_sign_extend;
9172
9173               *cost +=
9174                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9175                                                      0, speed));
9176             }
9177           return true;
9178         }
9179
9180       op0 = aarch64_extend_bitfield_pattern_p (x);
9181       if (op0)
9182         {
9183           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9184           if (speed)
9185             *cost += extra_cost->alu.bfx;
9186           return true;
9187         }
9188
9189       if (speed)
9190         {
9191           if (VECTOR_MODE_P (mode))
9192             *cost += extra_cost->vect.alu;
9193           else
9194             *cost += extra_cost->alu.extend;
9195         }
9196       return false;
9197
9198     case ASHIFT:
9199       op0 = XEXP (x, 0);
9200       op1 = XEXP (x, 1);
9201
9202       if (CONST_INT_P (op1))
9203         {
9204           if (speed)
9205             {
9206               if (VECTOR_MODE_P (mode))
9207                 {
9208                   /* Vector shift (immediate).  */
9209                   *cost += extra_cost->vect.alu;
9210                 }
9211               else
9212                 {
9213                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
9214                      aliases.  */
9215                   *cost += extra_cost->alu.shift;
9216                 }
9217             }
9218
9219           /* We can incorporate zero/sign extend for free.  */
9220           if (GET_CODE (op0) == ZERO_EXTEND
9221               || GET_CODE (op0) == SIGN_EXTEND)
9222             op0 = XEXP (op0, 0);
9223
9224           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9225           return true;
9226         }
9227       else
9228         {
9229           if (VECTOR_MODE_P (mode))
9230             {
9231               if (speed)
9232                 /* Vector shift (register).  */
9233                 *cost += extra_cost->vect.alu;
9234             }
9235           else
9236             {
9237               if (speed)
9238                 /* LSLV.  */
9239                 *cost += extra_cost->alu.shift_reg;
9240
9241               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9242                   && CONST_INT_P (XEXP (op1, 1))
9243                   && known_eq (INTVAL (XEXP (op1, 1)),
9244                                GET_MODE_BITSIZE (mode) - 1))
9245                 {
9246                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9247                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9248                      don't recurse into it.  */
9249                   return true;
9250                 }
9251             }
9252           return false;  /* All arguments need to be in registers.  */
9253         }
9254
9255     case ROTATE:
9256     case ROTATERT:
9257     case LSHIFTRT:
9258     case ASHIFTRT:
9259       op0 = XEXP (x, 0);
9260       op1 = XEXP (x, 1);
9261
9262       if (CONST_INT_P (op1))
9263         {
9264           /* ASR (immediate) and friends.  */
9265           if (speed)
9266             {
9267               if (VECTOR_MODE_P (mode))
9268                 *cost += extra_cost->vect.alu;
9269               else
9270                 *cost += extra_cost->alu.shift;
9271             }
9272
9273           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9274           return true;
9275         }
9276       else
9277         {
9278           if (VECTOR_MODE_P (mode))
9279             {
9280               if (speed)
9281                 /* Vector shift (register).  */
9282                 *cost += extra_cost->vect.alu;
9283             }
9284           else
9285             {
9286               if (speed)
9287                 /* ASR (register) and friends.  */
9288                 *cost += extra_cost->alu.shift_reg;
9289
9290               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9291                   && CONST_INT_P (XEXP (op1, 1))
9292                   && known_eq (INTVAL (XEXP (op1, 1)),
9293                                GET_MODE_BITSIZE (mode) - 1))
9294                 {
9295                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9296                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9297                      don't recurse into it.  */
9298                   return true;
9299                 }
9300             }
9301           return false;  /* All arguments need to be in registers.  */
9302         }
9303
9304     case SYMBOL_REF:
9305
9306       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9307           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9308         {
9309           /* LDR.  */
9310           if (speed)
9311             *cost += extra_cost->ldst.load;
9312         }
9313       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9314                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9315         {
9316           /* ADRP, followed by ADD.  */
9317           *cost += COSTS_N_INSNS (1);
9318           if (speed)
9319             *cost += 2 * extra_cost->alu.arith;
9320         }
9321       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9322                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9323         {
9324           /* ADR.  */
9325           if (speed)
9326             *cost += extra_cost->alu.arith;
9327         }
9328
9329       if (flag_pic)
9330         {
9331           /* One extra load instruction, after accessing the GOT.  */
9332           *cost += COSTS_N_INSNS (1);
9333           if (speed)
9334             *cost += extra_cost->ldst.load;
9335         }
9336       return true;
9337
9338     case HIGH:
9339     case LO_SUM:
9340       /* ADRP/ADD (immediate).  */
9341       if (speed)
9342         *cost += extra_cost->alu.arith;
9343       return true;
9344
9345     case ZERO_EXTRACT:
9346     case SIGN_EXTRACT:
9347       /* UBFX/SBFX.  */
9348       if (speed)
9349         {
9350           if (VECTOR_MODE_P (mode))
9351             *cost += extra_cost->vect.alu;
9352           else
9353             *cost += extra_cost->alu.bfx;
9354         }
9355
9356       /* We can trust that the immediates used will be correct (there
9357          are no by-register forms), so we need only cost op0.  */
9358       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9359       return true;
9360
9361     case MULT:
9362       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9363       /* aarch64_rtx_mult_cost always handles recursion to its
9364          operands.  */
9365       return true;
9366
9367     case MOD:
9368     /* We can expand signed mod by power of 2 using a NEGS, two parallel
9369        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
9370        an unconditional negate.  This case should only ever be reached through
9371        the set_smod_pow2_cheap check in expmed.c.  */
9372       if (CONST_INT_P (XEXP (x, 1))
9373           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9374           && (mode == SImode || mode == DImode))
9375         {
9376           /* We expand to 4 instructions.  Reset the baseline.  */
9377           *cost = COSTS_N_INSNS (4);
9378
9379           if (speed)
9380             *cost += 2 * extra_cost->alu.logical
9381                      + 2 * extra_cost->alu.arith;
9382
9383           return true;
9384         }
9385
9386     /* Fall-through.  */
9387     case UMOD:
9388       if (speed)
9389         {
9390           /* Slighly prefer UMOD over SMOD.  */
9391           if (VECTOR_MODE_P (mode))
9392             *cost += extra_cost->vect.alu;
9393           else if (GET_MODE_CLASS (mode) == MODE_INT)
9394             *cost += (extra_cost->mult[mode == DImode].add
9395                       + extra_cost->mult[mode == DImode].idiv
9396                       + (code == MOD ? 1 : 0));
9397         }
9398       return false;  /* All arguments need to be in registers.  */
9399
9400     case DIV:
9401     case UDIV:
9402     case SQRT:
9403       if (speed)
9404         {
9405           if (VECTOR_MODE_P (mode))
9406             *cost += extra_cost->vect.alu;
9407           else if (GET_MODE_CLASS (mode) == MODE_INT)
9408             /* There is no integer SQRT, so only DIV and UDIV can get
9409                here.  */
9410             *cost += (extra_cost->mult[mode == DImode].idiv
9411                      /* Slighly prefer UDIV over SDIV.  */
9412                      + (code == DIV ? 1 : 0));
9413           else
9414             *cost += extra_cost->fp[mode == DFmode].div;
9415         }
9416       return false;  /* All arguments need to be in registers.  */
9417
9418     case IF_THEN_ELSE:
9419       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9420                                          XEXP (x, 2), cost, speed);
9421
9422     case EQ:
9423     case NE:
9424     case GT:
9425     case GTU:
9426     case LT:
9427     case LTU:
9428     case GE:
9429     case GEU:
9430     case LE:
9431     case LEU:
9432
9433       return false; /* All arguments must be in registers.  */
9434
9435     case FMA:
9436       op0 = XEXP (x, 0);
9437       op1 = XEXP (x, 1);
9438       op2 = XEXP (x, 2);
9439
9440       if (speed)
9441         {
9442           if (VECTOR_MODE_P (mode))
9443             *cost += extra_cost->vect.alu;
9444           else
9445             *cost += extra_cost->fp[mode == DFmode].fma;
9446         }
9447
9448       /* FMSUB, FNMADD, and FNMSUB are free.  */
9449       if (GET_CODE (op0) == NEG)
9450         op0 = XEXP (op0, 0);
9451
9452       if (GET_CODE (op2) == NEG)
9453         op2 = XEXP (op2, 0);
9454
9455       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9456          and the by-element operand as operand 0.  */
9457       if (GET_CODE (op1) == NEG)
9458         op1 = XEXP (op1, 0);
9459
9460       /* Catch vector-by-element operations.  The by-element operand can
9461          either be (vec_duplicate (vec_select (x))) or just
9462          (vec_select (x)), depending on whether we are multiplying by
9463          a vector or a scalar.
9464
9465          Canonicalization is not very good in these cases, FMA4 will put the
9466          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
9467       if (GET_CODE (op0) == VEC_DUPLICATE)
9468         op0 = XEXP (op0, 0);
9469       else if (GET_CODE (op1) == VEC_DUPLICATE)
9470         op1 = XEXP (op1, 0);
9471
9472       if (GET_CODE (op0) == VEC_SELECT)
9473         op0 = XEXP (op0, 0);
9474       else if (GET_CODE (op1) == VEC_SELECT)
9475         op1 = XEXP (op1, 0);
9476
9477       /* If the remaining parameters are not registers,
9478          get the cost to put them into registers.  */
9479       *cost += rtx_cost (op0, mode, FMA, 0, speed);
9480       *cost += rtx_cost (op1, mode, FMA, 1, speed);
9481       *cost += rtx_cost (op2, mode, FMA, 2, speed);
9482       return true;
9483
9484     case FLOAT:
9485     case UNSIGNED_FLOAT:
9486       if (speed)
9487         *cost += extra_cost->fp[mode == DFmode].fromint;
9488       return false;
9489
9490     case FLOAT_EXTEND:
9491       if (speed)
9492         {
9493           if (VECTOR_MODE_P (mode))
9494             {
9495               /*Vector truncate.  */
9496               *cost += extra_cost->vect.alu;
9497             }
9498           else
9499             *cost += extra_cost->fp[mode == DFmode].widen;
9500         }
9501       return false;
9502
9503     case FLOAT_TRUNCATE:
9504       if (speed)
9505         {
9506           if (VECTOR_MODE_P (mode))
9507             {
9508               /*Vector conversion.  */
9509               *cost += extra_cost->vect.alu;
9510             }
9511           else
9512             *cost += extra_cost->fp[mode == DFmode].narrow;
9513         }
9514       return false;
9515
9516     case FIX:
9517     case UNSIGNED_FIX:
9518       x = XEXP (x, 0);
9519       /* Strip the rounding part.  They will all be implemented
9520          by the fcvt* family of instructions anyway.  */
9521       if (GET_CODE (x) == UNSPEC)
9522         {
9523           unsigned int uns_code = XINT (x, 1);
9524
9525           if (uns_code == UNSPEC_FRINTA
9526               || uns_code == UNSPEC_FRINTM
9527               || uns_code == UNSPEC_FRINTN
9528               || uns_code == UNSPEC_FRINTP
9529               || uns_code == UNSPEC_FRINTZ)
9530             x = XVECEXP (x, 0, 0);
9531         }
9532
9533       if (speed)
9534         {
9535           if (VECTOR_MODE_P (mode))
9536             *cost += extra_cost->vect.alu;
9537           else
9538             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9539         }
9540
9541       /* We can combine fmul by a power of 2 followed by a fcvt into a single
9542          fixed-point fcvt.  */
9543       if (GET_CODE (x) == MULT
9544           && ((VECTOR_MODE_P (mode)
9545                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9546               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9547         {
9548           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9549                              0, speed);
9550           return true;
9551         }
9552
9553       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9554       return true;
9555
9556     case ABS:
9557       if (VECTOR_MODE_P (mode))
9558         {
9559           /* ABS (vector).  */
9560           if (speed)
9561             *cost += extra_cost->vect.alu;
9562         }
9563       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9564         {
9565           op0 = XEXP (x, 0);
9566
9567           /* FABD, which is analogous to FADD.  */
9568           if (GET_CODE (op0) == MINUS)
9569             {
9570               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9571               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9572               if (speed)
9573                 *cost += extra_cost->fp[mode == DFmode].addsub;
9574
9575               return true;
9576             }
9577           /* Simple FABS is analogous to FNEG.  */
9578           if (speed)
9579             *cost += extra_cost->fp[mode == DFmode].neg;
9580         }
9581       else
9582         {
9583           /* Integer ABS will either be split to
9584              two arithmetic instructions, or will be an ABS
9585              (scalar), which we don't model.  */
9586           *cost = COSTS_N_INSNS (2);
9587           if (speed)
9588             *cost += 2 * extra_cost->alu.arith;
9589         }
9590       return false;
9591
9592     case SMAX:
9593     case SMIN:
9594       if (speed)
9595         {
9596           if (VECTOR_MODE_P (mode))
9597             *cost += extra_cost->vect.alu;
9598           else
9599             {
9600               /* FMAXNM/FMINNM/FMAX/FMIN.
9601                  TODO: This may not be accurate for all implementations, but
9602                  we do not model this in the cost tables.  */
9603               *cost += extra_cost->fp[mode == DFmode].addsub;
9604             }
9605         }
9606       return false;
9607
9608     case UNSPEC:
9609       /* The floating point round to integer frint* instructions.  */
9610       if (aarch64_frint_unspec_p (XINT (x, 1)))
9611         {
9612           if (speed)
9613             *cost += extra_cost->fp[mode == DFmode].roundint;
9614
9615           return false;
9616         }
9617
9618       if (XINT (x, 1) == UNSPEC_RBIT)
9619         {
9620           if (speed)
9621             *cost += extra_cost->alu.rev;
9622
9623           return false;
9624         }
9625       break;
9626
9627     case TRUNCATE:
9628
9629       /* Decompose <su>muldi3_highpart.  */
9630       if (/* (truncate:DI  */
9631           mode == DImode
9632           /*   (lshiftrt:TI  */
9633           && GET_MODE (XEXP (x, 0)) == TImode
9634           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9635           /*      (mult:TI  */
9636           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9637           /*        (ANY_EXTEND:TI (reg:DI))
9638                     (ANY_EXTEND:TI (reg:DI)))  */
9639           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9640                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9641               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9642                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9643           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9644           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9645           /*     (const_int 64)  */
9646           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9647           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9648         {
9649           /* UMULH/SMULH.  */
9650           if (speed)
9651             *cost += extra_cost->mult[mode == DImode].extend;
9652           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9653                              mode, MULT, 0, speed);
9654           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9655                              mode, MULT, 1, speed);
9656           return true;
9657         }
9658
9659       /* Fall through.  */
9660     default:
9661       break;
9662     }
9663
9664   if (dump_file
9665       && flag_aarch64_verbose_cost)
9666     fprintf (dump_file,
9667       "\nFailed to cost RTX.  Assuming default cost.\n");
9668
9669   return true;
9670 }
9671
9672 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9673    calculated for X.  This cost is stored in *COST.  Returns true
9674    if the total cost of X was calculated.  */
9675 static bool
9676 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9677                    int param, int *cost, bool speed)
9678 {
9679   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9680
9681   if (dump_file
9682       && flag_aarch64_verbose_cost)
9683     {
9684       print_rtl_single (dump_file, x);
9685       fprintf (dump_file, "\n%s cost: %d (%s)\n",
9686                speed ? "Hot" : "Cold",
9687                *cost, result ? "final" : "partial");
9688     }
9689
9690   return result;
9691 }
9692
9693 static int
9694 aarch64_register_move_cost (machine_mode mode,
9695                             reg_class_t from_i, reg_class_t to_i)
9696 {
9697   enum reg_class from = (enum reg_class) from_i;
9698   enum reg_class to = (enum reg_class) to_i;
9699   const struct cpu_regmove_cost *regmove_cost
9700     = aarch64_tune_params.regmove_cost;
9701
9702   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
9703   if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9704     to = GENERAL_REGS;
9705
9706   if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9707     from = GENERAL_REGS;
9708
9709   /* Moving between GPR and stack cost is the same as GP2GP.  */
9710   if ((from == GENERAL_REGS && to == STACK_REG)
9711       || (to == GENERAL_REGS && from == STACK_REG))
9712     return regmove_cost->GP2GP;
9713
9714   /* To/From the stack register, we move via the gprs.  */
9715   if (to == STACK_REG || from == STACK_REG)
9716     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9717             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9718
9719   if (known_eq (GET_MODE_SIZE (mode), 16))
9720     {
9721       /* 128-bit operations on general registers require 2 instructions.  */
9722       if (from == GENERAL_REGS && to == GENERAL_REGS)
9723         return regmove_cost->GP2GP * 2;
9724       else if (from == GENERAL_REGS)
9725         return regmove_cost->GP2FP * 2;
9726       else if (to == GENERAL_REGS)
9727         return regmove_cost->FP2GP * 2;
9728
9729       /* When AdvSIMD instructions are disabled it is not possible to move
9730          a 128-bit value directly between Q registers.  This is handled in
9731          secondary reload.  A general register is used as a scratch to move
9732          the upper DI value and the lower DI value is moved directly,
9733          hence the cost is the sum of three moves. */
9734       if (! TARGET_SIMD)
9735         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9736
9737       return regmove_cost->FP2FP;
9738     }
9739
9740   if (from == GENERAL_REGS && to == GENERAL_REGS)
9741     return regmove_cost->GP2GP;
9742   else if (from == GENERAL_REGS)
9743     return regmove_cost->GP2FP;
9744   else if (to == GENERAL_REGS)
9745     return regmove_cost->FP2GP;
9746
9747   return regmove_cost->FP2FP;
9748 }
9749
9750 static int
9751 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9752                           reg_class_t rclass ATTRIBUTE_UNUSED,
9753                           bool in ATTRIBUTE_UNUSED)
9754 {
9755   return aarch64_tune_params.memmov_cost;
9756 }
9757
9758 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9759    to optimize 1.0/sqrt.  */
9760
9761 static bool
9762 use_rsqrt_p (machine_mode mode)
9763 {
9764   return (!flag_trapping_math
9765           && flag_unsafe_math_optimizations
9766           && ((aarch64_tune_params.approx_modes->recip_sqrt
9767                & AARCH64_APPROX_MODE (mode))
9768               || flag_mrecip_low_precision_sqrt));
9769 }
9770
9771 /* Function to decide when to use the approximate reciprocal square root
9772    builtin.  */
9773
9774 static tree
9775 aarch64_builtin_reciprocal (tree fndecl)
9776 {
9777   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9778
9779   if (!use_rsqrt_p (mode))
9780     return NULL_TREE;
9781   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9782 }
9783
9784 typedef rtx (*rsqrte_type) (rtx, rtx);
9785
9786 /* Select reciprocal square root initial estimate insn depending on machine
9787    mode.  */
9788
9789 static rsqrte_type
9790 get_rsqrte_type (machine_mode mode)
9791 {
9792   switch (mode)
9793   {
9794     case E_DFmode:   return gen_aarch64_rsqrtedf;
9795     case E_SFmode:   return gen_aarch64_rsqrtesf;
9796     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9797     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9798     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9799     default: gcc_unreachable ();
9800   }
9801 }
9802
9803 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9804
9805 /* Select reciprocal square root series step insn depending on machine mode.  */
9806
9807 static rsqrts_type
9808 get_rsqrts_type (machine_mode mode)
9809 {
9810   switch (mode)
9811   {
9812     case E_DFmode:   return gen_aarch64_rsqrtsdf;
9813     case E_SFmode:   return gen_aarch64_rsqrtssf;
9814     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9815     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9816     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9817     default: gcc_unreachable ();
9818   }
9819 }
9820
9821 /* Emit instruction sequence to compute either the approximate square root
9822    or its approximate reciprocal, depending on the flag RECP, and return
9823    whether the sequence was emitted or not.  */
9824
9825 bool
9826 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9827 {
9828   machine_mode mode = GET_MODE (dst);
9829
9830   if (GET_MODE_INNER (mode) == HFmode)
9831     {
9832       gcc_assert (!recp);
9833       return false;
9834     }
9835
9836   if (!recp)
9837     {
9838       if (!(flag_mlow_precision_sqrt
9839             || (aarch64_tune_params.approx_modes->sqrt
9840                 & AARCH64_APPROX_MODE (mode))))
9841         return false;
9842
9843       if (flag_finite_math_only
9844           || flag_trapping_math
9845           || !flag_unsafe_math_optimizations
9846           || optimize_function_for_size_p (cfun))
9847         return false;
9848     }
9849   else
9850     /* Caller assumes we cannot fail.  */
9851     gcc_assert (use_rsqrt_p (mode));
9852
9853   machine_mode mmsk = mode_for_int_vector (mode).require ();
9854   rtx xmsk = gen_reg_rtx (mmsk);
9855   if (!recp)
9856     /* When calculating the approximate square root, compare the
9857        argument with 0.0 and create a mask.  */
9858     emit_insn (gen_rtx_SET (xmsk,
9859                             gen_rtx_NEG (mmsk,
9860                                          gen_rtx_EQ (mmsk, src,
9861                                                      CONST0_RTX (mode)))));
9862
9863   /* Estimate the approximate reciprocal square root.  */
9864   rtx xdst = gen_reg_rtx (mode);
9865   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9866
9867   /* Iterate over the series twice for SF and thrice for DF.  */
9868   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9869
9870   /* Optionally iterate over the series once less for faster performance
9871      while sacrificing the accuracy.  */
9872   if ((recp && flag_mrecip_low_precision_sqrt)
9873       || (!recp && flag_mlow_precision_sqrt))
9874     iterations--;
9875
9876   /* Iterate over the series to calculate the approximate reciprocal square
9877      root.  */
9878   rtx x1 = gen_reg_rtx (mode);
9879   while (iterations--)
9880     {
9881       rtx x2 = gen_reg_rtx (mode);
9882       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9883
9884       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9885
9886       if (iterations > 0)
9887         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9888     }
9889
9890   if (!recp)
9891     {
9892       /* Qualify the approximate reciprocal square root when the argument is
9893          0.0 by squashing the intermediary result to 0.0.  */
9894       rtx xtmp = gen_reg_rtx (mmsk);
9895       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9896                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
9897       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9898
9899       /* Calculate the approximate square root.  */
9900       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9901     }
9902
9903   /* Finalize the approximation.  */
9904   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9905
9906   return true;
9907 }
9908
9909 typedef rtx (*recpe_type) (rtx, rtx);
9910
9911 /* Select reciprocal initial estimate insn depending on machine mode.  */
9912
9913 static recpe_type
9914 get_recpe_type (machine_mode mode)
9915 {
9916   switch (mode)
9917   {
9918     case E_SFmode:   return (gen_aarch64_frecpesf);
9919     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
9920     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
9921     case E_DFmode:   return (gen_aarch64_frecpedf);
9922     case E_V2DFmode: return (gen_aarch64_frecpev2df);
9923     default:         gcc_unreachable ();
9924   }
9925 }
9926
9927 typedef rtx (*recps_type) (rtx, rtx, rtx);
9928
9929 /* Select reciprocal series step insn depending on machine mode.  */
9930
9931 static recps_type
9932 get_recps_type (machine_mode mode)
9933 {
9934   switch (mode)
9935   {
9936     case E_SFmode:   return (gen_aarch64_frecpssf);
9937     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
9938     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
9939     case E_DFmode:   return (gen_aarch64_frecpsdf);
9940     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
9941     default:         gcc_unreachable ();
9942   }
9943 }
9944
9945 /* Emit the instruction sequence to compute the approximation for the division
9946    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
9947
9948 bool
9949 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9950 {
9951   machine_mode mode = GET_MODE (quo);
9952
9953   if (GET_MODE_INNER (mode) == HFmode)
9954     return false;
9955
9956   bool use_approx_division_p = (flag_mlow_precision_div
9957                                 || (aarch64_tune_params.approx_modes->division
9958                                     & AARCH64_APPROX_MODE (mode)));
9959
9960   if (!flag_finite_math_only
9961       || flag_trapping_math
9962       || !flag_unsafe_math_optimizations
9963       || optimize_function_for_size_p (cfun)
9964       || !use_approx_division_p)
9965     return false;
9966
9967   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
9968     return false;
9969
9970   /* Estimate the approximate reciprocal.  */
9971   rtx xrcp = gen_reg_rtx (mode);
9972   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
9973
9974   /* Iterate over the series twice for SF and thrice for DF.  */
9975   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9976
9977   /* Optionally iterate over the series once less for faster performance,
9978      while sacrificing the accuracy.  */
9979   if (flag_mlow_precision_div)
9980     iterations--;
9981
9982   /* Iterate over the series to calculate the approximate reciprocal.  */
9983   rtx xtmp = gen_reg_rtx (mode);
9984   while (iterations--)
9985     {
9986       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
9987
9988       if (iterations > 0)
9989         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
9990     }
9991
9992   if (num != CONST1_RTX (mode))
9993     {
9994       /* As the approximate reciprocal of DEN is already calculated, only
9995          calculate the approximate division when NUM is not 1.0.  */
9996       rtx xnum = force_reg (mode, num);
9997       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
9998     }
9999
10000   /* Finalize the approximation.  */
10001   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10002   return true;
10003 }
10004
10005 /* Return the number of instructions that can be issued per cycle.  */
10006 static int
10007 aarch64_sched_issue_rate (void)
10008 {
10009   return aarch64_tune_params.issue_rate;
10010 }
10011
10012 static int
10013 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10014 {
10015   int issue_rate = aarch64_sched_issue_rate ();
10016
10017   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10018 }
10019
10020
10021 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10022    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
10023    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
10024
10025 static int
10026 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10027                                                     int ready_index)
10028 {
10029   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10030 }
10031
10032
10033 /* Vectorizer cost model target hooks.  */
10034
10035 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
10036 static int
10037 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10038                                     tree vectype,
10039                                     int misalign ATTRIBUTE_UNUSED)
10040 {
10041   unsigned elements;
10042   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10043   bool fp = false;
10044
10045   if (vectype != NULL)
10046     fp = FLOAT_TYPE_P (vectype);
10047
10048   switch (type_of_cost)
10049     {
10050       case scalar_stmt:
10051         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10052
10053       case scalar_load:
10054         return costs->scalar_load_cost;
10055
10056       case scalar_store:
10057         return costs->scalar_store_cost;
10058
10059       case vector_stmt:
10060         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10061
10062       case vector_load:
10063         return costs->vec_align_load_cost;
10064
10065       case vector_store:
10066         return costs->vec_store_cost;
10067
10068       case vec_to_scalar:
10069         return costs->vec_to_scalar_cost;
10070
10071       case scalar_to_vec:
10072         return costs->scalar_to_vec_cost;
10073
10074       case unaligned_load:
10075       case vector_gather_load:
10076         return costs->vec_unalign_load_cost;
10077
10078       case unaligned_store:
10079       case vector_scatter_store:
10080         return costs->vec_unalign_store_cost;
10081
10082       case cond_branch_taken:
10083         return costs->cond_taken_branch_cost;
10084
10085       case cond_branch_not_taken:
10086         return costs->cond_not_taken_branch_cost;
10087
10088       case vec_perm:
10089         return costs->vec_permute_cost;
10090
10091       case vec_promote_demote:
10092         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10093
10094       case vec_construct:
10095         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10096         return elements / 2 + 1;
10097
10098       default:
10099         gcc_unreachable ();
10100     }
10101 }
10102
10103 /* Implement targetm.vectorize.add_stmt_cost.  */
10104 static unsigned
10105 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10106                        struct _stmt_vec_info *stmt_info, int misalign,
10107                        enum vect_cost_model_location where)
10108 {
10109   unsigned *cost = (unsigned *) data;
10110   unsigned retval = 0;
10111
10112   if (flag_vect_cost_model)
10113     {
10114       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10115       int stmt_cost =
10116             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10117
10118       /* Statements in an inner loop relative to the loop being
10119          vectorized are weighted more heavily.  The value here is
10120          arbitrary and could potentially be improved with analysis.  */
10121       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10122         count *= 50; /*  FIXME  */
10123
10124       retval = (unsigned) (count * stmt_cost);
10125       cost[where] += retval;
10126     }
10127
10128   return retval;
10129 }
10130
10131 static void initialize_aarch64_code_model (struct gcc_options *);
10132
10133 /* Parse the TO_PARSE string and put the architecture struct that it
10134    selects into RES and the architectural features into ISA_FLAGS.
10135    Return an aarch64_parse_opt_result describing the parse result.
10136    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
10137
10138 static enum aarch64_parse_opt_result
10139 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10140                     unsigned long *isa_flags)
10141 {
10142   char *ext;
10143   const struct processor *arch;
10144   char *str = (char *) alloca (strlen (to_parse) + 1);
10145   size_t len;
10146
10147   strcpy (str, to_parse);
10148
10149   ext = strchr (str, '+');
10150
10151   if (ext != NULL)
10152     len = ext - str;
10153   else
10154     len = strlen (str);
10155
10156   if (len == 0)
10157     return AARCH64_PARSE_MISSING_ARG;
10158
10159
10160   /* Loop through the list of supported ARCHes to find a match.  */
10161   for (arch = all_architectures; arch->name != NULL; arch++)
10162     {
10163       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10164         {
10165           unsigned long isa_temp = arch->flags;
10166
10167           if (ext != NULL)
10168             {
10169               /* TO_PARSE string contains at least one extension.  */
10170               enum aarch64_parse_opt_result ext_res
10171                 = aarch64_parse_extension (ext, &isa_temp);
10172
10173               if (ext_res != AARCH64_PARSE_OK)
10174                 return ext_res;
10175             }
10176           /* Extension parsing was successful.  Confirm the result
10177              arch and ISA flags.  */
10178           *res = arch;
10179           *isa_flags = isa_temp;
10180           return AARCH64_PARSE_OK;
10181         }
10182     }
10183
10184   /* ARCH name not found in list.  */
10185   return AARCH64_PARSE_INVALID_ARG;
10186 }
10187
10188 /* Parse the TO_PARSE string and put the result tuning in RES and the
10189    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
10190    describing the parse result.  If there is an error parsing, RES and
10191    ISA_FLAGS are left unchanged.  */
10192
10193 static enum aarch64_parse_opt_result
10194 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10195                    unsigned long *isa_flags)
10196 {
10197   char *ext;
10198   const struct processor *cpu;
10199   char *str = (char *) alloca (strlen (to_parse) + 1);
10200   size_t len;
10201
10202   strcpy (str, to_parse);
10203
10204   ext = strchr (str, '+');
10205
10206   if (ext != NULL)
10207     len = ext - str;
10208   else
10209     len = strlen (str);
10210
10211   if (len == 0)
10212     return AARCH64_PARSE_MISSING_ARG;
10213
10214
10215   /* Loop through the list of supported CPUs to find a match.  */
10216   for (cpu = all_cores; cpu->name != NULL; cpu++)
10217     {
10218       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10219         {
10220           unsigned long isa_temp = cpu->flags;
10221
10222
10223           if (ext != NULL)
10224             {
10225               /* TO_PARSE string contains at least one extension.  */
10226               enum aarch64_parse_opt_result ext_res
10227                 = aarch64_parse_extension (ext, &isa_temp);
10228
10229               if (ext_res != AARCH64_PARSE_OK)
10230                 return ext_res;
10231             }
10232           /* Extension parsing was successfull.  Confirm the result
10233              cpu and ISA flags.  */
10234           *res = cpu;
10235           *isa_flags = isa_temp;
10236           return AARCH64_PARSE_OK;
10237         }
10238     }
10239
10240   /* CPU name not found in list.  */
10241   return AARCH64_PARSE_INVALID_ARG;
10242 }
10243
10244 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10245    Return an aarch64_parse_opt_result describing the parse result.
10246    If the parsing fails the RES does not change.  */
10247
10248 static enum aarch64_parse_opt_result
10249 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10250 {
10251   const struct processor *cpu;
10252   char *str = (char *) alloca (strlen (to_parse) + 1);
10253
10254   strcpy (str, to_parse);
10255
10256   /* Loop through the list of supported CPUs to find a match.  */
10257   for (cpu = all_cores; cpu->name != NULL; cpu++)
10258     {
10259       if (strcmp (cpu->name, str) == 0)
10260         {
10261           *res = cpu;
10262           return AARCH64_PARSE_OK;
10263         }
10264     }
10265
10266   /* CPU name not found in list.  */
10267   return AARCH64_PARSE_INVALID_ARG;
10268 }
10269
10270 /* Parse TOKEN, which has length LENGTH to see if it is an option
10271    described in FLAG.  If it is, return the index bit for that fusion type.
10272    If not, error (printing OPTION_NAME) and return zero.  */
10273
10274 static unsigned int
10275 aarch64_parse_one_option_token (const char *token,
10276                                 size_t length,
10277                                 const struct aarch64_flag_desc *flag,
10278                                 const char *option_name)
10279 {
10280   for (; flag->name != NULL; flag++)
10281     {
10282       if (length == strlen (flag->name)
10283           && !strncmp (flag->name, token, length))
10284         return flag->flag;
10285     }
10286
10287   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10288   return 0;
10289 }
10290
10291 /* Parse OPTION which is a comma-separated list of flags to enable.
10292    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10293    default state we inherit from the CPU tuning structures.  OPTION_NAME
10294    gives the top-level option we are parsing in the -moverride string,
10295    for use in error messages.  */
10296
10297 static unsigned int
10298 aarch64_parse_boolean_options (const char *option,
10299                                const struct aarch64_flag_desc *flags,
10300                                unsigned int initial_state,
10301                                const char *option_name)
10302 {
10303   const char separator = '.';
10304   const char* specs = option;
10305   const char* ntoken = option;
10306   unsigned int found_flags = initial_state;
10307
10308   while ((ntoken = strchr (specs, separator)))
10309     {
10310       size_t token_length = ntoken - specs;
10311       unsigned token_ops = aarch64_parse_one_option_token (specs,
10312                                                            token_length,
10313                                                            flags,
10314                                                            option_name);
10315       /* If we find "none" (or, for simplicity's sake, an error) anywhere
10316          in the token stream, reset the supported operations.  So:
10317
10318            adrp+add.cmp+branch.none.adrp+add
10319
10320            would have the result of turning on only adrp+add fusion.  */
10321       if (!token_ops)
10322         found_flags = 0;
10323
10324       found_flags |= token_ops;
10325       specs = ++ntoken;
10326     }
10327
10328   /* We ended with a comma, print something.  */
10329   if (!(*specs))
10330     {
10331       error ("%s string ill-formed\n", option_name);
10332       return 0;
10333     }
10334
10335   /* We still have one more token to parse.  */
10336   size_t token_length = strlen (specs);
10337   unsigned token_ops = aarch64_parse_one_option_token (specs,
10338                                                        token_length,
10339                                                        flags,
10340                                                        option_name);
10341    if (!token_ops)
10342      found_flags = 0;
10343
10344   found_flags |= token_ops;
10345   return found_flags;
10346 }
10347
10348 /* Support for overriding instruction fusion.  */
10349
10350 static void
10351 aarch64_parse_fuse_string (const char *fuse_string,
10352                             struct tune_params *tune)
10353 {
10354   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10355                                                      aarch64_fusible_pairs,
10356                                                      tune->fusible_ops,
10357                                                      "fuse=");
10358 }
10359
10360 /* Support for overriding other tuning flags.  */
10361
10362 static void
10363 aarch64_parse_tune_string (const char *tune_string,
10364                             struct tune_params *tune)
10365 {
10366   tune->extra_tuning_flags
10367     = aarch64_parse_boolean_options (tune_string,
10368                                      aarch64_tuning_flags,
10369                                      tune->extra_tuning_flags,
10370                                      "tune=");
10371 }
10372
10373 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10374    we understand.  If it is, extract the option string and handoff to
10375    the appropriate function.  */
10376
10377 void
10378 aarch64_parse_one_override_token (const char* token,
10379                                   size_t length,
10380                                   struct tune_params *tune)
10381 {
10382   const struct aarch64_tuning_override_function *fn
10383     = aarch64_tuning_override_functions;
10384
10385   const char *option_part = strchr (token, '=');
10386   if (!option_part)
10387     {
10388       error ("tuning string missing in option (%s)", token);
10389       return;
10390     }
10391
10392   /* Get the length of the option name.  */
10393   length = option_part - token;
10394   /* Skip the '=' to get to the option string.  */
10395   option_part++;
10396
10397   for (; fn->name != NULL; fn++)
10398     {
10399       if (!strncmp (fn->name, token, length))
10400         {
10401           fn->parse_override (option_part, tune);
10402           return;
10403         }
10404     }
10405
10406   error ("unknown tuning option (%s)",token);
10407   return;
10408 }
10409
10410 /* A checking mechanism for the implementation of the tls size.  */
10411
10412 static void
10413 initialize_aarch64_tls_size (struct gcc_options *opts)
10414 {
10415   if (aarch64_tls_size == 0)
10416     aarch64_tls_size = 24;
10417
10418   switch (opts->x_aarch64_cmodel_var)
10419     {
10420     case AARCH64_CMODEL_TINY:
10421       /* Both the default and maximum TLS size allowed under tiny is 1M which
10422          needs two instructions to address, so we clamp the size to 24.  */
10423       if (aarch64_tls_size > 24)
10424         aarch64_tls_size = 24;
10425       break;
10426     case AARCH64_CMODEL_SMALL:
10427       /* The maximum TLS size allowed under small is 4G.  */
10428       if (aarch64_tls_size > 32)
10429         aarch64_tls_size = 32;
10430       break;
10431     case AARCH64_CMODEL_LARGE:
10432       /* The maximum TLS size allowed under large is 16E.
10433          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
10434       if (aarch64_tls_size > 48)
10435         aarch64_tls_size = 48;
10436       break;
10437     default:
10438       gcc_unreachable ();
10439     }
10440
10441   return;
10442 }
10443
10444 /* Parse STRING looking for options in the format:
10445      string     :: option:string
10446      option     :: name=substring
10447      name       :: {a-z}
10448      substring  :: defined by option.  */
10449
10450 static void
10451 aarch64_parse_override_string (const char* input_string,
10452                                struct tune_params* tune)
10453 {
10454   const char separator = ':';
10455   size_t string_length = strlen (input_string) + 1;
10456   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10457   char *string = string_root;
10458   strncpy (string, input_string, string_length);
10459   string[string_length - 1] = '\0';
10460
10461   char* ntoken = string;
10462
10463   while ((ntoken = strchr (string, separator)))
10464     {
10465       size_t token_length = ntoken - string;
10466       /* Make this substring look like a string.  */
10467       *ntoken = '\0';
10468       aarch64_parse_one_override_token (string, token_length, tune);
10469       string = ++ntoken;
10470     }
10471
10472   /* One last option to parse.  */
10473   aarch64_parse_one_override_token (string, strlen (string), tune);
10474   free (string_root);
10475 }
10476
10477
10478 static void
10479 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10480 {
10481   /* PR 70044: We have to be careful about being called multiple times for the
10482      same function.  This means all changes should be repeatable.  */
10483
10484   /* If the frame pointer is enabled, set it to a special value that behaves
10485      similar to frame pointer omission.  If we don't do this all leaf functions
10486      will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
10487      If flag_omit_frame_pointer has this special value, we must force the
10488      frame pointer if not in a leaf function.  We also need to force it in a
10489      leaf function if flag_omit_frame_pointer is not set or if LR is used.  */
10490   if (opts->x_flag_omit_frame_pointer == 0)
10491     opts->x_flag_omit_frame_pointer = 2;
10492
10493   /* If not optimizing for size, set the default
10494      alignment to what the target wants.  */
10495   if (!opts->x_optimize_size)
10496     {
10497       if (opts->x_align_loops <= 0)
10498         opts->x_align_loops = aarch64_tune_params.loop_align;
10499       if (opts->x_align_jumps <= 0)
10500         opts->x_align_jumps = aarch64_tune_params.jump_align;
10501       if (opts->x_align_functions <= 0)
10502         opts->x_align_functions = aarch64_tune_params.function_align;
10503     }
10504
10505   /* We default to no pc-relative literal loads.  */
10506
10507   aarch64_pcrelative_literal_loads = false;
10508
10509   /* If -mpc-relative-literal-loads is set on the command line, this
10510      implies that the user asked for PC relative literal loads.  */
10511   if (opts->x_pcrelative_literal_loads == 1)
10512     aarch64_pcrelative_literal_loads = true;
10513
10514   /* In the tiny memory model it makes no sense to disallow PC relative
10515      literal pool loads.  */
10516   if (aarch64_cmodel == AARCH64_CMODEL_TINY
10517       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10518     aarch64_pcrelative_literal_loads = true;
10519
10520   /* When enabling the lower precision Newton series for the square root, also
10521      enable it for the reciprocal square root, since the latter is an
10522      intermediary step for the former.  */
10523   if (flag_mlow_precision_sqrt)
10524     flag_mrecip_low_precision_sqrt = true;
10525 }
10526
10527 /* 'Unpack' up the internal tuning structs and update the options
10528     in OPTS.  The caller must have set up selected_tune and selected_arch
10529     as all the other target-specific codegen decisions are
10530     derived from them.  */
10531
10532 void
10533 aarch64_override_options_internal (struct gcc_options *opts)
10534 {
10535   aarch64_tune_flags = selected_tune->flags;
10536   aarch64_tune = selected_tune->sched_core;
10537   /* Make a copy of the tuning parameters attached to the core, which
10538      we may later overwrite.  */
10539   aarch64_tune_params = *(selected_tune->tune);
10540   aarch64_architecture_version = selected_arch->architecture_version;
10541
10542   if (opts->x_aarch64_override_tune_string)
10543     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10544                                   &aarch64_tune_params);
10545
10546   /* This target defaults to strict volatile bitfields.  */
10547   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10548     opts->x_flag_strict_volatile_bitfields = 1;
10549
10550   initialize_aarch64_code_model (opts);
10551   initialize_aarch64_tls_size (opts);
10552
10553   int queue_depth = 0;
10554   switch (aarch64_tune_params.autoprefetcher_model)
10555     {
10556       case tune_params::AUTOPREFETCHER_OFF:
10557         queue_depth = -1;
10558         break;
10559       case tune_params::AUTOPREFETCHER_WEAK:
10560         queue_depth = 0;
10561         break;
10562       case tune_params::AUTOPREFETCHER_STRONG:
10563         queue_depth = max_insn_queue_index + 1;
10564         break;
10565       default:
10566         gcc_unreachable ();
10567     }
10568
10569   /* We don't mind passing in global_options_set here as we don't use
10570      the *options_set structs anyway.  */
10571   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10572                          queue_depth,
10573                          opts->x_param_values,
10574                          global_options_set.x_param_values);
10575
10576   /* Set up parameters to be used in prefetching algorithm.  Do not
10577      override the defaults unless we are tuning for a core we have
10578      researched values for.  */
10579   if (aarch64_tune_params.prefetch->num_slots > 0)
10580     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10581                            aarch64_tune_params.prefetch->num_slots,
10582                            opts->x_param_values,
10583                            global_options_set.x_param_values);
10584   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10585     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10586                            aarch64_tune_params.prefetch->l1_cache_size,
10587                            opts->x_param_values,
10588                            global_options_set.x_param_values);
10589   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10590     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10591                            aarch64_tune_params.prefetch->l1_cache_line_size,
10592                            opts->x_param_values,
10593                            global_options_set.x_param_values);
10594   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10595     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10596                            aarch64_tune_params.prefetch->l2_cache_size,
10597                            opts->x_param_values,
10598                            global_options_set.x_param_values);
10599
10600   /* Use the alternative scheduling-pressure algorithm by default.  */
10601   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10602                          opts->x_param_values,
10603                          global_options_set.x_param_values);
10604
10605   /* Enable sw prefetching at specified optimization level for
10606      CPUS that have prefetch.  Lower optimization level threshold by 1
10607      when profiling is enabled.  */
10608   if (opts->x_flag_prefetch_loop_arrays < 0
10609       && !opts->x_optimize_size
10610       && aarch64_tune_params.prefetch->default_opt_level >= 0
10611       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10612     opts->x_flag_prefetch_loop_arrays = 1;
10613
10614   aarch64_override_options_after_change_1 (opts);
10615 }
10616
10617 /* Print a hint with a suggestion for a core or architecture name that
10618    most closely resembles what the user passed in STR.  ARCH is true if
10619    the user is asking for an architecture name.  ARCH is false if the user
10620    is asking for a core name.  */
10621
10622 static void
10623 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10624 {
10625   auto_vec<const char *> candidates;
10626   const struct processor *entry = arch ? all_architectures : all_cores;
10627   for (; entry->name != NULL; entry++)
10628     candidates.safe_push (entry->name);
10629
10630 #ifdef HAVE_LOCAL_CPU_DETECT
10631   /* Add also "native" as possible value.  */
10632   if (arch)
10633     candidates.safe_push ("native");
10634 #endif
10635
10636   char *s;
10637   const char *hint = candidates_list_and_hint (str, s, candidates);
10638   if (hint)
10639     inform (input_location, "valid arguments are: %s;"
10640                              " did you mean %qs?", s, hint);
10641   else
10642     inform (input_location, "valid arguments are: %s", s);
10643
10644   XDELETEVEC (s);
10645 }
10646
10647 /* Print a hint with a suggestion for a core name that most closely resembles
10648    what the user passed in STR.  */
10649
10650 inline static void
10651 aarch64_print_hint_for_core (const char *str)
10652 {
10653   aarch64_print_hint_for_core_or_arch (str, false);
10654 }
10655
10656 /* Print a hint with a suggestion for an architecture name that most closely
10657    resembles what the user passed in STR.  */
10658
10659 inline static void
10660 aarch64_print_hint_for_arch (const char *str)
10661 {
10662   aarch64_print_hint_for_core_or_arch (str, true);
10663 }
10664
10665 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
10666    specified in STR and throw errors if appropriate.  Put the results if
10667    they are valid in RES and ISA_FLAGS.  Return whether the option is
10668    valid.  */
10669
10670 static bool
10671 aarch64_validate_mcpu (const char *str, const struct processor **res,
10672                        unsigned long *isa_flags)
10673 {
10674   enum aarch64_parse_opt_result parse_res
10675     = aarch64_parse_cpu (str, res, isa_flags);
10676
10677   if (parse_res == AARCH64_PARSE_OK)
10678     return true;
10679
10680   switch (parse_res)
10681     {
10682       case AARCH64_PARSE_MISSING_ARG:
10683         error ("missing cpu name in %<-mcpu=%s%>", str);
10684         break;
10685       case AARCH64_PARSE_INVALID_ARG:
10686         error ("unknown value %qs for -mcpu", str);
10687         aarch64_print_hint_for_core (str);
10688         break;
10689       case AARCH64_PARSE_INVALID_FEATURE:
10690         error ("invalid feature modifier in %<-mcpu=%s%>", str);
10691         break;
10692       default:
10693         gcc_unreachable ();
10694     }
10695
10696   return false;
10697 }
10698
10699 /* Validate a command-line -march option.  Parse the arch and extensions
10700    (if any) specified in STR and throw errors if appropriate.  Put the
10701    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
10702    option is valid.  */
10703
10704 static bool
10705 aarch64_validate_march (const char *str, const struct processor **res,
10706                          unsigned long *isa_flags)
10707 {
10708   enum aarch64_parse_opt_result parse_res
10709     = aarch64_parse_arch (str, res, isa_flags);
10710
10711   if (parse_res == AARCH64_PARSE_OK)
10712     return true;
10713
10714   switch (parse_res)
10715     {
10716       case AARCH64_PARSE_MISSING_ARG:
10717         error ("missing arch name in %<-march=%s%>", str);
10718         break;
10719       case AARCH64_PARSE_INVALID_ARG:
10720         error ("unknown value %qs for -march", str);
10721         aarch64_print_hint_for_arch (str);
10722         break;
10723       case AARCH64_PARSE_INVALID_FEATURE:
10724         error ("invalid feature modifier in %<-march=%s%>", str);
10725         break;
10726       default:
10727         gcc_unreachable ();
10728     }
10729
10730   return false;
10731 }
10732
10733 /* Validate a command-line -mtune option.  Parse the cpu
10734    specified in STR and throw errors if appropriate.  Put the
10735    result, if it is valid, in RES.  Return whether the option is
10736    valid.  */
10737
10738 static bool
10739 aarch64_validate_mtune (const char *str, const struct processor **res)
10740 {
10741   enum aarch64_parse_opt_result parse_res
10742     = aarch64_parse_tune (str, res);
10743
10744   if (parse_res == AARCH64_PARSE_OK)
10745     return true;
10746
10747   switch (parse_res)
10748     {
10749       case AARCH64_PARSE_MISSING_ARG:
10750         error ("missing cpu name in %<-mtune=%s%>", str);
10751         break;
10752       case AARCH64_PARSE_INVALID_ARG:
10753         error ("unknown value %qs for -mtune", str);
10754         aarch64_print_hint_for_core (str);
10755         break;
10756       default:
10757         gcc_unreachable ();
10758     }
10759   return false;
10760 }
10761
10762 /* Return the CPU corresponding to the enum CPU.
10763    If it doesn't specify a cpu, return the default.  */
10764
10765 static const struct processor *
10766 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10767 {
10768   if (cpu != aarch64_none)
10769     return &all_cores[cpu];
10770
10771   /* The & 0x3f is to extract the bottom 6 bits that encode the
10772      default cpu as selected by the --with-cpu GCC configure option
10773      in config.gcc.
10774      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10775      flags mechanism should be reworked to make it more sane.  */
10776   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10777 }
10778
10779 /* Return the architecture corresponding to the enum ARCH.
10780    If it doesn't specify a valid architecture, return the default.  */
10781
10782 static const struct processor *
10783 aarch64_get_arch (enum aarch64_arch arch)
10784 {
10785   if (arch != aarch64_no_arch)
10786     return &all_architectures[arch];
10787
10788   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10789
10790   return &all_architectures[cpu->arch];
10791 }
10792
10793 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
10794
10795 static poly_uint16
10796 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10797 {
10798   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10799      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10800      deciding which .md file patterns to use and when deciding whether
10801      something is a legitimate address or constant.  */
10802   if (value == SVE_SCALABLE || value == SVE_128)
10803     return poly_uint16 (2, 2);
10804   else
10805     return (int) value / 64;
10806 }
10807
10808 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
10809    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10810    tuning structs.  In particular it must set selected_tune and
10811    aarch64_isa_flags that define the available ISA features and tuning
10812    decisions.  It must also set selected_arch as this will be used to
10813    output the .arch asm tags for each function.  */
10814
10815 static void
10816 aarch64_override_options (void)
10817 {
10818   unsigned long cpu_isa = 0;
10819   unsigned long arch_isa = 0;
10820   aarch64_isa_flags = 0;
10821
10822   bool valid_cpu = true;
10823   bool valid_tune = true;
10824   bool valid_arch = true;
10825
10826   selected_cpu = NULL;
10827   selected_arch = NULL;
10828   selected_tune = NULL;
10829
10830   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10831      If either of -march or -mtune is given, they override their
10832      respective component of -mcpu.  */
10833   if (aarch64_cpu_string)
10834     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10835                                         &cpu_isa);
10836
10837   if (aarch64_arch_string)
10838     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10839                                           &arch_isa);
10840
10841   if (aarch64_tune_string)
10842     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10843
10844   /* If the user did not specify a processor, choose the default
10845      one for them.  This will be the CPU set during configuration using
10846      --with-cpu, otherwise it is "generic".  */
10847   if (!selected_cpu)
10848     {
10849       if (selected_arch)
10850         {
10851           selected_cpu = &all_cores[selected_arch->ident];
10852           aarch64_isa_flags = arch_isa;
10853           explicit_arch = selected_arch->arch;
10854         }
10855       else
10856         {
10857           /* Get default configure-time CPU.  */
10858           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10859           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10860         }
10861
10862       if (selected_tune)
10863         explicit_tune_core = selected_tune->ident;
10864     }
10865   /* If both -mcpu and -march are specified check that they are architecturally
10866      compatible, warn if they're not and prefer the -march ISA flags.  */
10867   else if (selected_arch)
10868     {
10869       if (selected_arch->arch != selected_cpu->arch)
10870         {
10871           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10872                        all_architectures[selected_cpu->arch].name,
10873                        selected_arch->name);
10874         }
10875       aarch64_isa_flags = arch_isa;
10876       explicit_arch = selected_arch->arch;
10877       explicit_tune_core = selected_tune ? selected_tune->ident
10878                                           : selected_cpu->ident;
10879     }
10880   else
10881     {
10882       /* -mcpu but no -march.  */
10883       aarch64_isa_flags = cpu_isa;
10884       explicit_tune_core = selected_tune ? selected_tune->ident
10885                                           : selected_cpu->ident;
10886       gcc_assert (selected_cpu);
10887       selected_arch = &all_architectures[selected_cpu->arch];
10888       explicit_arch = selected_arch->arch;
10889     }
10890
10891   /* Set the arch as well as we will need it when outputing
10892      the .arch directive in assembly.  */
10893   if (!selected_arch)
10894     {
10895       gcc_assert (selected_cpu);
10896       selected_arch = &all_architectures[selected_cpu->arch];
10897     }
10898
10899   if (!selected_tune)
10900     selected_tune = selected_cpu;
10901
10902 #ifndef HAVE_AS_MABI_OPTION
10903   /* The compiler may have been configured with 2.23.* binutils, which does
10904      not have support for ILP32.  */
10905   if (TARGET_ILP32)
10906     error ("assembler does not support -mabi=ilp32");
10907 #endif
10908
10909   /* Convert -msve-vector-bits to a VG count.  */
10910   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10911
10912   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10913     sorry ("return address signing is only supported for -mabi=lp64");
10914
10915   /* Make sure we properly set up the explicit options.  */
10916   if ((aarch64_cpu_string && valid_cpu)
10917        || (aarch64_tune_string && valid_tune))
10918     gcc_assert (explicit_tune_core != aarch64_none);
10919
10920   if ((aarch64_cpu_string && valid_cpu)
10921        || (aarch64_arch_string && valid_arch))
10922     gcc_assert (explicit_arch != aarch64_no_arch);
10923
10924   aarch64_override_options_internal (&global_options);
10925
10926   /* Save these options as the default ones in case we push and pop them later
10927      while processing functions with potential target attributes.  */
10928   target_option_default_node = target_option_current_node
10929       = build_target_option_node (&global_options);
10930 }
10931
10932 /* Implement targetm.override_options_after_change.  */
10933
10934 static void
10935 aarch64_override_options_after_change (void)
10936 {
10937   aarch64_override_options_after_change_1 (&global_options);
10938 }
10939
10940 static struct machine_function *
10941 aarch64_init_machine_status (void)
10942 {
10943   struct machine_function *machine;
10944   machine = ggc_cleared_alloc<machine_function> ();
10945   return machine;
10946 }
10947
10948 void
10949 aarch64_init_expanders (void)
10950 {
10951   init_machine_status = aarch64_init_machine_status;
10952 }
10953
10954 /* A checking mechanism for the implementation of the various code models.  */
10955 static void
10956 initialize_aarch64_code_model (struct gcc_options *opts)
10957 {
10958    if (opts->x_flag_pic)
10959      {
10960        switch (opts->x_aarch64_cmodel_var)
10961          {
10962          case AARCH64_CMODEL_TINY:
10963            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
10964            break;
10965          case AARCH64_CMODEL_SMALL:
10966 #ifdef HAVE_AS_SMALL_PIC_RELOCS
10967            aarch64_cmodel = (flag_pic == 2
10968                              ? AARCH64_CMODEL_SMALL_PIC
10969                              : AARCH64_CMODEL_SMALL_SPIC);
10970 #else
10971            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
10972 #endif
10973            break;
10974          case AARCH64_CMODEL_LARGE:
10975            sorry ("code model %qs with -f%s", "large",
10976                   opts->x_flag_pic > 1 ? "PIC" : "pic");
10977            break;
10978          default:
10979            gcc_unreachable ();
10980          }
10981      }
10982    else
10983      aarch64_cmodel = opts->x_aarch64_cmodel_var;
10984 }
10985
10986 /* Implement TARGET_OPTION_SAVE.  */
10987
10988 static void
10989 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
10990 {
10991   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
10992 }
10993
10994 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
10995    using the information saved in PTR.  */
10996
10997 static void
10998 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
10999 {
11000   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11001   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11002   opts->x_explicit_arch = ptr->x_explicit_arch;
11003   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11004   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11005
11006   aarch64_override_options_internal (opts);
11007 }
11008
11009 /* Implement TARGET_OPTION_PRINT.  */
11010
11011 static void
11012 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11013 {
11014   const struct processor *cpu
11015     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11016   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11017   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11018   std::string extension
11019     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11020
11021   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11022   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11023            arch->name, extension.c_str ());
11024 }
11025
11026 static GTY(()) tree aarch64_previous_fndecl;
11027
11028 void
11029 aarch64_reset_previous_fndecl (void)
11030 {
11031   aarch64_previous_fndecl = NULL;
11032 }
11033
11034 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11035    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11036    make sure optab availability predicates are recomputed when necessary.  */
11037
11038 void
11039 aarch64_save_restore_target_globals (tree new_tree)
11040 {
11041   if (TREE_TARGET_GLOBALS (new_tree))
11042     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11043   else if (new_tree == target_option_default_node)
11044     restore_target_globals (&default_target_globals);
11045   else
11046     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11047 }
11048
11049 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
11050    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11051    of the function, if such exists.  This function may be called multiple
11052    times on a single function so use aarch64_previous_fndecl to avoid
11053    setting up identical state.  */
11054
11055 static void
11056 aarch64_set_current_function (tree fndecl)
11057 {
11058   if (!fndecl || fndecl == aarch64_previous_fndecl)
11059     return;
11060
11061   tree old_tree = (aarch64_previous_fndecl
11062                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11063                    : NULL_TREE);
11064
11065   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11066
11067   /* If current function has no attributes but the previous one did,
11068      use the default node.  */
11069   if (!new_tree && old_tree)
11070     new_tree = target_option_default_node;
11071
11072   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
11073      the default have been handled by aarch64_save_restore_target_globals from
11074      aarch64_pragma_target_parse.  */
11075   if (old_tree == new_tree)
11076     return;
11077
11078   aarch64_previous_fndecl = fndecl;
11079
11080   /* First set the target options.  */
11081   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11082
11083   aarch64_save_restore_target_globals (new_tree);
11084 }
11085
11086 /* Enum describing the various ways we can handle attributes.
11087    In many cases we can reuse the generic option handling machinery.  */
11088
11089 enum aarch64_attr_opt_type
11090 {
11091   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
11092   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
11093   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
11094   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
11095 };
11096
11097 /* All the information needed to handle a target attribute.
11098    NAME is the name of the attribute.
11099    ATTR_TYPE specifies the type of behavior of the attribute as described
11100    in the definition of enum aarch64_attr_opt_type.
11101    ALLOW_NEG is true if the attribute supports a "no-" form.
11102    HANDLER is the function that takes the attribute string as an argument
11103    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11104    OPT_NUM is the enum specifying the option that the attribute modifies.
11105    This is needed for attributes that mirror the behavior of a command-line
11106    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11107    aarch64_attr_enum.  */
11108
11109 struct aarch64_attribute_info
11110 {
11111   const char *name;
11112   enum aarch64_attr_opt_type attr_type;
11113   bool allow_neg;
11114   bool (*handler) (const char *);
11115   enum opt_code opt_num;
11116 };
11117
11118 /* Handle the ARCH_STR argument to the arch= target attribute.  */
11119
11120 static bool
11121 aarch64_handle_attr_arch (const char *str)
11122 {
11123   const struct processor *tmp_arch = NULL;
11124   enum aarch64_parse_opt_result parse_res
11125     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11126
11127   if (parse_res == AARCH64_PARSE_OK)
11128     {
11129       gcc_assert (tmp_arch);
11130       selected_arch = tmp_arch;
11131       explicit_arch = selected_arch->arch;
11132       return true;
11133     }
11134
11135   switch (parse_res)
11136     {
11137       case AARCH64_PARSE_MISSING_ARG:
11138         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11139         break;
11140       case AARCH64_PARSE_INVALID_ARG:
11141         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11142         aarch64_print_hint_for_arch (str);
11143         break;
11144       case AARCH64_PARSE_INVALID_FEATURE:
11145         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11146         break;
11147       default:
11148         gcc_unreachable ();
11149     }
11150
11151   return false;
11152 }
11153
11154 /* Handle the argument CPU_STR to the cpu= target attribute.  */
11155
11156 static bool
11157 aarch64_handle_attr_cpu (const char *str)
11158 {
11159   const struct processor *tmp_cpu = NULL;
11160   enum aarch64_parse_opt_result parse_res
11161     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11162
11163   if (parse_res == AARCH64_PARSE_OK)
11164     {
11165       gcc_assert (tmp_cpu);
11166       selected_tune = tmp_cpu;
11167       explicit_tune_core = selected_tune->ident;
11168
11169       selected_arch = &all_architectures[tmp_cpu->arch];
11170       explicit_arch = selected_arch->arch;
11171       return true;
11172     }
11173
11174   switch (parse_res)
11175     {
11176       case AARCH64_PARSE_MISSING_ARG:
11177         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11178         break;
11179       case AARCH64_PARSE_INVALID_ARG:
11180         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11181         aarch64_print_hint_for_core (str);
11182         break;
11183       case AARCH64_PARSE_INVALID_FEATURE:
11184         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11185         break;
11186       default:
11187         gcc_unreachable ();
11188     }
11189
11190   return false;
11191 }
11192
11193 /* Handle the argument STR to the tune= target attribute.  */
11194
11195 static bool
11196 aarch64_handle_attr_tune (const char *str)
11197 {
11198   const struct processor *tmp_tune = NULL;
11199   enum aarch64_parse_opt_result parse_res
11200     = aarch64_parse_tune (str, &tmp_tune);
11201
11202   if (parse_res == AARCH64_PARSE_OK)
11203     {
11204       gcc_assert (tmp_tune);
11205       selected_tune = tmp_tune;
11206       explicit_tune_core = selected_tune->ident;
11207       return true;
11208     }
11209
11210   switch (parse_res)
11211     {
11212       case AARCH64_PARSE_INVALID_ARG:
11213         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11214         aarch64_print_hint_for_core (str);
11215         break;
11216       default:
11217         gcc_unreachable ();
11218     }
11219
11220   return false;
11221 }
11222
11223 /* Parse an architecture extensions target attribute string specified in STR.
11224    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
11225    if successful.  Update aarch64_isa_flags to reflect the ISA features
11226    modified.  */
11227
11228 static bool
11229 aarch64_handle_attr_isa_flags (char *str)
11230 {
11231   enum aarch64_parse_opt_result parse_res;
11232   unsigned long isa_flags = aarch64_isa_flags;
11233
11234   /* We allow "+nothing" in the beginning to clear out all architectural
11235      features if the user wants to handpick specific features.  */
11236   if (strncmp ("+nothing", str, 8) == 0)
11237     {
11238       isa_flags = 0;
11239       str += 8;
11240     }
11241
11242   parse_res = aarch64_parse_extension (str, &isa_flags);
11243
11244   if (parse_res == AARCH64_PARSE_OK)
11245     {
11246       aarch64_isa_flags = isa_flags;
11247       return true;
11248     }
11249
11250   switch (parse_res)
11251     {
11252       case AARCH64_PARSE_MISSING_ARG:
11253         error ("missing value in %<target()%> pragma or attribute");
11254         break;
11255
11256       case AARCH64_PARSE_INVALID_FEATURE:
11257         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11258         break;
11259
11260       default:
11261         gcc_unreachable ();
11262     }
11263
11264  return false;
11265 }
11266
11267 /* The target attributes that we support.  On top of these we also support just
11268    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
11269    handled explicitly in aarch64_process_one_target_attr.  */
11270
11271 static const struct aarch64_attribute_info aarch64_attributes[] =
11272 {
11273   { "general-regs-only", aarch64_attr_mask, false, NULL,
11274      OPT_mgeneral_regs_only },
11275   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11276      OPT_mfix_cortex_a53_835769 },
11277   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11278      OPT_mfix_cortex_a53_843419 },
11279   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11280   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
11281   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11282      OPT_momit_leaf_frame_pointer },
11283   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11284   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11285      OPT_march_ },
11286   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11287   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11288      OPT_mtune_ },
11289   { "sign-return-address", aarch64_attr_enum, false, NULL,
11290      OPT_msign_return_address_ },
11291   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11292 };
11293
11294 /* Parse ARG_STR which contains the definition of one target attribute.
11295    Show appropriate errors if any or return true if the attribute is valid.  */
11296
11297 static bool
11298 aarch64_process_one_target_attr (char *arg_str)
11299 {
11300   bool invert = false;
11301
11302   size_t len = strlen (arg_str);
11303
11304   if (len == 0)
11305     {
11306       error ("malformed %<target()%> pragma or attribute");
11307       return false;
11308     }
11309
11310   char *str_to_check = (char *) alloca (len + 1);
11311   strcpy (str_to_check, arg_str);
11312
11313   /* Skip leading whitespace.  */
11314   while (*str_to_check == ' ' || *str_to_check == '\t')
11315     str_to_check++;
11316
11317   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11318      It is easier to detect and handle it explicitly here rather than going
11319      through the machinery for the rest of the target attributes in this
11320      function.  */
11321   if (*str_to_check == '+')
11322     return aarch64_handle_attr_isa_flags (str_to_check);
11323
11324   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11325     {
11326       invert = true;
11327       str_to_check += 3;
11328     }
11329   char *arg = strchr (str_to_check, '=');
11330
11331   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11332      and point ARG to "foo".  */
11333   if (arg)
11334     {
11335       *arg = '\0';
11336       arg++;
11337     }
11338   const struct aarch64_attribute_info *p_attr;
11339   bool found = false;
11340   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11341     {
11342       /* If the names don't match up, or the user has given an argument
11343          to an attribute that doesn't accept one, or didn't give an argument
11344          to an attribute that expects one, fail to match.  */
11345       if (strcmp (str_to_check, p_attr->name) != 0)
11346         continue;
11347
11348       found = true;
11349       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11350                               || p_attr->attr_type == aarch64_attr_enum;
11351
11352       if (attr_need_arg_p ^ (arg != NULL))
11353         {
11354           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11355           return false;
11356         }
11357
11358       /* If the name matches but the attribute does not allow "no-" versions
11359          then we can't match.  */
11360       if (invert && !p_attr->allow_neg)
11361         {
11362           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11363           return false;
11364         }
11365
11366       switch (p_attr->attr_type)
11367         {
11368         /* Has a custom handler registered.
11369            For example, cpu=, arch=, tune=.  */
11370           case aarch64_attr_custom:
11371             gcc_assert (p_attr->handler);
11372             if (!p_attr->handler (arg))
11373               return false;
11374             break;
11375
11376           /* Either set or unset a boolean option.  */
11377           case aarch64_attr_bool:
11378             {
11379               struct cl_decoded_option decoded;
11380
11381               generate_option (p_attr->opt_num, NULL, !invert,
11382                                CL_TARGET, &decoded);
11383               aarch64_handle_option (&global_options, &global_options_set,
11384                                       &decoded, input_location);
11385               break;
11386             }
11387           /* Set or unset a bit in the target_flags.  aarch64_handle_option
11388              should know what mask to apply given the option number.  */
11389           case aarch64_attr_mask:
11390             {
11391               struct cl_decoded_option decoded;
11392               /* We only need to specify the option number.
11393                  aarch64_handle_option will know which mask to apply.  */
11394               decoded.opt_index = p_attr->opt_num;
11395               decoded.value = !invert;
11396               aarch64_handle_option (&global_options, &global_options_set,
11397                                       &decoded, input_location);
11398               break;
11399             }
11400           /* Use the option setting machinery to set an option to an enum.  */
11401           case aarch64_attr_enum:
11402             {
11403               gcc_assert (arg);
11404               bool valid;
11405               int value;
11406               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11407                                               &value, CL_TARGET);
11408               if (valid)
11409                 {
11410                   set_option (&global_options, NULL, p_attr->opt_num, value,
11411                               NULL, DK_UNSPECIFIED, input_location,
11412                               global_dc);
11413                 }
11414               else
11415                 {
11416                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11417                 }
11418               break;
11419             }
11420           default:
11421             gcc_unreachable ();
11422         }
11423     }
11424
11425   /* If we reached here we either have found an attribute and validated
11426      it or didn't match any.  If we matched an attribute but its arguments
11427      were malformed we will have returned false already.  */
11428   return found;
11429 }
11430
11431 /* Count how many times the character C appears in
11432    NULL-terminated string STR.  */
11433
11434 static unsigned int
11435 num_occurences_in_str (char c, char *str)
11436 {
11437   unsigned int res = 0;
11438   while (*str != '\0')
11439     {
11440       if (*str == c)
11441         res++;
11442
11443       str++;
11444     }
11445
11446   return res;
11447 }
11448
11449 /* Parse the tree in ARGS that contains the target attribute information
11450    and update the global target options space.  */
11451
11452 bool
11453 aarch64_process_target_attr (tree args)
11454 {
11455   if (TREE_CODE (args) == TREE_LIST)
11456     {
11457       do
11458         {
11459           tree head = TREE_VALUE (args);
11460           if (head)
11461             {
11462               if (!aarch64_process_target_attr (head))
11463                 return false;
11464             }
11465           args = TREE_CHAIN (args);
11466         } while (args);
11467
11468       return true;
11469     }
11470
11471   if (TREE_CODE (args) != STRING_CST)
11472     {
11473       error ("attribute %<target%> argument not a string");
11474       return false;
11475     }
11476
11477   size_t len = strlen (TREE_STRING_POINTER (args));
11478   char *str_to_check = (char *) alloca (len + 1);
11479   strcpy (str_to_check, TREE_STRING_POINTER (args));
11480
11481   if (len == 0)
11482     {
11483       error ("malformed %<target()%> pragma or attribute");
11484       return false;
11485     }
11486
11487   /* Used to catch empty spaces between commas i.e.
11488      attribute ((target ("attr1,,attr2"))).  */
11489   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11490
11491   /* Handle multiple target attributes separated by ','.  */
11492   char *token = strtok (str_to_check, ",");
11493
11494   unsigned int num_attrs = 0;
11495   while (token)
11496     {
11497       num_attrs++;
11498       if (!aarch64_process_one_target_attr (token))
11499         {
11500           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11501           return false;
11502         }
11503
11504       token = strtok (NULL, ",");
11505     }
11506
11507   if (num_attrs != num_commas + 1)
11508     {
11509       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11510       return false;
11511     }
11512
11513   return true;
11514 }
11515
11516 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
11517    process attribute ((target ("..."))).  */
11518
11519 static bool
11520 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11521 {
11522   struct cl_target_option cur_target;
11523   bool ret;
11524   tree old_optimize;
11525   tree new_target, new_optimize;
11526   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11527
11528   /* If what we're processing is the current pragma string then the
11529      target option node is already stored in target_option_current_node
11530      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
11531      having to re-parse the string.  This is especially useful to keep
11532      arm_neon.h compile times down since that header contains a lot
11533      of intrinsics enclosed in pragmas.  */
11534   if (!existing_target && args == current_target_pragma)
11535     {
11536       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11537       return true;
11538     }
11539   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11540
11541   old_optimize = build_optimization_node (&global_options);
11542   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11543
11544   /* If the function changed the optimization levels as well as setting
11545      target options, start with the optimizations specified.  */
11546   if (func_optimize && func_optimize != old_optimize)
11547     cl_optimization_restore (&global_options,
11548                              TREE_OPTIMIZATION (func_optimize));
11549
11550   /* Save the current target options to restore at the end.  */
11551   cl_target_option_save (&cur_target, &global_options);
11552
11553   /* If fndecl already has some target attributes applied to it, unpack
11554      them so that we add this attribute on top of them, rather than
11555      overwriting them.  */
11556   if (existing_target)
11557     {
11558       struct cl_target_option *existing_options
11559         = TREE_TARGET_OPTION (existing_target);
11560
11561       if (existing_options)
11562         cl_target_option_restore (&global_options, existing_options);
11563     }
11564   else
11565     cl_target_option_restore (&global_options,
11566                         TREE_TARGET_OPTION (target_option_current_node));
11567
11568   ret = aarch64_process_target_attr (args);
11569
11570   /* Set up any additional state.  */
11571   if (ret)
11572     {
11573       aarch64_override_options_internal (&global_options);
11574       /* Initialize SIMD builtins if we haven't already.
11575          Set current_target_pragma to NULL for the duration so that
11576          the builtin initialization code doesn't try to tag the functions
11577          being built with the attributes specified by any current pragma, thus
11578          going into an infinite recursion.  */
11579       if (TARGET_SIMD)
11580         {
11581           tree saved_current_target_pragma = current_target_pragma;
11582           current_target_pragma = NULL;
11583           aarch64_init_simd_builtins ();
11584           current_target_pragma = saved_current_target_pragma;
11585         }
11586       new_target = build_target_option_node (&global_options);
11587     }
11588   else
11589     new_target = NULL;
11590
11591   new_optimize = build_optimization_node (&global_options);
11592
11593   if (fndecl && ret)
11594     {
11595       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11596
11597       if (old_optimize != new_optimize)
11598         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11599     }
11600
11601   cl_target_option_restore (&global_options, &cur_target);
11602
11603   if (old_optimize != new_optimize)
11604     cl_optimization_restore (&global_options,
11605                              TREE_OPTIMIZATION (old_optimize));
11606   return ret;
11607 }
11608
11609 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
11610    tri-bool options (yes, no, don't care) and the default value is
11611    DEF, determine whether to reject inlining.  */
11612
11613 static bool
11614 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11615                                      int dont_care, int def)
11616 {
11617   /* If the callee doesn't care, always allow inlining.  */
11618   if (callee == dont_care)
11619     return true;
11620
11621   /* If the caller doesn't care, always allow inlining.  */
11622   if (caller == dont_care)
11623     return true;
11624
11625   /* Otherwise, allow inlining if either the callee and caller values
11626      agree, or if the callee is using the default value.  */
11627   return (callee == caller || callee == def);
11628 }
11629
11630 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
11631    to inline CALLEE into CALLER based on target-specific info.
11632    Make sure that the caller and callee have compatible architectural
11633    features.  Then go through the other possible target attributes
11634    and see if they can block inlining.  Try not to reject always_inline
11635    callees unless they are incompatible architecturally.  */
11636
11637 static bool
11638 aarch64_can_inline_p (tree caller, tree callee)
11639 {
11640   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11641   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11642
11643   /* If callee has no option attributes, then it is ok to inline.  */
11644   if (!callee_tree)
11645     return true;
11646
11647   struct cl_target_option *caller_opts
11648         = TREE_TARGET_OPTION (caller_tree ? caller_tree
11649                                            : target_option_default_node);
11650
11651   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
11652
11653
11654   /* Callee's ISA flags should be a subset of the caller's.  */
11655   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11656        != callee_opts->x_aarch64_isa_flags)
11657     return false;
11658
11659   /* Allow non-strict aligned functions inlining into strict
11660      aligned ones.  */
11661   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11662        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11663       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11664            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11665     return false;
11666
11667   bool always_inline = lookup_attribute ("always_inline",
11668                                           DECL_ATTRIBUTES (callee));
11669
11670   /* If the architectural features match up and the callee is always_inline
11671      then the other attributes don't matter.  */
11672   if (always_inline)
11673     return true;
11674
11675   if (caller_opts->x_aarch64_cmodel_var
11676       != callee_opts->x_aarch64_cmodel_var)
11677     return false;
11678
11679   if (caller_opts->x_aarch64_tls_dialect
11680       != callee_opts->x_aarch64_tls_dialect)
11681     return false;
11682
11683   /* Honour explicit requests to workaround errata.  */
11684   if (!aarch64_tribools_ok_for_inlining_p (
11685           caller_opts->x_aarch64_fix_a53_err835769,
11686           callee_opts->x_aarch64_fix_a53_err835769,
11687           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11688     return false;
11689
11690   if (!aarch64_tribools_ok_for_inlining_p (
11691           caller_opts->x_aarch64_fix_a53_err843419,
11692           callee_opts->x_aarch64_fix_a53_err843419,
11693           2, TARGET_FIX_ERR_A53_843419))
11694     return false;
11695
11696   /* If the user explicitly specified -momit-leaf-frame-pointer for the
11697      caller and calle and they don't match up, reject inlining.  */
11698   if (!aarch64_tribools_ok_for_inlining_p (
11699           caller_opts->x_flag_omit_leaf_frame_pointer,
11700           callee_opts->x_flag_omit_leaf_frame_pointer,
11701           2, 1))
11702     return false;
11703
11704   /* If the callee has specific tuning overrides, respect them.  */
11705   if (callee_opts->x_aarch64_override_tune_string != NULL
11706       && caller_opts->x_aarch64_override_tune_string == NULL)
11707     return false;
11708
11709   /* If the user specified tuning override strings for the
11710      caller and callee and they don't match up, reject inlining.
11711      We just do a string compare here, we don't analyze the meaning
11712      of the string, as it would be too costly for little gain.  */
11713   if (callee_opts->x_aarch64_override_tune_string
11714       && caller_opts->x_aarch64_override_tune_string
11715       && (strcmp (callee_opts->x_aarch64_override_tune_string,
11716                   caller_opts->x_aarch64_override_tune_string) != 0))
11717     return false;
11718
11719   return true;
11720 }
11721
11722 /* Return true if SYMBOL_REF X binds locally.  */
11723
11724 static bool
11725 aarch64_symbol_binds_local_p (const_rtx x)
11726 {
11727   return (SYMBOL_REF_DECL (x)
11728           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11729           : SYMBOL_REF_LOCAL_P (x));
11730 }
11731
11732 /* Return true if SYMBOL_REF X is thread local */
11733 static bool
11734 aarch64_tls_symbol_p (rtx x)
11735 {
11736   if (! TARGET_HAVE_TLS)
11737     return false;
11738
11739   if (GET_CODE (x) != SYMBOL_REF)
11740     return false;
11741
11742   return SYMBOL_REF_TLS_MODEL (x) != 0;
11743 }
11744
11745 /* Classify a TLS symbol into one of the TLS kinds.  */
11746 enum aarch64_symbol_type
11747 aarch64_classify_tls_symbol (rtx x)
11748 {
11749   enum tls_model tls_kind = tls_symbolic_operand_type (x);
11750
11751   switch (tls_kind)
11752     {
11753     case TLS_MODEL_GLOBAL_DYNAMIC:
11754     case TLS_MODEL_LOCAL_DYNAMIC:
11755       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11756
11757     case TLS_MODEL_INITIAL_EXEC:
11758       switch (aarch64_cmodel)
11759         {
11760         case AARCH64_CMODEL_TINY:
11761         case AARCH64_CMODEL_TINY_PIC:
11762           return SYMBOL_TINY_TLSIE;
11763         default:
11764           return SYMBOL_SMALL_TLSIE;
11765         }
11766
11767     case TLS_MODEL_LOCAL_EXEC:
11768       if (aarch64_tls_size == 12)
11769         return SYMBOL_TLSLE12;
11770       else if (aarch64_tls_size == 24)
11771         return SYMBOL_TLSLE24;
11772       else if (aarch64_tls_size == 32)
11773         return SYMBOL_TLSLE32;
11774       else if (aarch64_tls_size == 48)
11775         return SYMBOL_TLSLE48;
11776       else
11777         gcc_unreachable ();
11778
11779     case TLS_MODEL_EMULATED:
11780     case TLS_MODEL_NONE:
11781       return SYMBOL_FORCE_TO_MEM;
11782
11783     default:
11784       gcc_unreachable ();
11785     }
11786 }
11787
11788 /* Return the correct method for accessing X + OFFSET, where X is either
11789    a SYMBOL_REF or LABEL_REF.  */
11790
11791 enum aarch64_symbol_type
11792 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11793 {
11794   if (GET_CODE (x) == LABEL_REF)
11795     {
11796       switch (aarch64_cmodel)
11797         {
11798         case AARCH64_CMODEL_LARGE:
11799           return SYMBOL_FORCE_TO_MEM;
11800
11801         case AARCH64_CMODEL_TINY_PIC:
11802         case AARCH64_CMODEL_TINY:
11803           return SYMBOL_TINY_ABSOLUTE;
11804
11805         case AARCH64_CMODEL_SMALL_SPIC:
11806         case AARCH64_CMODEL_SMALL_PIC:
11807         case AARCH64_CMODEL_SMALL:
11808           return SYMBOL_SMALL_ABSOLUTE;
11809
11810         default:
11811           gcc_unreachable ();
11812         }
11813     }
11814
11815   if (GET_CODE (x) == SYMBOL_REF)
11816     {
11817       if (aarch64_tls_symbol_p (x))
11818         return aarch64_classify_tls_symbol (x);
11819
11820       switch (aarch64_cmodel)
11821         {
11822         case AARCH64_CMODEL_TINY:
11823           /* When we retrieve symbol + offset address, we have to make sure
11824              the offset does not cause overflow of the final address.  But
11825              we have no way of knowing the address of symbol at compile time
11826              so we can't accurately say if the distance between the PC and
11827              symbol + offset is outside the addressible range of +/-1M in the
11828              TINY code model.  So we rely on images not being greater than
11829              1M and cap the offset at 1M and anything beyond 1M will have to
11830              be loaded using an alternative mechanism.  Furthermore if the
11831              symbol is a weak reference to something that isn't known to
11832              resolve to a symbol in this module, then force to memory.  */
11833           if ((SYMBOL_REF_WEAK (x)
11834                && !aarch64_symbol_binds_local_p (x))
11835               || !IN_RANGE (offset, -1048575, 1048575))
11836             return SYMBOL_FORCE_TO_MEM;
11837           return SYMBOL_TINY_ABSOLUTE;
11838
11839         case AARCH64_CMODEL_SMALL:
11840           /* Same reasoning as the tiny code model, but the offset cap here is
11841              4G.  */
11842           if ((SYMBOL_REF_WEAK (x)
11843                && !aarch64_symbol_binds_local_p (x))
11844               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11845                             HOST_WIDE_INT_C (4294967264)))
11846             return SYMBOL_FORCE_TO_MEM;
11847           return SYMBOL_SMALL_ABSOLUTE;
11848
11849         case AARCH64_CMODEL_TINY_PIC:
11850           if (!aarch64_symbol_binds_local_p (x))
11851             return SYMBOL_TINY_GOT;
11852           return SYMBOL_TINY_ABSOLUTE;
11853
11854         case AARCH64_CMODEL_SMALL_SPIC:
11855         case AARCH64_CMODEL_SMALL_PIC:
11856           if (!aarch64_symbol_binds_local_p (x))
11857             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11858                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11859           return SYMBOL_SMALL_ABSOLUTE;
11860
11861         case AARCH64_CMODEL_LARGE:
11862           /* This is alright even in PIC code as the constant
11863              pool reference is always PC relative and within
11864              the same translation unit.  */
11865           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11866             return SYMBOL_SMALL_ABSOLUTE;
11867           else
11868             return SYMBOL_FORCE_TO_MEM;
11869
11870         default:
11871           gcc_unreachable ();
11872         }
11873     }
11874
11875   /* By default push everything into the constant pool.  */
11876   return SYMBOL_FORCE_TO_MEM;
11877 }
11878
11879 bool
11880 aarch64_constant_address_p (rtx x)
11881 {
11882   return (CONSTANT_P (x) && memory_address_p (DImode, x));
11883 }
11884
11885 bool
11886 aarch64_legitimate_pic_operand_p (rtx x)
11887 {
11888   if (GET_CODE (x) == SYMBOL_REF
11889       || (GET_CODE (x) == CONST
11890           && GET_CODE (XEXP (x, 0)) == PLUS
11891           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11892      return false;
11893
11894   return true;
11895 }
11896
11897 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
11898    that should be rematerialized rather than spilled.  */
11899
11900 static bool
11901 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11902 {
11903   /* Support CSE and rematerialization of common constants.  */
11904   if (CONST_INT_P (x)
11905       || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11906       || GET_CODE (x) == CONST_VECTOR)
11907     return true;
11908
11909   /* Do not allow vector struct mode constants for Advanced SIMD.
11910      We could support 0 and -1 easily, but they need support in
11911      aarch64-simd.md.  */
11912   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11913   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11914     return false;
11915
11916   /* Only accept variable-length vector constants if they can be
11917      handled directly.
11918
11919      ??? It would be possible to handle rematerialization of other
11920      constants via secondary reloads.  */
11921   if (vec_flags & VEC_ANY_SVE)
11922     return aarch64_simd_valid_immediate (x, NULL);
11923
11924   if (GET_CODE (x) == HIGH)
11925     x = XEXP (x, 0);
11926
11927   /* Accept polynomial constants that can be calculated by using the
11928      destination of a move as the sole temporary.  Constants that
11929      require a second temporary cannot be rematerialized (they can't be
11930      forced to memory and also aren't legitimate constants).  */
11931   poly_int64 offset;
11932   if (poly_int_rtx_p (x, &offset))
11933     return aarch64_offset_temporaries (false, offset) <= 1;
11934
11935   /* If an offset is being added to something else, we need to allow the
11936      base to be moved into the destination register, meaning that there
11937      are no free temporaries for the offset.  */
11938   x = strip_offset (x, &offset);
11939   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11940     return false;
11941
11942   /* Do not allow const (plus (anchor_symbol, const_int)).  */
11943   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11944     return false;
11945
11946   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
11947      so spilling them is better than rematerialization.  */
11948   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11949     return true;
11950
11951   /* Label references are always constant.  */
11952   if (GET_CODE (x) == LABEL_REF)
11953     return true;
11954
11955   return false;
11956 }
11957
11958 rtx
11959 aarch64_load_tp (rtx target)
11960 {
11961   if (!target
11962       || GET_MODE (target) != Pmode
11963       || !register_operand (target, Pmode))
11964     target = gen_reg_rtx (Pmode);
11965
11966   /* Can return in any reg.  */
11967   emit_insn (gen_aarch64_load_tp_hard (target));
11968   return target;
11969 }
11970
11971 /* On AAPCS systems, this is the "struct __va_list".  */
11972 static GTY(()) tree va_list_type;
11973
11974 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
11975    Return the type to use as __builtin_va_list.
11976
11977    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
11978
11979    struct __va_list
11980    {
11981      void *__stack;
11982      void *__gr_top;
11983      void *__vr_top;
11984      int   __gr_offs;
11985      int   __vr_offs;
11986    };  */
11987
11988 static tree
11989 aarch64_build_builtin_va_list (void)
11990 {
11991   tree va_list_name;
11992   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11993
11994   /* Create the type.  */
11995   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
11996   /* Give it the required name.  */
11997   va_list_name = build_decl (BUILTINS_LOCATION,
11998                              TYPE_DECL,
11999                              get_identifier ("__va_list"),
12000                              va_list_type);
12001   DECL_ARTIFICIAL (va_list_name) = 1;
12002   TYPE_NAME (va_list_type) = va_list_name;
12003   TYPE_STUB_DECL (va_list_type) = va_list_name;
12004
12005   /* Create the fields.  */
12006   f_stack = build_decl (BUILTINS_LOCATION,
12007                         FIELD_DECL, get_identifier ("__stack"),
12008                         ptr_type_node);
12009   f_grtop = build_decl (BUILTINS_LOCATION,
12010                         FIELD_DECL, get_identifier ("__gr_top"),
12011                         ptr_type_node);
12012   f_vrtop = build_decl (BUILTINS_LOCATION,
12013                         FIELD_DECL, get_identifier ("__vr_top"),
12014                         ptr_type_node);
12015   f_groff = build_decl (BUILTINS_LOCATION,
12016                         FIELD_DECL, get_identifier ("__gr_offs"),
12017                         integer_type_node);
12018   f_vroff = build_decl (BUILTINS_LOCATION,
12019                         FIELD_DECL, get_identifier ("__vr_offs"),
12020                         integer_type_node);
12021
12022   /* Tell tree-stdarg pass about our internal offset fields.
12023      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12024      purpose to identify whether the code is updating va_list internal
12025      offset fields through irregular way.  */
12026   va_list_gpr_counter_field = f_groff;
12027   va_list_fpr_counter_field = f_vroff;
12028
12029   DECL_ARTIFICIAL (f_stack) = 1;
12030   DECL_ARTIFICIAL (f_grtop) = 1;
12031   DECL_ARTIFICIAL (f_vrtop) = 1;
12032   DECL_ARTIFICIAL (f_groff) = 1;
12033   DECL_ARTIFICIAL (f_vroff) = 1;
12034
12035   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12036   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12037   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12038   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12039   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12040
12041   TYPE_FIELDS (va_list_type) = f_stack;
12042   DECL_CHAIN (f_stack) = f_grtop;
12043   DECL_CHAIN (f_grtop) = f_vrtop;
12044   DECL_CHAIN (f_vrtop) = f_groff;
12045   DECL_CHAIN (f_groff) = f_vroff;
12046
12047   /* Compute its layout.  */
12048   layout_type (va_list_type);
12049
12050   return va_list_type;
12051 }
12052
12053 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
12054 static void
12055 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12056 {
12057   const CUMULATIVE_ARGS *cum;
12058   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12059   tree stack, grtop, vrtop, groff, vroff;
12060   tree t;
12061   int gr_save_area_size = cfun->va_list_gpr_size;
12062   int vr_save_area_size = cfun->va_list_fpr_size;
12063   int vr_offset;
12064
12065   cum = &crtl->args.info;
12066   if (cfun->va_list_gpr_size)
12067     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12068                              cfun->va_list_gpr_size);
12069   if (cfun->va_list_fpr_size)
12070     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12071                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
12072
12073   if (!TARGET_FLOAT)
12074     {
12075       gcc_assert (cum->aapcs_nvrn == 0);
12076       vr_save_area_size = 0;
12077     }
12078
12079   f_stack = TYPE_FIELDS (va_list_type_node);
12080   f_grtop = DECL_CHAIN (f_stack);
12081   f_vrtop = DECL_CHAIN (f_grtop);
12082   f_groff = DECL_CHAIN (f_vrtop);
12083   f_vroff = DECL_CHAIN (f_groff);
12084
12085   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12086                   NULL_TREE);
12087   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12088                   NULL_TREE);
12089   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12090                   NULL_TREE);
12091   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12092                   NULL_TREE);
12093   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12094                   NULL_TREE);
12095
12096   /* Emit code to initialize STACK, which points to the next varargs stack
12097      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
12098      by named arguments.  STACK is 8-byte aligned.  */
12099   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12100   if (cum->aapcs_stack_size > 0)
12101     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12102   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12103   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12104
12105   /* Emit code to initialize GRTOP, the top of the GR save area.
12106      virtual_incoming_args_rtx should have been 16 byte aligned.  */
12107   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12108   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12109   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12110
12111   /* Emit code to initialize VRTOP, the top of the VR save area.
12112      This address is gr_save_area_bytes below GRTOP, rounded
12113      down to the next 16-byte boundary.  */
12114   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12115   vr_offset = ROUND_UP (gr_save_area_size,
12116                         STACK_BOUNDARY / BITS_PER_UNIT);
12117
12118   if (vr_offset)
12119     t = fold_build_pointer_plus_hwi (t, -vr_offset);
12120   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12121   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12122
12123   /* Emit code to initialize GROFF, the offset from GRTOP of the
12124      next GPR argument.  */
12125   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12126               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12127   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12128
12129   /* Likewise emit code to initialize VROFF, the offset from FTOP
12130      of the next VR argument.  */
12131   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12132               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12133   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12134 }
12135
12136 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
12137
12138 static tree
12139 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12140                               gimple_seq *post_p ATTRIBUTE_UNUSED)
12141 {
12142   tree addr;
12143   bool indirect_p;
12144   bool is_ha;           /* is HFA or HVA.  */
12145   bool dw_align;        /* double-word align.  */
12146   machine_mode ag_mode = VOIDmode;
12147   int nregs;
12148   machine_mode mode;
12149
12150   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12151   tree stack, f_top, f_off, off, arg, roundup, on_stack;
12152   HOST_WIDE_INT size, rsize, adjust, align;
12153   tree t, u, cond1, cond2;
12154
12155   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12156   if (indirect_p)
12157     type = build_pointer_type (type);
12158
12159   mode = TYPE_MODE (type);
12160
12161   f_stack = TYPE_FIELDS (va_list_type_node);
12162   f_grtop = DECL_CHAIN (f_stack);
12163   f_vrtop = DECL_CHAIN (f_grtop);
12164   f_groff = DECL_CHAIN (f_vrtop);
12165   f_vroff = DECL_CHAIN (f_groff);
12166
12167   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12168                   f_stack, NULL_TREE);
12169   size = int_size_in_bytes (type);
12170   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12171
12172   dw_align = false;
12173   adjust = 0;
12174   if (aarch64_vfp_is_call_or_return_candidate (mode,
12175                                                type,
12176                                                &ag_mode,
12177                                                &nregs,
12178                                                &is_ha))
12179     {
12180       /* No frontends can create types with variable-sized modes, so we
12181          shouldn't be asked to pass or return them.  */
12182       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12183
12184       /* TYPE passed in fp/simd registers.  */
12185       if (!TARGET_FLOAT)
12186         aarch64_err_no_fpadvsimd (mode, "varargs");
12187
12188       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12189                       unshare_expr (valist), f_vrtop, NULL_TREE);
12190       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12191                       unshare_expr (valist), f_vroff, NULL_TREE);
12192
12193       rsize = nregs * UNITS_PER_VREG;
12194
12195       if (is_ha)
12196         {
12197           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12198             adjust = UNITS_PER_VREG - ag_size;
12199         }
12200       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12201                && size < UNITS_PER_VREG)
12202         {
12203           adjust = UNITS_PER_VREG - size;
12204         }
12205     }
12206   else
12207     {
12208       /* TYPE passed in general registers.  */
12209       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12210                       unshare_expr (valist), f_grtop, NULL_TREE);
12211       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12212                       unshare_expr (valist), f_groff, NULL_TREE);
12213       rsize = ROUND_UP (size, UNITS_PER_WORD);
12214       nregs = rsize / UNITS_PER_WORD;
12215
12216       if (align > 8)
12217         dw_align = true;
12218
12219       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12220           && size < UNITS_PER_WORD)
12221         {
12222           adjust = UNITS_PER_WORD  - size;
12223         }
12224     }
12225
12226   /* Get a local temporary for the field value.  */
12227   off = get_initialized_tmp_var (f_off, pre_p, NULL);
12228
12229   /* Emit code to branch if off >= 0.  */
12230   t = build2 (GE_EXPR, boolean_type_node, off,
12231               build_int_cst (TREE_TYPE (off), 0));
12232   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12233
12234   if (dw_align)
12235     {
12236       /* Emit: offs = (offs + 15) & -16.  */
12237       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12238                   build_int_cst (TREE_TYPE (off), 15));
12239       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12240                   build_int_cst (TREE_TYPE (off), -16));
12241       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12242     }
12243   else
12244     roundup = NULL;
12245
12246   /* Update ap.__[g|v]r_offs  */
12247   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12248               build_int_cst (TREE_TYPE (off), rsize));
12249   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12250
12251   /* String up.  */
12252   if (roundup)
12253     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12254
12255   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
12256   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12257               build_int_cst (TREE_TYPE (f_off), 0));
12258   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12259
12260   /* String up: make sure the assignment happens before the use.  */
12261   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12262   COND_EXPR_ELSE (cond1) = t;
12263
12264   /* Prepare the trees handling the argument that is passed on the stack;
12265      the top level node will store in ON_STACK.  */
12266   arg = get_initialized_tmp_var (stack, pre_p, NULL);
12267   if (align > 8)
12268     {
12269       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
12270       t = fold_convert (intDI_type_node, arg);
12271       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12272                   build_int_cst (TREE_TYPE (t), 15));
12273       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12274                   build_int_cst (TREE_TYPE (t), -16));
12275       t = fold_convert (TREE_TYPE (arg), t);
12276       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12277     }
12278   else
12279     roundup = NULL;
12280   /* Advance ap.__stack  */
12281   t = fold_convert (intDI_type_node, arg);
12282   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12283               build_int_cst (TREE_TYPE (t), size + 7));
12284   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12285               build_int_cst (TREE_TYPE (t), -8));
12286   t = fold_convert (TREE_TYPE (arg), t);
12287   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12288   /* String up roundup and advance.  */
12289   if (roundup)
12290     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12291   /* String up with arg */
12292   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12293   /* Big-endianness related address adjustment.  */
12294   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12295       && size < UNITS_PER_WORD)
12296   {
12297     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12298                 size_int (UNITS_PER_WORD - size));
12299     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12300   }
12301
12302   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12303   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12304
12305   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
12306   t = off;
12307   if (adjust)
12308     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12309                 build_int_cst (TREE_TYPE (off), adjust));
12310
12311   t = fold_convert (sizetype, t);
12312   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12313
12314   if (is_ha)
12315     {
12316       /* type ha; // treat as "struct {ftype field[n];}"
12317          ... [computing offs]
12318          for (i = 0; i <nregs; ++i, offs += 16)
12319            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12320          return ha;  */
12321       int i;
12322       tree tmp_ha, field_t, field_ptr_t;
12323
12324       /* Declare a local variable.  */
12325       tmp_ha = create_tmp_var_raw (type, "ha");
12326       gimple_add_tmp_var (tmp_ha);
12327
12328       /* Establish the base type.  */
12329       switch (ag_mode)
12330         {
12331         case E_SFmode:
12332           field_t = float_type_node;
12333           field_ptr_t = float_ptr_type_node;
12334           break;
12335         case E_DFmode:
12336           field_t = double_type_node;
12337           field_ptr_t = double_ptr_type_node;
12338           break;
12339         case E_TFmode:
12340           field_t = long_double_type_node;
12341           field_ptr_t = long_double_ptr_type_node;
12342           break;
12343         case E_HFmode:
12344           field_t = aarch64_fp16_type_node;
12345           field_ptr_t = aarch64_fp16_ptr_type_node;
12346           break;
12347         case E_V2SImode:
12348         case E_V4SImode:
12349             {
12350               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12351               field_t = build_vector_type_for_mode (innertype, ag_mode);
12352               field_ptr_t = build_pointer_type (field_t);
12353             }
12354           break;
12355         default:
12356           gcc_assert (0);
12357         }
12358
12359       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
12360       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12361       addr = t;
12362       t = fold_convert (field_ptr_t, addr);
12363       t = build2 (MODIFY_EXPR, field_t,
12364                   build1 (INDIRECT_REF, field_t, tmp_ha),
12365                   build1 (INDIRECT_REF, field_t, t));
12366
12367       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
12368       for (i = 1; i < nregs; ++i)
12369         {
12370           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12371           u = fold_convert (field_ptr_t, addr);
12372           u = build2 (MODIFY_EXPR, field_t,
12373                       build2 (MEM_REF, field_t, tmp_ha,
12374                               build_int_cst (field_ptr_t,
12375                                              (i *
12376                                               int_size_in_bytes (field_t)))),
12377                       build1 (INDIRECT_REF, field_t, u));
12378           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12379         }
12380
12381       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12382       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12383     }
12384
12385   COND_EXPR_ELSE (cond2) = t;
12386   addr = fold_convert (build_pointer_type (type), cond1);
12387   addr = build_va_arg_indirect_ref (addr);
12388
12389   if (indirect_p)
12390     addr = build_va_arg_indirect_ref (addr);
12391
12392   return addr;
12393 }
12394
12395 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
12396
12397 static void
12398 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12399                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12400                                 int no_rtl)
12401 {
12402   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12403   CUMULATIVE_ARGS local_cum;
12404   int gr_saved = cfun->va_list_gpr_size;
12405   int vr_saved = cfun->va_list_fpr_size;
12406
12407   /* The caller has advanced CUM up to, but not beyond, the last named
12408      argument.  Advance a local copy of CUM past the last "real" named
12409      argument, to find out how many registers are left over.  */
12410   local_cum = *cum;
12411   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12412
12413   /* Found out how many registers we need to save.
12414      Honor tree-stdvar analysis results.  */
12415   if (cfun->va_list_gpr_size)
12416     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12417                     cfun->va_list_gpr_size / UNITS_PER_WORD);
12418   if (cfun->va_list_fpr_size)
12419     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12420                     cfun->va_list_fpr_size / UNITS_PER_VREG);
12421
12422   if (!TARGET_FLOAT)
12423     {
12424       gcc_assert (local_cum.aapcs_nvrn == 0);
12425       vr_saved = 0;
12426     }
12427
12428   if (!no_rtl)
12429     {
12430       if (gr_saved > 0)
12431         {
12432           rtx ptr, mem;
12433
12434           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
12435           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12436                                - gr_saved * UNITS_PER_WORD);
12437           mem = gen_frame_mem (BLKmode, ptr);
12438           set_mem_alias_set (mem, get_varargs_alias_set ());
12439
12440           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12441                                mem, gr_saved);
12442         }
12443       if (vr_saved > 0)
12444         {
12445           /* We can't use move_block_from_reg, because it will use
12446              the wrong mode, storing D regs only.  */
12447           machine_mode mode = TImode;
12448           int off, i, vr_start;
12449
12450           /* Set OFF to the offset from virtual_incoming_args_rtx of
12451              the first vector register.  The VR save area lies below
12452              the GR one, and is aligned to 16 bytes.  */
12453           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12454                            STACK_BOUNDARY / BITS_PER_UNIT);
12455           off -= vr_saved * UNITS_PER_VREG;
12456
12457           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12458           for (i = 0; i < vr_saved; ++i)
12459             {
12460               rtx ptr, mem;
12461
12462               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12463               mem = gen_frame_mem (mode, ptr);
12464               set_mem_alias_set (mem, get_varargs_alias_set ());
12465               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12466               off += UNITS_PER_VREG;
12467             }
12468         }
12469     }
12470
12471   /* We don't save the size into *PRETEND_SIZE because we want to avoid
12472      any complication of having crtl->args.pretend_args_size changed.  */
12473   cfun->machine->frame.saved_varargs_size
12474     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12475                  STACK_BOUNDARY / BITS_PER_UNIT)
12476        + vr_saved * UNITS_PER_VREG);
12477 }
12478
12479 static void
12480 aarch64_conditional_register_usage (void)
12481 {
12482   int i;
12483   if (!TARGET_FLOAT)
12484     {
12485       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12486         {
12487           fixed_regs[i] = 1;
12488           call_used_regs[i] = 1;
12489         }
12490     }
12491   if (!TARGET_SVE)
12492     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12493       {
12494         fixed_regs[i] = 1;
12495         call_used_regs[i] = 1;
12496       }
12497 }
12498
12499 /* Walk down the type tree of TYPE counting consecutive base elements.
12500    If *MODEP is VOIDmode, then set it to the first valid floating point
12501    type.  If a non-floating point type is found, or if a floating point
12502    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12503    otherwise return the count in the sub-tree.  */
12504 static int
12505 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12506 {
12507   machine_mode mode;
12508   HOST_WIDE_INT size;
12509
12510   switch (TREE_CODE (type))
12511     {
12512     case REAL_TYPE:
12513       mode = TYPE_MODE (type);
12514       if (mode != DFmode && mode != SFmode
12515           && mode != TFmode && mode != HFmode)
12516         return -1;
12517
12518       if (*modep == VOIDmode)
12519         *modep = mode;
12520
12521       if (*modep == mode)
12522         return 1;
12523
12524       break;
12525
12526     case COMPLEX_TYPE:
12527       mode = TYPE_MODE (TREE_TYPE (type));
12528       if (mode != DFmode && mode != SFmode
12529           && mode != TFmode && mode != HFmode)
12530         return -1;
12531
12532       if (*modep == VOIDmode)
12533         *modep = mode;
12534
12535       if (*modep == mode)
12536         return 2;
12537
12538       break;
12539
12540     case VECTOR_TYPE:
12541       /* Use V2SImode and V4SImode as representatives of all 64-bit
12542          and 128-bit vector types.  */
12543       size = int_size_in_bytes (type);
12544       switch (size)
12545         {
12546         case 8:
12547           mode = V2SImode;
12548           break;
12549         case 16:
12550           mode = V4SImode;
12551           break;
12552         default:
12553           return -1;
12554         }
12555
12556       if (*modep == VOIDmode)
12557         *modep = mode;
12558
12559       /* Vector modes are considered to be opaque: two vectors are
12560          equivalent for the purposes of being homogeneous aggregates
12561          if they are the same size.  */
12562       if (*modep == mode)
12563         return 1;
12564
12565       break;
12566
12567     case ARRAY_TYPE:
12568       {
12569         int count;
12570         tree index = TYPE_DOMAIN (type);
12571
12572         /* Can't handle incomplete types nor sizes that are not
12573            fixed.  */
12574         if (!COMPLETE_TYPE_P (type)
12575             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12576           return -1;
12577
12578         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12579         if (count == -1
12580             || !index
12581             || !TYPE_MAX_VALUE (index)
12582             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12583             || !TYPE_MIN_VALUE (index)
12584             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12585             || count < 0)
12586           return -1;
12587
12588         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12589                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12590
12591         /* There must be no padding.  */
12592         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12593                       count * GET_MODE_BITSIZE (*modep)))
12594           return -1;
12595
12596         return count;
12597       }
12598
12599     case RECORD_TYPE:
12600       {
12601         int count = 0;
12602         int sub_count;
12603         tree field;
12604
12605         /* Can't handle incomplete types nor sizes that are not
12606            fixed.  */
12607         if (!COMPLETE_TYPE_P (type)
12608             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12609           return -1;
12610
12611         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12612           {
12613             if (TREE_CODE (field) != FIELD_DECL)
12614               continue;
12615
12616             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12617             if (sub_count < 0)
12618               return -1;
12619             count += sub_count;
12620           }
12621
12622         /* There must be no padding.  */
12623         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12624                       count * GET_MODE_BITSIZE (*modep)))
12625           return -1;
12626
12627         return count;
12628       }
12629
12630     case UNION_TYPE:
12631     case QUAL_UNION_TYPE:
12632       {
12633         /* These aren't very interesting except in a degenerate case.  */
12634         int count = 0;
12635         int sub_count;
12636         tree field;
12637
12638         /* Can't handle incomplete types nor sizes that are not
12639            fixed.  */
12640         if (!COMPLETE_TYPE_P (type)
12641             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12642           return -1;
12643
12644         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12645           {
12646             if (TREE_CODE (field) != FIELD_DECL)
12647               continue;
12648
12649             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12650             if (sub_count < 0)
12651               return -1;
12652             count = count > sub_count ? count : sub_count;
12653           }
12654
12655         /* There must be no padding.  */
12656         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12657                       count * GET_MODE_BITSIZE (*modep)))
12658           return -1;
12659
12660         return count;
12661       }
12662
12663     default:
12664       break;
12665     }
12666
12667   return -1;
12668 }
12669
12670 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12671    type as described in AAPCS64 \S 4.1.2.
12672
12673    See the comment above aarch64_composite_type_p for the notes on MODE.  */
12674
12675 static bool
12676 aarch64_short_vector_p (const_tree type,
12677                         machine_mode mode)
12678 {
12679   poly_int64 size = -1;
12680
12681   if (type && TREE_CODE (type) == VECTOR_TYPE)
12682     size = int_size_in_bytes (type);
12683   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12684             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12685     size = GET_MODE_SIZE (mode);
12686
12687   return known_eq (size, 8) || known_eq (size, 16);
12688 }
12689
12690 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12691    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
12692    array types.  The C99 floating-point complex types are also considered
12693    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
12694    types, which are GCC extensions and out of the scope of AAPCS64, are
12695    treated as composite types here as well.
12696
12697    Note that MODE itself is not sufficient in determining whether a type
12698    is such a composite type or not.  This is because
12699    stor-layout.c:compute_record_mode may have already changed the MODE
12700    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
12701    structure with only one field may have its MODE set to the mode of the
12702    field.  Also an integer mode whose size matches the size of the
12703    RECORD_TYPE type may be used to substitute the original mode
12704    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
12705    solely relied on.  */
12706
12707 static bool
12708 aarch64_composite_type_p (const_tree type,
12709                           machine_mode mode)
12710 {
12711   if (aarch64_short_vector_p (type, mode))
12712     return false;
12713
12714   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12715     return true;
12716
12717   if (mode == BLKmode
12718       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12719       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12720     return true;
12721
12722   return false;
12723 }
12724
12725 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12726    shall be passed or returned in simd/fp register(s) (providing these
12727    parameter passing registers are available).
12728
12729    Upon successful return, *COUNT returns the number of needed registers,
12730    *BASE_MODE returns the mode of the individual register and when IS_HAF
12731    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12732    floating-point aggregate or a homogeneous short-vector aggregate.  */
12733
12734 static bool
12735 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12736                                          const_tree type,
12737                                          machine_mode *base_mode,
12738                                          int *count,
12739                                          bool *is_ha)
12740 {
12741   machine_mode new_mode = VOIDmode;
12742   bool composite_p = aarch64_composite_type_p (type, mode);
12743
12744   if (is_ha != NULL) *is_ha = false;
12745
12746   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12747       || aarch64_short_vector_p (type, mode))
12748     {
12749       *count = 1;
12750       new_mode = mode;
12751     }
12752   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12753     {
12754       if (is_ha != NULL) *is_ha = true;
12755       *count = 2;
12756       new_mode = GET_MODE_INNER (mode);
12757     }
12758   else if (type && composite_p)
12759     {
12760       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12761
12762       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12763         {
12764           if (is_ha != NULL) *is_ha = true;
12765           *count = ag_count;
12766         }
12767       else
12768         return false;
12769     }
12770   else
12771     return false;
12772
12773   *base_mode = new_mode;
12774   return true;
12775 }
12776
12777 /* Implement TARGET_STRUCT_VALUE_RTX.  */
12778
12779 static rtx
12780 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12781                           int incoming ATTRIBUTE_UNUSED)
12782 {
12783   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12784 }
12785
12786 /* Implements target hook vector_mode_supported_p.  */
12787 static bool
12788 aarch64_vector_mode_supported_p (machine_mode mode)
12789 {
12790   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12791   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12792 }
12793
12794 /* Return appropriate SIMD container
12795    for MODE within a vector of WIDTH bits.  */
12796 static machine_mode
12797 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12798 {
12799   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12800     switch (mode)
12801       {
12802       case E_DFmode:
12803         return VNx2DFmode;
12804       case E_SFmode:
12805         return VNx4SFmode;
12806       case E_HFmode:
12807         return VNx8HFmode;
12808       case E_DImode:
12809         return VNx2DImode;
12810       case E_SImode:
12811         return VNx4SImode;
12812       case E_HImode:
12813         return VNx8HImode;
12814       case E_QImode:
12815         return VNx16QImode;
12816       default:
12817         return word_mode;
12818       }
12819
12820   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12821   if (TARGET_SIMD)
12822     {
12823       if (known_eq (width, 128))
12824         switch (mode)
12825           {
12826           case E_DFmode:
12827             return V2DFmode;
12828           case E_SFmode:
12829             return V4SFmode;
12830           case E_HFmode:
12831             return V8HFmode;
12832           case E_SImode:
12833             return V4SImode;
12834           case E_HImode:
12835             return V8HImode;
12836           case E_QImode:
12837             return V16QImode;
12838           case E_DImode:
12839             return V2DImode;
12840           default:
12841             break;
12842           }
12843       else
12844         switch (mode)
12845           {
12846           case E_SFmode:
12847             return V2SFmode;
12848           case E_HFmode:
12849             return V4HFmode;
12850           case E_SImode:
12851             return V2SImode;
12852           case E_HImode:
12853             return V4HImode;
12854           case E_QImode:
12855             return V8QImode;
12856           default:
12857             break;
12858           }
12859     }
12860   return word_mode;
12861 }
12862
12863 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
12864 static machine_mode
12865 aarch64_preferred_simd_mode (scalar_mode mode)
12866 {
12867   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12868   return aarch64_simd_container_mode (mode, bits);
12869 }
12870
12871 /* Return a list of possible vector sizes for the vectorizer
12872    to iterate over.  */
12873 static void
12874 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12875 {
12876   if (TARGET_SVE)
12877     sizes->safe_push (BYTES_PER_SVE_VECTOR);
12878   sizes->safe_push (16);
12879   sizes->safe_push (8);
12880 }
12881
12882 /* Implement TARGET_MANGLE_TYPE.  */
12883
12884 static const char *
12885 aarch64_mangle_type (const_tree type)
12886 {
12887   /* The AArch64 ABI documents say that "__va_list" has to be
12888      managled as if it is in the "std" namespace.  */
12889   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12890     return "St9__va_list";
12891
12892   /* Half-precision float.  */
12893   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12894     return "Dh";
12895
12896   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
12897      builtin types.  */
12898   if (TYPE_NAME (type) != NULL)
12899     return aarch64_mangle_builtin_type (type);
12900
12901   /* Use the default mangling.  */
12902   return NULL;
12903 }
12904
12905 /* Find the first rtx_insn before insn that will generate an assembly
12906    instruction.  */
12907
12908 static rtx_insn *
12909 aarch64_prev_real_insn (rtx_insn *insn)
12910 {
12911   if (!insn)
12912     return NULL;
12913
12914   do
12915     {
12916       insn = prev_real_insn (insn);
12917     }
12918   while (insn && recog_memoized (insn) < 0);
12919
12920   return insn;
12921 }
12922
12923 static bool
12924 is_madd_op (enum attr_type t1)
12925 {
12926   unsigned int i;
12927   /* A number of these may be AArch32 only.  */
12928   enum attr_type mlatypes[] = {
12929     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12930     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12931     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12932   };
12933
12934   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12935     {
12936       if (t1 == mlatypes[i])
12937         return true;
12938     }
12939
12940   return false;
12941 }
12942
12943 /* Check if there is a register dependency between a load and the insn
12944    for which we hold recog_data.  */
12945
12946 static bool
12947 dep_between_memop_and_curr (rtx memop)
12948 {
12949   rtx load_reg;
12950   int opno;
12951
12952   gcc_assert (GET_CODE (memop) == SET);
12953
12954   if (!REG_P (SET_DEST (memop)))
12955     return false;
12956
12957   load_reg = SET_DEST (memop);
12958   for (opno = 1; opno < recog_data.n_operands; opno++)
12959     {
12960       rtx operand = recog_data.operand[opno];
12961       if (REG_P (operand)
12962           && reg_overlap_mentioned_p (load_reg, operand))
12963         return true;
12964
12965     }
12966   return false;
12967 }
12968
12969
12970 /* When working around the Cortex-A53 erratum 835769,
12971    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12972    instruction and has a preceding memory instruction such that a NOP
12973    should be inserted between them.  */
12974
12975 bool
12976 aarch64_madd_needs_nop (rtx_insn* insn)
12977 {
12978   enum attr_type attr_type;
12979   rtx_insn *prev;
12980   rtx body;
12981
12982   if (!TARGET_FIX_ERR_A53_835769)
12983     return false;
12984
12985   if (!INSN_P (insn) || recog_memoized (insn) < 0)
12986     return false;
12987
12988   attr_type = get_attr_type (insn);
12989   if (!is_madd_op (attr_type))
12990     return false;
12991
12992   prev = aarch64_prev_real_insn (insn);
12993   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
12994      Restore recog state to INSN to avoid state corruption.  */
12995   extract_constrain_insn_cached (insn);
12996
12997   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
12998     return false;
12999
13000   body = single_set (prev);
13001
13002   /* If the previous insn is a memory op and there is no dependency between
13003      it and the DImode madd, emit a NOP between them.  If body is NULL then we
13004      have a complex memory operation, probably a load/store pair.
13005      Be conservative for now and emit a NOP.  */
13006   if (GET_MODE (recog_data.operand[0]) == DImode
13007       && (!body || !dep_between_memop_and_curr (body)))
13008     return true;
13009
13010   return false;
13011
13012 }
13013
13014
13015 /* Implement FINAL_PRESCAN_INSN.  */
13016
13017 void
13018 aarch64_final_prescan_insn (rtx_insn *insn)
13019 {
13020   if (aarch64_madd_needs_nop (insn))
13021     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13022 }
13023
13024
13025 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13026    instruction.  */
13027
13028 bool
13029 aarch64_sve_index_immediate_p (rtx base_or_step)
13030 {
13031   return (CONST_INT_P (base_or_step)
13032           && IN_RANGE (INTVAL (base_or_step), -16, 15));
13033 }
13034
13035 /* Return true if X is a valid immediate for the SVE ADD and SUB
13036    instructions.  Negate X first if NEGATE_P is true.  */
13037
13038 bool
13039 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13040 {
13041   rtx elt;
13042
13043   if (!const_vec_duplicate_p (x, &elt)
13044       || !CONST_INT_P (elt))
13045     return false;
13046
13047   HOST_WIDE_INT val = INTVAL (elt);
13048   if (negate_p)
13049     val = -val;
13050   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13051
13052   if (val & 0xff)
13053     return IN_RANGE (val, 0, 0xff);
13054   return IN_RANGE (val, 0, 0xff00);
13055 }
13056
13057 /* Return true if X is a valid immediate operand for an SVE logical
13058    instruction such as AND.  */
13059
13060 bool
13061 aarch64_sve_bitmask_immediate_p (rtx x)
13062 {
13063   rtx elt;
13064
13065   return (const_vec_duplicate_p (x, &elt)
13066           && CONST_INT_P (elt)
13067           && aarch64_bitmask_imm (INTVAL (elt),
13068                                   GET_MODE_INNER (GET_MODE (x))));
13069 }
13070
13071 /* Return true if X is a valid immediate for the SVE DUP and CPY
13072    instructions.  */
13073
13074 bool
13075 aarch64_sve_dup_immediate_p (rtx x)
13076 {
13077   rtx elt;
13078
13079   if (!const_vec_duplicate_p (x, &elt)
13080       || !CONST_INT_P (elt))
13081     return false;
13082
13083   HOST_WIDE_INT val = INTVAL (elt);
13084   if (val & 0xff)
13085     return IN_RANGE (val, -0x80, 0x7f);
13086   return IN_RANGE (val, -0x8000, 0x7f00);
13087 }
13088
13089 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13090    SIGNED_P says whether the operand is signed rather than unsigned.  */
13091
13092 bool
13093 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13094 {
13095   rtx elt;
13096
13097   return (const_vec_duplicate_p (x, &elt)
13098           && CONST_INT_P (elt)
13099           && (signed_p
13100               ? IN_RANGE (INTVAL (elt), -16, 15)
13101               : IN_RANGE (INTVAL (elt), 0, 127)));
13102 }
13103
13104 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13105    instruction.  Negate X first if NEGATE_P is true.  */
13106
13107 bool
13108 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13109 {
13110   rtx elt;
13111   REAL_VALUE_TYPE r;
13112
13113   if (!const_vec_duplicate_p (x, &elt)
13114       || GET_CODE (elt) != CONST_DOUBLE)
13115     return false;
13116
13117   r = *CONST_DOUBLE_REAL_VALUE (elt);
13118
13119   if (negate_p)
13120     r = real_value_negate (&r);
13121
13122   if (real_equal (&r, &dconst1))
13123     return true;
13124   if (real_equal (&r, &dconsthalf))
13125     return true;
13126   return false;
13127 }
13128
13129 /* Return true if X is a valid immediate operand for an SVE FMUL
13130    instruction.  */
13131
13132 bool
13133 aarch64_sve_float_mul_immediate_p (rtx x)
13134 {
13135   rtx elt;
13136
13137   /* GCC will never generate a multiply with an immediate of 2, so there is no
13138      point testing for it (even though it is a valid constant).  */
13139   return (const_vec_duplicate_p (x, &elt)
13140           && GET_CODE (elt) == CONST_DOUBLE
13141           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13142 }
13143
13144 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13145    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
13146    is nonnull, use it to describe valid immediates.  */
13147 static bool
13148 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13149                                     simd_immediate_info *info,
13150                                     enum simd_immediate_check which,
13151                                     simd_immediate_info::insn_type insn)
13152 {
13153   /* Try a 4-byte immediate with LSL.  */
13154   for (unsigned int shift = 0; shift < 32; shift += 8)
13155     if ((val32 & (0xff << shift)) == val32)
13156       {
13157         if (info)
13158           *info = simd_immediate_info (SImode, val32 >> shift, insn,
13159                                        simd_immediate_info::LSL, shift);
13160         return true;
13161       }
13162
13163   /* Try a 2-byte immediate with LSL.  */
13164   unsigned int imm16 = val32 & 0xffff;
13165   if (imm16 == (val32 >> 16))
13166     for (unsigned int shift = 0; shift < 16; shift += 8)
13167       if ((imm16 & (0xff << shift)) == imm16)
13168         {
13169           if (info)
13170             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13171                                          simd_immediate_info::LSL, shift);
13172           return true;
13173         }
13174
13175   /* Try a 4-byte immediate with MSL, except for cases that MVN
13176      can handle.  */
13177   if (which == AARCH64_CHECK_MOV)
13178     for (unsigned int shift = 8; shift < 24; shift += 8)
13179       {
13180         unsigned int low = (1 << shift) - 1;
13181         if (((val32 & (0xff << shift)) | low) == val32)
13182           {
13183             if (info)
13184               *info = simd_immediate_info (SImode, val32 >> shift, insn,
13185                                            simd_immediate_info::MSL, shift);
13186             return true;
13187           }
13188       }
13189
13190   return false;
13191 }
13192
13193 /* Return true if replicating VAL64 is a valid immediate for the
13194    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
13195    use it to describe valid immediates.  */
13196 static bool
13197 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13198                                  simd_immediate_info *info,
13199                                  enum simd_immediate_check which)
13200 {
13201   unsigned int val32 = val64 & 0xffffffff;
13202   unsigned int val16 = val64 & 0xffff;
13203   unsigned int val8 = val64 & 0xff;
13204
13205   if (val32 == (val64 >> 32))
13206     {
13207       if ((which & AARCH64_CHECK_ORR) != 0
13208           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13209                                                  simd_immediate_info::MOV))
13210         return true;
13211
13212       if ((which & AARCH64_CHECK_BIC) != 0
13213           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13214                                                  simd_immediate_info::MVN))
13215         return true;
13216
13217       /* Try using a replicated byte.  */
13218       if (which == AARCH64_CHECK_MOV
13219           && val16 == (val32 >> 16)
13220           && val8 == (val16 >> 8))
13221         {
13222           if (info)
13223             *info = simd_immediate_info (QImode, val8);
13224           return true;
13225         }
13226     }
13227
13228   /* Try using a bit-to-bytemask.  */
13229   if (which == AARCH64_CHECK_MOV)
13230     {
13231       unsigned int i;
13232       for (i = 0; i < 64; i += 8)
13233         {
13234           unsigned char byte = (val64 >> i) & 0xff;
13235           if (byte != 0 && byte != 0xff)
13236             break;
13237         }
13238       if (i == 64)
13239         {
13240           if (info)
13241             *info = simd_immediate_info (DImode, val64);
13242           return true;
13243         }
13244     }
13245   return false;
13246 }
13247
13248 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13249    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
13250
13251 static bool
13252 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13253                              simd_immediate_info *info)
13254 {
13255   scalar_int_mode mode = DImode;
13256   unsigned int val32 = val64 & 0xffffffff;
13257   if (val32 == (val64 >> 32))
13258     {
13259       mode = SImode;
13260       unsigned int val16 = val32 & 0xffff;
13261       if (val16 == (val32 >> 16))
13262         {
13263           mode = HImode;
13264           unsigned int val8 = val16 & 0xff;
13265           if (val8 == (val16 >> 8))
13266             mode = QImode;
13267         }
13268     }
13269   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13270   if (IN_RANGE (val, -0x80, 0x7f))
13271     {
13272       /* DUP with no shift.  */
13273       if (info)
13274         *info = simd_immediate_info (mode, val);
13275       return true;
13276     }
13277   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13278     {
13279       /* DUP with LSL #8.  */
13280       if (info)
13281         *info = simd_immediate_info (mode, val);
13282       return true;
13283     }
13284   if (aarch64_bitmask_imm (val64, mode))
13285     {
13286       /* DUPM.  */
13287       if (info)
13288         *info = simd_immediate_info (mode, val);
13289       return true;
13290     }
13291   return false;
13292 }
13293
13294 /* Return true if OP is a valid SIMD immediate for the operation
13295    described by WHICH.  If INFO is nonnull, use it to describe valid
13296    immediates.  */
13297 bool
13298 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13299                               enum simd_immediate_check which)
13300 {
13301   machine_mode mode = GET_MODE (op);
13302   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13303   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13304     return false;
13305
13306   scalar_mode elt_mode = GET_MODE_INNER (mode);
13307   rtx base, step;
13308   unsigned int n_elts;
13309   if (GET_CODE (op) == CONST_VECTOR
13310       && CONST_VECTOR_DUPLICATE_P (op))
13311     n_elts = CONST_VECTOR_NPATTERNS (op);
13312   else if ((vec_flags & VEC_SVE_DATA)
13313            && const_vec_series_p (op, &base, &step))
13314     {
13315       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13316       if (!aarch64_sve_index_immediate_p (base)
13317           || !aarch64_sve_index_immediate_p (step))
13318         return false;
13319
13320       if (info)
13321         *info = simd_immediate_info (elt_mode, base, step);
13322       return true;
13323     }
13324   else if (GET_CODE (op) == CONST_VECTOR
13325            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13326     /* N_ELTS set above.  */;
13327   else
13328     return false;
13329
13330   /* Handle PFALSE and PTRUE.  */
13331   if (vec_flags & VEC_SVE_PRED)
13332     return (op == CONST0_RTX (mode)
13333             || op == CONSTM1_RTX (mode));
13334
13335   scalar_float_mode elt_float_mode;
13336   if (n_elts == 1
13337       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13338     {
13339       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13340       if (aarch64_float_const_zero_rtx_p (elt)
13341           || aarch64_float_const_representable_p (elt))
13342         {
13343           if (info)
13344             *info = simd_immediate_info (elt_float_mode, elt);
13345           return true;
13346         }
13347     }
13348
13349   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13350   if (elt_size > 8)
13351     return false;
13352
13353   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13354
13355   /* Expand the vector constant out into a byte vector, with the least
13356      significant byte of the register first.  */
13357   auto_vec<unsigned char, 16> bytes;
13358   bytes.reserve (n_elts * elt_size);
13359   for (unsigned int i = 0; i < n_elts; i++)
13360     {
13361       /* The vector is provided in gcc endian-neutral fashion.
13362          For aarch64_be Advanced SIMD, it must be laid out in the vector
13363          register in reverse order.  */
13364       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13365       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13366
13367       if (elt_mode != elt_int_mode)
13368         elt = gen_lowpart (elt_int_mode, elt);
13369
13370       if (!CONST_INT_P (elt))
13371         return false;
13372
13373       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13374       for (unsigned int byte = 0; byte < elt_size; byte++)
13375         {
13376           bytes.quick_push (elt_val & 0xff);
13377           elt_val >>= BITS_PER_UNIT;
13378         }
13379     }
13380
13381   /* The immediate must repeat every eight bytes.  */
13382   unsigned int nbytes = bytes.length ();
13383   for (unsigned i = 8; i < nbytes; ++i)
13384     if (bytes[i] != bytes[i - 8])
13385       return false;
13386
13387   /* Get the repeating 8-byte value as an integer.  No endian correction
13388      is needed here because bytes is already in lsb-first order.  */
13389   unsigned HOST_WIDE_INT val64 = 0;
13390   for (unsigned int i = 0; i < 8; i++)
13391     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13392               << (i * BITS_PER_UNIT));
13393
13394   if (vec_flags & VEC_SVE_DATA)
13395     return aarch64_sve_valid_immediate (val64, info);
13396   else
13397     return aarch64_advsimd_valid_immediate (val64, info, which);
13398 }
13399
13400 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13401    has a step in the range of INDEX.  Return the index expression if so,
13402    otherwise return null.  */
13403 rtx
13404 aarch64_check_zero_based_sve_index_immediate (rtx x)
13405 {
13406   rtx base, step;
13407   if (const_vec_series_p (x, &base, &step)
13408       && base == const0_rtx
13409       && aarch64_sve_index_immediate_p (step))
13410     return step;
13411   return NULL_RTX;
13412 }
13413
13414 /* Check of immediate shift constants are within range.  */
13415 bool
13416 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13417 {
13418   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13419   if (left)
13420     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13421   else
13422     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13423 }
13424
13425 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13426    operation of width WIDTH at bit position POS.  */
13427
13428 rtx
13429 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13430 {
13431   gcc_assert (CONST_INT_P (width));
13432   gcc_assert (CONST_INT_P (pos));
13433
13434   unsigned HOST_WIDE_INT mask
13435     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13436   return GEN_INT (mask << UINTVAL (pos));
13437 }
13438
13439 bool
13440 aarch64_mov_operand_p (rtx x, machine_mode mode)
13441 {
13442   if (GET_CODE (x) == HIGH
13443       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13444     return true;
13445
13446   if (CONST_INT_P (x))
13447     return true;
13448
13449   if (VECTOR_MODE_P (GET_MODE (x)))
13450     return aarch64_simd_valid_immediate (x, NULL);
13451
13452   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13453     return true;
13454
13455   if (aarch64_sve_cnt_immediate_p (x))
13456     return true;
13457
13458   return aarch64_classify_symbolic_expression (x)
13459     == SYMBOL_TINY_ABSOLUTE;
13460 }
13461
13462 /* Return a const_int vector of VAL.  */
13463 rtx
13464 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13465 {
13466   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13467   return gen_const_vec_duplicate (mode, c);
13468 }
13469
13470 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
13471
13472 bool
13473 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13474 {
13475   machine_mode vmode;
13476
13477   vmode = aarch64_simd_container_mode (mode, 64);
13478   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13479   return aarch64_simd_valid_immediate (op_v, NULL);
13480 }
13481
13482 /* Construct and return a PARALLEL RTX vector with elements numbering the
13483    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13484    the vector - from the perspective of the architecture.  This does not
13485    line up with GCC's perspective on lane numbers, so we end up with
13486    different masks depending on our target endian-ness.  The diagram
13487    below may help.  We must draw the distinction when building masks
13488    which select one half of the vector.  An instruction selecting
13489    architectural low-lanes for a big-endian target, must be described using
13490    a mask selecting GCC high-lanes.
13491
13492                  Big-Endian             Little-Endian
13493
13494 GCC             0   1   2   3           3   2   1   0
13495               | x | x | x | x |       | x | x | x | x |
13496 Architecture    3   2   1   0           3   2   1   0
13497
13498 Low Mask:         { 2, 3 }                { 0, 1 }
13499 High Mask:        { 0, 1 }                { 2, 3 }
13500
13501    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
13502
13503 rtx
13504 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13505 {
13506   rtvec v = rtvec_alloc (nunits / 2);
13507   int high_base = nunits / 2;
13508   int low_base = 0;
13509   int base;
13510   rtx t1;
13511   int i;
13512
13513   if (BYTES_BIG_ENDIAN)
13514     base = high ? low_base : high_base;
13515   else
13516     base = high ? high_base : low_base;
13517
13518   for (i = 0; i < nunits / 2; i++)
13519     RTVEC_ELT (v, i) = GEN_INT (base + i);
13520
13521   t1 = gen_rtx_PARALLEL (mode, v);
13522   return t1;
13523 }
13524
13525 /* Check OP for validity as a PARALLEL RTX vector with elements
13526    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13527    from the perspective of the architecture.  See the diagram above
13528    aarch64_simd_vect_par_cnst_half for more details.  */
13529
13530 bool
13531 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13532                                        bool high)
13533 {
13534   int nelts;
13535   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13536     return false;
13537
13538   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13539   HOST_WIDE_INT count_op = XVECLEN (op, 0);
13540   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13541   int i = 0;
13542
13543   if (count_op != count_ideal)
13544     return false;
13545
13546   for (i = 0; i < count_ideal; i++)
13547     {
13548       rtx elt_op = XVECEXP (op, 0, i);
13549       rtx elt_ideal = XVECEXP (ideal, 0, i);
13550
13551       if (!CONST_INT_P (elt_op)
13552           || INTVAL (elt_ideal) != INTVAL (elt_op))
13553         return false;
13554     }
13555   return true;
13556 }
13557
13558 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
13559    HIGH (exclusive).  */
13560 void
13561 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13562                           const_tree exp)
13563 {
13564   HOST_WIDE_INT lane;
13565   gcc_assert (CONST_INT_P (operand));
13566   lane = INTVAL (operand);
13567
13568   if (lane < low || lane >= high)
13569   {
13570     if (exp)
13571       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13572     else
13573       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13574   }
13575 }
13576
13577 /* Peform endian correction on lane number N, which indexes a vector
13578    of mode MODE, and return the result as an SImode rtx.  */
13579
13580 rtx
13581 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13582 {
13583   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13584 }
13585
13586 /* Return TRUE if OP is a valid vector addressing mode.  */
13587
13588 bool
13589 aarch64_simd_mem_operand_p (rtx op)
13590 {
13591   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13592                         || REG_P (XEXP (op, 0)));
13593 }
13594
13595 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
13596
13597 bool
13598 aarch64_sve_ld1r_operand_p (rtx op)
13599 {
13600   struct aarch64_address_info addr;
13601   scalar_mode mode;
13602
13603   return (MEM_P (op)
13604           && is_a <scalar_mode> (GET_MODE (op), &mode)
13605           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13606           && addr.type == ADDRESS_REG_IMM
13607           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13608 }
13609
13610 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13611    The conditions for STR are the same.  */
13612 bool
13613 aarch64_sve_ldr_operand_p (rtx op)
13614 {
13615   struct aarch64_address_info addr;
13616
13617   return (MEM_P (op)
13618           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13619                                        false, ADDR_QUERY_ANY)
13620           && addr.type == ADDRESS_REG_IMM);
13621 }
13622
13623 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13624    We need to be able to access the individual pieces, so the range
13625    is different from LD[234] and ST[234].  */
13626 bool
13627 aarch64_sve_struct_memory_operand_p (rtx op)
13628 {
13629   if (!MEM_P (op))
13630     return false;
13631
13632   machine_mode mode = GET_MODE (op);
13633   struct aarch64_address_info addr;
13634   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13635                                  ADDR_QUERY_ANY)
13636       || addr.type != ADDRESS_REG_IMM)
13637     return false;
13638
13639   poly_int64 first = addr.const_offset;
13640   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13641   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13642           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13643 }
13644
13645 /* Emit a register copy from operand to operand, taking care not to
13646    early-clobber source registers in the process.
13647
13648    COUNT is the number of components into which the copy needs to be
13649    decomposed.  */
13650 void
13651 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13652                                 unsigned int count)
13653 {
13654   unsigned int i;
13655   int rdest = REGNO (operands[0]);
13656   int rsrc = REGNO (operands[1]);
13657
13658   if (!reg_overlap_mentioned_p (operands[0], operands[1])
13659       || rdest < rsrc)
13660     for (i = 0; i < count; i++)
13661       emit_move_insn (gen_rtx_REG (mode, rdest + i),
13662                       gen_rtx_REG (mode, rsrc + i));
13663   else
13664     for (i = 0; i < count; i++)
13665       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13666                       gen_rtx_REG (mode, rsrc + count - i - 1));
13667 }
13668
13669 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13670    one of VSTRUCT modes: OI, CI, or XI.  */
13671 int
13672 aarch64_simd_attr_length_rglist (machine_mode mode)
13673 {
13674   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
13675   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13676 }
13677
13678 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
13679    alignment of a vector to 128 bits.  SVE predicates have an alignment of
13680    16 bits.  */
13681 static HOST_WIDE_INT
13682 aarch64_simd_vector_alignment (const_tree type)
13683 {
13684   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13685     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13686        be set for non-predicate vectors of booleans.  Modes are the most
13687        direct way we have of identifying real SVE predicate types.  */
13688     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13689   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13690   return MIN (align, 128);
13691 }
13692
13693 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
13694 static HOST_WIDE_INT
13695 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13696 {
13697   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13698     {
13699       /* If the length of the vector is fixed, try to align to that length,
13700          otherwise don't try to align at all.  */
13701       HOST_WIDE_INT result;
13702       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13703         result = TYPE_ALIGN (TREE_TYPE (type));
13704       return result;
13705     }
13706   return TYPE_ALIGN (type);
13707 }
13708
13709 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
13710 static bool
13711 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13712 {
13713   if (is_packed)
13714     return false;
13715
13716   /* For fixed-length vectors, check that the vectorizer will aim for
13717      full-vector alignment.  This isn't true for generic GCC vectors
13718      that are wider than the ABI maximum of 128 bits.  */
13719   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13720       && (wi::to_widest (TYPE_SIZE (type))
13721           != aarch64_vectorize_preferred_vector_alignment (type)))
13722     return false;
13723
13724   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
13725   return true;
13726 }
13727
13728 /* Return true if the vector misalignment factor is supported by the
13729    target.  */
13730 static bool
13731 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13732                                              const_tree type, int misalignment,
13733                                              bool is_packed)
13734 {
13735   if (TARGET_SIMD && STRICT_ALIGNMENT)
13736     {
13737       /* Return if movmisalign pattern is not supported for this mode.  */
13738       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13739         return false;
13740
13741       /* Misalignment factor is unknown at compile time.  */
13742       if (misalignment == -1)
13743         return false;
13744     }
13745   return default_builtin_support_vector_misalignment (mode, type, misalignment,
13746                                                       is_packed);
13747 }
13748
13749 /* If VALS is a vector constant that can be loaded into a register
13750    using DUP, generate instructions to do so and return an RTX to
13751    assign to the register.  Otherwise return NULL_RTX.  */
13752 static rtx
13753 aarch64_simd_dup_constant (rtx vals)
13754 {
13755   machine_mode mode = GET_MODE (vals);
13756   machine_mode inner_mode = GET_MODE_INNER (mode);
13757   rtx x;
13758
13759   if (!const_vec_duplicate_p (vals, &x))
13760     return NULL_RTX;
13761
13762   /* We can load this constant by using DUP and a constant in a
13763      single ARM register.  This will be cheaper than a vector
13764      load.  */
13765   x = copy_to_mode_reg (inner_mode, x);
13766   return gen_vec_duplicate (mode, x);
13767 }
13768
13769
13770 /* Generate code to load VALS, which is a PARALLEL containing only
13771    constants (for vec_init) or CONST_VECTOR, efficiently into a
13772    register.  Returns an RTX to copy into the register, or NULL_RTX
13773    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
13774 static rtx
13775 aarch64_simd_make_constant (rtx vals)
13776 {
13777   machine_mode mode = GET_MODE (vals);
13778   rtx const_dup;
13779   rtx const_vec = NULL_RTX;
13780   int n_const = 0;
13781   int i;
13782
13783   if (GET_CODE (vals) == CONST_VECTOR)
13784     const_vec = vals;
13785   else if (GET_CODE (vals) == PARALLEL)
13786     {
13787       /* A CONST_VECTOR must contain only CONST_INTs and
13788          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13789          Only store valid constants in a CONST_VECTOR.  */
13790       int n_elts = XVECLEN (vals, 0);
13791       for (i = 0; i < n_elts; ++i)
13792         {
13793           rtx x = XVECEXP (vals, 0, i);
13794           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13795             n_const++;
13796         }
13797       if (n_const == n_elts)
13798         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13799     }
13800   else
13801     gcc_unreachable ();
13802
13803   if (const_vec != NULL_RTX
13804       && aarch64_simd_valid_immediate (const_vec, NULL))
13805     /* Load using MOVI/MVNI.  */
13806     return const_vec;
13807   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13808     /* Loaded using DUP.  */
13809     return const_dup;
13810   else if (const_vec != NULL_RTX)
13811     /* Load from constant pool. We can not take advantage of single-cycle
13812        LD1 because we need a PC-relative addressing mode.  */
13813     return const_vec;
13814   else
13815     /* A PARALLEL containing something not valid inside CONST_VECTOR.
13816        We can not construct an initializer.  */
13817     return NULL_RTX;
13818 }
13819
13820 /* Expand a vector initialisation sequence, such that TARGET is
13821    initialised to contain VALS.  */
13822
13823 void
13824 aarch64_expand_vector_init (rtx target, rtx vals)
13825 {
13826   machine_mode mode = GET_MODE (target);
13827   scalar_mode inner_mode = GET_MODE_INNER (mode);
13828   /* The number of vector elements.  */
13829   int n_elts = XVECLEN (vals, 0);
13830   /* The number of vector elements which are not constant.  */
13831   int n_var = 0;
13832   rtx any_const = NULL_RTX;
13833   /* The first element of vals.  */
13834   rtx v0 = XVECEXP (vals, 0, 0);
13835   bool all_same = true;
13836
13837   /* Count the number of variable elements to initialise.  */
13838   for (int i = 0; i < n_elts; ++i)
13839     {
13840       rtx x = XVECEXP (vals, 0, i);
13841       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13842         ++n_var;
13843       else
13844         any_const = x;
13845
13846       all_same &= rtx_equal_p (x, v0);
13847     }
13848
13849   /* No variable elements, hand off to aarch64_simd_make_constant which knows
13850      how best to handle this.  */
13851   if (n_var == 0)
13852     {
13853       rtx constant = aarch64_simd_make_constant (vals);
13854       if (constant != NULL_RTX)
13855         {
13856           emit_move_insn (target, constant);
13857           return;
13858         }
13859     }
13860
13861   /* Splat a single non-constant element if we can.  */
13862   if (all_same)
13863     {
13864       rtx x = copy_to_mode_reg (inner_mode, v0);
13865       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13866       return;
13867     }
13868
13869   enum insn_code icode = optab_handler (vec_set_optab, mode);
13870   gcc_assert (icode != CODE_FOR_nothing);
13871
13872   /* If there are only variable elements, try to optimize
13873      the insertion using dup for the most common element
13874      followed by insertions.  */
13875
13876   /* The algorithm will fill matches[*][0] with the earliest matching element,
13877      and matches[X][1] with the count of duplicate elements (if X is the
13878      earliest element which has duplicates).  */
13879
13880   if (n_var == n_elts && n_elts <= 16)
13881     {
13882       int matches[16][2] = {0};
13883       for (int i = 0; i < n_elts; i++)
13884         {
13885           for (int j = 0; j <= i; j++)
13886             {
13887               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13888                 {
13889                   matches[i][0] = j;
13890                   matches[j][1]++;
13891                   break;
13892                 }
13893             }
13894         }
13895       int maxelement = 0;
13896       int maxv = 0;
13897       for (int i = 0; i < n_elts; i++)
13898         if (matches[i][1] > maxv)
13899           {
13900             maxelement = i;
13901             maxv = matches[i][1];
13902           }
13903
13904       /* Create a duplicate of the most common element.  */
13905       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13906       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13907
13908       /* Insert the rest.  */
13909       for (int i = 0; i < n_elts; i++)
13910         {
13911           rtx x = XVECEXP (vals, 0, i);
13912           if (matches[i][0] == maxelement)
13913             continue;
13914           x = copy_to_mode_reg (inner_mode, x);
13915           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13916         }
13917       return;
13918     }
13919
13920   /* Initialise a vector which is part-variable.  We want to first try
13921      to build those lanes which are constant in the most efficient way we
13922      can.  */
13923   if (n_var != n_elts)
13924     {
13925       rtx copy = copy_rtx (vals);
13926
13927       /* Load constant part of vector.  We really don't care what goes into the
13928          parts we will overwrite, but we're more likely to be able to load the
13929          constant efficiently if it has fewer, larger, repeating parts
13930          (see aarch64_simd_valid_immediate).  */
13931       for (int i = 0; i < n_elts; i++)
13932         {
13933           rtx x = XVECEXP (vals, 0, i);
13934           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13935             continue;
13936           rtx subst = any_const;
13937           for (int bit = n_elts / 2; bit > 0; bit /= 2)
13938             {
13939               /* Look in the copied vector, as more elements are const.  */
13940               rtx test = XVECEXP (copy, 0, i ^ bit);
13941               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
13942                 {
13943                   subst = test;
13944                   break;
13945                 }
13946             }
13947           XVECEXP (copy, 0, i) = subst;
13948         }
13949       aarch64_expand_vector_init (target, copy);
13950     }
13951
13952   /* Insert the variable lanes directly.  */
13953   for (int i = 0; i < n_elts; i++)
13954     {
13955       rtx x = XVECEXP (vals, 0, i);
13956       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13957         continue;
13958       x = copy_to_mode_reg (inner_mode, x);
13959       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13960     }
13961 }
13962
13963 static unsigned HOST_WIDE_INT
13964 aarch64_shift_truncation_mask (machine_mode mode)
13965 {
13966   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
13967     return 0;
13968   return GET_MODE_UNIT_BITSIZE (mode) - 1;
13969 }
13970
13971 /* Select a format to encode pointers in exception handling data.  */
13972 int
13973 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
13974 {
13975    int type;
13976    switch (aarch64_cmodel)
13977      {
13978      case AARCH64_CMODEL_TINY:
13979      case AARCH64_CMODEL_TINY_PIC:
13980      case AARCH64_CMODEL_SMALL:
13981      case AARCH64_CMODEL_SMALL_PIC:
13982      case AARCH64_CMODEL_SMALL_SPIC:
13983        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
13984           for everything.  */
13985        type = DW_EH_PE_sdata4;
13986        break;
13987      default:
13988        /* No assumptions here.  8-byte relocs required.  */
13989        type = DW_EH_PE_sdata8;
13990        break;
13991      }
13992    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
13993 }
13994
13995 /* The last .arch and .tune assembly strings that we printed.  */
13996 static std::string aarch64_last_printed_arch_string;
13997 static std::string aarch64_last_printed_tune_string;
13998
13999 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
14000    by the function fndecl.  */
14001
14002 void
14003 aarch64_declare_function_name (FILE *stream, const char* name,
14004                                 tree fndecl)
14005 {
14006   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14007
14008   struct cl_target_option *targ_options;
14009   if (target_parts)
14010     targ_options = TREE_TARGET_OPTION (target_parts);
14011   else
14012     targ_options = TREE_TARGET_OPTION (target_option_current_node);
14013   gcc_assert (targ_options);
14014
14015   const struct processor *this_arch
14016     = aarch64_get_arch (targ_options->x_explicit_arch);
14017
14018   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14019   std::string extension
14020     = aarch64_get_extension_string_for_isa_flags (isa_flags,
14021                                                   this_arch->flags);
14022   /* Only update the assembler .arch string if it is distinct from the last
14023      such string we printed.  */
14024   std::string to_print = this_arch->name + extension;
14025   if (to_print != aarch64_last_printed_arch_string)
14026     {
14027       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14028       aarch64_last_printed_arch_string = to_print;
14029     }
14030
14031   /* Print the cpu name we're tuning for in the comments, might be
14032      useful to readers of the generated asm.  Do it only when it changes
14033      from function to function and verbose assembly is requested.  */
14034   const struct processor *this_tune
14035     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14036
14037   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14038     {
14039       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14040                    this_tune->name);
14041       aarch64_last_printed_tune_string = this_tune->name;
14042     }
14043
14044   /* Don't forget the type directive for ELF.  */
14045   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14046   ASM_OUTPUT_LABEL (stream, name);
14047 }
14048
14049 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
14050
14051 static void
14052 aarch64_start_file (void)
14053 {
14054   struct cl_target_option *default_options
14055     = TREE_TARGET_OPTION (target_option_default_node);
14056
14057   const struct processor *default_arch
14058     = aarch64_get_arch (default_options->x_explicit_arch);
14059   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14060   std::string extension
14061     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14062                                                   default_arch->flags);
14063
14064    aarch64_last_printed_arch_string = default_arch->name + extension;
14065    aarch64_last_printed_tune_string = "";
14066    asm_fprintf (asm_out_file, "\t.arch %s\n",
14067                 aarch64_last_printed_arch_string.c_str ());
14068
14069    default_file_start ();
14070 }
14071
14072 /* Emit load exclusive.  */
14073
14074 static void
14075 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14076                              rtx mem, rtx model_rtx)
14077 {
14078   rtx (*gen) (rtx, rtx, rtx);
14079
14080   switch (mode)
14081     {
14082     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14083     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14084     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14085     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14086     default:
14087       gcc_unreachable ();
14088     }
14089
14090   emit_insn (gen (rval, mem, model_rtx));
14091 }
14092
14093 /* Emit store exclusive.  */
14094
14095 static void
14096 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14097                               rtx rval, rtx mem, rtx model_rtx)
14098 {
14099   rtx (*gen) (rtx, rtx, rtx, rtx);
14100
14101   switch (mode)
14102     {
14103     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14104     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14105     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14106     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14107     default:
14108       gcc_unreachable ();
14109     }
14110
14111   emit_insn (gen (bval, rval, mem, model_rtx));
14112 }
14113
14114 /* Mark the previous jump instruction as unlikely.  */
14115
14116 static void
14117 aarch64_emit_unlikely_jump (rtx insn)
14118 {
14119   rtx_insn *jump = emit_jump_insn (insn);
14120   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14121 }
14122
14123 /* Expand a compare and swap pattern.  */
14124
14125 void
14126 aarch64_expand_compare_and_swap (rtx operands[])
14127 {
14128   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14129   machine_mode mode, cmp_mode;
14130   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14131   int idx;
14132   gen_cas_fn gen;
14133   const gen_cas_fn split_cas[] =
14134   {
14135     gen_aarch64_compare_and_swapqi,
14136     gen_aarch64_compare_and_swaphi,
14137     gen_aarch64_compare_and_swapsi,
14138     gen_aarch64_compare_and_swapdi
14139   };
14140   const gen_cas_fn atomic_cas[] =
14141   {
14142     gen_aarch64_compare_and_swapqi_lse,
14143     gen_aarch64_compare_and_swaphi_lse,
14144     gen_aarch64_compare_and_swapsi_lse,
14145     gen_aarch64_compare_and_swapdi_lse
14146   };
14147
14148   bval = operands[0];
14149   rval = operands[1];
14150   mem = operands[2];
14151   oldval = operands[3];
14152   newval = operands[4];
14153   is_weak = operands[5];
14154   mod_s = operands[6];
14155   mod_f = operands[7];
14156   mode = GET_MODE (mem);
14157   cmp_mode = mode;
14158
14159   /* Normally the succ memory model must be stronger than fail, but in the
14160      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14161      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
14162
14163   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14164       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14165     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14166
14167   switch (mode)
14168     {
14169     case E_QImode:
14170     case E_HImode:
14171       /* For short modes, we're going to perform the comparison in SImode,
14172          so do the zero-extension now.  */
14173       cmp_mode = SImode;
14174       rval = gen_reg_rtx (SImode);
14175       oldval = convert_modes (SImode, mode, oldval, true);
14176       /* Fall through.  */
14177
14178     case E_SImode:
14179     case E_DImode:
14180       /* Force the value into a register if needed.  */
14181       if (!aarch64_plus_operand (oldval, mode))
14182         oldval = force_reg (cmp_mode, oldval);
14183       break;
14184
14185     default:
14186       gcc_unreachable ();
14187     }
14188
14189   switch (mode)
14190     {
14191     case E_QImode: idx = 0; break;
14192     case E_HImode: idx = 1; break;
14193     case E_SImode: idx = 2; break;
14194     case E_DImode: idx = 3; break;
14195     default:
14196       gcc_unreachable ();
14197     }
14198   if (TARGET_LSE)
14199     gen = atomic_cas[idx];
14200   else
14201     gen = split_cas[idx];
14202
14203   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14204
14205   if (mode == QImode || mode == HImode)
14206     emit_move_insn (operands[1], gen_lowpart (mode, rval));
14207
14208   x = gen_rtx_REG (CCmode, CC_REGNUM);
14209   x = gen_rtx_EQ (SImode, x, const0_rtx);
14210   emit_insn (gen_rtx_SET (bval, x));
14211 }
14212
14213 /* Test whether the target supports using a atomic load-operate instruction.
14214    CODE is the operation and AFTER is TRUE if the data in memory after the
14215    operation should be returned and FALSE if the data before the operation
14216    should be returned.  Returns FALSE if the operation isn't supported by the
14217    architecture.  */
14218
14219 bool
14220 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14221 {
14222   if (!TARGET_LSE)
14223     return false;
14224
14225   switch (code)
14226     {
14227     case SET:
14228     case AND:
14229     case IOR:
14230     case XOR:
14231     case MINUS:
14232     case PLUS:
14233       return true;
14234     default:
14235       return false;
14236     }
14237 }
14238
14239 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14240    sequence implementing an atomic operation.  */
14241
14242 static void
14243 aarch64_emit_post_barrier (enum memmodel model)
14244 {
14245   const enum memmodel base_model = memmodel_base (model);
14246
14247   if (is_mm_sync (model)
14248       && (base_model == MEMMODEL_ACQUIRE
14249           || base_model == MEMMODEL_ACQ_REL
14250           || base_model == MEMMODEL_SEQ_CST))
14251     {
14252       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14253     }
14254 }
14255
14256 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
14257    for the data in memory.  EXPECTED is the value expected to be in memory.
14258    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
14259    is the memory ordering to use.  */
14260
14261 void
14262 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14263                         rtx expected, rtx desired,
14264                         rtx model)
14265 {
14266   rtx (*gen) (rtx, rtx, rtx, rtx);
14267   machine_mode mode;
14268
14269   mode = GET_MODE (mem);
14270
14271   switch (mode)
14272     {
14273     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14274     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14275     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14276     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14277     default:
14278       gcc_unreachable ();
14279     }
14280
14281   /* Move the expected value into the CAS destination register.  */
14282   emit_insn (gen_rtx_SET (rval, expected));
14283
14284   /* Emit the CAS.  */
14285   emit_insn (gen (rval, mem, desired, model));
14286
14287   /* Compare the expected value with the value loaded by the CAS, to establish
14288      whether the swap was made.  */
14289   aarch64_gen_compare_reg (EQ, rval, expected);
14290 }
14291
14292 /* Split a compare and swap pattern.  */
14293
14294 void
14295 aarch64_split_compare_and_swap (rtx operands[])
14296 {
14297   rtx rval, mem, oldval, newval, scratch;
14298   machine_mode mode;
14299   bool is_weak;
14300   rtx_code_label *label1, *label2;
14301   rtx x, cond;
14302   enum memmodel model;
14303   rtx model_rtx;
14304
14305   rval = operands[0];
14306   mem = operands[1];
14307   oldval = operands[2];
14308   newval = operands[3];
14309   is_weak = (operands[4] != const0_rtx);
14310   model_rtx = operands[5];
14311   scratch = operands[7];
14312   mode = GET_MODE (mem);
14313   model = memmodel_from_int (INTVAL (model_rtx));
14314
14315   /* When OLDVAL is zero and we want the strong version we can emit a tighter
14316     loop:
14317     .label1:
14318         LD[A]XR rval, [mem]
14319         CBNZ    rval, .label2
14320         ST[L]XR scratch, newval, [mem]
14321         CBNZ    scratch, .label1
14322     .label2:
14323         CMP     rval, 0.  */
14324   bool strong_zero_p = !is_weak && oldval == const0_rtx;
14325
14326   label1 = NULL;
14327   if (!is_weak)
14328     {
14329       label1 = gen_label_rtx ();
14330       emit_label (label1);
14331     }
14332   label2 = gen_label_rtx ();
14333
14334   /* The initial load can be relaxed for a __sync operation since a final
14335      barrier will be emitted to stop code hoisting.  */
14336   if (is_mm_sync (model))
14337     aarch64_emit_load_exclusive (mode, rval, mem,
14338                                  GEN_INT (MEMMODEL_RELAXED));
14339   else
14340     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14341
14342   if (strong_zero_p)
14343     {
14344       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14345       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14346                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14347       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14348     }
14349   else
14350     {
14351       cond = aarch64_gen_compare_reg (NE, rval, oldval);
14352       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14353       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14354                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14355       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14356     }
14357
14358   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14359
14360   if (!is_weak)
14361     {
14362       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14363       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14364                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14365       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14366     }
14367   else
14368     {
14369       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14370       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14371       emit_insn (gen_rtx_SET (cond, x));
14372     }
14373
14374   emit_label (label2);
14375   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14376      to set the condition flags.  If this is not used it will be removed by
14377      later passes.  */
14378   if (strong_zero_p)
14379     {
14380       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14381       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14382       emit_insn (gen_rtx_SET (cond, x));
14383     }
14384   /* Emit any final barrier needed for a __sync operation.  */
14385   if (is_mm_sync (model))
14386     aarch64_emit_post_barrier (model);
14387 }
14388
14389 /* Emit a BIC instruction.  */
14390
14391 static void
14392 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14393 {
14394   rtx shift_rtx = GEN_INT (shift);
14395   rtx (*gen) (rtx, rtx, rtx, rtx);
14396
14397   switch (mode)
14398     {
14399     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14400     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14401     default:
14402       gcc_unreachable ();
14403     }
14404
14405   emit_insn (gen (dst, s2, shift_rtx, s1));
14406 }
14407
14408 /* Emit an atomic swap.  */
14409
14410 static void
14411 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14412                           rtx mem, rtx model)
14413 {
14414   rtx (*gen) (rtx, rtx, rtx, rtx);
14415
14416   switch (mode)
14417     {
14418     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14419     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14420     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14421     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14422     default:
14423       gcc_unreachable ();
14424     }
14425
14426   emit_insn (gen (dst, mem, value, model));
14427 }
14428
14429 /* Operations supported by aarch64_emit_atomic_load_op.  */
14430
14431 enum aarch64_atomic_load_op_code
14432 {
14433   AARCH64_LDOP_PLUS,    /* A + B  */
14434   AARCH64_LDOP_XOR,     /* A ^ B  */
14435   AARCH64_LDOP_OR,      /* A | B  */
14436   AARCH64_LDOP_BIC      /* A & ~B  */
14437 };
14438
14439 /* Emit an atomic load-operate.  */
14440
14441 static void
14442 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14443                              machine_mode mode, rtx dst, rtx src,
14444                              rtx mem, rtx model)
14445 {
14446   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14447   const aarch64_atomic_load_op_fn plus[] =
14448   {
14449     gen_aarch64_atomic_loadaddqi,
14450     gen_aarch64_atomic_loadaddhi,
14451     gen_aarch64_atomic_loadaddsi,
14452     gen_aarch64_atomic_loadadddi
14453   };
14454   const aarch64_atomic_load_op_fn eor[] =
14455   {
14456     gen_aarch64_atomic_loadeorqi,
14457     gen_aarch64_atomic_loadeorhi,
14458     gen_aarch64_atomic_loadeorsi,
14459     gen_aarch64_atomic_loadeordi
14460   };
14461   const aarch64_atomic_load_op_fn ior[] =
14462   {
14463     gen_aarch64_atomic_loadsetqi,
14464     gen_aarch64_atomic_loadsethi,
14465     gen_aarch64_atomic_loadsetsi,
14466     gen_aarch64_atomic_loadsetdi
14467   };
14468   const aarch64_atomic_load_op_fn bic[] =
14469   {
14470     gen_aarch64_atomic_loadclrqi,
14471     gen_aarch64_atomic_loadclrhi,
14472     gen_aarch64_atomic_loadclrsi,
14473     gen_aarch64_atomic_loadclrdi
14474   };
14475   aarch64_atomic_load_op_fn gen;
14476   int idx = 0;
14477
14478   switch (mode)
14479     {
14480     case E_QImode: idx = 0; break;
14481     case E_HImode: idx = 1; break;
14482     case E_SImode: idx = 2; break;
14483     case E_DImode: idx = 3; break;
14484     default:
14485       gcc_unreachable ();
14486     }
14487
14488   switch (code)
14489     {
14490     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14491     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14492     case AARCH64_LDOP_OR: gen = ior[idx]; break;
14493     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14494     default:
14495       gcc_unreachable ();
14496     }
14497
14498   emit_insn (gen (dst, mem, src, model));
14499 }
14500
14501 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
14502    location to store the data read from memory.  OUT_RESULT is the location to
14503    store the result of the operation.  MEM is the memory location to read and
14504    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
14505    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
14506    be NULL.  */
14507
14508 void
14509 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14510                          rtx mem, rtx value, rtx model_rtx)
14511 {
14512   machine_mode mode = GET_MODE (mem);
14513   machine_mode wmode = (mode == DImode ? DImode : SImode);
14514   const bool short_mode = (mode < SImode);
14515   aarch64_atomic_load_op_code ldop_code;
14516   rtx src;
14517   rtx x;
14518
14519   if (out_data)
14520     out_data = gen_lowpart (mode, out_data);
14521
14522   if (out_result)
14523     out_result = gen_lowpart (mode, out_result);
14524
14525   /* Make sure the value is in a register, putting it into a destination
14526      register if it needs to be manipulated.  */
14527   if (!register_operand (value, mode)
14528       || code == AND || code == MINUS)
14529     {
14530       src = out_result ? out_result : out_data;
14531       emit_move_insn (src, gen_lowpart (mode, value));
14532     }
14533   else
14534     src = value;
14535   gcc_assert (register_operand (src, mode));
14536
14537   /* Preprocess the data for the operation as necessary.  If the operation is
14538      a SET then emit a swap instruction and finish.  */
14539   switch (code)
14540     {
14541     case SET:
14542       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14543       return;
14544
14545     case MINUS:
14546       /* Negate the value and treat it as a PLUS.  */
14547       {
14548         rtx neg_src;
14549
14550         /* Resize the value if necessary.  */
14551         if (short_mode)
14552           src = gen_lowpart (wmode, src);
14553
14554         neg_src = gen_rtx_NEG (wmode, src);
14555         emit_insn (gen_rtx_SET (src, neg_src));
14556
14557         if (short_mode)
14558           src = gen_lowpart (mode, src);
14559       }
14560       /* Fall-through.  */
14561     case PLUS:
14562       ldop_code = AARCH64_LDOP_PLUS;
14563       break;
14564
14565     case IOR:
14566       ldop_code = AARCH64_LDOP_OR;
14567       break;
14568
14569     case XOR:
14570       ldop_code = AARCH64_LDOP_XOR;
14571       break;
14572
14573     case AND:
14574       {
14575         rtx not_src;
14576
14577         /* Resize the value if necessary.  */
14578         if (short_mode)
14579           src = gen_lowpart (wmode, src);
14580
14581         not_src = gen_rtx_NOT (wmode, src);
14582         emit_insn (gen_rtx_SET (src, not_src));
14583
14584         if (short_mode)
14585           src = gen_lowpart (mode, src);
14586       }
14587       ldop_code = AARCH64_LDOP_BIC;
14588       break;
14589
14590     default:
14591       /* The operation can't be done with atomic instructions.  */
14592       gcc_unreachable ();
14593     }
14594
14595   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14596
14597   /* If necessary, calculate the data in memory after the update by redoing the
14598      operation from values in registers.  */
14599   if (!out_result)
14600     return;
14601
14602   if (short_mode)
14603     {
14604       src = gen_lowpart (wmode, src);
14605       out_data = gen_lowpart (wmode, out_data);
14606       out_result = gen_lowpart (wmode, out_result);
14607     }
14608
14609   x = NULL_RTX;
14610
14611   switch (code)
14612     {
14613     case MINUS:
14614     case PLUS:
14615       x = gen_rtx_PLUS (wmode, out_data, src);
14616       break;
14617     case IOR:
14618       x = gen_rtx_IOR (wmode, out_data, src);
14619       break;
14620     case XOR:
14621       x = gen_rtx_XOR (wmode, out_data, src);
14622       break;
14623     case AND:
14624       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14625       return;
14626     default:
14627       gcc_unreachable ();
14628     }
14629
14630   emit_set_insn (out_result, x);
14631
14632   return;
14633 }
14634
14635 /* Split an atomic operation.  */
14636
14637 void
14638 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14639                          rtx value, rtx model_rtx, rtx cond)
14640 {
14641   machine_mode mode = GET_MODE (mem);
14642   machine_mode wmode = (mode == DImode ? DImode : SImode);
14643   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14644   const bool is_sync = is_mm_sync (model);
14645   rtx_code_label *label;
14646   rtx x;
14647
14648   /* Split the atomic operation into a sequence.  */
14649   label = gen_label_rtx ();
14650   emit_label (label);
14651
14652   if (new_out)
14653     new_out = gen_lowpart (wmode, new_out);
14654   if (old_out)
14655     old_out = gen_lowpart (wmode, old_out);
14656   else
14657     old_out = new_out;
14658   value = simplify_gen_subreg (wmode, value, mode, 0);
14659
14660   /* The initial load can be relaxed for a __sync operation since a final
14661      barrier will be emitted to stop code hoisting.  */
14662  if (is_sync)
14663     aarch64_emit_load_exclusive (mode, old_out, mem,
14664                                  GEN_INT (MEMMODEL_RELAXED));
14665   else
14666     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14667
14668   switch (code)
14669     {
14670     case SET:
14671       new_out = value;
14672       break;
14673
14674     case NOT:
14675       x = gen_rtx_AND (wmode, old_out, value);
14676       emit_insn (gen_rtx_SET (new_out, x));
14677       x = gen_rtx_NOT (wmode, new_out);
14678       emit_insn (gen_rtx_SET (new_out, x));
14679       break;
14680
14681     case MINUS:
14682       if (CONST_INT_P (value))
14683         {
14684           value = GEN_INT (-INTVAL (value));
14685           code = PLUS;
14686         }
14687       /* Fall through.  */
14688
14689     default:
14690       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14691       emit_insn (gen_rtx_SET (new_out, x));
14692       break;
14693     }
14694
14695   aarch64_emit_store_exclusive (mode, cond, mem,
14696                                 gen_lowpart (mode, new_out), model_rtx);
14697
14698   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14699   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14700                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14701   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14702
14703   /* Emit any final barrier needed for a __sync operation.  */
14704   if (is_sync)
14705     aarch64_emit_post_barrier (model);
14706 }
14707
14708 static void
14709 aarch64_init_libfuncs (void)
14710 {
14711    /* Half-precision float operations.  The compiler handles all operations
14712      with NULL libfuncs by converting to SFmode.  */
14713
14714   /* Conversions.  */
14715   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14716   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14717
14718   /* Arithmetic.  */
14719   set_optab_libfunc (add_optab, HFmode, NULL);
14720   set_optab_libfunc (sdiv_optab, HFmode, NULL);
14721   set_optab_libfunc (smul_optab, HFmode, NULL);
14722   set_optab_libfunc (neg_optab, HFmode, NULL);
14723   set_optab_libfunc (sub_optab, HFmode, NULL);
14724
14725   /* Comparisons.  */
14726   set_optab_libfunc (eq_optab, HFmode, NULL);
14727   set_optab_libfunc (ne_optab, HFmode, NULL);
14728   set_optab_libfunc (lt_optab, HFmode, NULL);
14729   set_optab_libfunc (le_optab, HFmode, NULL);
14730   set_optab_libfunc (ge_optab, HFmode, NULL);
14731   set_optab_libfunc (gt_optab, HFmode, NULL);
14732   set_optab_libfunc (unord_optab, HFmode, NULL);
14733 }
14734
14735 /* Target hook for c_mode_for_suffix.  */
14736 static machine_mode
14737 aarch64_c_mode_for_suffix (char suffix)
14738 {
14739   if (suffix == 'q')
14740     return TFmode;
14741
14742   return VOIDmode;
14743 }
14744
14745 /* We can only represent floating point constants which will fit in
14746    "quarter-precision" values.  These values are characterised by
14747    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
14748    by:
14749
14750    (-1)^s * (n/16) * 2^r
14751
14752    Where:
14753      's' is the sign bit.
14754      'n' is an integer in the range 16 <= n <= 31.
14755      'r' is an integer in the range -3 <= r <= 4.  */
14756
14757 /* Return true iff X can be represented by a quarter-precision
14758    floating point immediate operand X.  Note, we cannot represent 0.0.  */
14759 bool
14760 aarch64_float_const_representable_p (rtx x)
14761 {
14762   /* This represents our current view of how many bits
14763      make up the mantissa.  */
14764   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14765   int exponent;
14766   unsigned HOST_WIDE_INT mantissa, mask;
14767   REAL_VALUE_TYPE r, m;
14768   bool fail;
14769
14770   if (!CONST_DOUBLE_P (x))
14771     return false;
14772
14773   /* We don't support HFmode constants yet.  */
14774   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14775     return false;
14776
14777   r = *CONST_DOUBLE_REAL_VALUE (x);
14778
14779   /* We cannot represent infinities, NaNs or +/-zero.  We won't
14780      know if we have +zero until we analyse the mantissa, but we
14781      can reject the other invalid values.  */
14782   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14783       || REAL_VALUE_MINUS_ZERO (r))
14784     return false;
14785
14786   /* Extract exponent.  */
14787   r = real_value_abs (&r);
14788   exponent = REAL_EXP (&r);
14789
14790   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14791      highest (sign) bit, with a fixed binary point at bit point_pos.
14792      m1 holds the low part of the mantissa, m2 the high part.
14793      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14794      bits for the mantissa, this can fail (low bits will be lost).  */
14795   real_ldexp (&m, &r, point_pos - exponent);
14796   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14797
14798   /* If the low part of the mantissa has bits set we cannot represent
14799      the value.  */
14800   if (w.ulow () != 0)
14801     return false;
14802   /* We have rejected the lower HOST_WIDE_INT, so update our
14803      understanding of how many bits lie in the mantissa and
14804      look only at the high HOST_WIDE_INT.  */
14805   mantissa = w.elt (1);
14806   point_pos -= HOST_BITS_PER_WIDE_INT;
14807
14808   /* We can only represent values with a mantissa of the form 1.xxxx.  */
14809   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14810   if ((mantissa & mask) != 0)
14811     return false;
14812
14813   /* Having filtered unrepresentable values, we may now remove all
14814      but the highest 5 bits.  */
14815   mantissa >>= point_pos - 5;
14816
14817   /* We cannot represent the value 0.0, so reject it.  This is handled
14818      elsewhere.  */
14819   if (mantissa == 0)
14820     return false;
14821
14822   /* Then, as bit 4 is always set, we can mask it off, leaving
14823      the mantissa in the range [0, 15].  */
14824   mantissa &= ~(1 << 4);
14825   gcc_assert (mantissa <= 15);
14826
14827   /* GCC internally does not use IEEE754-like encoding (where normalized
14828      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
14829      Our mantissa values are shifted 4 places to the left relative to
14830      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14831      by 5 places to correct for GCC's representation.  */
14832   exponent = 5 - exponent;
14833
14834   return (exponent >= 0 && exponent <= 7);
14835 }
14836
14837 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14838    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
14839    output MOVI/MVNI, ORR or BIC immediate.  */
14840 char*
14841 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14842                                    enum simd_immediate_check which)
14843 {
14844   bool is_valid;
14845   static char templ[40];
14846   const char *mnemonic;
14847   const char *shift_op;
14848   unsigned int lane_count = 0;
14849   char element_char;
14850
14851   struct simd_immediate_info info;
14852
14853   /* This will return true to show const_vector is legal for use as either
14854      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14855      It will also update INFO to show how the immediate should be generated.
14856      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
14857   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14858   gcc_assert (is_valid);
14859
14860   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14861   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14862
14863   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14864     {
14865       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14866       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14867          move immediate path.  */
14868       if (aarch64_float_const_zero_rtx_p (info.value))
14869         info.value = GEN_INT (0);
14870       else
14871         {
14872           const unsigned int buf_size = 20;
14873           char float_buf[buf_size] = {'\0'};
14874           real_to_decimal_for_mode (float_buf,
14875                                     CONST_DOUBLE_REAL_VALUE (info.value),
14876                                     buf_size, buf_size, 1, info.elt_mode);
14877
14878           if (lane_count == 1)
14879             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14880           else
14881             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14882                       lane_count, element_char, float_buf);
14883           return templ;
14884         }
14885     }
14886
14887   gcc_assert (CONST_INT_P (info.value));
14888
14889   if (which == AARCH64_CHECK_MOV)
14890     {
14891       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14892       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14893       if (lane_count == 1)
14894         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14895                   mnemonic, UINTVAL (info.value));
14896       else if (info.shift)
14897         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14898                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14899                   element_char, UINTVAL (info.value), shift_op, info.shift);
14900       else
14901         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14902                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14903                   element_char, UINTVAL (info.value));
14904     }
14905   else
14906     {
14907       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
14908       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14909       if (info.shift)
14910         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14911                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14912                   element_char, UINTVAL (info.value), "lsl", info.shift);
14913       else
14914         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14915                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14916                   element_char, UINTVAL (info.value));
14917     }
14918   return templ;
14919 }
14920
14921 char*
14922 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14923 {
14924
14925   /* If a floating point number was passed and we desire to use it in an
14926      integer mode do the conversion to integer.  */
14927   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14928     {
14929       unsigned HOST_WIDE_INT ival;
14930       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
14931           gcc_unreachable ();
14932       immediate = gen_int_mode (ival, mode);
14933     }
14934
14935   machine_mode vmode;
14936   /* use a 64 bit mode for everything except for DI/DF mode, where we use
14937      a 128 bit vector mode.  */
14938   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
14939
14940   vmode = aarch64_simd_container_mode (mode, width);
14941   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
14942   return aarch64_output_simd_mov_immediate (v_op, width);
14943 }
14944
14945 /* Return the output string to use for moving immediate CONST_VECTOR
14946    into an SVE register.  */
14947
14948 char *
14949 aarch64_output_sve_mov_immediate (rtx const_vector)
14950 {
14951   static char templ[40];
14952   struct simd_immediate_info info;
14953   char element_char;
14954
14955   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
14956   gcc_assert (is_valid);
14957
14958   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14959
14960   if (info.step)
14961     {
14962       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
14963                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
14964                 element_char, INTVAL (info.value), INTVAL (info.step));
14965       return templ;
14966     }
14967
14968   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14969     {
14970       if (aarch64_float_const_zero_rtx_p (info.value))
14971         info.value = GEN_INT (0);
14972       else
14973         {
14974           const int buf_size = 20;
14975           char float_buf[buf_size] = {};
14976           real_to_decimal_for_mode (float_buf,
14977                                     CONST_DOUBLE_REAL_VALUE (info.value),
14978                                     buf_size, buf_size, 1, info.elt_mode);
14979
14980           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
14981                     element_char, float_buf);
14982           return templ;
14983         }
14984     }
14985
14986   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
14987             element_char, INTVAL (info.value));
14988   return templ;
14989 }
14990
14991 /* Return the asm format for a PTRUE instruction whose destination has
14992    mode MODE.  SUFFIX is the element size suffix.  */
14993
14994 char *
14995 aarch64_output_ptrue (machine_mode mode, char suffix)
14996 {
14997   unsigned int nunits;
14998   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
14999   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15000     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15001   else
15002     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15003   return buf;
15004 }
15005
15006 /* Split operands into moves from op[1] + op[2] into op[0].  */
15007
15008 void
15009 aarch64_split_combinev16qi (rtx operands[3])
15010 {
15011   unsigned int dest = REGNO (operands[0]);
15012   unsigned int src1 = REGNO (operands[1]);
15013   unsigned int src2 = REGNO (operands[2]);
15014   machine_mode halfmode = GET_MODE (operands[1]);
15015   unsigned int halfregs = REG_NREGS (operands[1]);
15016   rtx destlo, desthi;
15017
15018   gcc_assert (halfmode == V16QImode);
15019
15020   if (src1 == dest && src2 == dest + halfregs)
15021     {
15022       /* No-op move.  Can't split to nothing; emit something.  */
15023       emit_note (NOTE_INSN_DELETED);
15024       return;
15025     }
15026
15027   /* Preserve register attributes for variable tracking.  */
15028   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15029   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15030                                GET_MODE_SIZE (halfmode));
15031
15032   /* Special case of reversed high/low parts.  */
15033   if (reg_overlap_mentioned_p (operands[2], destlo)
15034       && reg_overlap_mentioned_p (operands[1], desthi))
15035     {
15036       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15037       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15038       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15039     }
15040   else if (!reg_overlap_mentioned_p (operands[2], destlo))
15041     {
15042       /* Try to avoid unnecessary moves if part of the result
15043          is in the right place already.  */
15044       if (src1 != dest)
15045         emit_move_insn (destlo, operands[1]);
15046       if (src2 != dest + halfregs)
15047         emit_move_insn (desthi, operands[2]);
15048     }
15049   else
15050     {
15051       if (src2 != dest + halfregs)
15052         emit_move_insn (desthi, operands[2]);
15053       if (src1 != dest)
15054         emit_move_insn (destlo, operands[1]);
15055     }
15056 }
15057
15058 /* vec_perm support.  */
15059
15060 struct expand_vec_perm_d
15061 {
15062   rtx target, op0, op1;
15063   vec_perm_indices perm;
15064   machine_mode vmode;
15065   unsigned int vec_flags;
15066   bool one_vector_p;
15067   bool testing_p;
15068 };
15069
15070 /* Generate a variable permutation.  */
15071
15072 static void
15073 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15074 {
15075   machine_mode vmode = GET_MODE (target);
15076   bool one_vector_p = rtx_equal_p (op0, op1);
15077
15078   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15079   gcc_checking_assert (GET_MODE (op0) == vmode);
15080   gcc_checking_assert (GET_MODE (op1) == vmode);
15081   gcc_checking_assert (GET_MODE (sel) == vmode);
15082   gcc_checking_assert (TARGET_SIMD);
15083
15084   if (one_vector_p)
15085     {
15086       if (vmode == V8QImode)
15087         {
15088           /* Expand the argument to a V16QI mode by duplicating it.  */
15089           rtx pair = gen_reg_rtx (V16QImode);
15090           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15091           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15092         }
15093       else
15094         {
15095           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15096         }
15097     }
15098   else
15099     {
15100       rtx pair;
15101
15102       if (vmode == V8QImode)
15103         {
15104           pair = gen_reg_rtx (V16QImode);
15105           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15106           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15107         }
15108       else
15109         {
15110           pair = gen_reg_rtx (OImode);
15111           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15112           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15113         }
15114     }
15115 }
15116
15117 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15118    NELT is the number of elements in the vector.  */
15119
15120 void
15121 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15122                          unsigned int nelt)
15123 {
15124   machine_mode vmode = GET_MODE (target);
15125   bool one_vector_p = rtx_equal_p (op0, op1);
15126   rtx mask;
15127
15128   /* The TBL instruction does not use a modulo index, so we must take care
15129      of that ourselves.  */
15130   mask = aarch64_simd_gen_const_vector_dup (vmode,
15131       one_vector_p ? nelt - 1 : 2 * nelt - 1);
15132   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15133
15134   /* For big-endian, we also need to reverse the index within the vector
15135      (but not which vector).  */
15136   if (BYTES_BIG_ENDIAN)
15137     {
15138       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
15139       if (!one_vector_p)
15140         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15141       sel = expand_simple_binop (vmode, XOR, sel, mask,
15142                                  NULL, 0, OPTAB_LIB_WIDEN);
15143     }
15144   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15145 }
15146
15147 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
15148
15149 static void
15150 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15151 {
15152   emit_insn (gen_rtx_SET (target,
15153                           gen_rtx_UNSPEC (GET_MODE (target),
15154                                           gen_rtvec (2, op0, op1), code)));
15155 }
15156
15157 /* Expand an SVE vec_perm with the given operands.  */
15158
15159 void
15160 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15161 {
15162   machine_mode data_mode = GET_MODE (target);
15163   machine_mode sel_mode = GET_MODE (sel);
15164   /* Enforced by the pattern condition.  */
15165   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15166
15167   /* Note: vec_perm indices are supposed to wrap when they go beyond the
15168      size of the two value vectors, i.e. the upper bits of the indices
15169      are effectively ignored.  SVE TBL instead produces 0 for any
15170      out-of-range indices, so we need to modulo all the vec_perm indices
15171      to ensure they are all in range.  */
15172   rtx sel_reg = force_reg (sel_mode, sel);
15173
15174   /* Check if the sel only references the first values vector.  */
15175   if (GET_CODE (sel) == CONST_VECTOR
15176       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15177     {
15178       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15179       return;
15180     }
15181
15182   /* Check if the two values vectors are the same.  */
15183   if (rtx_equal_p (op0, op1))
15184     {
15185       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15186       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15187                                          NULL, 0, OPTAB_DIRECT);
15188       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15189       return;
15190     }
15191
15192   /* Run TBL on for each value vector and combine the results.  */
15193
15194   rtx res0 = gen_reg_rtx (data_mode);
15195   rtx res1 = gen_reg_rtx (data_mode);
15196   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15197   if (GET_CODE (sel) != CONST_VECTOR
15198       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15199     {
15200       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15201                                                        2 * nunits - 1);
15202       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15203                                      NULL, 0, OPTAB_DIRECT);
15204     }
15205   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15206   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15207                                      NULL, 0, OPTAB_DIRECT);
15208   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15209   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15210     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15211   else
15212     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15213 }
15214
15215 /* Recognize patterns suitable for the TRN instructions.  */
15216 static bool
15217 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15218 {
15219   HOST_WIDE_INT odd;
15220   poly_uint64 nelt = d->perm.length ();
15221   rtx out, in0, in1, x;
15222   machine_mode vmode = d->vmode;
15223
15224   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15225     return false;
15226
15227   /* Note that these are little-endian tests.
15228      We correct for big-endian later.  */
15229   if (!d->perm[0].is_constant (&odd)
15230       || (odd != 0 && odd != 1)
15231       || !d->perm.series_p (0, 2, odd, 2)
15232       || !d->perm.series_p (1, 2, nelt + odd, 2))
15233     return false;
15234
15235   /* Success!  */
15236   if (d->testing_p)
15237     return true;
15238
15239   in0 = d->op0;
15240   in1 = d->op1;
15241   /* We don't need a big-endian lane correction for SVE; see the comment
15242      at the head of aarch64-sve.md for details.  */
15243   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15244     {
15245       x = in0, in0 = in1, in1 = x;
15246       odd = !odd;
15247     }
15248   out = d->target;
15249
15250   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15251                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15252   return true;
15253 }
15254
15255 /* Recognize patterns suitable for the UZP instructions.  */
15256 static bool
15257 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15258 {
15259   HOST_WIDE_INT odd;
15260   rtx out, in0, in1, x;
15261   machine_mode vmode = d->vmode;
15262
15263   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15264     return false;
15265
15266   /* Note that these are little-endian tests.
15267      We correct for big-endian later.  */
15268   if (!d->perm[0].is_constant (&odd)
15269       || (odd != 0 && odd != 1)
15270       || !d->perm.series_p (0, 1, odd, 2))
15271     return false;
15272
15273   /* Success!  */
15274   if (d->testing_p)
15275     return true;
15276
15277   in0 = d->op0;
15278   in1 = d->op1;
15279   /* We don't need a big-endian lane correction for SVE; see the comment
15280      at the head of aarch64-sve.md for details.  */
15281   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15282     {
15283       x = in0, in0 = in1, in1 = x;
15284       odd = !odd;
15285     }
15286   out = d->target;
15287
15288   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15289                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15290   return true;
15291 }
15292
15293 /* Recognize patterns suitable for the ZIP instructions.  */
15294 static bool
15295 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15296 {
15297   unsigned int high;
15298   poly_uint64 nelt = d->perm.length ();
15299   rtx out, in0, in1, x;
15300   machine_mode vmode = d->vmode;
15301
15302   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15303     return false;
15304
15305   /* Note that these are little-endian tests.
15306      We correct for big-endian later.  */
15307   poly_uint64 first = d->perm[0];
15308   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15309       || !d->perm.series_p (0, 2, first, 1)
15310       || !d->perm.series_p (1, 2, first + nelt, 1))
15311     return false;
15312   high = maybe_ne (first, 0U);
15313
15314   /* Success!  */
15315   if (d->testing_p)
15316     return true;
15317
15318   in0 = d->op0;
15319   in1 = d->op1;
15320   /* We don't need a big-endian lane correction for SVE; see the comment
15321      at the head of aarch64-sve.md for details.  */
15322   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15323     {
15324       x = in0, in0 = in1, in1 = x;
15325       high = !high;
15326     }
15327   out = d->target;
15328
15329   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15330                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15331   return true;
15332 }
15333
15334 /* Recognize patterns for the EXT insn.  */
15335
15336 static bool
15337 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15338 {
15339   HOST_WIDE_INT location;
15340   rtx offset;
15341
15342   /* The first element always refers to the first vector.
15343      Check if the extracted indices are increasing by one.  */
15344   if (d->vec_flags == VEC_SVE_PRED
15345       || !d->perm[0].is_constant (&location)
15346       || !d->perm.series_p (0, 1, location, 1))
15347     return false;
15348
15349   /* Success! */
15350   if (d->testing_p)
15351     return true;
15352
15353   /* The case where (location == 0) is a no-op for both big- and little-endian,
15354      and is removed by the mid-end at optimization levels -O1 and higher.
15355
15356      We don't need a big-endian lane correction for SVE; see the comment
15357      at the head of aarch64-sve.md for details.  */
15358   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15359     {
15360       /* After setup, we want the high elements of the first vector (stored
15361          at the LSB end of the register), and the low elements of the second
15362          vector (stored at the MSB end of the register). So swap.  */
15363       std::swap (d->op0, d->op1);
15364       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15365          to_constant () is safe since this is restricted to Advanced SIMD
15366          vectors.  */
15367       location = d->perm.length ().to_constant () - location;
15368     }
15369
15370   offset = GEN_INT (location);
15371   emit_set_insn (d->target,
15372                  gen_rtx_UNSPEC (d->vmode,
15373                                  gen_rtvec (3, d->op0, d->op1, offset),
15374                                  UNSPEC_EXT));
15375   return true;
15376 }
15377
15378 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15379    within each 64-bit, 32-bit or 16-bit granule.  */
15380
15381 static bool
15382 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15383 {
15384   HOST_WIDE_INT diff;
15385   unsigned int i, size, unspec;
15386   machine_mode pred_mode;
15387
15388   if (d->vec_flags == VEC_SVE_PRED
15389       || !d->one_vector_p
15390       || !d->perm[0].is_constant (&diff))
15391     return false;
15392
15393   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15394   if (size == 8)
15395     {
15396       unspec = UNSPEC_REV64;
15397       pred_mode = VNx2BImode;
15398     }
15399   else if (size == 4)
15400     {
15401       unspec = UNSPEC_REV32;
15402       pred_mode = VNx4BImode;
15403     }
15404   else if (size == 2)
15405     {
15406       unspec = UNSPEC_REV16;
15407       pred_mode = VNx8BImode;
15408     }
15409   else
15410     return false;
15411
15412   unsigned int step = diff + 1;
15413   for (i = 0; i < step; ++i)
15414     if (!d->perm.series_p (i, step, diff - i, step))
15415       return false;
15416
15417   /* Success! */
15418   if (d->testing_p)
15419     return true;
15420
15421   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15422   if (d->vec_flags == VEC_SVE_DATA)
15423     {
15424       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15425       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15426                             UNSPEC_MERGE_PTRUE);
15427     }
15428   emit_set_insn (d->target, src);
15429   return true;
15430 }
15431
15432 /* Recognize patterns for the REV insn, which reverses elements within
15433    a full vector.  */
15434
15435 static bool
15436 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15437 {
15438   poly_uint64 nelt = d->perm.length ();
15439
15440   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15441     return false;
15442
15443   if (!d->perm.series_p (0, 1, nelt - 1, -1))
15444     return false;
15445
15446   /* Success! */
15447   if (d->testing_p)
15448     return true;
15449
15450   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15451   emit_set_insn (d->target, src);
15452   return true;
15453 }
15454
15455 static bool
15456 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15457 {
15458   rtx out = d->target;
15459   rtx in0;
15460   HOST_WIDE_INT elt;
15461   machine_mode vmode = d->vmode;
15462   rtx lane;
15463
15464   if (d->vec_flags == VEC_SVE_PRED
15465       || d->perm.encoding ().encoded_nelts () != 1
15466       || !d->perm[0].is_constant (&elt))
15467     return false;
15468
15469   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15470     return false;
15471
15472   /* Success! */
15473   if (d->testing_p)
15474     return true;
15475
15476   /* The generic preparation in aarch64_expand_vec_perm_const_1
15477      swaps the operand order and the permute indices if it finds
15478      d->perm[0] to be in the second operand.  Thus, we can always
15479      use d->op0 and need not do any extra arithmetic to get the
15480      correct lane number.  */
15481   in0 = d->op0;
15482   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
15483
15484   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15485   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15486   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15487   return true;
15488 }
15489
15490 static bool
15491 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15492 {
15493   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15494   machine_mode vmode = d->vmode;
15495
15496   /* Make sure that the indices are constant.  */
15497   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15498   for (unsigned int i = 0; i < encoded_nelts; ++i)
15499     if (!d->perm[i].is_constant ())
15500       return false;
15501
15502   if (d->testing_p)
15503     return true;
15504
15505   /* Generic code will try constant permutation twice.  Once with the
15506      original mode and again with the elements lowered to QImode.
15507      So wait and don't do the selector expansion ourselves.  */
15508   if (vmode != V8QImode && vmode != V16QImode)
15509     return false;
15510
15511   /* to_constant is safe since this routine is specific to Advanced SIMD
15512      vectors.  */
15513   unsigned int nelt = d->perm.length ().to_constant ();
15514   for (unsigned int i = 0; i < nelt; ++i)
15515     /* If big-endian and two vectors we end up with a weird mixed-endian
15516        mode on NEON.  Reverse the index within each word but not the word
15517        itself.  to_constant is safe because we checked is_constant above.  */
15518     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15519                         ? d->perm[i].to_constant () ^ (nelt - 1)
15520                         : d->perm[i].to_constant ());
15521
15522   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15523   sel = force_reg (vmode, sel);
15524
15525   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15526   return true;
15527 }
15528
15529 /* Try to implement D using an SVE TBL instruction.  */
15530
15531 static bool
15532 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15533 {
15534   unsigned HOST_WIDE_INT nelt;
15535
15536   /* Permuting two variable-length vectors could overflow the
15537      index range.  */
15538   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15539     return false;
15540
15541   if (d->testing_p)
15542     return true;
15543
15544   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15545   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15546   aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15547   return true;
15548 }
15549
15550 static bool
15551 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15552 {
15553   /* The pattern matching functions above are written to look for a small
15554      number to begin the sequence (0, 1, N/2).  If we begin with an index
15555      from the second operand, we can swap the operands.  */
15556   poly_int64 nelt = d->perm.length ();
15557   if (known_ge (d->perm[0], nelt))
15558     {
15559       d->perm.rotate_inputs (1);
15560       std::swap (d->op0, d->op1);
15561     }
15562
15563   if ((d->vec_flags == VEC_ADVSIMD
15564        || d->vec_flags == VEC_SVE_DATA
15565        || d->vec_flags == VEC_SVE_PRED)
15566       && known_gt (nelt, 1))
15567     {
15568       if (aarch64_evpc_rev_local (d))
15569         return true;
15570       else if (aarch64_evpc_rev_global (d))
15571         return true;
15572       else if (aarch64_evpc_ext (d))
15573         return true;
15574       else if (aarch64_evpc_dup (d))
15575         return true;
15576       else if (aarch64_evpc_zip (d))
15577         return true;
15578       else if (aarch64_evpc_uzp (d))
15579         return true;
15580       else if (aarch64_evpc_trn (d))
15581         return true;
15582       if (d->vec_flags == VEC_SVE_DATA)
15583         return aarch64_evpc_sve_tbl (d);
15584       else if (d->vec_flags == VEC_SVE_DATA)
15585         return aarch64_evpc_tbl (d);
15586     }
15587   return false;
15588 }
15589
15590 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
15591
15592 static bool
15593 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15594                                   rtx op1, const vec_perm_indices &sel)
15595 {
15596   struct expand_vec_perm_d d;
15597
15598   /* Check whether the mask can be applied to a single vector.  */
15599   if (op0 && rtx_equal_p (op0, op1))
15600     d.one_vector_p = true;
15601   else if (sel.all_from_input_p (0))
15602     {
15603       d.one_vector_p = true;
15604       op1 = op0;
15605     }
15606   else if (sel.all_from_input_p (1))
15607     {
15608       d.one_vector_p = true;
15609       op0 = op1;
15610     }
15611   else
15612     d.one_vector_p = false;
15613
15614   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15615                      sel.nelts_per_input ());
15616   d.vmode = vmode;
15617   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15618   d.target = target;
15619   d.op0 = op0;
15620   d.op1 = op1;
15621   d.testing_p = !target;
15622
15623   if (!d.testing_p)
15624     return aarch64_expand_vec_perm_const_1 (&d);
15625
15626   rtx_insn *last = get_last_insn ();
15627   bool ret = aarch64_expand_vec_perm_const_1 (&d);
15628   gcc_assert (last == get_last_insn ());
15629
15630   return ret;
15631 }
15632
15633 /* Generate a byte permute mask for a register of mode MODE,
15634    which has NUNITS units.  */
15635
15636 rtx
15637 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15638 {
15639   /* We have to reverse each vector because we dont have
15640      a permuted load that can reverse-load according to ABI rules.  */
15641   rtx mask;
15642   rtvec v = rtvec_alloc (16);
15643   unsigned int i, j;
15644   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15645
15646   gcc_assert (BYTES_BIG_ENDIAN);
15647   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15648
15649   for (i = 0; i < nunits; i++)
15650     for (j = 0; j < usize; j++)
15651       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15652   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15653   return force_reg (V16QImode, mask);
15654 }
15655
15656 /* Return true if X is a valid second operand for the SVE instruction
15657    that implements integer comparison OP_CODE.  */
15658
15659 static bool
15660 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15661 {
15662   if (register_operand (x, VOIDmode))
15663     return true;
15664
15665   switch (op_code)
15666     {
15667     case LTU:
15668     case LEU:
15669     case GEU:
15670     case GTU:
15671       return aarch64_sve_cmp_immediate_p (x, false);
15672     case LT:
15673     case LE:
15674     case GE:
15675     case GT:
15676     case NE:
15677     case EQ:
15678       return aarch64_sve_cmp_immediate_p (x, true);
15679     default:
15680       gcc_unreachable ();
15681     }
15682 }
15683
15684 /* Return the UNSPEC_COND_* code for comparison CODE.  */
15685
15686 static unsigned int
15687 aarch64_unspec_cond_code (rtx_code code)
15688 {
15689   switch (code)
15690     {
15691     case NE:
15692       return UNSPEC_COND_NE;
15693     case EQ:
15694       return UNSPEC_COND_EQ;
15695     case LT:
15696       return UNSPEC_COND_LT;
15697     case GT:
15698       return UNSPEC_COND_GT;
15699     case LE:
15700       return UNSPEC_COND_LE;
15701     case GE:
15702       return UNSPEC_COND_GE;
15703     case LTU:
15704       return UNSPEC_COND_LO;
15705     case GTU:
15706       return UNSPEC_COND_HI;
15707     case LEU:
15708       return UNSPEC_COND_LS;
15709     case GEU:
15710       return UNSPEC_COND_HS;
15711     case UNORDERED:
15712       return UNSPEC_COND_UO;
15713     default:
15714       gcc_unreachable ();
15715     }
15716 }
15717
15718 /* Return an (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>) expression,
15719    where <X> is the operation associated with comparison CODE.  */
15720
15721 static rtx
15722 aarch64_gen_unspec_cond (rtx_code code, machine_mode pred_mode,
15723                          rtx pred, rtx op0, rtx op1)
15724 {
15725   rtvec vec = gen_rtvec (3, pred, op0, op1);
15726   return gen_rtx_UNSPEC (pred_mode, vec, aarch64_unspec_cond_code (code));
15727 }
15728
15729 /* Expand an SVE integer comparison:
15730
15731      TARGET = CODE (OP0, OP1).  */
15732
15733 void
15734 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15735 {
15736   machine_mode pred_mode = GET_MODE (target);
15737   machine_mode data_mode = GET_MODE (op0);
15738
15739   if (!aarch64_sve_cmp_operand_p (code, op1))
15740     op1 = force_reg (data_mode, op1);
15741
15742   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15743   rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, ptrue, op0, op1);
15744   emit_insn (gen_set_clobber_cc (target, unspec));
15745 }
15746
15747 /* Emit an instruction:
15748
15749       (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15750
15751    where <X> is the operation associated with comparison CODE.  */
15752
15753 static void
15754 aarch64_emit_unspec_cond (rtx target, rtx_code code, machine_mode pred_mode,
15755                           rtx pred, rtx op0, rtx op1)
15756 {
15757   rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, pred, op0, op1);
15758   emit_set_insn (target, unspec);
15759 }
15760
15761 /* Emit:
15762
15763       (set TMP1 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X1>))
15764       (set TMP2 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X2>))
15765       (set TARGET (and:PRED_MODE (ior:PRED_MODE TMP1 TMP2) PTRUE))
15766
15767    where <Xi> is the operation associated with comparison CODEi.  */
15768
15769 static void
15770 aarch64_emit_unspec_cond_or (rtx target, rtx_code code1, rtx_code code2,
15771                              machine_mode pred_mode, rtx ptrue,
15772                              rtx op0, rtx op1)
15773 {
15774   rtx tmp1 = gen_reg_rtx (pred_mode);
15775   aarch64_emit_unspec_cond (tmp1, code1, pred_mode, ptrue, op0, op1);
15776   rtx tmp2 = gen_reg_rtx (pred_mode);
15777   aarch64_emit_unspec_cond (tmp2, code2, pred_mode, ptrue, op0, op1);
15778   emit_set_insn (target, gen_rtx_AND (pred_mode,
15779                                       gen_rtx_IOR (pred_mode, tmp1, tmp2),
15780                                       ptrue));
15781 }
15782
15783 /* If CAN_INVERT_P, emit an instruction:
15784
15785       (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15786
15787    where <X> is the operation associated with comparison CODE.  Otherwise
15788    emit:
15789
15790       (set TMP (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15791       (set TARGET (and:PRED_MODE (not:PRED_MODE TMP) PTRUE))
15792
15793    where the second instructions sets TARGET to the inverse of TMP.  */
15794
15795 static void
15796 aarch64_emit_inverted_unspec_cond (rtx target, rtx_code code,
15797                                    machine_mode pred_mode, rtx ptrue, rtx pred,
15798                                    rtx op0, rtx op1, bool can_invert_p)
15799 {
15800   if (can_invert_p)
15801     aarch64_emit_unspec_cond (target, code, pred_mode, pred, op0, op1);
15802   else
15803     {
15804       rtx tmp = gen_reg_rtx (pred_mode);
15805       aarch64_emit_unspec_cond (tmp, code, pred_mode, pred, op0, op1);
15806       emit_set_insn (target, gen_rtx_AND (pred_mode,
15807                                           gen_rtx_NOT (pred_mode, tmp),
15808                                           ptrue));
15809     }
15810 }
15811
15812 /* Expand an SVE floating-point comparison:
15813
15814      TARGET = CODE (OP0, OP1)
15815
15816    If CAN_INVERT_P is true, the caller can also handle inverted results;
15817    return true if the result is in fact inverted.  */
15818
15819 bool
15820 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15821                                   rtx op0, rtx op1, bool can_invert_p)
15822 {
15823   machine_mode pred_mode = GET_MODE (target);
15824   machine_mode data_mode = GET_MODE (op0);
15825
15826   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15827   switch (code)
15828     {
15829     case UNORDERED:
15830       /* UNORDERED has no immediate form.  */
15831       op1 = force_reg (data_mode, op1);
15832       aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15833       return false;
15834
15835     case LT:
15836     case LE:
15837     case GT:
15838     case GE:
15839     case EQ:
15840     case NE:
15841       /* There is native support for the comparison.  */
15842       aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15843       return false;
15844
15845     case ORDERED:
15846       /* There is native support for the inverse comparison.  */
15847       op1 = force_reg (data_mode, op1);
15848       aarch64_emit_inverted_unspec_cond (target, UNORDERED,
15849                                          pred_mode, ptrue, ptrue, op0, op1,
15850                                          can_invert_p);
15851       return can_invert_p;
15852
15853     case LTGT:
15854       /* This is a trapping operation (LT or GT).  */
15855       aarch64_emit_unspec_cond_or (target, LT, GT, pred_mode, ptrue, op0, op1);
15856       return false;
15857
15858     case UNEQ:
15859       if (!flag_trapping_math)
15860         {
15861           /* This would trap for signaling NaNs.  */
15862           op1 = force_reg (data_mode, op1);
15863           aarch64_emit_unspec_cond_or (target, UNORDERED, EQ,
15864                                        pred_mode, ptrue, op0, op1);
15865           return false;
15866         }
15867       /* fall through */
15868
15869     case UNLT:
15870     case UNLE:
15871     case UNGT:
15872     case UNGE:
15873       {
15874         rtx ordered = ptrue;
15875         if (flag_trapping_math)
15876           {
15877             /* Only compare the elements that are known to be ordered.  */
15878             ordered = gen_reg_rtx (pred_mode);
15879             op1 = force_reg (data_mode, op1);
15880             aarch64_emit_inverted_unspec_cond (ordered, UNORDERED, pred_mode,
15881                                                ptrue, ptrue, op0, op1, false);
15882           }
15883         if (code == UNEQ)
15884           code = NE;
15885         else
15886           code = reverse_condition_maybe_unordered (code);
15887         aarch64_emit_inverted_unspec_cond (target, code, pred_mode, ptrue,
15888                                            ordered, op0, op1, can_invert_p);
15889         return can_invert_p;
15890       }
15891
15892     default:
15893       gcc_unreachable ();
15894     }
15895 }
15896
15897 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
15898    of the data being selected and CMP_MODE is the mode of the values being
15899    compared.  */
15900
15901 void
15902 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15903                           rtx *ops)
15904 {
15905   machine_mode pred_mode
15906     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15907                              GET_MODE_SIZE (cmp_mode)).require ();
15908   rtx pred = gen_reg_rtx (pred_mode);
15909   if (FLOAT_MODE_P (cmp_mode))
15910     {
15911       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15912                                             ops[4], ops[5], true))
15913         std::swap (ops[1], ops[2]);
15914     }
15915   else
15916     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
15917
15918   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
15919   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
15920 }
15921
15922 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
15923    true.  However due to issues with register allocation it is preferable
15924    to avoid tieing integer scalar and FP scalar modes.  Executing integer
15925    operations in general registers is better than treating them as scalar
15926    vector operations.  This reduces latency and avoids redundant int<->FP
15927    moves.  So tie modes if they are either the same class, or vector modes
15928    with other vector modes, vector structs or any scalar mode.  */
15929
15930 static bool
15931 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
15932 {
15933   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
15934     return true;
15935
15936   /* We specifically want to allow elements of "structure" modes to
15937      be tieable to the structure.  This more general condition allows
15938      other rarer situations too.  The reason we don't extend this to
15939      predicate modes is that there are no predicate structure modes
15940      nor any specific instructions for extracting part of a predicate
15941      register.  */
15942   if (aarch64_vector_data_mode_p (mode1)
15943       && aarch64_vector_data_mode_p (mode2))
15944     return true;
15945
15946   /* Also allow any scalar modes with vectors.  */
15947   if (aarch64_vector_mode_supported_p (mode1)
15948       || aarch64_vector_mode_supported_p (mode2))
15949     return true;
15950
15951   return false;
15952 }
15953
15954 /* Return a new RTX holding the result of moving POINTER forward by
15955    AMOUNT bytes.  */
15956
15957 static rtx
15958 aarch64_move_pointer (rtx pointer, poly_int64 amount)
15959 {
15960   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
15961
15962   return adjust_automodify_address (pointer, GET_MODE (pointer),
15963                                     next, amount);
15964 }
15965
15966 /* Return a new RTX holding the result of moving POINTER forward by the
15967    size of the mode it points to.  */
15968
15969 static rtx
15970 aarch64_progress_pointer (rtx pointer)
15971 {
15972   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
15973 }
15974
15975 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
15976    MODE bytes.  */
15977
15978 static void
15979 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
15980                                               machine_mode mode)
15981 {
15982   rtx reg = gen_reg_rtx (mode);
15983
15984   /* "Cast" the pointers to the correct mode.  */
15985   *src = adjust_address (*src, mode, 0);
15986   *dst = adjust_address (*dst, mode, 0);
15987   /* Emit the memcpy.  */
15988   emit_move_insn (reg, *src);
15989   emit_move_insn (*dst, reg);
15990   /* Move the pointers forward.  */
15991   *src = aarch64_progress_pointer (*src);
15992   *dst = aarch64_progress_pointer (*dst);
15993 }
15994
15995 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
15996    we succeed, otherwise return false.  */
15997
15998 bool
15999 aarch64_expand_movmem (rtx *operands)
16000 {
16001   unsigned int n;
16002   rtx dst = operands[0];
16003   rtx src = operands[1];
16004   rtx base;
16005   bool speed_p = !optimize_function_for_size_p (cfun);
16006
16007   /* When optimizing for size, give a better estimate of the length of a
16008      memcpy call, but use the default otherwise.  */
16009   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
16010
16011   /* We can't do anything smart if the amount to copy is not constant.  */
16012   if (!CONST_INT_P (operands[2]))
16013     return false;
16014
16015   n = UINTVAL (operands[2]);
16016
16017   /* Try to keep the number of instructions low.  For cases below 16 bytes we
16018      need to make at most two moves.  For cases above 16 bytes it will be one
16019      move for each 16 byte chunk, then at most two additional moves.  */
16020   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
16021     return false;
16022
16023   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16024   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16025
16026   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16027   src = adjust_automodify_address (src, VOIDmode, base, 0);
16028
16029   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
16030      1-byte chunk.  */
16031   if (n < 4)
16032     {
16033       if (n >= 2)
16034         {
16035           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16036           n -= 2;
16037         }
16038
16039       if (n == 1)
16040         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16041
16042       return true;
16043     }
16044
16045   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
16046      4-byte chunk, partially overlapping with the previously copied chunk.  */
16047   if (n < 8)
16048     {
16049       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16050       n -= 4;
16051       if (n > 0)
16052         {
16053           int move = n - 4;
16054
16055           src = aarch64_move_pointer (src, move);
16056           dst = aarch64_move_pointer (dst, move);
16057           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16058         }
16059       return true;
16060     }
16061
16062   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
16063      them, then (if applicable) an 8-byte chunk.  */
16064   while (n >= 8)
16065     {
16066       if (n / 16)
16067         {
16068           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
16069           n -= 16;
16070         }
16071       else
16072         {
16073           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16074           n -= 8;
16075         }
16076     }
16077
16078   /* Finish the final bytes of the copy.  We can always do this in one
16079      instruction.  We either copy the exact amount we need, or partially
16080      overlap with the previous chunk we copied and copy 8-bytes.  */
16081   if (n == 0)
16082     return true;
16083   else if (n == 1)
16084     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16085   else if (n == 2)
16086     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16087   else if (n == 4)
16088     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16089   else
16090     {
16091       if (n == 3)
16092         {
16093           src = aarch64_move_pointer (src, -1);
16094           dst = aarch64_move_pointer (dst, -1);
16095           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16096         }
16097       else
16098         {
16099           int move = n - 8;
16100
16101           src = aarch64_move_pointer (src, move);
16102           dst = aarch64_move_pointer (dst, move);
16103           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16104         }
16105     }
16106
16107   return true;
16108 }
16109
16110 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16111    SImode stores.  Handle the case when the constant has identical
16112    bottom and top halves.  This is beneficial when the two stores can be
16113    merged into an STP and we avoid synthesising potentially expensive
16114    immediates twice.  Return true if such a split is possible.  */
16115
16116 bool
16117 aarch64_split_dimode_const_store (rtx dst, rtx src)
16118 {
16119   rtx lo = gen_lowpart (SImode, src);
16120   rtx hi = gen_highpart_mode (SImode, DImode, src);
16121
16122   bool size_p = optimize_function_for_size_p (cfun);
16123
16124   if (!rtx_equal_p (lo, hi))
16125     return false;
16126
16127   unsigned int orig_cost
16128     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16129   unsigned int lo_cost
16130     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16131
16132   /* We want to transform:
16133      MOV        x1, 49370
16134      MOVK       x1, 0x140, lsl 16
16135      MOVK       x1, 0xc0da, lsl 32
16136      MOVK       x1, 0x140, lsl 48
16137      STR        x1, [x0]
16138    into:
16139      MOV        w1, 49370
16140      MOVK       w1, 0x140, lsl 16
16141      STP        w1, w1, [x0]
16142    So we want to perform this only when we save two instructions
16143    or more.  When optimizing for size, however, accept any code size
16144    savings we can.  */
16145   if (size_p && orig_cost <= lo_cost)
16146     return false;
16147
16148   if (!size_p
16149       && (orig_cost <= lo_cost + 1))
16150     return false;
16151
16152   rtx mem_lo = adjust_address (dst, SImode, 0);
16153   if (!aarch64_mem_pair_operand (mem_lo, SImode))
16154     return false;
16155
16156   rtx tmp_reg = gen_reg_rtx (SImode);
16157   aarch64_expand_mov_immediate (tmp_reg, lo);
16158   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16159   /* Don't emit an explicit store pair as this may not be always profitable.
16160      Let the sched-fusion logic decide whether to merge them.  */
16161   emit_move_insn (mem_lo, tmp_reg);
16162   emit_move_insn (mem_hi, tmp_reg);
16163
16164   return true;
16165 }
16166
16167 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
16168
16169 static unsigned HOST_WIDE_INT
16170 aarch64_asan_shadow_offset (void)
16171 {
16172   return (HOST_WIDE_INT_1 << 36);
16173 }
16174
16175 static rtx
16176 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16177                         int code, tree treeop0, tree treeop1)
16178 {
16179   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16180   rtx op0, op1;
16181   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16182   insn_code icode;
16183   struct expand_operand ops[4];
16184
16185   start_sequence ();
16186   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16187
16188   op_mode = GET_MODE (op0);
16189   if (op_mode == VOIDmode)
16190     op_mode = GET_MODE (op1);
16191
16192   switch (op_mode)
16193     {
16194     case E_QImode:
16195     case E_HImode:
16196     case E_SImode:
16197       cmp_mode = SImode;
16198       icode = CODE_FOR_cmpsi;
16199       break;
16200
16201     case E_DImode:
16202       cmp_mode = DImode;
16203       icode = CODE_FOR_cmpdi;
16204       break;
16205
16206     case E_SFmode:
16207       cmp_mode = SFmode;
16208       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16209       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16210       break;
16211
16212     case E_DFmode:
16213       cmp_mode = DFmode;
16214       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16215       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16216       break;
16217
16218     default:
16219       end_sequence ();
16220       return NULL_RTX;
16221     }
16222
16223   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16224   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16225   if (!op0 || !op1)
16226     {
16227       end_sequence ();
16228       return NULL_RTX;
16229     }
16230   *prep_seq = get_insns ();
16231   end_sequence ();
16232
16233   create_fixed_operand (&ops[0], op0);
16234   create_fixed_operand (&ops[1], op1);
16235
16236   start_sequence ();
16237   if (!maybe_expand_insn (icode, 2, ops))
16238     {
16239       end_sequence ();
16240       return NULL_RTX;
16241     }
16242   *gen_seq = get_insns ();
16243   end_sequence ();
16244
16245   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16246                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16247 }
16248
16249 static rtx
16250 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16251                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
16252 {
16253   rtx op0, op1, target;
16254   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16255   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16256   insn_code icode;
16257   struct expand_operand ops[6];
16258   int aarch64_cond;
16259
16260   push_to_sequence (*prep_seq);
16261   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16262
16263   op_mode = GET_MODE (op0);
16264   if (op_mode == VOIDmode)
16265     op_mode = GET_MODE (op1);
16266
16267   switch (op_mode)
16268     {
16269     case E_QImode:
16270     case E_HImode:
16271     case E_SImode:
16272       cmp_mode = SImode;
16273       icode = CODE_FOR_ccmpsi;
16274       break;
16275
16276     case E_DImode:
16277       cmp_mode = DImode;
16278       icode = CODE_FOR_ccmpdi;
16279       break;
16280
16281     case E_SFmode:
16282       cmp_mode = SFmode;
16283       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16284       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16285       break;
16286
16287     case E_DFmode:
16288       cmp_mode = DFmode;
16289       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16290       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16291       break;
16292
16293     default:
16294       end_sequence ();
16295       return NULL_RTX;
16296     }
16297
16298   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16299   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16300   if (!op0 || !op1)
16301     {
16302       end_sequence ();
16303       return NULL_RTX;
16304     }
16305   *prep_seq = get_insns ();
16306   end_sequence ();
16307
16308   target = gen_rtx_REG (cc_mode, CC_REGNUM);
16309   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16310
16311   if (bit_code != AND)
16312     {
16313       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16314                                                 GET_MODE (XEXP (prev, 0))),
16315                              VOIDmode, XEXP (prev, 0), const0_rtx);
16316       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16317     }
16318
16319   create_fixed_operand (&ops[0], XEXP (prev, 0));
16320   create_fixed_operand (&ops[1], target);
16321   create_fixed_operand (&ops[2], op0);
16322   create_fixed_operand (&ops[3], op1);
16323   create_fixed_operand (&ops[4], prev);
16324   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16325
16326   push_to_sequence (*gen_seq);
16327   if (!maybe_expand_insn (icode, 6, ops))
16328     {
16329       end_sequence ();
16330       return NULL_RTX;
16331     }
16332
16333   *gen_seq = get_insns ();
16334   end_sequence ();
16335
16336   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16337 }
16338
16339 #undef TARGET_GEN_CCMP_FIRST
16340 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16341
16342 #undef TARGET_GEN_CCMP_NEXT
16343 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16344
16345 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
16346    instruction fusion of some sort.  */
16347
16348 static bool
16349 aarch64_macro_fusion_p (void)
16350 {
16351   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16352 }
16353
16354
16355 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
16356    should be kept together during scheduling.  */
16357
16358 static bool
16359 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16360 {
16361   rtx set_dest;
16362   rtx prev_set = single_set (prev);
16363   rtx curr_set = single_set (curr);
16364   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
16365   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16366
16367   if (!aarch64_macro_fusion_p ())
16368     return false;
16369
16370   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16371     {
16372       /* We are trying to match:
16373          prev (mov)  == (set (reg r0) (const_int imm16))
16374          curr (movk) == (set (zero_extract (reg r0)
16375                                            (const_int 16)
16376                                            (const_int 16))
16377                              (const_int imm16_1))  */
16378
16379       set_dest = SET_DEST (curr_set);
16380
16381       if (GET_CODE (set_dest) == ZERO_EXTRACT
16382           && CONST_INT_P (SET_SRC (curr_set))
16383           && CONST_INT_P (SET_SRC (prev_set))
16384           && CONST_INT_P (XEXP (set_dest, 2))
16385           && INTVAL (XEXP (set_dest, 2)) == 16
16386           && REG_P (XEXP (set_dest, 0))
16387           && REG_P (SET_DEST (prev_set))
16388           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16389         {
16390           return true;
16391         }
16392     }
16393
16394   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16395     {
16396
16397       /*  We're trying to match:
16398           prev (adrp) == (set (reg r1)
16399                               (high (symbol_ref ("SYM"))))
16400           curr (add) == (set (reg r0)
16401                              (lo_sum (reg r1)
16402                                      (symbol_ref ("SYM"))))
16403           Note that r0 need not necessarily be the same as r1, especially
16404           during pre-regalloc scheduling.  */
16405
16406       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16407           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16408         {
16409           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16410               && REG_P (XEXP (SET_SRC (curr_set), 0))
16411               && REGNO (XEXP (SET_SRC (curr_set), 0))
16412                  == REGNO (SET_DEST (prev_set))
16413               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16414                               XEXP (SET_SRC (curr_set), 1)))
16415             return true;
16416         }
16417     }
16418
16419   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16420     {
16421
16422       /* We're trying to match:
16423          prev (movk) == (set (zero_extract (reg r0)
16424                                            (const_int 16)
16425                                            (const_int 32))
16426                              (const_int imm16_1))
16427          curr (movk) == (set (zero_extract (reg r0)
16428                                            (const_int 16)
16429                                            (const_int 48))
16430                              (const_int imm16_2))  */
16431
16432       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16433           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16434           && REG_P (XEXP (SET_DEST (prev_set), 0))
16435           && REG_P (XEXP (SET_DEST (curr_set), 0))
16436           && REGNO (XEXP (SET_DEST (prev_set), 0))
16437              == REGNO (XEXP (SET_DEST (curr_set), 0))
16438           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16439           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16440           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16441           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16442           && CONST_INT_P (SET_SRC (prev_set))
16443           && CONST_INT_P (SET_SRC (curr_set)))
16444         return true;
16445
16446     }
16447   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16448     {
16449       /* We're trying to match:
16450           prev (adrp) == (set (reg r0)
16451                               (high (symbol_ref ("SYM"))))
16452           curr (ldr) == (set (reg r1)
16453                              (mem (lo_sum (reg r0)
16454                                              (symbol_ref ("SYM")))))
16455                  or
16456           curr (ldr) == (set (reg r1)
16457                              (zero_extend (mem
16458                                            (lo_sum (reg r0)
16459                                                    (symbol_ref ("SYM"))))))  */
16460       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16461           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16462         {
16463           rtx curr_src = SET_SRC (curr_set);
16464
16465           if (GET_CODE (curr_src) == ZERO_EXTEND)
16466             curr_src = XEXP (curr_src, 0);
16467
16468           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16469               && REG_P (XEXP (XEXP (curr_src, 0), 0))
16470               && REGNO (XEXP (XEXP (curr_src, 0), 0))
16471                  == REGNO (SET_DEST (prev_set))
16472               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16473                               XEXP (SET_SRC (prev_set), 0)))
16474               return true;
16475         }
16476     }
16477
16478   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16479        && aarch_crypto_can_dual_issue (prev, curr))
16480     return true;
16481
16482   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16483       && any_condjump_p (curr))
16484     {
16485       enum attr_type prev_type = get_attr_type (prev);
16486
16487       unsigned int condreg1, condreg2;
16488       rtx cc_reg_1;
16489       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16490       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16491
16492       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16493           && prev
16494           && modified_in_p (cc_reg_1, prev))
16495         {
16496           /* FIXME: this misses some which is considered simple arthematic
16497              instructions for ThunderX.  Simple shifts are missed here.  */
16498           if (prev_type == TYPE_ALUS_SREG
16499               || prev_type == TYPE_ALUS_IMM
16500               || prev_type == TYPE_LOGICS_REG
16501               || prev_type == TYPE_LOGICS_IMM)
16502             return true;
16503         }
16504     }
16505
16506   if (prev_set
16507       && curr_set
16508       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16509       && any_condjump_p (curr))
16510     {
16511       /* We're trying to match:
16512           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16513           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
16514                                                          (const_int 0))
16515                                                  (label_ref ("SYM"))
16516                                                  (pc))  */
16517       if (SET_DEST (curr_set) == (pc_rtx)
16518           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16519           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16520           && REG_P (SET_DEST (prev_set))
16521           && REGNO (SET_DEST (prev_set))
16522              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16523         {
16524           /* Fuse ALU operations followed by conditional branch instruction.  */
16525           switch (get_attr_type (prev))
16526             {
16527             case TYPE_ALU_IMM:
16528             case TYPE_ALU_SREG:
16529             case TYPE_ADC_REG:
16530             case TYPE_ADC_IMM:
16531             case TYPE_ADCS_REG:
16532             case TYPE_ADCS_IMM:
16533             case TYPE_LOGIC_REG:
16534             case TYPE_LOGIC_IMM:
16535             case TYPE_CSEL:
16536             case TYPE_ADR:
16537             case TYPE_MOV_IMM:
16538             case TYPE_SHIFT_REG:
16539             case TYPE_SHIFT_IMM:
16540             case TYPE_BFM:
16541             case TYPE_RBIT:
16542             case TYPE_REV:
16543             case TYPE_EXTEND:
16544               return true;
16545
16546             default:;
16547             }
16548         }
16549     }
16550
16551   return false;
16552 }
16553
16554 /* Return true iff the instruction fusion described by OP is enabled.  */
16555
16556 bool
16557 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16558 {
16559   return (aarch64_tune_params.fusible_ops & op) != 0;
16560 }
16561
16562 /* If MEM is in the form of [base+offset], extract the two parts
16563    of address and set to BASE and OFFSET, otherwise return false
16564    after clearing BASE and OFFSET.  */
16565
16566 bool
16567 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16568 {
16569   rtx addr;
16570
16571   gcc_assert (MEM_P (mem));
16572
16573   addr = XEXP (mem, 0);
16574
16575   if (REG_P (addr))
16576     {
16577       *base = addr;
16578       *offset = const0_rtx;
16579       return true;
16580     }
16581
16582   if (GET_CODE (addr) == PLUS
16583       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16584     {
16585       *base = XEXP (addr, 0);
16586       *offset = XEXP (addr, 1);
16587       return true;
16588     }
16589
16590   *base = NULL_RTX;
16591   *offset = NULL_RTX;
16592
16593   return false;
16594 }
16595
16596 /* Types for scheduling fusion.  */
16597 enum sched_fusion_type
16598 {
16599   SCHED_FUSION_NONE = 0,
16600   SCHED_FUSION_LD_SIGN_EXTEND,
16601   SCHED_FUSION_LD_ZERO_EXTEND,
16602   SCHED_FUSION_LD,
16603   SCHED_FUSION_ST,
16604   SCHED_FUSION_NUM
16605 };
16606
16607 /* If INSN is a load or store of address in the form of [base+offset],
16608    extract the two parts and set to BASE and OFFSET.  Return scheduling
16609    fusion type this INSN is.  */
16610
16611 static enum sched_fusion_type
16612 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16613 {
16614   rtx x, dest, src;
16615   enum sched_fusion_type fusion = SCHED_FUSION_LD;
16616
16617   gcc_assert (INSN_P (insn));
16618   x = PATTERN (insn);
16619   if (GET_CODE (x) != SET)
16620     return SCHED_FUSION_NONE;
16621
16622   src = SET_SRC (x);
16623   dest = SET_DEST (x);
16624
16625   machine_mode dest_mode = GET_MODE (dest);
16626
16627   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16628     return SCHED_FUSION_NONE;
16629
16630   if (GET_CODE (src) == SIGN_EXTEND)
16631     {
16632       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16633       src = XEXP (src, 0);
16634       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16635         return SCHED_FUSION_NONE;
16636     }
16637   else if (GET_CODE (src) == ZERO_EXTEND)
16638     {
16639       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16640       src = XEXP (src, 0);
16641       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16642         return SCHED_FUSION_NONE;
16643     }
16644
16645   if (GET_CODE (src) == MEM && REG_P (dest))
16646     extract_base_offset_in_addr (src, base, offset);
16647   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16648     {
16649       fusion = SCHED_FUSION_ST;
16650       extract_base_offset_in_addr (dest, base, offset);
16651     }
16652   else
16653     return SCHED_FUSION_NONE;
16654
16655   if (*base == NULL_RTX || *offset == NULL_RTX)
16656     fusion = SCHED_FUSION_NONE;
16657
16658   return fusion;
16659 }
16660
16661 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16662
16663    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16664    and PRI are only calculated for these instructions.  For other instruction,
16665    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
16666    type instruction fusion can be added by returning different priorities.
16667
16668    It's important that irrelevant instructions get the largest FUSION_PRI.  */
16669
16670 static void
16671 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16672                                int *fusion_pri, int *pri)
16673 {
16674   int tmp, off_val;
16675   rtx base, offset;
16676   enum sched_fusion_type fusion;
16677
16678   gcc_assert (INSN_P (insn));
16679
16680   tmp = max_pri - 1;
16681   fusion = fusion_load_store (insn, &base, &offset);
16682   if (fusion == SCHED_FUSION_NONE)
16683     {
16684       *pri = tmp;
16685       *fusion_pri = tmp;
16686       return;
16687     }
16688
16689   /* Set FUSION_PRI according to fusion type and base register.  */
16690   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16691
16692   /* Calculate PRI.  */
16693   tmp /= 2;
16694
16695   /* INSN with smaller offset goes first.  */
16696   off_val = (int)(INTVAL (offset));
16697   if (off_val >= 0)
16698     tmp -= (off_val & 0xfffff);
16699   else
16700     tmp += ((- off_val) & 0xfffff);
16701
16702   *pri = tmp;
16703   return;
16704 }
16705
16706 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16707    Adjust priority of sha1h instructions so they are scheduled before
16708    other SHA1 instructions.  */
16709
16710 static int
16711 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16712 {
16713   rtx x = PATTERN (insn);
16714
16715   if (GET_CODE (x) == SET)
16716     {
16717       x = SET_SRC (x);
16718
16719       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16720         return priority + 10;
16721     }
16722
16723   return priority;
16724 }
16725
16726 /* Given OPERANDS of consecutive load/store, check if we can merge
16727    them into ldp/stp.  LOAD is true if they are load instructions.
16728    MODE is the mode of memory operands.  */
16729
16730 bool
16731 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16732                                 machine_mode mode)
16733 {
16734   HOST_WIDE_INT offval_1, offval_2, msize;
16735   enum reg_class rclass_1, rclass_2;
16736   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16737
16738   if (load)
16739     {
16740       mem_1 = operands[1];
16741       mem_2 = operands[3];
16742       reg_1 = operands[0];
16743       reg_2 = operands[2];
16744       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16745       if (REGNO (reg_1) == REGNO (reg_2))
16746         return false;
16747     }
16748   else
16749     {
16750       mem_1 = operands[0];
16751       mem_2 = operands[2];
16752       reg_1 = operands[1];
16753       reg_2 = operands[3];
16754     }
16755
16756   /* The mems cannot be volatile.  */
16757   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16758     return false;
16759
16760   /* If we have SImode and slow unaligned ldp,
16761      check the alignment to be at least 8 byte. */
16762   if (mode == SImode
16763       && (aarch64_tune_params.extra_tuning_flags
16764           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16765       && !optimize_size
16766       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16767     return false;
16768
16769   /* Check if the addresses are in the form of [base+offset].  */
16770   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16771   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16772     return false;
16773   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16774   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16775     return false;
16776
16777   /* Check if the bases are same.  */
16778   if (!rtx_equal_p (base_1, base_2))
16779     return false;
16780
16781   offval_1 = INTVAL (offset_1);
16782   offval_2 = INTVAL (offset_2);
16783   /* We should only be trying this for fixed-sized modes.  There is no
16784      SVE LDP/STP instruction.  */
16785   msize = GET_MODE_SIZE (mode).to_constant ();
16786   /* Check if the offsets are consecutive.  */
16787   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16788     return false;
16789
16790   /* Check if the addresses are clobbered by load.  */
16791   if (load)
16792     {
16793       if (reg_mentioned_p (reg_1, mem_1))
16794         return false;
16795
16796       /* In increasing order, the last load can clobber the address.  */
16797       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16798       return false;
16799     }
16800
16801   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16802     rclass_1 = FP_REGS;
16803   else
16804     rclass_1 = GENERAL_REGS;
16805
16806   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16807     rclass_2 = FP_REGS;
16808   else
16809     rclass_2 = GENERAL_REGS;
16810
16811   /* Check if the registers are of same class.  */
16812   if (rclass_1 != rclass_2)
16813     return false;
16814
16815   return true;
16816 }
16817
16818 /* Given OPERANDS of consecutive load/store, check if we can merge
16819    them into ldp/stp by adjusting the offset.  LOAD is true if they
16820    are load instructions.  MODE is the mode of memory operands.
16821
16822    Given below consecutive stores:
16823
16824      str  w1, [xb, 0x100]
16825      str  w1, [xb, 0x104]
16826      str  w1, [xb, 0x108]
16827      str  w1, [xb, 0x10c]
16828
16829    Though the offsets are out of the range supported by stp, we can
16830    still pair them after adjusting the offset, like:
16831
16832      add  scratch, xb, 0x100
16833      stp  w1, w1, [scratch]
16834      stp  w1, w1, [scratch, 0x8]
16835
16836    The peephole patterns detecting this opportunity should guarantee
16837    the scratch register is avaliable.  */
16838
16839 bool
16840 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16841                                        scalar_mode mode)
16842 {
16843   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
16844   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
16845   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
16846   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
16847
16848   if (load)
16849     {
16850       reg_1 = operands[0];
16851       mem_1 = operands[1];
16852       reg_2 = operands[2];
16853       mem_2 = operands[3];
16854       reg_3 = operands[4];
16855       mem_3 = operands[5];
16856       reg_4 = operands[6];
16857       mem_4 = operands[7];
16858       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
16859                   && REG_P (reg_3) && REG_P (reg_4));
16860       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
16861         return false;
16862     }
16863   else
16864     {
16865       mem_1 = operands[0];
16866       reg_1 = operands[1];
16867       mem_2 = operands[2];
16868       reg_2 = operands[3];
16869       mem_3 = operands[4];
16870       reg_3 = operands[5];
16871       mem_4 = operands[6];
16872       reg_4 = operands[7];
16873     }
16874   /* Skip if memory operand is by itslef valid for ldp/stp.  */
16875   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
16876     return false;
16877
16878   /* The mems cannot be volatile.  */
16879   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
16880       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
16881     return false;
16882
16883   /* Check if the addresses are in the form of [base+offset].  */
16884   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16885   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16886     return false;
16887   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16888   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16889     return false;
16890   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
16891   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
16892     return false;
16893   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
16894   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
16895     return false;
16896
16897   /* Check if the bases are same.  */
16898   if (!rtx_equal_p (base_1, base_2)
16899       || !rtx_equal_p (base_2, base_3)
16900       || !rtx_equal_p (base_3, base_4))
16901     return false;
16902
16903   offval_1 = INTVAL (offset_1);
16904   offval_2 = INTVAL (offset_2);
16905   offval_3 = INTVAL (offset_3);
16906   offval_4 = INTVAL (offset_4);
16907   msize = GET_MODE_SIZE (mode);
16908   /* Check if the offsets are consecutive.  */
16909   if ((offval_1 != (offval_2 + msize)
16910        || offval_1 != (offval_3 + msize * 2)
16911        || offval_1 != (offval_4 + msize * 3))
16912       && (offval_4 != (offval_3 + msize)
16913           || offval_4 != (offval_2 + msize * 2)
16914           || offval_4 != (offval_1 + msize * 3)))
16915     return false;
16916
16917   /* Check if the addresses are clobbered by load.  */
16918   if (load)
16919     {
16920       if (reg_mentioned_p (reg_1, mem_1)
16921           || reg_mentioned_p (reg_2, mem_2)
16922           || reg_mentioned_p (reg_3, mem_3))
16923         return false;
16924
16925       /* In increasing order, the last load can clobber the address.  */
16926       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
16927         return false;
16928     }
16929
16930   /* If we have SImode and slow unaligned ldp,
16931      check the alignment to be at least 8 byte. */
16932   if (mode == SImode
16933       && (aarch64_tune_params.extra_tuning_flags
16934           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16935       && !optimize_size
16936       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16937     return false;
16938
16939   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16940     rclass_1 = FP_REGS;
16941   else
16942     rclass_1 = GENERAL_REGS;
16943
16944   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16945     rclass_2 = FP_REGS;
16946   else
16947     rclass_2 = GENERAL_REGS;
16948
16949   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
16950     rclass_3 = FP_REGS;
16951   else
16952     rclass_3 = GENERAL_REGS;
16953
16954   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
16955     rclass_4 = FP_REGS;
16956   else
16957     rclass_4 = GENERAL_REGS;
16958
16959   /* Check if the registers are of same class.  */
16960   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
16961     return false;
16962
16963   return true;
16964 }
16965
16966 /* Given OPERANDS of consecutive load/store, this function pairs them
16967    into ldp/stp after adjusting the offset.  It depends on the fact
16968    that addresses of load/store instructions are in increasing order.
16969    MODE is the mode of memory operands.  CODE is the rtl operator
16970    which should be applied to all memory operands, it's SIGN_EXTEND,
16971    ZERO_EXTEND or UNKNOWN.  */
16972
16973 bool
16974 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
16975                              scalar_mode mode, RTX_CODE code)
16976 {
16977   rtx base, offset, t1, t2;
16978   rtx mem_1, mem_2, mem_3, mem_4;
16979   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
16980
16981   if (load)
16982     {
16983       mem_1 = operands[1];
16984       mem_2 = operands[3];
16985       mem_3 = operands[5];
16986       mem_4 = operands[7];
16987     }
16988   else
16989     {
16990       mem_1 = operands[0];
16991       mem_2 = operands[2];
16992       mem_3 = operands[4];
16993       mem_4 = operands[6];
16994       gcc_assert (code == UNKNOWN);
16995     }
16996
16997   extract_base_offset_in_addr (mem_1, &base, &offset);
16998   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
16999
17000   /* Adjust offset thus it can fit in ldp/stp instruction.  */
17001   msize = GET_MODE_SIZE (mode);
17002   stp_off_limit = msize * 0x40;
17003   off_val = INTVAL (offset);
17004   abs_off = (off_val < 0) ? -off_val : off_val;
17005   new_off = abs_off % stp_off_limit;
17006   adj_off = abs_off - new_off;
17007
17008   /* Further adjust to make sure all offsets are OK.  */
17009   if ((new_off + msize * 2) >= stp_off_limit)
17010     {
17011       adj_off += stp_off_limit;
17012       new_off -= stp_off_limit;
17013     }
17014
17015   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
17016   if (adj_off >= 0x1000)
17017     return false;
17018
17019   if (off_val < 0)
17020     {
17021       adj_off = -adj_off;
17022       new_off = -new_off;
17023     }
17024
17025   /* Create new memory references.  */
17026   mem_1 = change_address (mem_1, VOIDmode,
17027                           plus_constant (DImode, operands[8], new_off));
17028
17029   /* Check if the adjusted address is OK for ldp/stp.  */
17030   if (!aarch64_mem_pair_operand (mem_1, mode))
17031     return false;
17032
17033   msize = GET_MODE_SIZE (mode);
17034   mem_2 = change_address (mem_2, VOIDmode,
17035                           plus_constant (DImode,
17036                                          operands[8],
17037                                          new_off + msize));
17038   mem_3 = change_address (mem_3, VOIDmode,
17039                           plus_constant (DImode,
17040                                          operands[8],
17041                                          new_off + msize * 2));
17042   mem_4 = change_address (mem_4, VOIDmode,
17043                           plus_constant (DImode,
17044                                          operands[8],
17045                                          new_off + msize * 3));
17046
17047   if (code == ZERO_EXTEND)
17048     {
17049       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17050       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17051       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17052       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17053     }
17054   else if (code == SIGN_EXTEND)
17055     {
17056       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17057       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17058       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17059       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17060     }
17061
17062   if (load)
17063     {
17064       operands[1] = mem_1;
17065       operands[3] = mem_2;
17066       operands[5] = mem_3;
17067       operands[7] = mem_4;
17068     }
17069   else
17070     {
17071       operands[0] = mem_1;
17072       operands[2] = mem_2;
17073       operands[4] = mem_3;
17074       operands[6] = mem_4;
17075     }
17076
17077   /* Emit adjusting instruction.  */
17078   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
17079   /* Emit ldp/stp instructions.  */
17080   t1 = gen_rtx_SET (operands[0], operands[1]);
17081   t2 = gen_rtx_SET (operands[2], operands[3]);
17082   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17083   t1 = gen_rtx_SET (operands[4], operands[5]);
17084   t2 = gen_rtx_SET (operands[6], operands[7]);
17085   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17086   return true;
17087 }
17088
17089 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
17090    it isn't worth branching around empty masked ops (including masked
17091    stores).  */
17092
17093 static bool
17094 aarch64_empty_mask_is_expensive (unsigned)
17095 {
17096   return false;
17097 }
17098
17099 /* Return 1 if pseudo register should be created and used to hold
17100    GOT address for PIC code.  */
17101
17102 bool
17103 aarch64_use_pseudo_pic_reg (void)
17104 {
17105   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17106 }
17107
17108 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
17109
17110 static int
17111 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17112 {
17113   switch (XINT (x, 1))
17114     {
17115     case UNSPEC_GOTSMALLPIC:
17116     case UNSPEC_GOTSMALLPIC28K:
17117     case UNSPEC_GOTTINYPIC:
17118       return 0;
17119     default:
17120       break;
17121     }
17122
17123   return default_unspec_may_trap_p (x, flags);
17124 }
17125
17126
17127 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17128    return the log2 of that value.  Otherwise return -1.  */
17129
17130 int
17131 aarch64_fpconst_pow_of_2 (rtx x)
17132 {
17133   const REAL_VALUE_TYPE *r;
17134
17135   if (!CONST_DOUBLE_P (x))
17136     return -1;
17137
17138   r = CONST_DOUBLE_REAL_VALUE (x);
17139
17140   if (REAL_VALUE_NEGATIVE (*r)
17141       || REAL_VALUE_ISNAN (*r)
17142       || REAL_VALUE_ISINF (*r)
17143       || !real_isinteger (r, DFmode))
17144     return -1;
17145
17146   return exact_log2 (real_to_integer (r));
17147 }
17148
17149 /* If X is a vector of equal CONST_DOUBLE values and that value is
17150    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
17151
17152 int
17153 aarch64_vec_fpconst_pow_of_2 (rtx x)
17154 {
17155   int nelts;
17156   if (GET_CODE (x) != CONST_VECTOR
17157       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17158     return -1;
17159
17160   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17161     return -1;
17162
17163   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17164   if (firstval <= 0)
17165     return -1;
17166
17167   for (int i = 1; i < nelts; i++)
17168     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17169       return -1;
17170
17171   return firstval;
17172 }
17173
17174 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17175    to float.
17176
17177    __fp16 always promotes through this hook.
17178    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17179    through the generic excess precision logic rather than here.  */
17180
17181 static tree
17182 aarch64_promoted_type (const_tree t)
17183 {
17184   if (SCALAR_FLOAT_TYPE_P (t)
17185       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17186     return float_type_node;
17187
17188   return NULL_TREE;
17189 }
17190
17191 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
17192
17193 static bool
17194 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17195                            optimization_type opt_type)
17196 {
17197   switch (op)
17198     {
17199     case rsqrt_optab:
17200       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17201
17202     default:
17203       return true;
17204     }
17205 }
17206
17207 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
17208
17209 static unsigned int
17210 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17211                                         int *offset)
17212 {
17213   /* Polynomial invariant 1 == (VG / 2) - 1.  */
17214   gcc_assert (i == 1);
17215   *factor = 2;
17216   *offset = 1;
17217   return AARCH64_DWARF_VG;
17218 }
17219
17220 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17221    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17222
17223 static bool
17224 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17225 {
17226   return (mode == HFmode
17227           ? true
17228           : default_libgcc_floating_mode_supported_p (mode));
17229 }
17230
17231 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17232    if MODE is HFmode, and punt to the generic implementation otherwise.  */
17233
17234 static bool
17235 aarch64_scalar_mode_supported_p (scalar_mode mode)
17236 {
17237   return (mode == HFmode
17238           ? true
17239           : default_scalar_mode_supported_p (mode));
17240 }
17241
17242 /* Set the value of FLT_EVAL_METHOD.
17243    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17244
17245     0: evaluate all operations and constants, whose semantic type has at
17246        most the range and precision of type float, to the range and
17247        precision of float; evaluate all other operations and constants to
17248        the range and precision of the semantic type;
17249
17250     N, where _FloatN is a supported interchange floating type
17251        evaluate all operations and constants, whose semantic type has at
17252        most the range and precision of _FloatN type, to the range and
17253        precision of the _FloatN type; evaluate all other operations and
17254        constants to the range and precision of the semantic type;
17255
17256    If we have the ARMv8.2-A extensions then we support _Float16 in native
17257    precision, so we should set this to 16.  Otherwise, we support the type,
17258    but want to evaluate expressions in float precision, so set this to
17259    0.  */
17260
17261 static enum flt_eval_method
17262 aarch64_excess_precision (enum excess_precision_type type)
17263 {
17264   switch (type)
17265     {
17266       case EXCESS_PRECISION_TYPE_FAST:
17267       case EXCESS_PRECISION_TYPE_STANDARD:
17268         /* We can calculate either in 16-bit range and precision or
17269            32-bit range and precision.  Make that decision based on whether
17270            we have native support for the ARMv8.2-A 16-bit floating-point
17271            instructions or not.  */
17272         return (TARGET_FP_F16INST
17273                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17274                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17275       case EXCESS_PRECISION_TYPE_IMPLICIT:
17276         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17277       default:
17278         gcc_unreachable ();
17279     }
17280   return FLT_EVAL_METHOD_UNPREDICTABLE;
17281 }
17282
17283 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
17284    scheduled for speculative execution.  Reject the long-running division
17285    and square-root instructions.  */
17286
17287 static bool
17288 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17289 {
17290   switch (get_attr_type (insn))
17291     {
17292       case TYPE_SDIV:
17293       case TYPE_UDIV:
17294       case TYPE_FDIVS:
17295       case TYPE_FDIVD:
17296       case TYPE_FSQRTS:
17297       case TYPE_FSQRTD:
17298       case TYPE_NEON_FP_SQRT_S:
17299       case TYPE_NEON_FP_SQRT_D:
17300       case TYPE_NEON_FP_SQRT_S_Q:
17301       case TYPE_NEON_FP_SQRT_D_Q:
17302       case TYPE_NEON_FP_DIV_S:
17303       case TYPE_NEON_FP_DIV_D:
17304       case TYPE_NEON_FP_DIV_S_Q:
17305       case TYPE_NEON_FP_DIV_D_Q:
17306         return false;
17307       default:
17308         return true;
17309     }
17310 }
17311
17312 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
17313
17314 static int
17315 aarch64_compute_pressure_classes (reg_class *classes)
17316 {
17317   int i = 0;
17318   classes[i++] = GENERAL_REGS;
17319   classes[i++] = FP_REGS;
17320   /* PR_REGS isn't a useful pressure class because many predicate pseudo
17321      registers need to go in PR_LO_REGS at some point during their
17322      lifetime.  Splitting it into two halves has the effect of making
17323      all predicates count against PR_LO_REGS, so that we try whenever
17324      possible to restrict the number of live predicates to 8.  This
17325      greatly reduces the amount of spilling in certain loops.  */
17326   classes[i++] = PR_LO_REGS;
17327   classes[i++] = PR_HI_REGS;
17328   return i;
17329 }
17330
17331 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
17332
17333 static bool
17334 aarch64_can_change_mode_class (machine_mode from,
17335                                machine_mode to, reg_class_t)
17336 {
17337   if (BYTES_BIG_ENDIAN)
17338     {
17339       bool from_sve_p = aarch64_sve_data_mode_p (from);
17340       bool to_sve_p = aarch64_sve_data_mode_p (to);
17341
17342       /* Don't allow changes between SVE data modes and non-SVE modes.
17343          See the comment at the head of aarch64-sve.md for details.  */
17344       if (from_sve_p != to_sve_p)
17345         return false;
17346
17347       /* Don't allow changes in element size: lane 0 of the new vector
17348          would not then be lane 0 of the old vector.  See the comment
17349          above aarch64_maybe_expand_sve_subreg_move for a more detailed
17350          description.
17351
17352          In the worst case, this forces a register to be spilled in
17353          one mode and reloaded in the other, which handles the
17354          endianness correctly.  */
17355       if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17356         return false;
17357     }
17358   return true;
17359 }
17360
17361 /* Implement TARGET_EARLY_REMAT_MODES.  */
17362
17363 static void
17364 aarch64_select_early_remat_modes (sbitmap modes)
17365 {
17366   /* SVE values are not normally live across a call, so it should be
17367      worth doing early rematerialization even in VL-specific mode.  */
17368   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17369     {
17370       machine_mode mode = (machine_mode) i;
17371       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17372       if (vec_flags & VEC_ANY_SVE)
17373         bitmap_set_bit (modes, i);
17374     }
17375 }
17376
17377 /* Target-specific selftests.  */
17378
17379 #if CHECKING_P
17380
17381 namespace selftest {
17382
17383 /* Selftest for the RTL loader.
17384    Verify that the RTL loader copes with a dump from
17385    print_rtx_function.  This is essentially just a test that class
17386    function_reader can handle a real dump, but it also verifies
17387    that lookup_reg_by_dump_name correctly handles hard regs.
17388    The presence of hard reg names in the dump means that the test is
17389    target-specific, hence it is in this file.  */
17390
17391 static void
17392 aarch64_test_loading_full_dump ()
17393 {
17394   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17395
17396   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17397
17398   rtx_insn *insn_1 = get_insn_by_uid (1);
17399   ASSERT_EQ (NOTE, GET_CODE (insn_1));
17400
17401   rtx_insn *insn_15 = get_insn_by_uid (15);
17402   ASSERT_EQ (INSN, GET_CODE (insn_15));
17403   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17404
17405   /* Verify crtl->return_rtx.  */
17406   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17407   ASSERT_EQ (0, REGNO (crtl->return_rtx));
17408   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17409 }
17410
17411 /* Run all target-specific selftests.  */
17412
17413 static void
17414 aarch64_run_selftests (void)
17415 {
17416   aarch64_test_loading_full_dump ();
17417 }
17418
17419 } // namespace selftest
17420
17421 #endif /* #if CHECKING_P */
17422
17423 #undef TARGET_ADDRESS_COST
17424 #define TARGET_ADDRESS_COST aarch64_address_cost
17425
17426 /* This hook will determines whether unnamed bitfields affect the alignment
17427    of the containing structure.  The hook returns true if the structure
17428    should inherit the alignment requirements of an unnamed bitfield's
17429    type.  */
17430 #undef TARGET_ALIGN_ANON_BITFIELD
17431 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17432
17433 #undef TARGET_ASM_ALIGNED_DI_OP
17434 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17435
17436 #undef TARGET_ASM_ALIGNED_HI_OP
17437 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17438
17439 #undef TARGET_ASM_ALIGNED_SI_OP
17440 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17441
17442 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17443 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17444   hook_bool_const_tree_hwi_hwi_const_tree_true
17445
17446 #undef TARGET_ASM_FILE_START
17447 #define TARGET_ASM_FILE_START aarch64_start_file
17448
17449 #undef TARGET_ASM_OUTPUT_MI_THUNK
17450 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17451
17452 #undef TARGET_ASM_SELECT_RTX_SECTION
17453 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17454
17455 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17456 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17457
17458 #undef TARGET_BUILD_BUILTIN_VA_LIST
17459 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17460
17461 #undef TARGET_CALLEE_COPIES
17462 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17463
17464 #undef TARGET_CAN_ELIMINATE
17465 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17466
17467 #undef TARGET_CAN_INLINE_P
17468 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17469
17470 #undef TARGET_CANNOT_FORCE_CONST_MEM
17471 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17472
17473 #undef TARGET_CASE_VALUES_THRESHOLD
17474 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17475
17476 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17477 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17478
17479 /* Only the least significant bit is used for initialization guard
17480    variables.  */
17481 #undef TARGET_CXX_GUARD_MASK_BIT
17482 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17483
17484 #undef TARGET_C_MODE_FOR_SUFFIX
17485 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17486
17487 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17488 #undef  TARGET_DEFAULT_TARGET_FLAGS
17489 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17490 #endif
17491
17492 #undef TARGET_CLASS_MAX_NREGS
17493 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17494
17495 #undef TARGET_BUILTIN_DECL
17496 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17497
17498 #undef TARGET_BUILTIN_RECIPROCAL
17499 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17500
17501 #undef TARGET_C_EXCESS_PRECISION
17502 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17503
17504 #undef  TARGET_EXPAND_BUILTIN
17505 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17506
17507 #undef TARGET_EXPAND_BUILTIN_VA_START
17508 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17509
17510 #undef TARGET_FOLD_BUILTIN
17511 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17512
17513 #undef TARGET_FUNCTION_ARG
17514 #define TARGET_FUNCTION_ARG aarch64_function_arg
17515
17516 #undef TARGET_FUNCTION_ARG_ADVANCE
17517 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17518
17519 #undef TARGET_FUNCTION_ARG_BOUNDARY
17520 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17521
17522 #undef TARGET_FUNCTION_ARG_PADDING
17523 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17524
17525 #undef TARGET_GET_RAW_RESULT_MODE
17526 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17527 #undef TARGET_GET_RAW_ARG_MODE
17528 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17529
17530 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17531 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17532
17533 #undef TARGET_FUNCTION_VALUE
17534 #define TARGET_FUNCTION_VALUE aarch64_function_value
17535
17536 #undef TARGET_FUNCTION_VALUE_REGNO_P
17537 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17538
17539 #undef TARGET_GIMPLE_FOLD_BUILTIN
17540 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17541
17542 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17543 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17544
17545 #undef  TARGET_INIT_BUILTINS
17546 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
17547
17548 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17549 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17550   aarch64_ira_change_pseudo_allocno_class
17551
17552 #undef TARGET_LEGITIMATE_ADDRESS_P
17553 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17554
17555 #undef TARGET_LEGITIMATE_CONSTANT_P
17556 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17557
17558 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17559 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17560   aarch64_legitimize_address_displacement
17561
17562 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17563 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17564
17565 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17566 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17567 aarch64_libgcc_floating_mode_supported_p
17568
17569 #undef TARGET_MANGLE_TYPE
17570 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17571
17572 #undef TARGET_MEMORY_MOVE_COST
17573 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17574
17575 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17576 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17577
17578 #undef TARGET_MUST_PASS_IN_STACK
17579 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17580
17581 /* This target hook should return true if accesses to volatile bitfields
17582    should use the narrowest mode possible.  It should return false if these
17583    accesses should use the bitfield container type.  */
17584 #undef TARGET_NARROW_VOLATILE_BITFIELD
17585 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17586
17587 #undef  TARGET_OPTION_OVERRIDE
17588 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17589
17590 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17591 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17592   aarch64_override_options_after_change
17593
17594 #undef TARGET_OPTION_SAVE
17595 #define TARGET_OPTION_SAVE aarch64_option_save
17596
17597 #undef TARGET_OPTION_RESTORE
17598 #define TARGET_OPTION_RESTORE aarch64_option_restore
17599
17600 #undef TARGET_OPTION_PRINT
17601 #define TARGET_OPTION_PRINT aarch64_option_print
17602
17603 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17604 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17605
17606 #undef TARGET_SET_CURRENT_FUNCTION
17607 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17608
17609 #undef TARGET_PASS_BY_REFERENCE
17610 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17611
17612 #undef TARGET_PREFERRED_RELOAD_CLASS
17613 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17614
17615 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17616 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17617
17618 #undef TARGET_PROMOTED_TYPE
17619 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17620
17621 #undef TARGET_SECONDARY_RELOAD
17622 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17623
17624 #undef TARGET_SHIFT_TRUNCATION_MASK
17625 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17626
17627 #undef TARGET_SETUP_INCOMING_VARARGS
17628 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17629
17630 #undef TARGET_STRUCT_VALUE_RTX
17631 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
17632
17633 #undef TARGET_REGISTER_MOVE_COST
17634 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17635
17636 #undef TARGET_RETURN_IN_MEMORY
17637 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17638
17639 #undef TARGET_RETURN_IN_MSB
17640 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17641
17642 #undef TARGET_RTX_COSTS
17643 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17644
17645 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17646 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17647
17648 #undef TARGET_SCHED_ISSUE_RATE
17649 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17650
17651 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17652 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17653   aarch64_sched_first_cycle_multipass_dfa_lookahead
17654
17655 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17656 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17657   aarch64_first_cycle_multipass_dfa_lookahead_guard
17658
17659 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17660 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17661   aarch64_get_separate_components
17662
17663 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17664 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17665   aarch64_components_for_bb
17666
17667 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17668 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17669   aarch64_disqualify_components
17670
17671 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17672 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17673   aarch64_emit_prologue_components
17674
17675 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17676 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17677   aarch64_emit_epilogue_components
17678
17679 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17680 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17681   aarch64_set_handled_components
17682
17683 #undef TARGET_TRAMPOLINE_INIT
17684 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17685
17686 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17687 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17688
17689 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17690 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17691
17692 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17693 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17694   aarch64_builtin_support_vector_misalignment
17695
17696 #undef TARGET_ARRAY_MODE
17697 #define TARGET_ARRAY_MODE aarch64_array_mode
17698
17699 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17700 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17701
17702 #undef TARGET_VECTORIZE_ADD_STMT_COST
17703 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17704
17705 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17706 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17707   aarch64_builtin_vectorization_cost
17708
17709 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17710 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17711
17712 #undef TARGET_VECTORIZE_BUILTINS
17713 #define TARGET_VECTORIZE_BUILTINS
17714
17715 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17716 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17717   aarch64_builtin_vectorized_function
17718
17719 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17720 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17721   aarch64_autovectorize_vector_sizes
17722
17723 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17724 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17725   aarch64_atomic_assign_expand_fenv
17726
17727 /* Section anchor support.  */
17728
17729 #undef TARGET_MIN_ANCHOR_OFFSET
17730 #define TARGET_MIN_ANCHOR_OFFSET -256
17731
17732 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17733    byte offset; we can do much more for larger data types, but have no way
17734    to determine the size of the access.  We assume accesses are aligned.  */
17735 #undef TARGET_MAX_ANCHOR_OFFSET
17736 #define TARGET_MAX_ANCHOR_OFFSET 4095
17737
17738 #undef TARGET_VECTOR_ALIGNMENT
17739 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17740
17741 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17742 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17743   aarch64_vectorize_preferred_vector_alignment
17744 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17745 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17746   aarch64_simd_vector_alignment_reachable
17747
17748 /* vec_perm support.  */
17749
17750 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17751 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17752   aarch64_vectorize_vec_perm_const
17753
17754 #undef TARGET_VECTORIZE_GET_MASK_MODE
17755 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17756 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17757 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17758   aarch64_empty_mask_is_expensive
17759
17760 #undef TARGET_INIT_LIBFUNCS
17761 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17762
17763 #undef TARGET_FIXED_CONDITION_CODE_REGS
17764 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17765
17766 #undef TARGET_FLAGS_REGNUM
17767 #define TARGET_FLAGS_REGNUM CC_REGNUM
17768
17769 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17770 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17771
17772 #undef TARGET_ASAN_SHADOW_OFFSET
17773 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17774
17775 #undef TARGET_LEGITIMIZE_ADDRESS
17776 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17777
17778 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17779 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17780
17781 #undef TARGET_CAN_USE_DOLOOP_P
17782 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17783
17784 #undef TARGET_SCHED_ADJUST_PRIORITY
17785 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17786
17787 #undef TARGET_SCHED_MACRO_FUSION_P
17788 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17789
17790 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17791 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17792
17793 #undef TARGET_SCHED_FUSION_PRIORITY
17794 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17795
17796 #undef TARGET_UNSPEC_MAY_TRAP_P
17797 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17798
17799 #undef TARGET_USE_PSEUDO_PIC_REG
17800 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17801
17802 #undef TARGET_PRINT_OPERAND
17803 #define TARGET_PRINT_OPERAND aarch64_print_operand
17804
17805 #undef TARGET_PRINT_OPERAND_ADDRESS
17806 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17807
17808 #undef TARGET_OPTAB_SUPPORTED_P
17809 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17810
17811 #undef TARGET_OMIT_STRUCT_RETURN_REG
17812 #define TARGET_OMIT_STRUCT_RETURN_REG true
17813
17814 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17815 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17816   aarch64_dwarf_poly_indeterminate_value
17817
17818 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
17819 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17820 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17821
17822 #undef TARGET_HARD_REGNO_NREGS
17823 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17824 #undef TARGET_HARD_REGNO_MODE_OK
17825 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17826
17827 #undef TARGET_MODES_TIEABLE_P
17828 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17829
17830 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17831 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17832   aarch64_hard_regno_call_part_clobbered
17833
17834 #undef TARGET_CONSTANT_ALIGNMENT
17835 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17836
17837 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17838 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17839
17840 #undef TARGET_CAN_CHANGE_MODE_CLASS
17841 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17842
17843 #undef TARGET_SELECT_EARLY_REMAT_MODES
17844 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
17845
17846 #if CHECKING_P
17847 #undef TARGET_RUN_TARGET_SELFTESTS
17848 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17849 #endif /* #if CHECKING_P */
17850
17851 struct gcc_target targetm = TARGET_INITIALIZER;
17852
17853 #include "gt-aarch64.h"