gcc/config/aarch64/aarch64.c

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2018 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #include "config.h"
  24 #define INCLUDE_STRING
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "memmodel.h"
  32 #include "gimple.h"
  33 #include "cfghooks.h"
  34 #include "cfgloop.h"
  35 #include "df.h"
  36 #include "tm_p.h"
  37 #include "stringpool.h"
  38 #include "attribs.h"
  39 #include "optabs.h"
  40 #include "regs.h"
  41 #include "emit-rtl.h"
  42 #include "recog.h"
  43 #include "diagnostic.h"
  44 #include "insn-attr.h"
  45 #include "alias.h"
  46 #include "fold-const.h"
  47 #include "stor-layout.h"
  48 #include "calls.h"
  49 #include "varasm.h"
  50 #include "output.h"
  51 #include "flags.h"
  52 #include "explow.h"
  53 #include "expr.h"
  54 #include "reload.h"
  55 #include "langhooks.h"
  56 #include "opts.h"
  57 #include "params.h"
  58 #include "gimplify.h"
  59 #include "dwarf2.h"
  60 #include "gimple-iterator.h"
  61 #include "tree-vectorizer.h"
  62 #include "aarch64-cost-tables.h"
  63 #include "dumpfile.h"
  64 #include "builtins.h"
  65 #include "rtl-iter.h"
  66 #include "tm-constrs.h"
  67 #include "sched-int.h"
  68 #include "target-globals.h"
  69 #include "common/common-target.h"
  70 #include "cfgrtl.h"
  71 #include "selftest.h"
  72 #include "selftest-rtl.h"
  73 #include "rtx-vector-builder.h"
  74
  75 /* This file should be included last.  */
  76 #include "target-def.h"
  77
  78 /* Defined for convenience.  */
  79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
  80
  81 /* Classifies an address.
  82
  83    ADDRESS_REG_IMM
  84        A simple base register plus immediate offset.
  85
  86    ADDRESS_REG_WB
  87        A base register indexed by immediate offset with writeback.
  88
  89    ADDRESS_REG_REG
  90        A base register indexed by (optionally scaled) register.
  91
  92    ADDRESS_REG_UXTW
  93        A base register indexed by (optionally scaled) zero-extended register.
  94
  95    ADDRESS_REG_SXTW
  96        A base register indexed by (optionally scaled) sign-extended register.
  97
  98    ADDRESS_LO_SUM
  99        A LO_SUM rtx with a base register and "LO12" symbol relocation.
 100
 101    ADDRESS_SYMBOLIC:
 102        A constant symbolic address, in pc-relative literal pool.  */
 103
 104 enum aarch64_address_type {
 105   ADDRESS_REG_IMM,
 106   ADDRESS_REG_WB,
 107   ADDRESS_REG_REG,
 108   ADDRESS_REG_UXTW,
 109   ADDRESS_REG_SXTW,
 110   ADDRESS_LO_SUM,
 111   ADDRESS_SYMBOLIC
 112 };
 113
 114 struct aarch64_address_info {
 115   enum aarch64_address_type type;
 116   rtx base;
 117   rtx offset;
 118   poly_int64 const_offset;
 119   int shift;
 120   enum aarch64_symbol_type symbol_type;
 121 };
 122
 123 /* Information about a legitimate vector immediate operand.  */
 124 struct simd_immediate_info
 125 {
 126   enum insn_type { MOV, MVN };
 127   enum modifier_type { LSL, MSL };
 128
 129   simd_immediate_info () {}
 130   simd_immediate_info (scalar_float_mode, rtx);
 131   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 132                        insn_type = MOV, modifier_type = LSL,
 133                        unsigned int = 0);
 134   simd_immediate_info (scalar_mode, rtx, rtx);
 135
 136   /* The mode of the elements.  */
 137   scalar_mode elt_mode;
 138
 139   /* The value of each element if all elements are the same, or the
 140      first value if the constant is a series.  */
 141   rtx value;
 142
 143   /* The value of the step if the constant is a series, null otherwise.  */
 144   rtx step;
 145
 146   /* The instruction to use to move the immediate into a vector.  */
 147   insn_type insn;
 148
 149   /* The kind of shift modifier to use, and the number of bits to shift.
 150      This is (LSL, 0) if no shift is needed.  */
 151   modifier_type modifier;
 152   unsigned int shift;
 153 };
 154
 155 /* Construct a floating-point immediate in which each element has mode
 156    ELT_MODE_IN and value VALUE_IN.  */
 157 inline simd_immediate_info
 158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 159   : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
 160     modifier (LSL), shift (0)
 161 {}
 162
 163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 164    and value VALUE_IN.  The other parameters are as for the structure
 165    fields.  */
 166 inline simd_immediate_info
 167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 168                        unsigned HOST_WIDE_INT value_in,
 169                        insn_type insn_in, modifier_type modifier_in,
 170                        unsigned int shift_in)
 171   : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
 172     step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
 173 {}
 174
 175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 176    and where element I is equal to VALUE_IN + I * STEP_IN.  */
 177 inline simd_immediate_info
 178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
 179   : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
 180     modifier (LSL), shift (0)
 181 {}
 182
 183 /* The current code model.  */
 184 enum aarch64_code_model aarch64_cmodel;
 185
 186 /* The number of 64-bit elements in an SVE vector.  */
 187 poly_uint16 aarch64_sve_vg;
 188
 189 #ifdef HAVE_AS_TLS
 190 #undef TARGET_HAVE_TLS
 191 #define TARGET_HAVE_TLS 1
 192 #endif
 193
 194 static bool aarch64_composite_type_p (const_tree, machine_mode);
 195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 196                                                      const_tree,
 197                                                      machine_mode *, int *,
 198                                                      bool *);
 199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 201 static void aarch64_override_options_after_change (void);
 202 static bool aarch64_vector_mode_supported_p (machine_mode);
 203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 205                                                          const_tree type,
 206                                                          int misalignment,
 207                                                          bool is_packed);
 208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
 210
 211 /* Major revision number of the ARM Architecture implemented by the target.  */
 212 unsigned aarch64_architecture_version;
 213
 214 /* The processor for which instructions should be scheduled.  */
 215 enum aarch64_processor aarch64_tune = cortexa53;
 216
 217 /* Mask to specify which instruction scheduling options should be used.  */
 218 unsigned long aarch64_tune_flags = 0;
 219
 220 /* Global flag for PC relative loads.  */
 221 bool aarch64_pcrelative_literal_loads;
 222
 223 /* Support for command line parsing of boolean flags in the tuning
 224    structures.  */
 225 struct aarch64_flag_desc
 226 {
 227   const char* name;
 228   unsigned int flag;
 229 };
 230
 231 #define AARCH64_FUSION_PAIR(name, internal_name) \
 232   { name, AARCH64_FUSE_##internal_name },
 233 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 234 {
 235   { "none", AARCH64_FUSE_NOTHING },
 236 #include "aarch64-fusion-pairs.def"
 237   { "all", AARCH64_FUSE_ALL },
 238   { NULL, AARCH64_FUSE_NOTHING }
 239 };
 240
 241 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 242   { name, AARCH64_EXTRA_TUNE_##internal_name },
 243 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 244 {
 245   { "none", AARCH64_EXTRA_TUNE_NONE },
 246 #include "aarch64-tuning-flags.def"
 247   { "all", AARCH64_EXTRA_TUNE_ALL },
 248   { NULL, AARCH64_EXTRA_TUNE_NONE }
 249 };
 250
 251 /* Tuning parameters.  */
 252
 253 static const struct cpu_addrcost_table generic_addrcost_table =
 254 {
 255     {
 256       1, /* hi  */
 257       0, /* si  */
 258       0, /* di  */
 259       1, /* ti  */
 260     },
 261   0, /* pre_modify  */
 262   0, /* post_modify  */
 263   0, /* register_offset  */
 264   0, /* register_sextend  */
 265   0, /* register_zextend  */
 266   0 /* imm_offset  */
 267 };
 268
 269 static const struct cpu_addrcost_table exynosm1_addrcost_table =
 270 {
 271     {
 272       0, /* hi  */
 273       0, /* si  */
 274       0, /* di  */
 275       2, /* ti  */
 276     },
 277   0, /* pre_modify  */
 278   0, /* post_modify  */
 279   1, /* register_offset  */
 280   1, /* register_sextend  */
 281   2, /* register_zextend  */
 282   0, /* imm_offset  */
 283 };
 284
 285 static const struct cpu_addrcost_table xgene1_addrcost_table =
 286 {
 287     {
 288       1, /* hi  */
 289       0, /* si  */
 290       0, /* di  */
 291       1, /* ti  */
 292     },
 293   1, /* pre_modify  */
 294   0, /* post_modify  */
 295   0, /* register_offset  */
 296   1, /* register_sextend  */
 297   1, /* register_zextend  */
 298   0, /* imm_offset  */
 299 };
 300
 301 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
 302 {
 303     {
 304       1, /* hi  */
 305       1, /* si  */
 306       1, /* di  */
 307       2, /* ti  */
 308     },
 309   0, /* pre_modify  */
 310   0, /* post_modify  */
 311   2, /* register_offset  */
 312   3, /* register_sextend  */
 313   3, /* register_zextend  */
 314   0, /* imm_offset  */
 315 };
 316
 317 static const struct cpu_regmove_cost generic_regmove_cost =
 318 {
 319   1, /* GP2GP  */
 320   /* Avoid the use of slow int<->fp moves for spilling by setting
 321      their cost higher than memmov_cost.  */
 322   5, /* GP2FP  */
 323   5, /* FP2GP  */
 324   2 /* FP2FP  */
 325 };
 326
 327 static const struct cpu_regmove_cost cortexa57_regmove_cost =
 328 {
 329   1, /* GP2GP  */
 330   /* Avoid the use of slow int<->fp moves for spilling by setting
 331      their cost higher than memmov_cost.  */
 332   5, /* GP2FP  */
 333   5, /* FP2GP  */
 334   2 /* FP2FP  */
 335 };
 336
 337 static const struct cpu_regmove_cost cortexa53_regmove_cost =
 338 {
 339   1, /* GP2GP  */
 340   /* Avoid the use of slow int<->fp moves for spilling by setting
 341      their cost higher than memmov_cost.  */
 342   5, /* GP2FP  */
 343   5, /* FP2GP  */
 344   2 /* FP2FP  */
 345 };
 346
 347 static const struct cpu_regmove_cost exynosm1_regmove_cost =
 348 {
 349   1, /* GP2GP  */
 350   /* Avoid the use of slow int<->fp moves for spilling by setting
 351      their cost higher than memmov_cost (actual, 4 and 9).  */
 352   9, /* GP2FP  */
 353   9, /* FP2GP  */
 354   1 /* FP2FP  */
 355 };
 356
 357 static const struct cpu_regmove_cost thunderx_regmove_cost =
 358 {
 359   2, /* GP2GP  */
 360   2, /* GP2FP  */
 361   6, /* FP2GP  */
 362   4 /* FP2FP  */
 363 };
 364
 365 static const struct cpu_regmove_cost xgene1_regmove_cost =
 366 {
 367   1, /* GP2GP  */
 368   /* Avoid the use of slow int<->fp moves for spilling by setting
 369      their cost higher than memmov_cost.  */
 370   8, /* GP2FP  */
 371   8, /* FP2GP  */
 372   2 /* FP2FP  */
 373 };
 374
 375 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
 376 {
 377   2, /* GP2GP  */
 378   /* Avoid the use of int<->fp moves for spilling.  */
 379   6, /* GP2FP  */
 380   6, /* FP2GP  */
 381   4 /* FP2FP  */
 382 };
 383
 384 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
 385 {
 386   1, /* GP2GP  */
 387   /* Avoid the use of int<->fp moves for spilling.  */
 388   8, /* GP2FP  */
 389   8, /* FP2GP  */
 390   4  /* FP2FP  */
 391 };
 392
 393 /* Generic costs for vector insn classes.  */
 394 static const struct cpu_vector_cost generic_vector_cost =
 395 {
 396   1, /* scalar_int_stmt_cost  */
 397   1, /* scalar_fp_stmt_cost  */
 398   1, /* scalar_load_cost  */
 399   1, /* scalar_store_cost  */
 400   1, /* vec_int_stmt_cost  */
 401   1, /* vec_fp_stmt_cost  */
 402   2, /* vec_permute_cost  */
 403   1, /* vec_to_scalar_cost  */
 404   1, /* scalar_to_vec_cost  */
 405   1, /* vec_align_load_cost  */
 406   1, /* vec_unalign_load_cost  */
 407   1, /* vec_unalign_store_cost  */
 408   1, /* vec_store_cost  */
 409   3, /* cond_taken_branch_cost  */
 410   1 /* cond_not_taken_branch_cost  */
 411 };
 412
 413 /* ThunderX costs for vector insn classes.  */
 414 static const struct cpu_vector_cost thunderx_vector_cost =
 415 {
 416   1, /* scalar_int_stmt_cost  */
 417   1, /* scalar_fp_stmt_cost  */
 418   3, /* scalar_load_cost  */
 419   1, /* scalar_store_cost  */
 420   4, /* vec_int_stmt_cost  */
 421   1, /* vec_fp_stmt_cost  */
 422   4, /* vec_permute_cost  */
 423   2, /* vec_to_scalar_cost  */
 424   2, /* scalar_to_vec_cost  */
 425   3, /* vec_align_load_cost  */
 426   5, /* vec_unalign_load_cost  */
 427   5, /* vec_unalign_store_cost  */
 428   1, /* vec_store_cost  */
 429   3, /* cond_taken_branch_cost  */
 430   3 /* cond_not_taken_branch_cost  */
 431 };
 432
 433 /* Generic costs for vector insn classes.  */
 434 static const struct cpu_vector_cost cortexa57_vector_cost =
 435 {
 436   1, /* scalar_int_stmt_cost  */
 437   1, /* scalar_fp_stmt_cost  */
 438   4, /* scalar_load_cost  */
 439   1, /* scalar_store_cost  */
 440   2, /* vec_int_stmt_cost  */
 441   2, /* vec_fp_stmt_cost  */
 442   3, /* vec_permute_cost  */
 443   8, /* vec_to_scalar_cost  */
 444   8, /* scalar_to_vec_cost  */
 445   4, /* vec_align_load_cost  */
 446   4, /* vec_unalign_load_cost  */
 447   1, /* vec_unalign_store_cost  */
 448   1, /* vec_store_cost  */
 449   1, /* cond_taken_branch_cost  */
 450   1 /* cond_not_taken_branch_cost  */
 451 };
 452
 453 static const struct cpu_vector_cost exynosm1_vector_cost =
 454 {
 455   1, /* scalar_int_stmt_cost  */
 456   1, /* scalar_fp_stmt_cost  */
 457   5, /* scalar_load_cost  */
 458   1, /* scalar_store_cost  */
 459   3, /* vec_int_stmt_cost  */
 460   3, /* vec_fp_stmt_cost  */
 461   3, /* vec_permute_cost  */
 462   3, /* vec_to_scalar_cost  */
 463   3, /* scalar_to_vec_cost  */
 464   5, /* vec_align_load_cost  */
 465   5, /* vec_unalign_load_cost  */
 466   1, /* vec_unalign_store_cost  */
 467   1, /* vec_store_cost  */
 468   1, /* cond_taken_branch_cost  */
 469   1 /* cond_not_taken_branch_cost  */
 470 };
 471
 472 /* Generic costs for vector insn classes.  */
 473 static const struct cpu_vector_cost xgene1_vector_cost =
 474 {
 475   1, /* scalar_int_stmt_cost  */
 476   1, /* scalar_fp_stmt_cost  */
 477   5, /* scalar_load_cost  */
 478   1, /* scalar_store_cost  */
 479   2, /* vec_int_stmt_cost  */
 480   2, /* vec_fp_stmt_cost  */
 481   2, /* vec_permute_cost  */
 482   4, /* vec_to_scalar_cost  */
 483   4, /* scalar_to_vec_cost  */
 484   10, /* vec_align_load_cost  */
 485   10, /* vec_unalign_load_cost  */
 486   2, /* vec_unalign_store_cost  */
 487   2, /* vec_store_cost  */
 488   2, /* cond_taken_branch_cost  */
 489   1 /* cond_not_taken_branch_cost  */
 490 };
 491
 492 /* Costs for vector insn classes for Vulcan.  */
 493 static const struct cpu_vector_cost thunderx2t99_vector_cost =
 494 {
 495   1, /* scalar_int_stmt_cost  */
 496   6, /* scalar_fp_stmt_cost  */
 497   4, /* scalar_load_cost  */
 498   1, /* scalar_store_cost  */
 499   5, /* vec_int_stmt_cost  */
 500   6, /* vec_fp_stmt_cost  */
 501   3, /* vec_permute_cost  */
 502   6, /* vec_to_scalar_cost  */
 503   5, /* scalar_to_vec_cost  */
 504   8, /* vec_align_load_cost  */
 505   8, /* vec_unalign_load_cost  */
 506   4, /* vec_unalign_store_cost  */
 507   4, /* vec_store_cost  */
 508   2, /* cond_taken_branch_cost  */
 509   1  /* cond_not_taken_branch_cost  */
 510 };
 511
 512 /* Generic costs for branch instructions.  */
 513 static const struct cpu_branch_cost generic_branch_cost =
 514 {
 515   1,  /* Predictable.  */
 516   3   /* Unpredictable.  */
 517 };
 518
 519 /* Generic approximation modes.  */
 520 static const cpu_approx_modes generic_approx_modes =
 521 {
 522   AARCH64_APPROX_NONE,  /* division  */
 523   AARCH64_APPROX_NONE,  /* sqrt  */
 524   AARCH64_APPROX_NONE   /* recip_sqrt  */
 525 };
 526
 527 /* Approximation modes for Exynos M1.  */
 528 static const cpu_approx_modes exynosm1_approx_modes =
 529 {
 530   AARCH64_APPROX_NONE,  /* division  */
 531   AARCH64_APPROX_ALL,   /* sqrt  */
 532   AARCH64_APPROX_ALL    /* recip_sqrt  */
 533 };
 534
 535 /* Approximation modes for X-Gene 1.  */
 536 static const cpu_approx_modes xgene1_approx_modes =
 537 {
 538   AARCH64_APPROX_NONE,  /* division  */
 539   AARCH64_APPROX_NONE,  /* sqrt  */
 540   AARCH64_APPROX_ALL    /* recip_sqrt  */
 541 };
 542
 543 /* Generic prefetch settings (which disable prefetch).  */
 544 static const cpu_prefetch_tune generic_prefetch_tune =
 545 {
 546   0,                    /* num_slots  */
 547   -1,                   /* l1_cache_size  */
 548   -1,                   /* l1_cache_line_size  */
 549   -1,                   /* l2_cache_size  */
 550   -1                    /* default_opt_level  */
 551 };
 552
 553 static const cpu_prefetch_tune exynosm1_prefetch_tune =
 554 {
 555   0,                    /* num_slots  */
 556   -1,                   /* l1_cache_size  */
 557   64,                   /* l1_cache_line_size  */
 558   -1,                   /* l2_cache_size  */
 559   -1                    /* default_opt_level  */
 560 };
 561
 562 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
 563 {
 564   4,                    /* num_slots  */
 565   32,                   /* l1_cache_size  */
 566   64,                   /* l1_cache_line_size  */
 567   1024,                 /* l2_cache_size  */
 568   -1                    /* default_opt_level  */
 569 };
 570
 571 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
 572 {
 573   8,                    /* num_slots  */
 574   32,                   /* l1_cache_size  */
 575   128,                  /* l1_cache_line_size  */
 576   16*1024,              /* l2_cache_size  */
 577   3                     /* default_opt_level  */
 578 };
 579
 580 static const cpu_prefetch_tune thunderx_prefetch_tune =
 581 {
 582   8,                    /* num_slots  */
 583   32,                   /* l1_cache_size  */
 584   128,                  /* l1_cache_line_size  */
 585   -1,                   /* l2_cache_size  */
 586   -1                    /* default_opt_level  */
 587 };
 588
 589 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
 590 {
 591   8,                    /* num_slots  */
 592   32,                   /* l1_cache_size  */
 593   64,                   /* l1_cache_line_size  */
 594   256,                  /* l2_cache_size  */
 595   -1                    /* default_opt_level  */
 596 };
 597
 598 static const struct tune_params generic_tunings =
 599 {
 600   &cortexa57_extra_costs,
 601   &generic_addrcost_table,
 602   &generic_regmove_cost,
 603   &generic_vector_cost,
 604   &generic_branch_cost,
 605   &generic_approx_modes,
 606   4, /* memmov_cost  */
 607   2, /* issue_rate  */
 608   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 609   8,    /* function_align.  */
 610   4,    /* jump_align.  */
 611   8,    /* loop_align.  */
 612   2,    /* int_reassoc_width.  */
 613   4,    /* fp_reassoc_width.  */
 614   1,    /* vec_reassoc_width.  */
 615   2,    /* min_div_recip_mul_sf.  */
 616   2,    /* min_div_recip_mul_df.  */
 617   0,    /* max_case_values.  */
 618   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 619   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 620   &generic_prefetch_tune
 621 };
 622
 623 static const struct tune_params cortexa35_tunings =
 624 {
 625   &cortexa53_extra_costs,
 626   &generic_addrcost_table,
 627   &cortexa53_regmove_cost,
 628   &generic_vector_cost,
 629   &generic_branch_cost,
 630   &generic_approx_modes,
 631   4, /* memmov_cost  */
 632   1, /* issue_rate  */
 633   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 634    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 635   16,   /* function_align.  */
 636   4,    /* jump_align.  */
 637   8,    /* loop_align.  */
 638   2,    /* int_reassoc_width.  */
 639   4,    /* fp_reassoc_width.  */
 640   1,    /* vec_reassoc_width.  */
 641   2,    /* min_div_recip_mul_sf.  */
 642   2,    /* min_div_recip_mul_df.  */
 643   0,    /* max_case_values.  */
 644   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 645   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 646   &generic_prefetch_tune
 647 };
 648
 649 static const struct tune_params cortexa53_tunings =
 650 {
 651   &cortexa53_extra_costs,
 652   &generic_addrcost_table,
 653   &cortexa53_regmove_cost,
 654   &generic_vector_cost,
 655   &generic_branch_cost,
 656   &generic_approx_modes,
 657   4, /* memmov_cost  */
 658   2, /* issue_rate  */
 659   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 660    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 661   16,   /* function_align.  */
 662   4,    /* jump_align.  */
 663   8,    /* loop_align.  */
 664   2,    /* int_reassoc_width.  */
 665   4,    /* fp_reassoc_width.  */
 666   1,    /* vec_reassoc_width.  */
 667   2,    /* min_div_recip_mul_sf.  */
 668   2,    /* min_div_recip_mul_df.  */
 669   0,    /* max_case_values.  */
 670   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 671   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 672   &generic_prefetch_tune
 673 };
 674
 675 static const struct tune_params cortexa57_tunings =
 676 {
 677   &cortexa57_extra_costs,
 678   &generic_addrcost_table,
 679   &cortexa57_regmove_cost,
 680   &cortexa57_vector_cost,
 681   &generic_branch_cost,
 682   &generic_approx_modes,
 683   4, /* memmov_cost  */
 684   3, /* issue_rate  */
 685   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 686    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 687   16,   /* function_align.  */
 688   4,    /* jump_align.  */
 689   8,    /* loop_align.  */
 690   2,    /* int_reassoc_width.  */
 691   4,    /* fp_reassoc_width.  */
 692   1,    /* vec_reassoc_width.  */
 693   2,    /* min_div_recip_mul_sf.  */
 694   2,    /* min_div_recip_mul_df.  */
 695   0,    /* max_case_values.  */
 696   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 697   (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags.  */
 698   &generic_prefetch_tune
 699 };
 700
 701 static const struct tune_params cortexa72_tunings =
 702 {
 703   &cortexa57_extra_costs,
 704   &generic_addrcost_table,
 705   &cortexa57_regmove_cost,
 706   &cortexa57_vector_cost,
 707   &generic_branch_cost,
 708   &generic_approx_modes,
 709   4, /* memmov_cost  */
 710   3, /* issue_rate  */
 711   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 712    | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops  */
 713   16,   /* function_align.  */
 714   4,    /* jump_align.  */
 715   8,    /* loop_align.  */
 716   2,    /* int_reassoc_width.  */
 717   4,    /* fp_reassoc_width.  */
 718   1,    /* vec_reassoc_width.  */
 719   2,    /* min_div_recip_mul_sf.  */
 720   2,    /* min_div_recip_mul_df.  */
 721   0,    /* max_case_values.  */
 722   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 723   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 724   &generic_prefetch_tune
 725 };
 726
 727 static const struct tune_params cortexa73_tunings =
 728 {
 729   &cortexa57_extra_costs,
 730   &generic_addrcost_table,
 731   &cortexa57_regmove_cost,
 732   &cortexa57_vector_cost,
 733   &generic_branch_cost,
 734   &generic_approx_modes,
 735   4, /* memmov_cost.  */
 736   2, /* issue_rate.  */
 737   (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 738    | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops  */
 739   16,   /* function_align.  */
 740   4,    /* jump_align.  */
 741   8,    /* loop_align.  */
 742   2,    /* int_reassoc_width.  */
 743   4,    /* fp_reassoc_width.  */
 744   1,    /* vec_reassoc_width.  */
 745   2,    /* min_div_recip_mul_sf.  */
 746   2,    /* min_div_recip_mul_df.  */
 747   0,    /* max_case_values.  */
 748   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 749   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 750   &generic_prefetch_tune
 751 };
 752
 753
 754
 755 static const struct tune_params exynosm1_tunings =
 756 {
 757   &exynosm1_extra_costs,
 758   &exynosm1_addrcost_table,
 759   &exynosm1_regmove_cost,
 760   &exynosm1_vector_cost,
 761   &generic_branch_cost,
 762   &exynosm1_approx_modes,
 763   4,    /* memmov_cost  */
 764   3,    /* issue_rate  */
 765   (AARCH64_FUSE_AES_AESMC), /* fusible_ops  */
 766   4,    /* function_align.  */
 767   4,    /* jump_align.  */
 768   4,    /* loop_align.  */
 769   2,    /* int_reassoc_width.  */
 770   4,    /* fp_reassoc_width.  */
 771   1,    /* vec_reassoc_width.  */
 772   2,    /* min_div_recip_mul_sf.  */
 773   2,    /* min_div_recip_mul_df.  */
 774   48,   /* max_case_values.  */
 775   tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model.  */
 776   (AARCH64_EXTRA_TUNE_NONE), /* tune_flags.  */
 777   &exynosm1_prefetch_tune
 778 };
 779
 780 static const struct tune_params thunderxt88_tunings =
 781 {
 782   &thunderx_extra_costs,
 783   &generic_addrcost_table,
 784   &thunderx_regmove_cost,
 785   &thunderx_vector_cost,
 786   &generic_branch_cost,
 787   &generic_approx_modes,
 788   6, /* memmov_cost  */
 789   2, /* issue_rate  */
 790   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 791   8,    /* function_align.  */
 792   8,    /* jump_align.  */
 793   8,    /* loop_align.  */
 794   2,    /* int_reassoc_width.  */
 795   4,    /* fp_reassoc_width.  */
 796   1,    /* vec_reassoc_width.  */
 797   2,    /* min_div_recip_mul_sf.  */
 798   2,    /* min_div_recip_mul_df.  */
 799   0,    /* max_case_values.  */
 800   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 801   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW),     /* tune_flags.  */
 802   &thunderxt88_prefetch_tune
 803 };
 804
 805 static const struct tune_params thunderx_tunings =
 806 {
 807   &thunderx_extra_costs,
 808   &generic_addrcost_table,
 809   &thunderx_regmove_cost,
 810   &thunderx_vector_cost,
 811   &generic_branch_cost,
 812   &generic_approx_modes,
 813   6, /* memmov_cost  */
 814   2, /* issue_rate  */
 815   AARCH64_FUSE_CMP_BRANCH, /* fusible_ops  */
 816   8,    /* function_align.  */
 817   8,    /* jump_align.  */
 818   8,    /* loop_align.  */
 819   2,    /* int_reassoc_width.  */
 820   4,    /* fp_reassoc_width.  */
 821   1,    /* vec_reassoc_width.  */
 822   2,    /* min_div_recip_mul_sf.  */
 823   2,    /* min_div_recip_mul_df.  */
 824   0,    /* max_case_values.  */
 825   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 826   (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
 827    | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND),    /* tune_flags.  */
 828   &thunderx_prefetch_tune
 829 };
 830
 831 static const struct tune_params xgene1_tunings =
 832 {
 833   &xgene1_extra_costs,
 834   &xgene1_addrcost_table,
 835   &xgene1_regmove_cost,
 836   &xgene1_vector_cost,
 837   &generic_branch_cost,
 838   &xgene1_approx_modes,
 839   6, /* memmov_cost  */
 840   4, /* issue_rate  */
 841   AARCH64_FUSE_NOTHING, /* fusible_ops  */
 842   16,   /* function_align.  */
 843   8,    /* jump_align.  */
 844   16,   /* loop_align.  */
 845   2,    /* int_reassoc_width.  */
 846   4,    /* fp_reassoc_width.  */
 847   1,    /* vec_reassoc_width.  */
 848   2,    /* min_div_recip_mul_sf.  */
 849   2,    /* min_div_recip_mul_df.  */
 850   0,    /* max_case_values.  */
 851   tune_params::AUTOPREFETCHER_OFF,      /* autoprefetcher_model.  */
 852   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 853   &generic_prefetch_tune
 854 };
 855
 856 static const struct tune_params qdf24xx_tunings =
 857 {
 858   &qdf24xx_extra_costs,
 859   &generic_addrcost_table,
 860   &qdf24xx_regmove_cost,
 861   &generic_vector_cost,
 862   &generic_branch_cost,
 863   &generic_approx_modes,
 864   4, /* memmov_cost  */
 865   4, /* issue_rate  */
 866   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 867    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 868   16,   /* function_align.  */
 869   8,    /* jump_align.  */
 870   16,   /* loop_align.  */
 871   2,    /* int_reassoc_width.  */
 872   4,    /* fp_reassoc_width.  */
 873   1,    /* vec_reassoc_width.  */
 874   2,    /* min_div_recip_mul_sf.  */
 875   2,    /* min_div_recip_mul_df.  */
 876   0,    /* max_case_values.  */
 877   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 878   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 879   &qdf24xx_prefetch_tune
 880 };
 881
 882 /* Tuning structure for the Qualcomm Saphira core.  Default to falkor values
 883    for now.  */
 884 static const struct tune_params saphira_tunings =
 885 {
 886   &generic_extra_costs,
 887   &generic_addrcost_table,
 888   &generic_regmove_cost,
 889   &generic_vector_cost,
 890   &generic_branch_cost,
 891   &generic_approx_modes,
 892   4, /* memmov_cost  */
 893   4, /* issue_rate  */
 894   (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
 895    | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops  */
 896   16,   /* function_align.  */
 897   8,    /* jump_align.  */
 898   16,   /* loop_align.  */
 899   2,    /* int_reassoc_width.  */
 900   4,    /* fp_reassoc_width.  */
 901   1,    /* vec_reassoc_width.  */
 902   2,    /* min_div_recip_mul_sf.  */
 903   2,    /* min_div_recip_mul_df.  */
 904   0,    /* max_case_values.  */
 905   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 906   (AARCH64_EXTRA_TUNE_NONE),            /* tune_flags.  */
 907   &generic_prefetch_tune
 908 };
 909
 910 static const struct tune_params thunderx2t99_tunings =
 911 {
 912   &thunderx2t99_extra_costs,
 913   &thunderx2t99_addrcost_table,
 914   &thunderx2t99_regmove_cost,
 915   &thunderx2t99_vector_cost,
 916   &generic_branch_cost,
 917   &generic_approx_modes,
 918   4, /* memmov_cost.  */
 919   4, /* issue_rate.  */
 920   (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
 921    | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops  */
 922   16,   /* function_align.  */
 923   8,    /* jump_align.  */
 924   16,   /* loop_align.  */
 925   3,    /* int_reassoc_width.  */
 926   2,    /* fp_reassoc_width.  */
 927   2,    /* vec_reassoc_width.  */
 928   2,    /* min_div_recip_mul_sf.  */
 929   2,    /* min_div_recip_mul_df.  */
 930   0,    /* max_case_values.  */
 931   tune_params::AUTOPREFETCHER_WEAK,     /* autoprefetcher_model.  */
 932   (AARCH64_EXTRA_TUNE_NONE),    /* tune_flags.  */
 933   &thunderx2t99_prefetch_tune
 934 };
 935
 936 /* Support for fine-grained override of the tuning structures.  */
 937 struct aarch64_tuning_override_function
 938 {
 939   const char* name;
 940   void (*parse_override)(const char*, struct tune_params*);
 941 };
 942
 943 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 944 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 945
 946 static const struct aarch64_tuning_override_function
 947 aarch64_tuning_override_functions[] =
 948 {
 949   { "fuse", aarch64_parse_fuse_string },
 950   { "tune", aarch64_parse_tune_string },
 951   { NULL, NULL }
 952 };
 953
 954 /* A processor implementing AArch64.  */
 955 struct processor
 956 {
 957   const char *const name;
 958   enum aarch64_processor ident;
 959   enum aarch64_processor sched_core;
 960   enum aarch64_arch arch;
 961   unsigned architecture_version;
 962   const unsigned long flags;
 963   const struct tune_params *const tune;
 964 };
 965
 966 /* Architectures implementing AArch64.  */
 967 static const struct processor all_architectures[] =
 968 {
 969 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
 970   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
 971 #include "aarch64-arches.def"
 972   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 973 };
 974
 975 /* Processor cores implementing AArch64.  */
 976 static const struct processor all_cores[] =
 977 {
 978 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
 979   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH,                             \
 980   all_architectures[AARCH64_ARCH_##ARCH].architecture_version,  \
 981   FLAGS, &COSTS##_tunings},
 982 #include "aarch64-cores.def"
 983   {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
 984     AARCH64_FL_FOR_ARCH8, &generic_tunings},
 985   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
 986 };
 987
 988
 989 /* Target specification.  These are populated by the -march, -mtune, -mcpu
 990    handling code or by target attributes.  */
 991 static const struct processor *selected_arch;
 992 static const struct processor *selected_cpu;
 993 static const struct processor *selected_tune;
 994
 995 /* The current tuning set.  */
 996 struct tune_params aarch64_tune_params = generic_tunings;
 997
 998 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
 999
1000 /* An ISA extension in the co-processor and main instruction set space.  */
1001 struct aarch64_option_extension
1002 {
1003   const char *const name;
1004   const unsigned long flags_on;
1005   const unsigned long flags_off;
1006 };
1007
1008 typedef enum aarch64_cond_code
1009 {
1010   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1011   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1012   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1013 }
1014 aarch64_cc;
1015
1016 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1017
1018 /* The condition codes of the processor, and the inverse function.  */
1019 static const char * const aarch64_condition_codes[] =
1020 {
1021   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1022   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1023 };
1024
1025 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1026 const char *
1027 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1028                         const char * branch_format)
1029 {
1030     rtx_code_label * tmp_label = gen_label_rtx ();
1031     char label_buf[256];
1032     char buffer[128];
1033     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1034                                  CODE_LABEL_NUMBER (tmp_label));
1035     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1036     rtx dest_label = operands[pos_label];
1037     operands[pos_label] = tmp_label;
1038
1039     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1040     output_asm_insn (buffer, operands);
1041
1042     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1043     operands[pos_label] = dest_label;
1044     output_asm_insn (buffer, operands);
1045     return "";
1046 }
1047
1048 void
1049 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
1050 {
1051   const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
1052   if (TARGET_GENERAL_REGS_ONLY)
1053     error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
1054   else
1055     error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1056 }
1057
1058 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1059    The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1060    the same cost even if ALL_REGS has a much larger cost.  ALL_REGS is also
1061    used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1062    cost (in this case the best class is the lowest cost one).  Using ALL_REGS
1063    irrespectively of its cost results in bad allocations with many redundant
1064    int<->FP moves which are expensive on various cores.
1065    To avoid this we don't allow ALL_REGS as the allocno class, but force a
1066    decision between FP_REGS and GENERAL_REGS.  We use the allocno class if it
1067    isn't ALL_REGS.  Similarly, use the best class if it isn't ALL_REGS.
1068    Otherwise set the allocno class depending on the mode.
1069    The result of this is that it is no longer inefficient to have a higher
1070    memory move cost than the register move cost.
1071 */
1072
1073 static reg_class_t
1074 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1075                                          reg_class_t best_class)
1076 {
1077   machine_mode mode;
1078
1079   if (allocno_class != ALL_REGS)
1080     return allocno_class;
1081
1082   if (best_class != ALL_REGS)
1083     return best_class;
1084
1085   mode = PSEUDO_REGNO_MODE (regno);
1086   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1087 }
1088
1089 static unsigned int
1090 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1091 {
1092   if (GET_MODE_UNIT_SIZE (mode) == 4)
1093     return aarch64_tune_params.min_div_recip_mul_sf;
1094   return aarch64_tune_params.min_div_recip_mul_df;
1095 }
1096
1097 static int
1098 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1099                              machine_mode mode)
1100 {
1101   if (VECTOR_MODE_P (mode))
1102     return aarch64_tune_params.vec_reassoc_width;
1103   if (INTEGRAL_MODE_P (mode))
1104     return aarch64_tune_params.int_reassoc_width;
1105   if (FLOAT_MODE_P (mode))
1106     return aarch64_tune_params.fp_reassoc_width;
1107   return 1;
1108 }
1109
1110 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1111 unsigned
1112 aarch64_dbx_register_number (unsigned regno)
1113 {
1114    if (GP_REGNUM_P (regno))
1115      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1116    else if (regno == SP_REGNUM)
1117      return AARCH64_DWARF_SP;
1118    else if (FP_REGNUM_P (regno))
1119      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1120    else if (PR_REGNUM_P (regno))
1121      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1122    else if (regno == VG_REGNUM)
1123      return AARCH64_DWARF_VG;
1124
1125    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1126       equivalent DWARF register.  */
1127    return DWARF_FRAME_REGISTERS;
1128 }
1129
1130 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1131 static bool
1132 aarch64_advsimd_struct_mode_p (machine_mode mode)
1133 {
1134   return (TARGET_SIMD
1135           && (mode == OImode || mode == CImode || mode == XImode));
1136 }
1137
1138 /* Return true if MODE is an SVE predicate mode.  */
1139 static bool
1140 aarch64_sve_pred_mode_p (machine_mode mode)
1141 {
1142   return (TARGET_SVE
1143           && (mode == VNx16BImode
1144               || mode == VNx8BImode
1145               || mode == VNx4BImode
1146               || mode == VNx2BImode));
1147 }
1148
1149 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1150 const unsigned int VEC_ADVSIMD  = 1;
1151 const unsigned int VEC_SVE_DATA = 2;
1152 const unsigned int VEC_SVE_PRED = 4;
1153 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1154    a structure of 2, 3 or 4 vectors.  */
1155 const unsigned int VEC_STRUCT   = 8;
1156 /* Useful combinations of the above.  */
1157 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1158 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1159
1160 /* Return a set of flags describing the vector properties of mode MODE.
1161    Ignore modes that are not supported by the current target.  */
1162 static unsigned int
1163 aarch64_classify_vector_mode (machine_mode mode)
1164 {
1165   if (aarch64_advsimd_struct_mode_p (mode))
1166     return VEC_ADVSIMD | VEC_STRUCT;
1167
1168   if (aarch64_sve_pred_mode_p (mode))
1169     return VEC_SVE_PRED;
1170
1171   scalar_mode inner = GET_MODE_INNER (mode);
1172   if (VECTOR_MODE_P (mode)
1173       && (inner == QImode
1174           || inner == HImode
1175           || inner == HFmode
1176           || inner == SImode
1177           || inner == SFmode
1178           || inner == DImode
1179           || inner == DFmode))
1180     {
1181       if (TARGET_SVE
1182           && known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1183         return VEC_SVE_DATA;
1184
1185       /* This includes V1DF but not V1DI (which doesn't exist).  */
1186       if (TARGET_SIMD
1187           && (known_eq (GET_MODE_BITSIZE (mode), 64)
1188               || known_eq (GET_MODE_BITSIZE (mode), 128)))
1189         return VEC_ADVSIMD;
1190     }
1191
1192   return 0;
1193 }
1194
1195 /* Return true if MODE is any of the data vector modes, including
1196    structure modes.  */
1197 static bool
1198 aarch64_vector_data_mode_p (machine_mode mode)
1199 {
1200   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1201 }
1202
1203 /* Return true if MODE is an SVE data vector mode; either a single vector
1204    or a structure of vectors.  */
1205 static bool
1206 aarch64_sve_data_mode_p (machine_mode mode)
1207 {
1208   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1209 }
1210
1211 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1212 static bool
1213 aarch64_array_mode_supported_p (machine_mode mode,
1214                                 unsigned HOST_WIDE_INT nelems)
1215 {
1216   if (TARGET_SIMD
1217       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1218           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1219       && (nelems >= 2 && nelems <= 4))
1220     return true;
1221
1222   return false;
1223 }
1224
1225 /* Return the SVE predicate mode to use for elements that have
1226    ELEM_NBYTES bytes, if such a mode exists.  */
1227
1228 opt_machine_mode
1229 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1230 {
1231   if (TARGET_SVE)
1232     {
1233       if (elem_nbytes == 1)
1234         return VNx16BImode;
1235       if (elem_nbytes == 2)
1236         return VNx8BImode;
1237       if (elem_nbytes == 4)
1238         return VNx4BImode;
1239       if (elem_nbytes == 8)
1240         return VNx2BImode;
1241     }
1242   return opt_machine_mode ();
1243 }
1244
1245 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1246
1247 static opt_machine_mode
1248 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1249 {
1250   if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1251     {
1252       unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1253       machine_mode pred_mode;
1254       if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1255         return pred_mode;
1256     }
1257
1258   return default_get_mask_mode (nunits, nbytes);
1259 }
1260
1261 /* Implement TARGET_HARD_REGNO_NREGS.  */
1262
1263 static unsigned int
1264 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1265 {
1266   /* ??? Logically we should only need to provide a value when
1267      HARD_REGNO_MODE_OK says that the combination is valid,
1268      but at the moment we need to handle all modes.  Just ignore
1269      any runtime parts for registers that can't store them.  */
1270   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1271   switch (aarch64_regno_regclass (regno))
1272     {
1273     case FP_REGS:
1274     case FP_LO_REGS:
1275       if (aarch64_sve_data_mode_p (mode))
1276         return exact_div (GET_MODE_SIZE (mode),
1277                           BYTES_PER_SVE_VECTOR).to_constant ();
1278       return CEIL (lowest_size, UNITS_PER_VREG);
1279     case PR_REGS:
1280     case PR_LO_REGS:
1281     case PR_HI_REGS:
1282       return 1;
1283     default:
1284       return CEIL (lowest_size, UNITS_PER_WORD);
1285     }
1286   gcc_unreachable ();
1287 }
1288
1289 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
1290
1291 static bool
1292 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1293 {
1294   if (GET_MODE_CLASS (mode) == MODE_CC)
1295     return regno == CC_REGNUM;
1296
1297   if (regno == VG_REGNUM)
1298     /* This must have the same size as _Unwind_Word.  */
1299     return mode == DImode;
1300
1301   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1302   if (vec_flags & VEC_SVE_PRED)
1303     return PR_REGNUM_P (regno);
1304
1305   if (PR_REGNUM_P (regno))
1306     return 0;
1307
1308   if (regno == SP_REGNUM)
1309     /* The purpose of comparing with ptr_mode is to support the
1310        global register variable associated with the stack pointer
1311        register via the syntax of asm ("wsp") in ILP32.  */
1312     return mode == Pmode || mode == ptr_mode;
1313
1314   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1315     return mode == Pmode;
1316
1317   if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1318     return true;
1319
1320   if (FP_REGNUM_P (regno))
1321     {
1322       if (vec_flags & VEC_STRUCT)
1323         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1324       else
1325         return !VECTOR_MODE_P (mode) || vec_flags != 0;
1326     }
1327
1328   return false;
1329 }
1330
1331 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
1332    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
1333    clobbers the top 64 bits when restoring the bottom 64 bits.  */
1334
1335 static bool
1336 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1337 {
1338   return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1339 }
1340
1341 /* Implement REGMODE_NATURAL_SIZE.  */
1342 poly_uint64
1343 aarch64_regmode_natural_size (machine_mode mode)
1344 {
1345   /* The natural size for SVE data modes is one SVE data vector,
1346      and similarly for predicates.  We can't independently modify
1347      anything smaller than that.  */
1348   /* ??? For now, only do this for variable-width SVE registers.
1349      Doing it for constant-sized registers breaks lower-subreg.c.  */
1350   /* ??? And once that's fixed, we should probably have similar
1351      code for Advanced SIMD.  */
1352   if (!aarch64_sve_vg.is_constant ())
1353     {
1354       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1355       if (vec_flags & VEC_SVE_PRED)
1356         return BYTES_PER_SVE_PRED;
1357       if (vec_flags & VEC_SVE_DATA)
1358         return BYTES_PER_SVE_VECTOR;
1359     }
1360   return UNITS_PER_WORD;
1361 }
1362
1363 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
1364 machine_mode
1365 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1366                                      machine_mode mode)
1367 {
1368   /* The predicate mode determines which bits are significant and
1369      which are "don't care".  Decreasing the number of lanes would
1370      lose data while increasing the number of lanes would make bits
1371      unnecessarily significant.  */
1372   if (PR_REGNUM_P (regno))
1373     return mode;
1374   if (known_ge (GET_MODE_SIZE (mode), 4))
1375     return mode;
1376   else
1377     return SImode;
1378 }
1379
1380 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
1381    that strcpy from constants will be faster.  */
1382
1383 static HOST_WIDE_INT
1384 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1385 {
1386   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1387     return MAX (align, BITS_PER_WORD);
1388   return align;
1389 }
1390
1391 /* Return true if calls to DECL should be treated as
1392    long-calls (ie called via a register).  */
1393 static bool
1394 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1395 {
1396   return false;
1397 }
1398
1399 /* Return true if calls to symbol-ref SYM should be treated as
1400    long-calls (ie called via a register).  */
1401 bool
1402 aarch64_is_long_call_p (rtx sym)
1403 {
1404   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1405 }
1406
1407 /* Return true if calls to symbol-ref SYM should not go through
1408    plt stubs.  */
1409
1410 bool
1411 aarch64_is_noplt_call_p (rtx sym)
1412 {
1413   const_tree decl = SYMBOL_REF_DECL (sym);
1414
1415   if (flag_pic
1416       && decl
1417       && (!flag_plt
1418           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1419       && !targetm.binds_local_p (decl))
1420     return true;
1421
1422   return false;
1423 }
1424
1425 /* Return true if the offsets to a zero/sign-extract operation
1426    represent an expression that matches an extend operation.  The
1427    operands represent the paramters from
1428
1429    (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)).  */
1430 bool
1431 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1432                                 rtx extract_imm)
1433 {
1434   HOST_WIDE_INT mult_val, extract_val;
1435
1436   if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1437     return false;
1438
1439   mult_val = INTVAL (mult_imm);
1440   extract_val = INTVAL (extract_imm);
1441
1442   if (extract_val > 8
1443       && extract_val < GET_MODE_BITSIZE (mode)
1444       && exact_log2 (extract_val & ~7) > 0
1445       && (extract_val & 7) <= 4
1446       && mult_val == (1 << (extract_val & 7)))
1447     return true;
1448
1449   return false;
1450 }
1451
1452 /* Emit an insn that's a simple single-set.  Both the operands must be
1453    known to be valid.  */
1454 inline static rtx_insn *
1455 emit_set_insn (rtx x, rtx y)
1456 {
1457   return emit_insn (gen_rtx_SET (x, y));
1458 }
1459
1460 /* X and Y are two things to compare using CODE.  Emit the compare insn and
1461    return the rtx for register 0 in the proper mode.  */
1462 rtx
1463 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1464 {
1465   machine_mode mode = SELECT_CC_MODE (code, x, y);
1466   rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1467
1468   emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1469   return cc_reg;
1470 }
1471
1472 /* Build the SYMBOL_REF for __tls_get_addr.  */
1473
1474 static GTY(()) rtx tls_get_addr_libfunc;
1475
1476 rtx
1477 aarch64_tls_get_addr (void)
1478 {
1479   if (!tls_get_addr_libfunc)
1480     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1481   return tls_get_addr_libfunc;
1482 }
1483
1484 /* Return the TLS model to use for ADDR.  */
1485
1486 static enum tls_model
1487 tls_symbolic_operand_type (rtx addr)
1488 {
1489   enum tls_model tls_kind = TLS_MODEL_NONE;
1490   if (GET_CODE (addr) == CONST)
1491     {
1492       poly_int64 addend;
1493       rtx sym = strip_offset (addr, &addend);
1494       if (GET_CODE (sym) == SYMBOL_REF)
1495         tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1496     }
1497   else if (GET_CODE (addr) == SYMBOL_REF)
1498     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1499
1500   return tls_kind;
1501 }
1502
1503 /* We'll allow lo_sum's in addresses in our legitimate addresses
1504    so that combine would take care of combining addresses where
1505    necessary, but for generation purposes, we'll generate the address
1506    as :
1507    RTL                               Absolute
1508    tmp = hi (symbol_ref);            adrp  x1, foo
1509    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
1510                                      nop
1511
1512    PIC                               TLS
1513    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
1514    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
1515                                      bl   __tls_get_addr
1516                                      nop
1517
1518    Load TLS symbol, depending on TLS mechanism and TLS access model.
1519
1520    Global Dynamic - Traditional TLS:
1521    adrp tmp, :tlsgd:imm
1522    add  dest, tmp, #:tlsgd_lo12:imm
1523    bl   __tls_get_addr
1524
1525    Global Dynamic - TLS Descriptors:
1526    adrp dest, :tlsdesc:imm
1527    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
1528    add  dest, dest, #:tlsdesc_lo12:imm
1529    blr  tmp
1530    mrs  tp, tpidr_el0
1531    add  dest, dest, tp
1532
1533    Initial Exec:
1534    mrs  tp, tpidr_el0
1535    adrp tmp, :gottprel:imm
1536    ldr  dest, [tmp, #:gottprel_lo12:imm]
1537    add  dest, dest, tp
1538
1539    Local Exec:
1540    mrs  tp, tpidr_el0
1541    add  t0, tp, #:tprel_hi12:imm, lsl #12
1542    add  t0, t0, #:tprel_lo12_nc:imm
1543 */
1544
1545 static void
1546 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1547                                    enum aarch64_symbol_type type)
1548 {
1549   switch (type)
1550     {
1551     case SYMBOL_SMALL_ABSOLUTE:
1552       {
1553         /* In ILP32, the mode of dest can be either SImode or DImode.  */
1554         rtx tmp_reg = dest;
1555         machine_mode mode = GET_MODE (dest);
1556
1557         gcc_assert (mode == Pmode || mode == ptr_mode);
1558
1559         if (can_create_pseudo_p ())
1560           tmp_reg = gen_reg_rtx (mode);
1561
1562         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1563         emit_insn (gen_add_losym (dest, tmp_reg, imm));
1564         return;
1565       }
1566
1567     case SYMBOL_TINY_ABSOLUTE:
1568       emit_insn (gen_rtx_SET (dest, imm));
1569       return;
1570
1571     case SYMBOL_SMALL_GOT_28K:
1572       {
1573         machine_mode mode = GET_MODE (dest);
1574         rtx gp_rtx = pic_offset_table_rtx;
1575         rtx insn;
1576         rtx mem;
1577
1578         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1579            here before rtl expand.  Tree IVOPT will generate rtl pattern to
1580            decide rtx costs, in which case pic_offset_table_rtx is not
1581            initialized.  For that case no need to generate the first adrp
1582            instruction as the final cost for global variable access is
1583            one instruction.  */
1584         if (gp_rtx != NULL)
1585           {
1586             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1587                using the page base as GOT base, the first page may be wasted,
1588                in the worst scenario, there is only 28K space for GOT).
1589
1590                The generate instruction sequence for accessing global variable
1591                is:
1592
1593                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1594
1595                Only one instruction needed. But we must initialize
1596                pic_offset_table_rtx properly.  We generate initialize insn for
1597                every global access, and allow CSE to remove all redundant.
1598
1599                The final instruction sequences will look like the following
1600                for multiply global variables access.
1601
1602                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1603
1604                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1605                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1606                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1607                  ...  */
1608
1609             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1610             crtl->uses_pic_offset_table = 1;
1611             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1612
1613             if (mode != GET_MODE (gp_rtx))
1614              gp_rtx = gen_lowpart (mode, gp_rtx);
1615
1616           }
1617
1618         if (mode == ptr_mode)
1619           {
1620             if (mode == DImode)
1621               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1622             else
1623               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1624
1625             mem = XVECEXP (SET_SRC (insn), 0, 0);
1626           }
1627         else
1628           {
1629             gcc_assert (mode == Pmode);
1630
1631             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1632             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1633           }
1634
1635         /* The operand is expected to be MEM.  Whenever the related insn
1636            pattern changed, above code which calculate mem should be
1637            updated.  */
1638         gcc_assert (GET_CODE (mem) == MEM);
1639         MEM_READONLY_P (mem) = 1;
1640         MEM_NOTRAP_P (mem) = 1;
1641         emit_insn (insn);
1642         return;
1643       }
1644
1645     case SYMBOL_SMALL_GOT_4G:
1646       {
1647         /* In ILP32, the mode of dest can be either SImode or DImode,
1648            while the got entry is always of SImode size.  The mode of
1649            dest depends on how dest is used: if dest is assigned to a
1650            pointer (e.g. in the memory), it has SImode; it may have
1651            DImode if dest is dereferenced to access the memeory.
1652            This is why we have to handle three different ldr_got_small
1653            patterns here (two patterns for ILP32).  */
1654
1655         rtx insn;
1656         rtx mem;
1657         rtx tmp_reg = dest;
1658         machine_mode mode = GET_MODE (dest);
1659
1660         if (can_create_pseudo_p ())
1661           tmp_reg = gen_reg_rtx (mode);
1662
1663         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1664         if (mode == ptr_mode)
1665           {
1666             if (mode == DImode)
1667               insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1668             else
1669               insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1670
1671             mem = XVECEXP (SET_SRC (insn), 0, 0);
1672           }
1673         else
1674           {
1675             gcc_assert (mode == Pmode);
1676
1677             insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1678             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1679           }
1680
1681         gcc_assert (GET_CODE (mem) == MEM);
1682         MEM_READONLY_P (mem) = 1;
1683         MEM_NOTRAP_P (mem) = 1;
1684         emit_insn (insn);
1685         return;
1686       }
1687
1688     case SYMBOL_SMALL_TLSGD:
1689       {
1690         rtx_insn *insns;
1691         machine_mode mode = GET_MODE (dest);
1692         rtx result = gen_rtx_REG (mode, R0_REGNUM);
1693
1694         start_sequence ();
1695         if (TARGET_ILP32)
1696           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1697         else
1698           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1699         insns = get_insns ();
1700         end_sequence ();
1701
1702         RTL_CONST_CALL_P (insns) = 1;
1703         emit_libcall_block (insns, dest, result, imm);
1704         return;
1705       }
1706
1707     case SYMBOL_SMALL_TLSDESC:
1708       {
1709         machine_mode mode = GET_MODE (dest);
1710         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1711         rtx tp;
1712
1713         gcc_assert (mode == Pmode || mode == ptr_mode);
1714
1715         /* In ILP32, the got entry is always of SImode size.  Unlike
1716            small GOT, the dest is fixed at reg 0.  */
1717         if (TARGET_ILP32)
1718           emit_insn (gen_tlsdesc_small_si (imm));
1719         else
1720           emit_insn (gen_tlsdesc_small_di (imm));
1721         tp = aarch64_load_tp (NULL);
1722
1723         if (mode != Pmode)
1724           tp = gen_lowpart (mode, tp);
1725
1726         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1727         if (REG_P (dest))
1728           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1729         return;
1730       }
1731
1732     case SYMBOL_SMALL_TLSIE:
1733       {
1734         /* In ILP32, the mode of dest can be either SImode or DImode,
1735            while the got entry is always of SImode size.  The mode of
1736            dest depends on how dest is used: if dest is assigned to a
1737            pointer (e.g. in the memory), it has SImode; it may have
1738            DImode if dest is dereferenced to access the memeory.
1739            This is why we have to handle three different tlsie_small
1740            patterns here (two patterns for ILP32).  */
1741         machine_mode mode = GET_MODE (dest);
1742         rtx tmp_reg = gen_reg_rtx (mode);
1743         rtx tp = aarch64_load_tp (NULL);
1744
1745         if (mode == ptr_mode)
1746           {
1747             if (mode == DImode)
1748               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1749             else
1750               {
1751                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1752                 tp = gen_lowpart (mode, tp);
1753               }
1754           }
1755         else
1756           {
1757             gcc_assert (mode == Pmode);
1758             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1759           }
1760
1761         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1762         if (REG_P (dest))
1763           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1764         return;
1765       }
1766
1767     case SYMBOL_TLSLE12:
1768     case SYMBOL_TLSLE24:
1769     case SYMBOL_TLSLE32:
1770     case SYMBOL_TLSLE48:
1771       {
1772         machine_mode mode = GET_MODE (dest);
1773         rtx tp = aarch64_load_tp (NULL);
1774
1775         if (mode != Pmode)
1776           tp = gen_lowpart (mode, tp);
1777
1778         switch (type)
1779           {
1780           case SYMBOL_TLSLE12:
1781             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1782                         (dest, tp, imm));
1783             break;
1784           case SYMBOL_TLSLE24:
1785             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1786                         (dest, tp, imm));
1787           break;
1788           case SYMBOL_TLSLE32:
1789             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1790                         (dest, imm));
1791             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1792                         (dest, dest, tp));
1793           break;
1794           case SYMBOL_TLSLE48:
1795             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1796                         (dest, imm));
1797             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1798                         (dest, dest, tp));
1799             break;
1800           default:
1801             gcc_unreachable ();
1802           }
1803
1804         if (REG_P (dest))
1805           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1806         return;
1807       }
1808
1809     case SYMBOL_TINY_GOT:
1810       emit_insn (gen_ldr_got_tiny (dest, imm));
1811       return;
1812
1813     case SYMBOL_TINY_TLSIE:
1814       {
1815         machine_mode mode = GET_MODE (dest);
1816         rtx tp = aarch64_load_tp (NULL);
1817
1818         if (mode == ptr_mode)
1819           {
1820             if (mode == DImode)
1821               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1822             else
1823               {
1824                 tp = gen_lowpart (mode, tp);
1825                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1826               }
1827           }
1828         else
1829           {
1830             gcc_assert (mode == Pmode);
1831             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1832           }
1833
1834         if (REG_P (dest))
1835           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1836         return;
1837       }
1838
1839     default:
1840       gcc_unreachable ();
1841     }
1842 }
1843
1844 /* Emit a move from SRC to DEST.  Assume that the move expanders can
1845    handle all moves if !can_create_pseudo_p ().  The distinction is
1846    important because, unlike emit_move_insn, the move expanders know
1847    how to force Pmode objects into the constant pool even when the
1848    constant pool address is not itself legitimate.  */
1849 static rtx
1850 aarch64_emit_move (rtx dest, rtx src)
1851 {
1852   return (can_create_pseudo_p ()
1853           ? emit_move_insn (dest, src)
1854           : emit_move_insn_1 (dest, src));
1855 }
1856
1857 /* Split a 128-bit move operation into two 64-bit move operations,
1858    taking care to handle partial overlap of register to register
1859    copies.  Special cases are needed when moving between GP regs and
1860    FP regs.  SRC can be a register, constant or memory; DST a register
1861    or memory.  If either operand is memory it must not have any side
1862    effects.  */
1863 void
1864 aarch64_split_128bit_move (rtx dst, rtx src)
1865 {
1866   rtx dst_lo, dst_hi;
1867   rtx src_lo, src_hi;
1868
1869   machine_mode mode = GET_MODE (dst);
1870
1871   gcc_assert (mode == TImode || mode == TFmode);
1872   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1873   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1874
1875   if (REG_P (dst) && REG_P (src))
1876     {
1877       int src_regno = REGNO (src);
1878       int dst_regno = REGNO (dst);
1879
1880       /* Handle FP <-> GP regs.  */
1881       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1882         {
1883           src_lo = gen_lowpart (word_mode, src);
1884           src_hi = gen_highpart (word_mode, src);
1885
1886           if (mode == TImode)
1887             {
1888               emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1889               emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1890             }
1891           else
1892             {
1893               emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1894               emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1895             }
1896           return;
1897         }
1898       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1899         {
1900           dst_lo = gen_lowpart (word_mode, dst);
1901           dst_hi = gen_highpart (word_mode, dst);
1902
1903           if (mode == TImode)
1904             {
1905               emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1906               emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1907             }
1908           else
1909             {
1910               emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1911               emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1912             }
1913           return;
1914         }
1915     }
1916
1917   dst_lo = gen_lowpart (word_mode, dst);
1918   dst_hi = gen_highpart (word_mode, dst);
1919   src_lo = gen_lowpart (word_mode, src);
1920   src_hi = gen_highpart_mode (word_mode, mode, src);
1921
1922   /* At most one pairing may overlap.  */
1923   if (reg_overlap_mentioned_p (dst_lo, src_hi))
1924     {
1925       aarch64_emit_move (dst_hi, src_hi);
1926       aarch64_emit_move (dst_lo, src_lo);
1927     }
1928   else
1929     {
1930       aarch64_emit_move (dst_lo, src_lo);
1931       aarch64_emit_move (dst_hi, src_hi);
1932     }
1933 }
1934
1935 bool
1936 aarch64_split_128bit_move_p (rtx dst, rtx src)
1937 {
1938   return (! REG_P (src)
1939           || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1940 }
1941
1942 /* Split a complex SIMD combine.  */
1943
1944 void
1945 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1946 {
1947   machine_mode src_mode = GET_MODE (src1);
1948   machine_mode dst_mode = GET_MODE (dst);
1949
1950   gcc_assert (VECTOR_MODE_P (dst_mode));
1951   gcc_assert (register_operand (dst, dst_mode)
1952               && register_operand (src1, src_mode)
1953               && register_operand (src2, src_mode));
1954
1955   rtx (*gen) (rtx, rtx, rtx);
1956
1957   switch (src_mode)
1958     {
1959     case E_V8QImode:
1960       gen = gen_aarch64_simd_combinev8qi;
1961       break;
1962     case E_V4HImode:
1963       gen = gen_aarch64_simd_combinev4hi;
1964       break;
1965     case E_V2SImode:
1966       gen = gen_aarch64_simd_combinev2si;
1967       break;
1968     case E_V4HFmode:
1969       gen = gen_aarch64_simd_combinev4hf;
1970       break;
1971     case E_V2SFmode:
1972       gen = gen_aarch64_simd_combinev2sf;
1973       break;
1974     case E_DImode:
1975       gen = gen_aarch64_simd_combinedi;
1976       break;
1977     case E_DFmode:
1978       gen = gen_aarch64_simd_combinedf;
1979       break;
1980     default:
1981       gcc_unreachable ();
1982     }
1983
1984   emit_insn (gen (dst, src1, src2));
1985   return;
1986 }
1987
1988 /* Split a complex SIMD move.  */
1989
1990 void
1991 aarch64_split_simd_move (rtx dst, rtx src)
1992 {
1993   machine_mode src_mode = GET_MODE (src);
1994   machine_mode dst_mode = GET_MODE (dst);
1995
1996   gcc_assert (VECTOR_MODE_P (dst_mode));
1997
1998   if (REG_P (dst) && REG_P (src))
1999     {
2000       rtx (*gen) (rtx, rtx);
2001
2002       gcc_assert (VECTOR_MODE_P (src_mode));
2003
2004       switch (src_mode)
2005         {
2006         case E_V16QImode:
2007           gen = gen_aarch64_split_simd_movv16qi;
2008           break;
2009         case E_V8HImode:
2010           gen = gen_aarch64_split_simd_movv8hi;
2011           break;
2012         case E_V4SImode:
2013           gen = gen_aarch64_split_simd_movv4si;
2014           break;
2015         case E_V2DImode:
2016           gen = gen_aarch64_split_simd_movv2di;
2017           break;
2018         case E_V8HFmode:
2019           gen = gen_aarch64_split_simd_movv8hf;
2020           break;
2021         case E_V4SFmode:
2022           gen = gen_aarch64_split_simd_movv4sf;
2023           break;
2024         case E_V2DFmode:
2025           gen = gen_aarch64_split_simd_movv2df;
2026           break;
2027         default:
2028           gcc_unreachable ();
2029         }
2030
2031       emit_insn (gen (dst, src));
2032       return;
2033     }
2034 }
2035
2036 bool
2037 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2038                               machine_mode ymode, rtx y)
2039 {
2040   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2041   gcc_assert (r != NULL);
2042   return rtx_equal_p (x, r);
2043 }
2044
2045
2046 static rtx
2047 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2048 {
2049   if (can_create_pseudo_p ())
2050     return force_reg (mode, value);
2051   else
2052     {
2053       gcc_assert (x);
2054       aarch64_emit_move (x, value);
2055       return x;
2056     }
2057 }
2058
2059 /* Return true if we can move VALUE into a register using a single
2060    CNT[BHWD] instruction.  */
2061
2062 static bool
2063 aarch64_sve_cnt_immediate_p (poly_int64 value)
2064 {
2065   HOST_WIDE_INT factor = value.coeffs[0];
2066   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
2067   return (value.coeffs[1] == factor
2068           && IN_RANGE (factor, 2, 16 * 16)
2069           && (factor & 1) == 0
2070           && factor <= 16 * (factor & -factor));
2071 }
2072
2073 /* Likewise for rtx X.  */
2074
2075 bool
2076 aarch64_sve_cnt_immediate_p (rtx x)
2077 {
2078   poly_int64 value;
2079   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2080 }
2081
2082 /* Return the asm string for an instruction with a CNT-like vector size
2083    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2084    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2085    first part of the operands template (the part that comes before the
2086    vector size itself).  FACTOR is the number of quadwords.
2087    NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2088    If it is zero, we can use any element size.  */
2089
2090 static char *
2091 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2092                                   unsigned int factor,
2093                                   unsigned int nelts_per_vq)
2094 {
2095   static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2096
2097   if (nelts_per_vq == 0)
2098     /* There is some overlap in the ranges of the four CNT instructions.
2099        Here we always use the smallest possible element size, so that the
2100        multiplier is 1 whereever possible.  */
2101     nelts_per_vq = factor & -factor;
2102   int shift = std::min (exact_log2 (nelts_per_vq), 4);
2103   gcc_assert (IN_RANGE (shift, 1, 4));
2104   char suffix = "dwhb"[shift - 1];
2105
2106   factor >>= shift;
2107   unsigned int written;
2108   if (factor == 1)
2109     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2110                         prefix, suffix, operands);
2111   else
2112     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2113                         prefix, suffix, operands, factor);
2114   gcc_assert (written < sizeof (buffer));
2115   return buffer;
2116 }
2117
2118 /* Return the asm string for an instruction with a CNT-like vector size
2119    operand (a vector pattern followed by a multiplier in the range [1, 16]).
2120    PREFIX is the mnemonic without the size suffix and OPERANDS is the
2121    first part of the operands template (the part that comes before the
2122    vector size itself).  X is the value of the vector size operand,
2123    as a polynomial integer rtx.  */
2124
2125 char *
2126 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2127                                   rtx x)
2128 {
2129   poly_int64 value = rtx_to_poly_int64 (x);
2130   gcc_assert (aarch64_sve_cnt_immediate_p (value));
2131   return aarch64_output_sve_cnt_immediate (prefix, operands,
2132                                            value.coeffs[1], 0);
2133 }
2134
2135 /* Return true if we can add VALUE to a register using a single ADDVL
2136    or ADDPL instruction.  */
2137
2138 static bool
2139 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2140 {
2141   HOST_WIDE_INT factor = value.coeffs[0];
2142   if (factor == 0 || value.coeffs[1] != factor)
2143     return false;
2144   /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2145      and a value of 16 is one vector width.  */
2146   return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2147           || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2148 }
2149
2150 /* Likewise for rtx X.  */
2151
2152 bool
2153 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2154 {
2155   poly_int64 value;
2156   return (poly_int_rtx_p (x, &value)
2157           && aarch64_sve_addvl_addpl_immediate_p (value));
2158 }
2159
2160 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2161    and storing the result in operand 0.  */
2162
2163 char *
2164 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2165 {
2166   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2167   poly_int64 offset_value = rtx_to_poly_int64 (offset);
2168   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2169
2170   /* Use INC or DEC if possible.  */
2171   if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2172     {
2173       if (aarch64_sve_cnt_immediate_p (offset_value))
2174         return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2175                                                  offset_value.coeffs[1], 0);
2176       if (aarch64_sve_cnt_immediate_p (-offset_value))
2177         return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2178                                                  -offset_value.coeffs[1], 0);
2179     }
2180
2181   int factor = offset_value.coeffs[1];
2182   if ((factor & 15) == 0)
2183     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2184   else
2185     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2186   return buffer;
2187 }
2188
2189 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2190    instruction.  If it is, store the number of elements in each vector
2191    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2192    factor in *FACTOR_OUT (if nonnull).  */
2193
2194 bool
2195 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2196                                  unsigned int *nelts_per_vq_out)
2197 {
2198   rtx elt;
2199   poly_int64 value;
2200
2201   if (!const_vec_duplicate_p (x, &elt)
2202       || !poly_int_rtx_p (elt, &value))
2203     return false;
2204
2205   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2206   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2207     /* There's no vector INCB.  */
2208     return false;
2209
2210   HOST_WIDE_INT factor = value.coeffs[0];
2211   if (value.coeffs[1] != factor)
2212     return false;
2213
2214   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
2215   if ((factor % nelts_per_vq) != 0
2216       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2217     return false;
2218
2219   if (factor_out)
2220     *factor_out = factor;
2221   if (nelts_per_vq_out)
2222     *nelts_per_vq_out = nelts_per_vq;
2223   return true;
2224 }
2225
2226 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2227    instruction.  */
2228
2229 bool
2230 aarch64_sve_inc_dec_immediate_p (rtx x)
2231 {
2232   return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2233 }
2234
2235 /* Return the asm template for an SVE vector INC or DEC instruction.
2236    OPERANDS gives the operands before the vector count and X is the
2237    value of the vector count operand itself.  */
2238
2239 char *
2240 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2241 {
2242   int factor;
2243   unsigned int nelts_per_vq;
2244   if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2245     gcc_unreachable ();
2246   if (factor < 0)
2247     return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2248                                              nelts_per_vq);
2249   else
2250     return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2251                                              nelts_per_vq);
2252 }
2253
2254 static int
2255 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2256                                 scalar_int_mode mode)
2257 {
2258   int i;
2259   unsigned HOST_WIDE_INT val, val2, mask;
2260   int one_match, zero_match;
2261   int num_insns;
2262
2263   val = INTVAL (imm);
2264
2265   if (aarch64_move_imm (val, mode))
2266     {
2267       if (generate)
2268         emit_insn (gen_rtx_SET (dest, imm));
2269       return 1;
2270     }
2271
2272   /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2273      (with XXXX non-zero). In that case check to see if the move can be done in
2274      a smaller mode.  */
2275   val2 = val & 0xffffffff;
2276   if (mode == DImode
2277       && aarch64_move_imm (val2, SImode)
2278       && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2279     {
2280       if (generate)
2281         emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2282
2283       /* Check if we have to emit a second instruction by checking to see
2284          if any of the upper 32 bits of the original DI mode value is set.  */
2285       if (val == val2)
2286         return 1;
2287
2288       i = (val >> 48) ? 48 : 32;
2289
2290       if (generate)
2291          emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2292                                     GEN_INT ((val >> i) & 0xffff)));
2293
2294       return 2;
2295     }
2296
2297   if ((val >> 32) == 0 || mode == SImode)
2298     {
2299       if (generate)
2300         {
2301           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2302           if (mode == SImode)
2303             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2304                                        GEN_INT ((val >> 16) & 0xffff)));
2305           else
2306             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2307                                        GEN_INT ((val >> 16) & 0xffff)));
2308         }
2309       return 2;
2310     }
2311
2312   /* Remaining cases are all for DImode.  */
2313
2314   mask = 0xffff;
2315   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2316     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2317   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2318     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2319
2320   if (zero_match != 2 && one_match != 2)
2321     {
2322       /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2323          For a 64-bit bitmask try whether changing 16 bits to all ones or
2324          zeroes creates a valid bitmask.  To check any repeated bitmask,
2325          try using 16 bits from the other 32-bit half of val.  */
2326
2327       for (i = 0; i < 64; i += 16, mask <<= 16)
2328         {
2329           val2 = val & ~mask;
2330           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2331             break;
2332           val2 = val | mask;
2333           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2334             break;
2335           val2 = val2 & ~mask;
2336           val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2337           if (val2 != val && aarch64_bitmask_imm (val2, mode))
2338             break;
2339         }
2340       if (i != 64)
2341         {
2342           if (generate)
2343             {
2344               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2345               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2346                                          GEN_INT ((val >> i) & 0xffff)));
2347             }
2348           return 2;
2349         }
2350     }
2351
2352   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2353      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
2354      otherwise skip zero bits.  */
2355
2356   num_insns = 1;
2357   mask = 0xffff;
2358   val2 = one_match > zero_match ? ~val : val;
2359   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2360
2361   if (generate)
2362     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2363                                            ? (val | ~(mask << i))
2364                                            : (val & (mask << i)))));
2365   for (i += 16; i < 64; i += 16)
2366     {
2367       if ((val2 & (mask << i)) == 0)
2368         continue;
2369       if (generate)
2370         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2371                                    GEN_INT ((val >> i) & 0xffff)));
2372       num_insns ++;
2373     }
2374
2375   return num_insns;
2376 }
2377
2378 /* Return the number of temporary registers that aarch64_add_offset_1
2379    would need to add OFFSET to a register.  */
2380
2381 static unsigned int
2382 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2383 {
2384   return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2385 }
2386
2387 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
2388    a non-polynomial OFFSET.  MODE is the mode of the addition.
2389    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2390    be set and CFA adjustments added to the generated instructions.
2391
2392    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2393    temporary if register allocation is already complete.  This temporary
2394    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
2395    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2396    the immediate again.
2397
2398    Since this function may be used to adjust the stack pointer, we must
2399    ensure that it cannot cause transient stack deallocation (for example
2400    by first incrementing SP and then decrementing when adjusting by a
2401    large immediate).  */
2402
2403 static void
2404 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2405                       rtx src, HOST_WIDE_INT offset, rtx temp1,
2406                       bool frame_related_p, bool emit_move_imm)
2407 {
2408   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2409   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2410
2411   HOST_WIDE_INT moffset = abs_hwi (offset);
2412   rtx_insn *insn;
2413
2414   if (!moffset)
2415     {
2416       if (!rtx_equal_p (dest, src))
2417         {
2418           insn = emit_insn (gen_rtx_SET (dest, src));
2419           RTX_FRAME_RELATED_P (insn) = frame_related_p;
2420         }
2421       return;
2422     }
2423
2424   /* Single instruction adjustment.  */
2425   if (aarch64_uimm12_shift (moffset))
2426     {
2427       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2428       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2429       return;
2430     }
2431
2432   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2433      and either:
2434
2435      a) the offset cannot be loaded by a 16-bit move or
2436      b) there is no spare register into which we can move it.  */
2437   if (moffset < 0x1000000
2438       && ((!temp1 && !can_create_pseudo_p ())
2439           || !aarch64_move_imm (moffset, mode)))
2440     {
2441       HOST_WIDE_INT low_off = moffset & 0xfff;
2442
2443       low_off = offset < 0 ? -low_off : low_off;
2444       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2445       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2446       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2447       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2448       return;
2449     }
2450
2451   /* Emit a move immediate if required and an addition/subtraction.  */
2452   if (emit_move_imm)
2453     {
2454       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2455       temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2456     }
2457   insn = emit_insn (offset < 0
2458                     ? gen_sub3_insn (dest, src, temp1)
2459                     : gen_add3_insn (dest, src, temp1));
2460   if (frame_related_p)
2461     {
2462       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2463       rtx adj = plus_constant (mode, src, offset);
2464       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2465     }
2466 }
2467
2468 /* Return the number of temporary registers that aarch64_add_offset
2469    would need to move OFFSET into a register or add OFFSET to a register;
2470    ADD_P is true if we want the latter rather than the former.  */
2471
2472 static unsigned int
2473 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2474 {
2475   /* This follows the same structure as aarch64_add_offset.  */
2476   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2477     return 0;
2478
2479   unsigned int count = 0;
2480   HOST_WIDE_INT factor = offset.coeffs[1];
2481   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2482   poly_int64 poly_offset (factor, factor);
2483   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2484     /* Need one register for the ADDVL/ADDPL result.  */
2485     count += 1;
2486   else if (factor != 0)
2487     {
2488       factor = abs (factor);
2489       if (factor > 16 * (factor & -factor))
2490         /* Need one register for the CNT result and one for the multiplication
2491            factor.  If necessary, the second temporary can be reused for the
2492            constant part of the offset.  */
2493         return 2;
2494       /* Need one register for the CNT result (which might then
2495          be shifted).  */
2496       count += 1;
2497     }
2498   return count + aarch64_add_offset_1_temporaries (constant);
2499 }
2500
2501 /* If X can be represented as a poly_int64, return the number
2502    of temporaries that are required to add it to a register.
2503    Return -1 otherwise.  */
2504
2505 int
2506 aarch64_add_offset_temporaries (rtx x)
2507 {
2508   poly_int64 offset;
2509   if (!poly_int_rtx_p (x, &offset))
2510     return -1;
2511   return aarch64_offset_temporaries (true, offset);
2512 }
2513
2514 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
2515    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2516    be set and CFA adjustments added to the generated instructions.
2517
2518    TEMP1, if nonnull, is a register of mode MODE that can be used as a
2519    temporary if register allocation is already complete.  This temporary
2520    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2521    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2522    false to avoid emitting the immediate again.
2523
2524    TEMP2, if nonnull, is a second temporary register that doesn't
2525    overlap either DEST or REG.
2526
2527    Since this function may be used to adjust the stack pointer, we must
2528    ensure that it cannot cause transient stack deallocation (for example
2529    by first incrementing SP and then decrementing when adjusting by a
2530    large immediate).  */
2531
2532 static void
2533 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2534                     poly_int64 offset, rtx temp1, rtx temp2,
2535                     bool frame_related_p, bool emit_move_imm = true)
2536 {
2537   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2538   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2539   gcc_assert (temp1 == NULL_RTX
2540               || !frame_related_p
2541               || !reg_overlap_mentioned_p (temp1, dest));
2542   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2543
2544   /* Try using ADDVL or ADDPL to add the whole value.  */
2545   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2546     {
2547       rtx offset_rtx = gen_int_mode (offset, mode);
2548       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2549       RTX_FRAME_RELATED_P (insn) = frame_related_p;
2550       return;
2551     }
2552
2553   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2554      SVE vector register, over and above the minimum size of 128 bits.
2555      This is equivalent to half the value returned by CNTD with a
2556      vector shape of ALL.  */
2557   HOST_WIDE_INT factor = offset.coeffs[1];
2558   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2559
2560   /* Try using ADDVL or ADDPL to add the VG-based part.  */
2561   poly_int64 poly_offset (factor, factor);
2562   if (src != const0_rtx
2563       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2564     {
2565       rtx offset_rtx = gen_int_mode (poly_offset, mode);
2566       if (frame_related_p)
2567         {
2568           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2569           RTX_FRAME_RELATED_P (insn) = true;
2570           src = dest;
2571         }
2572       else
2573         {
2574           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2575           src = aarch64_force_temporary (mode, temp1, addr);
2576           temp1 = temp2;
2577           temp2 = NULL_RTX;
2578         }
2579     }
2580   /* Otherwise use a CNT-based sequence.  */
2581   else if (factor != 0)
2582     {
2583       /* Use a subtraction if we have a negative factor.  */
2584       rtx_code code = PLUS;
2585       if (factor < 0)
2586         {
2587           factor = -factor;
2588           code = MINUS;
2589         }
2590
2591       /* Calculate CNTD * FACTOR / 2.  First try to fold the division
2592          into the multiplication.  */
2593       rtx val;
2594       int shift = 0;
2595       if (factor & 1)
2596         /* Use a right shift by 1.  */
2597         shift = -1;
2598       else
2599         factor /= 2;
2600       HOST_WIDE_INT low_bit = factor & -factor;
2601       if (factor <= 16 * low_bit)
2602         {
2603           if (factor > 16 * 8)
2604             {
2605               /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2606                  the value with the minimum multiplier and shift it into
2607                  position.  */
2608               int extra_shift = exact_log2 (low_bit);
2609               shift += extra_shift;
2610               factor >>= extra_shift;
2611             }
2612           val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2613         }
2614       else
2615         {
2616           /* Use CNTD, then multiply it by FACTOR.  */
2617           val = gen_int_mode (poly_int64 (2, 2), mode);
2618           val = aarch64_force_temporary (mode, temp1, val);
2619
2620           /* Go back to using a negative multiplication factor if we have
2621              no register from which to subtract.  */
2622           if (code == MINUS && src == const0_rtx)
2623             {
2624               factor = -factor;
2625               code = PLUS;
2626             }
2627           rtx coeff1 = gen_int_mode (factor, mode);
2628           coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2629           val = gen_rtx_MULT (mode, val, coeff1);
2630         }
2631
2632       if (shift > 0)
2633         {
2634           /* Multiply by 1 << SHIFT.  */
2635           val = aarch64_force_temporary (mode, temp1, val);
2636           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2637         }
2638       else if (shift == -1)
2639         {
2640           /* Divide by 2.  */
2641           val = aarch64_force_temporary (mode, temp1, val);
2642           val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2643         }
2644
2645       /* Calculate SRC +/- CNTD * FACTOR / 2.  */
2646       if (src != const0_rtx)
2647         {
2648           val = aarch64_force_temporary (mode, temp1, val);
2649           val = gen_rtx_fmt_ee (code, mode, src, val);
2650         }
2651       else if (code == MINUS)
2652         {
2653           val = aarch64_force_temporary (mode, temp1, val);
2654           val = gen_rtx_NEG (mode, val);
2655         }
2656
2657       if (constant == 0 || frame_related_p)
2658         {
2659           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2660           if (frame_related_p)
2661             {
2662               RTX_FRAME_RELATED_P (insn) = true;
2663               add_reg_note (insn, REG_CFA_ADJUST_CFA,
2664                             gen_rtx_SET (dest, plus_constant (Pmode, src,
2665                                                               poly_offset)));
2666             }
2667           src = dest;
2668           if (constant == 0)
2669             return;
2670         }
2671       else
2672         {
2673           src = aarch64_force_temporary (mode, temp1, val);
2674           temp1 = temp2;
2675           temp2 = NULL_RTX;
2676         }
2677
2678       emit_move_imm = true;
2679     }
2680
2681   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2682                         frame_related_p, emit_move_imm);
2683 }
2684
2685 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2686    than a poly_int64.  */
2687
2688 void
2689 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2690                           rtx offset_rtx, rtx temp1, rtx temp2)
2691 {
2692   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2693                       temp1, temp2, false);
2694 }
2695
2696 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2697    TEMP1 is available as a temporary if nonnull.  EMIT_MOVE_IMM is false
2698    if TEMP1 already contains abs (DELTA).  */
2699
2700 static inline void
2701 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2702 {
2703   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2704                       temp1, temp2, true, emit_move_imm);
2705 }
2706
2707 /* Subtract DELTA from the stack pointer, marking the instructions
2708    frame-related if FRAME_RELATED_P.  TEMP1 is available as a temporary
2709    if nonnull.  */
2710
2711 static inline void
2712 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2713 {
2714   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2715                       temp1, temp2, frame_related_p);
2716 }
2717
2718 /* Set DEST to (vec_series BASE STEP).  */
2719
2720 static void
2721 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2722 {
2723   machine_mode mode = GET_MODE (dest);
2724   scalar_mode inner = GET_MODE_INNER (mode);
2725
2726   /* Each operand can be a register or an immediate in the range [-16, 15].  */
2727   if (!aarch64_sve_index_immediate_p (base))
2728     base = force_reg (inner, base);
2729   if (!aarch64_sve_index_immediate_p (step))
2730     step = force_reg (inner, step);
2731
2732   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2733 }
2734
2735 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2736    integer of mode INT_MODE.  Return true on success.  */
2737
2738 static bool
2739 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2740                                       rtx src)
2741 {
2742   /* If the constant is smaller than 128 bits, we can do the move
2743      using a vector of SRC_MODEs.  */
2744   if (src_mode != TImode)
2745     {
2746       poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2747                                      GET_MODE_SIZE (src_mode));
2748       machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2749       emit_move_insn (gen_lowpart (dup_mode, dest),
2750                       gen_const_vec_duplicate (dup_mode, src));
2751       return true;
2752     }
2753
2754   /* The bytes are loaded in little-endian order, so do a byteswap on
2755      big-endian targets.  */
2756   if (BYTES_BIG_ENDIAN)
2757     {
2758       src = simplify_unary_operation (BSWAP, src_mode, src, src_mode);
2759       if (!src)
2760         return NULL_RTX;
2761     }
2762
2763   /* Use LD1RQ to load the 128 bits from memory.  */
2764   src = force_const_mem (src_mode, src);
2765   if (!src)
2766     return false;
2767
2768   /* Make sure that the address is legitimate.  */
2769   if (!aarch64_sve_ld1r_operand_p (src))
2770     {
2771       rtx addr = force_reg (Pmode, XEXP (src, 0));
2772       src = replace_equiv_address (src, addr);
2773     }
2774
2775   rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2776   emit_insn (gen_sve_ld1rq (gen_lowpart (VNx16QImode, dest), ptrue, src));
2777   return true;
2778 }
2779
2780 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2781    isn't a simple duplicate or series.  */
2782
2783 static void
2784 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2785 {
2786   machine_mode mode = GET_MODE (src);
2787   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2788   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2789   gcc_assert (npatterns > 1);
2790
2791   if (nelts_per_pattern == 1)
2792     {
2793       /* The constant is a repeating seqeuence of at least two elements,
2794          where the repeating elements occupy no more than 128 bits.
2795          Get an integer representation of the replicated value.  */
2796       unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2797       gcc_assert (int_bits <= 128);
2798
2799       scalar_int_mode int_mode = int_mode_for_size (int_bits, 0).require ();
2800       rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2801       if (int_value
2802           && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2803         return;
2804     }
2805
2806   /* Expand each pattern individually.  */
2807   rtx_vector_builder builder;
2808   auto_vec<rtx, 16> vectors (npatterns);
2809   for (unsigned int i = 0; i < npatterns; ++i)
2810     {
2811       builder.new_vector (mode, 1, nelts_per_pattern);
2812       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2813         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2814       vectors.quick_push (force_reg (mode, builder.build ()));
2815     }
2816
2817   /* Use permutes to interleave the separate vectors.  */
2818   while (npatterns > 1)
2819     {
2820       npatterns /= 2;
2821       for (unsigned int i = 0; i < npatterns; ++i)
2822         {
2823           rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2824           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2825           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2826           vectors[i] = tmp;
2827         }
2828     }
2829   gcc_assert (vectors[0] == dest);
2830 }
2831
2832 /* Set DEST to immediate IMM.  For SVE vector modes, GEN_VEC_DUPLICATE
2833    is a pattern that can be used to set DEST to a replicated scalar
2834    element.  */
2835
2836 void
2837 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2838                               rtx (*gen_vec_duplicate) (rtx, rtx))
2839 {
2840   machine_mode mode = GET_MODE (dest);
2841
2842   /* Check on what type of symbol it is.  */
2843   scalar_int_mode int_mode;
2844   if ((GET_CODE (imm) == SYMBOL_REF
2845        || GET_CODE (imm) == LABEL_REF
2846        || GET_CODE (imm) == CONST
2847        || GET_CODE (imm) == CONST_POLY_INT)
2848       && is_a <scalar_int_mode> (mode, &int_mode))
2849     {
2850       rtx mem;
2851       poly_int64 offset;
2852       HOST_WIDE_INT const_offset;
2853       enum aarch64_symbol_type sty;
2854
2855       /* If we have (const (plus symbol offset)), separate out the offset
2856          before we start classifying the symbol.  */
2857       rtx base = strip_offset (imm, &offset);
2858
2859       /* We must always add an offset involving VL separately, rather than
2860          folding it into the relocation.  */
2861       if (!offset.is_constant (&const_offset))
2862         {
2863           if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2864             emit_insn (gen_rtx_SET (dest, imm));
2865           else
2866             {
2867               /* Do arithmetic on 32-bit values if the result is smaller
2868                  than that.  */
2869               if (partial_subreg_p (int_mode, SImode))
2870                 {
2871                   /* It is invalid to do symbol calculations in modes
2872                      narrower than SImode.  */
2873                   gcc_assert (base == const0_rtx);
2874                   dest = gen_lowpart (SImode, dest);
2875                   int_mode = SImode;
2876                 }
2877               if (base != const0_rtx)
2878                 {
2879                   base = aarch64_force_temporary (int_mode, dest, base);
2880                   aarch64_add_offset (int_mode, dest, base, offset,
2881                                       NULL_RTX, NULL_RTX, false);
2882                 }
2883               else
2884                 aarch64_add_offset (int_mode, dest, base, offset,
2885                                     dest, NULL_RTX, false);
2886             }
2887           return;
2888         }
2889
2890       sty = aarch64_classify_symbol (base, const_offset);
2891       switch (sty)
2892         {
2893         case SYMBOL_FORCE_TO_MEM:
2894           if (const_offset != 0
2895               && targetm.cannot_force_const_mem (int_mode, imm))
2896             {
2897               gcc_assert (can_create_pseudo_p ());
2898               base = aarch64_force_temporary (int_mode, dest, base);
2899               aarch64_add_offset (int_mode, dest, base, const_offset,
2900                                   NULL_RTX, NULL_RTX, false);
2901               return;
2902             }
2903
2904           mem = force_const_mem (ptr_mode, imm);
2905           gcc_assert (mem);
2906
2907           /* If we aren't generating PC relative literals, then
2908              we need to expand the literal pool access carefully.
2909              This is something that needs to be done in a number
2910              of places, so could well live as a separate function.  */
2911           if (!aarch64_pcrelative_literal_loads)
2912             {
2913               gcc_assert (can_create_pseudo_p ());
2914               base = gen_reg_rtx (ptr_mode);
2915               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2916               if (ptr_mode != Pmode)
2917                 base = convert_memory_address (Pmode, base);
2918               mem = gen_rtx_MEM (ptr_mode, base);
2919             }
2920
2921           if (int_mode != ptr_mode)
2922             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2923
2924           emit_insn (gen_rtx_SET (dest, mem));
2925
2926           return;
2927
2928         case SYMBOL_SMALL_TLSGD:
2929         case SYMBOL_SMALL_TLSDESC:
2930         case SYMBOL_SMALL_TLSIE:
2931         case SYMBOL_SMALL_GOT_28K:
2932         case SYMBOL_SMALL_GOT_4G:
2933         case SYMBOL_TINY_GOT:
2934         case SYMBOL_TINY_TLSIE:
2935           if (const_offset != 0)
2936             {
2937               gcc_assert(can_create_pseudo_p ());
2938               base = aarch64_force_temporary (int_mode, dest, base);
2939               aarch64_add_offset (int_mode, dest, base, const_offset,
2940                                   NULL_RTX, NULL_RTX, false);
2941               return;
2942             }
2943           /* FALLTHRU */
2944
2945         case SYMBOL_SMALL_ABSOLUTE:
2946         case SYMBOL_TINY_ABSOLUTE:
2947         case SYMBOL_TLSLE12:
2948         case SYMBOL_TLSLE24:
2949         case SYMBOL_TLSLE32:
2950         case SYMBOL_TLSLE48:
2951           aarch64_load_symref_appropriately (dest, imm, sty);
2952           return;
2953
2954         default:
2955           gcc_unreachable ();
2956         }
2957     }
2958
2959   if (!CONST_INT_P (imm))
2960     {
2961       rtx base, step, value;
2962       if (GET_CODE (imm) == HIGH
2963           || aarch64_simd_valid_immediate (imm, NULL))
2964         emit_insn (gen_rtx_SET (dest, imm));
2965       else if (const_vec_series_p (imm, &base, &step))
2966         aarch64_expand_vec_series (dest, base, step);
2967       else if (const_vec_duplicate_p (imm, &value))
2968         {
2969           /* If the constant is out of range of an SVE vector move,
2970              load it from memory if we can, otherwise move it into
2971              a register and use a DUP.  */
2972           scalar_mode inner_mode = GET_MODE_INNER (mode);
2973           rtx op = force_const_mem (inner_mode, value);
2974           if (!op)
2975             op = force_reg (inner_mode, value);
2976           else if (!aarch64_sve_ld1r_operand_p (op))
2977             {
2978               rtx addr = force_reg (Pmode, XEXP (op, 0));
2979               op = replace_equiv_address (op, addr);
2980             }
2981           emit_insn (gen_vec_duplicate (dest, op));
2982         }
2983       else if (GET_CODE (imm) == CONST_VECTOR
2984                && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
2985         aarch64_expand_sve_const_vector (dest, imm);
2986       else
2987         {
2988           rtx mem = force_const_mem (mode, imm);
2989           gcc_assert (mem);
2990           emit_move_insn (dest, mem);
2991         }
2992
2993       return;
2994     }
2995
2996   aarch64_internal_mov_immediate (dest, imm, true,
2997                                   as_a <scalar_int_mode> (mode));
2998 }
2999
3000 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
3001    that is known to contain PTRUE.  */
3002
3003 void
3004 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3005 {
3006   emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3007                                                 gen_rtvec (2, pred, src),
3008                                                 UNSPEC_MERGE_PTRUE)));
3009 }
3010
3011 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3012    operand is in memory.  In this case we need to use the predicated LD1
3013    and ST1 instead of LDR and STR, both for correctness on big-endian
3014    targets and because LD1 and ST1 support a wider range of addressing modes.
3015    PRED_MODE is the mode of the predicate.
3016
3017    See the comment at the head of aarch64-sve.md for details about the
3018    big-endian handling.  */
3019
3020 void
3021 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3022 {
3023   machine_mode mode = GET_MODE (dest);
3024   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3025   if (!register_operand (src, mode)
3026       && !register_operand (dest, mode))
3027     {
3028       rtx tmp = gen_reg_rtx (mode);
3029       if (MEM_P (src))
3030         aarch64_emit_sve_pred_move (tmp, ptrue, src);
3031       else
3032         emit_move_insn (tmp, src);
3033       src = tmp;
3034     }
3035   aarch64_emit_sve_pred_move (dest, ptrue, src);
3036 }
3037
3038 static bool
3039 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3040                                  tree exp ATTRIBUTE_UNUSED)
3041 {
3042   /* Currently, always true.  */
3043   return true;
3044 }
3045
3046 /* Implement TARGET_PASS_BY_REFERENCE.  */
3047
3048 static bool
3049 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3050                            machine_mode mode,
3051                            const_tree type,
3052                            bool named ATTRIBUTE_UNUSED)
3053 {
3054   HOST_WIDE_INT size;
3055   machine_mode dummymode;
3056   int nregs;
3057
3058   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
3059   if (mode == BLKmode && type)
3060     size = int_size_in_bytes (type);
3061   else
3062     /* No frontends can create types with variable-sized modes, so we
3063        shouldn't be asked to pass or return them.  */
3064     size = GET_MODE_SIZE (mode).to_constant ();
3065
3066   /* Aggregates are passed by reference based on their size.  */
3067   if (type && AGGREGATE_TYPE_P (type))
3068     {
3069       size = int_size_in_bytes (type);
3070     }
3071
3072   /* Variable sized arguments are always returned by reference.  */
3073   if (size < 0)
3074     return true;
3075
3076   /* Can this be a candidate to be passed in fp/simd register(s)?  */
3077   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3078                                                &dummymode, &nregs,
3079                                                NULL))
3080     return false;
3081
3082   /* Arguments which are variable sized or larger than 2 registers are
3083      passed by reference unless they are a homogenous floating point
3084      aggregate.  */
3085   return size > 2 * UNITS_PER_WORD;
3086 }
3087
3088 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
3089 static bool
3090 aarch64_return_in_msb (const_tree valtype)
3091 {
3092   machine_mode dummy_mode;
3093   int dummy_int;
3094
3095   /* Never happens in little-endian mode.  */
3096   if (!BYTES_BIG_ENDIAN)
3097     return false;
3098
3099   /* Only composite types smaller than or equal to 16 bytes can
3100      be potentially returned in registers.  */
3101   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3102       || int_size_in_bytes (valtype) <= 0
3103       || int_size_in_bytes (valtype) > 16)
3104     return false;
3105
3106   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3107      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3108      is always passed/returned in the least significant bits of fp/simd
3109      register(s).  */
3110   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3111                                                &dummy_mode, &dummy_int, NULL))
3112     return false;
3113
3114   return true;
3115 }
3116
3117 /* Implement TARGET_FUNCTION_VALUE.
3118    Define how to find the value returned by a function.  */
3119
3120 static rtx
3121 aarch64_function_value (const_tree type, const_tree func,
3122                         bool outgoing ATTRIBUTE_UNUSED)
3123 {
3124   machine_mode mode;
3125   int unsignedp;
3126   int count;
3127   machine_mode ag_mode;
3128
3129   mode = TYPE_MODE (type);
3130   if (INTEGRAL_TYPE_P (type))
3131     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3132
3133   if (aarch64_return_in_msb (type))
3134     {
3135       HOST_WIDE_INT size = int_size_in_bytes (type);
3136
3137       if (size % UNITS_PER_WORD != 0)
3138         {
3139           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3140           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3141         }
3142     }
3143
3144   if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3145                                                &ag_mode, &count, NULL))
3146     {
3147       if (!aarch64_composite_type_p (type, mode))
3148         {
3149           gcc_assert (count == 1 && mode == ag_mode);
3150           return gen_rtx_REG (mode, V0_REGNUM);
3151         }
3152       else
3153         {
3154           int i;
3155           rtx par;
3156
3157           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3158           for (i = 0; i < count; i++)
3159             {
3160               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3161               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3162               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3163               XVECEXP (par, 0, i) = tmp;
3164             }
3165           return par;
3166         }
3167     }
3168   else
3169     return gen_rtx_REG (mode, R0_REGNUM);
3170 }
3171
3172 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3173    Return true if REGNO is the number of a hard register in which the values
3174    of called function may come back.  */
3175
3176 static bool
3177 aarch64_function_value_regno_p (const unsigned int regno)
3178 {
3179   /* Maximum of 16 bytes can be returned in the general registers.  Examples
3180      of 16-byte return values are: 128-bit integers and 16-byte small
3181      structures (excluding homogeneous floating-point aggregates).  */
3182   if (regno == R0_REGNUM || regno == R1_REGNUM)
3183     return true;
3184
3185   /* Up to four fp/simd registers can return a function value, e.g. a
3186      homogeneous floating-point aggregate having four members.  */
3187   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3188     return TARGET_FLOAT;
3189
3190   return false;
3191 }
3192
3193 /* Implement TARGET_RETURN_IN_MEMORY.
3194
3195    If the type T of the result of a function is such that
3196      void func (T arg)
3197    would require that arg be passed as a value in a register (or set of
3198    registers) according to the parameter passing rules, then the result
3199    is returned in the same registers as would be used for such an
3200    argument.  */
3201
3202 static bool
3203 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3204 {
3205   HOST_WIDE_INT size;
3206   machine_mode ag_mode;
3207   int count;
3208
3209   if (!AGGREGATE_TYPE_P (type)
3210       && TREE_CODE (type) != COMPLEX_TYPE
3211       && TREE_CODE (type) != VECTOR_TYPE)
3212     /* Simple scalar types always returned in registers.  */
3213     return false;
3214
3215   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3216                                                type,
3217                                                &ag_mode,
3218                                                &count,
3219                                                NULL))
3220     return false;
3221
3222   /* Types larger than 2 registers returned in memory.  */
3223   size = int_size_in_bytes (type);
3224   return (size < 0 || size > 2 * UNITS_PER_WORD);
3225 }
3226
3227 static bool
3228 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3229                                const_tree type, int *nregs)
3230 {
3231   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3232   return aarch64_vfp_is_call_or_return_candidate (mode,
3233                                                   type,
3234                                                   &pcum->aapcs_vfp_rmode,
3235                                                   nregs,
3236                                                   NULL);
3237 }
3238
3239 /* Given MODE and TYPE of a function argument, return the alignment in
3240    bits.  The idea is to suppress any stronger alignment requested by
3241    the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3242    This is a helper function for local use only.  */
3243
3244 static unsigned int
3245 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3246 {
3247   if (!type)
3248     return GET_MODE_ALIGNMENT (mode);
3249
3250   if (integer_zerop (TYPE_SIZE (type)))
3251     return 0;
3252
3253   gcc_assert (TYPE_MODE (type) == mode);
3254
3255   if (!AGGREGATE_TYPE_P (type))
3256     return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3257
3258   if (TREE_CODE (type) == ARRAY_TYPE)
3259     return TYPE_ALIGN (TREE_TYPE (type));
3260
3261   unsigned int alignment = 0;
3262   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3263     if (TREE_CODE (field) == FIELD_DECL)
3264       alignment = std::max (alignment, DECL_ALIGN (field));
3265
3266   return alignment;
3267 }
3268
3269 /* Layout a function argument according to the AAPCS64 rules.  The rule
3270    numbers refer to the rule numbers in the AAPCS64.  */
3271
3272 static void
3273 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3274                     const_tree type,
3275                     bool named ATTRIBUTE_UNUSED)
3276 {
3277   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3278   int ncrn, nvrn, nregs;
3279   bool allocate_ncrn, allocate_nvrn;
3280   HOST_WIDE_INT size;
3281
3282   /* We need to do this once per argument.  */
3283   if (pcum->aapcs_arg_processed)
3284     return;
3285
3286   pcum->aapcs_arg_processed = true;
3287
3288   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
3289   if (type)
3290     size = int_size_in_bytes (type);
3291   else
3292     /* No frontends can create types with variable-sized modes, so we
3293        shouldn't be asked to pass or return them.  */
3294     size = GET_MODE_SIZE (mode).to_constant ();
3295   size = ROUND_UP (size, UNITS_PER_WORD);
3296
3297   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3298   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3299                                                  mode,
3300                                                  type,
3301                                                  &nregs);
3302
3303   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3304      The following code thus handles passing by SIMD/FP registers first.  */
3305
3306   nvrn = pcum->aapcs_nvrn;
3307
3308   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3309      and homogenous short-vector aggregates (HVA).  */
3310   if (allocate_nvrn)
3311     {
3312       if (!TARGET_FLOAT)
3313         aarch64_err_no_fpadvsimd (mode, "argument");
3314
3315       if (nvrn + nregs <= NUM_FP_ARG_REGS)
3316         {
3317           pcum->aapcs_nextnvrn = nvrn + nregs;
3318           if (!aarch64_composite_type_p (type, mode))
3319             {
3320               gcc_assert (nregs == 1);
3321               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3322             }
3323           else
3324             {
3325               rtx par;
3326               int i;
3327               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3328               for (i = 0; i < nregs; i++)
3329                 {
3330                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3331                                          V0_REGNUM + nvrn + i);
3332                   rtx offset = gen_int_mode
3333                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3334                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3335                   XVECEXP (par, 0, i) = tmp;
3336                 }
3337               pcum->aapcs_reg = par;
3338             }
3339           return;
3340         }
3341       else
3342         {
3343           /* C.3 NSRN is set to 8.  */
3344           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3345           goto on_stack;
3346         }
3347     }
3348
3349   ncrn = pcum->aapcs_ncrn;
3350   nregs = size / UNITS_PER_WORD;
3351
3352   /* C6 - C9.  though the sign and zero extension semantics are
3353      handled elsewhere.  This is the case where the argument fits
3354      entirely general registers.  */
3355   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3356     {
3357
3358       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3359
3360       /* C.8 if the argument has an alignment of 16 then the NGRN is
3361          rounded up to the next even number.  */
3362       if (nregs == 2
3363           && ncrn % 2
3364           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3365              comparison is there because for > 16 * BITS_PER_UNIT
3366              alignment nregs should be > 2 and therefore it should be
3367              passed by reference rather than value.  */
3368           && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3369         {
3370           ++ncrn;
3371           gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3372         }
3373
3374       /* NREGS can be 0 when e.g. an empty structure is to be passed.
3375          A reg is still generated for it, but the caller should be smart
3376          enough not to use it.  */
3377       if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3378         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3379       else
3380         {
3381           rtx par;
3382           int i;
3383
3384           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3385           for (i = 0; i < nregs; i++)
3386             {
3387               rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3388               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3389                                        GEN_INT (i * UNITS_PER_WORD));
3390               XVECEXP (par, 0, i) = tmp;
3391             }
3392           pcum->aapcs_reg = par;
3393         }
3394
3395       pcum->aapcs_nextncrn = ncrn + nregs;
3396       return;
3397     }
3398
3399   /* C.11  */
3400   pcum->aapcs_nextncrn = NUM_ARG_REGS;
3401
3402   /* The argument is passed on stack; record the needed number of words for
3403      this argument and align the total size if necessary.  */
3404 on_stack:
3405   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3406
3407   if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3408     pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3409                                        16 / UNITS_PER_WORD);
3410   return;
3411 }
3412
3413 /* Implement TARGET_FUNCTION_ARG.  */
3414
3415 static rtx
3416 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3417                       const_tree type, bool named)
3418 {
3419   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3420   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3421
3422   if (mode == VOIDmode)
3423     return NULL_RTX;
3424
3425   aarch64_layout_arg (pcum_v, mode, type, named);
3426   return pcum->aapcs_reg;
3427 }
3428
3429 void
3430 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3431                            const_tree fntype ATTRIBUTE_UNUSED,
3432                            rtx libname ATTRIBUTE_UNUSED,
3433                            const_tree fndecl ATTRIBUTE_UNUSED,
3434                            unsigned n_named ATTRIBUTE_UNUSED)
3435 {
3436   pcum->aapcs_ncrn = 0;
3437   pcum->aapcs_nvrn = 0;
3438   pcum->aapcs_nextncrn = 0;
3439   pcum->aapcs_nextnvrn = 0;
3440   pcum->pcs_variant = ARM_PCS_AAPCS64;
3441   pcum->aapcs_reg = NULL_RTX;
3442   pcum->aapcs_arg_processed = false;
3443   pcum->aapcs_stack_words = 0;
3444   pcum->aapcs_stack_size = 0;
3445
3446   if (!TARGET_FLOAT
3447       && fndecl && TREE_PUBLIC (fndecl)
3448       && fntype && fntype != error_mark_node)
3449     {
3450       const_tree type = TREE_TYPE (fntype);
3451       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
3452       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
3453       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3454                                                    &mode, &nregs, NULL))
3455         aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
3456     }
3457   return;
3458 }
3459
3460 static void
3461 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3462                               machine_mode mode,
3463                               const_tree type,
3464                               bool named)
3465 {
3466   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3467   if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3468     {
3469       aarch64_layout_arg (pcum_v, mode, type, named);
3470       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3471                   != (pcum->aapcs_stack_words != 0));
3472       pcum->aapcs_arg_processed = false;
3473       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3474       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3475       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3476       pcum->aapcs_stack_words = 0;
3477       pcum->aapcs_reg = NULL_RTX;
3478     }
3479 }
3480
3481 bool
3482 aarch64_function_arg_regno_p (unsigned regno)
3483 {
3484   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3485           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3486 }
3487
3488 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
3489    PARM_BOUNDARY bits of alignment, but will be given anything up
3490    to STACK_BOUNDARY bits if the type requires it.  This makes sure
3491    that both before and after the layout of each argument, the Next
3492    Stacked Argument Address (NSAA) will have a minimum alignment of
3493    8 bytes.  */
3494
3495 static unsigned int
3496 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3497 {
3498   unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3499   return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3500 }
3501
3502 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
3503
3504 static fixed_size_mode
3505 aarch64_get_reg_raw_mode (int regno)
3506 {
3507   if (TARGET_SVE && FP_REGNUM_P (regno))
3508     /* Don't use the SVE part of the register for __builtin_apply and
3509        __builtin_return.  The SVE registers aren't used by the normal PCS,
3510        so using them there would be a waste of time.  The PCS extensions
3511        for SVE types are fundamentally incompatible with the
3512        __builtin_return/__builtin_apply interface.  */
3513     return as_a <fixed_size_mode> (V16QImode);
3514   return default_get_reg_raw_mode (regno);
3515 }
3516
3517 /* Implement TARGET_FUNCTION_ARG_PADDING.
3518
3519    Small aggregate types are placed in the lowest memory address.
3520
3521    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
3522
3523 static pad_direction
3524 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3525 {
3526   /* On little-endian targets, the least significant byte of every stack
3527      argument is passed at the lowest byte address of the stack slot.  */
3528   if (!BYTES_BIG_ENDIAN)
3529     return PAD_UPWARD;
3530
3531   /* Otherwise, integral, floating-point and pointer types are padded downward:
3532      the least significant byte of a stack argument is passed at the highest
3533      byte address of the stack slot.  */
3534   if (type
3535       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3536          || POINTER_TYPE_P (type))
3537       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3538     return PAD_DOWNWARD;
3539
3540   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
3541   return PAD_UPWARD;
3542 }
3543
3544 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3545
3546    It specifies padding for the last (may also be the only)
3547    element of a block move between registers and memory.  If
3548    assuming the block is in the memory, padding upward means that
3549    the last element is padded after its highest significant byte,
3550    while in downward padding, the last element is padded at the
3551    its least significant byte side.
3552
3553    Small aggregates and small complex types are always padded
3554    upwards.
3555
3556    We don't need to worry about homogeneous floating-point or
3557    short-vector aggregates; their move is not affected by the
3558    padding direction determined here.  Regardless of endianness,
3559    each element of such an aggregate is put in the least
3560    significant bits of a fp/simd register.
3561
3562    Return !BYTES_BIG_ENDIAN if the least significant byte of the
3563    register has useful data, and return the opposite if the most
3564    significant byte does.  */
3565
3566 bool
3567 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3568                      bool first ATTRIBUTE_UNUSED)
3569 {
3570
3571   /* Small composite types are always padded upward.  */
3572   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3573     {
3574       HOST_WIDE_INT size;
3575       if (type)
3576         size = int_size_in_bytes (type);
3577       else
3578         /* No frontends can create types with variable-sized modes, so we
3579            shouldn't be asked to pass or return them.  */
3580         size = GET_MODE_SIZE (mode).to_constant ();
3581       if (size < 2 * UNITS_PER_WORD)
3582         return true;
3583     }
3584
3585   /* Otherwise, use the default padding.  */
3586   return !BYTES_BIG_ENDIAN;
3587 }
3588
3589 static scalar_int_mode
3590 aarch64_libgcc_cmp_return_mode (void)
3591 {
3592   return SImode;
3593 }
3594
3595 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3596
3597 /* We use the 12-bit shifted immediate arithmetic instructions so values
3598    must be multiple of (1 << 12), i.e. 4096.  */
3599 #define ARITH_FACTOR 4096
3600
3601 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3602 #error Cannot use simple address calculation for stack probing
3603 #endif
3604
3605 /* The pair of scratch registers used for stack probing.  */
3606 #define PROBE_STACK_FIRST_REG  9
3607 #define PROBE_STACK_SECOND_REG 10
3608
3609 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3610    inclusive.  These are offsets from the current stack pointer.  */
3611
3612 static void
3613 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3614 {
3615   HOST_WIDE_INT size;
3616   if (!poly_size.is_constant (&size))
3617     {
3618       sorry ("stack probes for SVE frames");
3619       return;
3620     }
3621
3622   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3623
3624   /* See the same assertion on PROBE_INTERVAL above.  */
3625   gcc_assert ((first % ARITH_FACTOR) == 0);
3626
3627   /* See if we have a constant small number of probes to generate.  If so,
3628      that's the easy case.  */
3629   if (size <= PROBE_INTERVAL)
3630     {
3631       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3632
3633       emit_set_insn (reg1,
3634                      plus_constant (Pmode,
3635                                     stack_pointer_rtx, -(first + base)));
3636       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3637     }
3638
3639   /* The run-time loop is made up of 8 insns in the generic case while the
3640      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
3641   else if (size <= 4 * PROBE_INTERVAL)
3642     {
3643       HOST_WIDE_INT i, rem;
3644
3645       emit_set_insn (reg1,
3646                      plus_constant (Pmode,
3647                                     stack_pointer_rtx,
3648                                     -(first + PROBE_INTERVAL)));
3649       emit_stack_probe (reg1);
3650
3651       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3652          it exceeds SIZE.  If only two probes are needed, this will not
3653          generate any code.  Then probe at FIRST + SIZE.  */
3654       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3655         {
3656           emit_set_insn (reg1,
3657                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3658           emit_stack_probe (reg1);
3659         }
3660
3661       rem = size - (i - PROBE_INTERVAL);
3662       if (rem > 256)
3663         {
3664           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3665
3666           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3667           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3668         }
3669       else
3670         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3671     }
3672
3673   /* Otherwise, do the same as above, but in a loop.  Note that we must be
3674      extra careful with variables wrapping around because we might be at
3675      the very top (or the very bottom) of the address space and we have
3676      to be able to handle this case properly; in particular, we use an
3677      equality test for the loop condition.  */
3678   else
3679     {
3680       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3681
3682       /* Step 1: round SIZE to the previous multiple of the interval.  */
3683
3684       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3685
3686
3687       /* Step 2: compute initial and final value of the loop counter.  */
3688
3689       /* TEST_ADDR = SP + FIRST.  */
3690       emit_set_insn (reg1,
3691                      plus_constant (Pmode, stack_pointer_rtx, -first));
3692
3693       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
3694       HOST_WIDE_INT adjustment = - (first + rounded_size);
3695       if (! aarch64_uimm12_shift (adjustment))
3696         {
3697           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3698                                           true, Pmode);
3699           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3700         }
3701       else
3702         {
3703           emit_set_insn (reg2,
3704                          plus_constant (Pmode, stack_pointer_rtx, adjustment));
3705         }
3706
3707       /* Step 3: the loop
3708
3709          do
3710            {
3711              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3712              probe at TEST_ADDR
3713            }
3714          while (TEST_ADDR != LAST_ADDR)
3715
3716          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3717          until it is equal to ROUNDED_SIZE.  */
3718
3719       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3720
3721
3722       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3723          that SIZE is equal to ROUNDED_SIZE.  */
3724
3725       if (size != rounded_size)
3726         {
3727           HOST_WIDE_INT rem = size - rounded_size;
3728
3729           if (rem > 256)
3730             {
3731               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3732
3733               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3734               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3735             }
3736           else
3737             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3738         }
3739     }
3740
3741   /* Make sure nothing is scheduled before we are done.  */
3742   emit_insn (gen_blockage ());
3743 }
3744
3745 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
3746    absolute addresses.  */
3747
3748 const char *
3749 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3750 {
3751   static int labelno = 0;
3752   char loop_lab[32];
3753   rtx xops[2];
3754
3755   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3756
3757   /* Loop.  */
3758   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3759
3760   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
3761   xops[0] = reg1;
3762   xops[1] = GEN_INT (PROBE_INTERVAL);
3763   output_asm_insn ("sub\t%0, %0, %1", xops);
3764
3765   /* Probe at TEST_ADDR.  */
3766   output_asm_insn ("str\txzr, [%0]", xops);
3767
3768   /* Test if TEST_ADDR == LAST_ADDR.  */
3769   xops[1] = reg2;
3770   output_asm_insn ("cmp\t%0, %1", xops);
3771
3772   /* Branch.  */
3773   fputs ("\tb.ne\t", asm_out_file);
3774   assemble_name_raw (asm_out_file, loop_lab);
3775   fputc ('\n', asm_out_file);
3776
3777   return "";
3778 }
3779
3780 /* Mark the registers that need to be saved by the callee and calculate
3781    the size of the callee-saved registers area and frame record (both FP
3782    and LR may be omitted).  */
3783 static void
3784 aarch64_layout_frame (void)
3785 {
3786   HOST_WIDE_INT offset = 0;
3787   int regno, last_fp_reg = INVALID_REGNUM;
3788
3789   if (reload_completed && cfun->machine->frame.laid_out)
3790     return;
3791
3792   /* Force a frame chain for EH returns so the return address is at FP+8.  */
3793   cfun->machine->frame.emit_frame_chain
3794     = frame_pointer_needed || crtl->calls_eh_return;
3795
3796   /* Emit a frame chain if the frame pointer is enabled.
3797      If -momit-leaf-frame-pointer is used, do not use a frame chain
3798      in leaf functions which do not use LR.  */
3799   if (flag_omit_frame_pointer == 2
3800       && !(flag_omit_leaf_frame_pointer && crtl->is_leaf
3801            && !df_regs_ever_live_p (LR_REGNUM)))
3802     cfun->machine->frame.emit_frame_chain = true;
3803
3804 #define SLOT_NOT_REQUIRED (-2)
3805 #define SLOT_REQUIRED     (-1)
3806
3807   cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
3808   cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
3809
3810   /* First mark all the registers that really need to be saved...  */
3811   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3812     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3813
3814   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3815     cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3816
3817   /* ... that includes the eh data registers (if needed)...  */
3818   if (crtl->calls_eh_return)
3819     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
3820       cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
3821         = SLOT_REQUIRED;
3822
3823   /* ... and any callee saved register that dataflow says is live.  */
3824   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3825     if (df_regs_ever_live_p (regno)
3826         && (regno == R30_REGNUM
3827             || !call_used_regs[regno]))
3828       cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3829
3830   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3831     if (df_regs_ever_live_p (regno)
3832         && !call_used_regs[regno])
3833       {
3834         cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3835         last_fp_reg = regno;
3836       }
3837
3838   if (cfun->machine->frame.emit_frame_chain)
3839     {
3840       /* FP and LR are placed in the linkage record.  */
3841       cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
3842       cfun->machine->frame.wb_candidate1 = R29_REGNUM;
3843       cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
3844       cfun->machine->frame.wb_candidate2 = R30_REGNUM;
3845       offset = 2 * UNITS_PER_WORD;
3846     }
3847
3848   /* Now assign stack slots for them.  */
3849   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3850     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
3851       {
3852         cfun->machine->frame.reg_offset[regno] = offset;
3853         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
3854           cfun->machine->frame.wb_candidate1 = regno;
3855         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
3856           cfun->machine->frame.wb_candidate2 = regno;
3857         offset += UNITS_PER_WORD;
3858       }
3859
3860   HOST_WIDE_INT max_int_offset = offset;
3861   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
3862   bool has_align_gap = offset != max_int_offset;
3863
3864   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3865     if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
3866       {
3867         /* If there is an alignment gap between integer and fp callee-saves,
3868            allocate the last fp register to it if possible.  */
3869         if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
3870           {
3871             cfun->machine->frame.reg_offset[regno] = max_int_offset;
3872             break;
3873           }
3874
3875         cfun->machine->frame.reg_offset[regno] = offset;
3876         if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
3877           cfun->machine->frame.wb_candidate1 = regno;
3878         else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
3879                  && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
3880           cfun->machine->frame.wb_candidate2 = regno;
3881         offset += UNITS_PER_WORD;
3882       }
3883
3884   offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
3885
3886   cfun->machine->frame.saved_regs_size = offset;
3887
3888   HOST_WIDE_INT varargs_and_saved_regs_size
3889     = offset + cfun->machine->frame.saved_varargs_size;
3890
3891   cfun->machine->frame.hard_fp_offset
3892     = aligned_upper_bound (varargs_and_saved_regs_size
3893                            + get_frame_size (),
3894                            STACK_BOUNDARY / BITS_PER_UNIT);
3895
3896   /* Both these values are already aligned.  */
3897   gcc_assert (multiple_p (crtl->outgoing_args_size,
3898                           STACK_BOUNDARY / BITS_PER_UNIT));
3899   cfun->machine->frame.frame_size
3900     = (cfun->machine->frame.hard_fp_offset
3901        + crtl->outgoing_args_size);
3902
3903   cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
3904
3905   cfun->machine->frame.initial_adjust = 0;
3906   cfun->machine->frame.final_adjust = 0;
3907   cfun->machine->frame.callee_adjust = 0;
3908   cfun->machine->frame.callee_offset = 0;
3909
3910   HOST_WIDE_INT max_push_offset = 0;
3911   if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
3912     max_push_offset = 512;
3913   else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
3914     max_push_offset = 256;
3915
3916   HOST_WIDE_INT const_size, const_fp_offset;
3917   if (cfun->machine->frame.frame_size.is_constant (&const_size)
3918       && const_size < max_push_offset
3919       && known_eq (crtl->outgoing_args_size, 0))
3920     {
3921       /* Simple, small frame with no outgoing arguments:
3922          stp reg1, reg2, [sp, -frame_size]!
3923          stp reg3, reg4, [sp, 16]  */
3924       cfun->machine->frame.callee_adjust = const_size;
3925     }
3926   else if (known_lt (crtl->outgoing_args_size
3927                      + cfun->machine->frame.saved_regs_size, 512)
3928            && !(cfun->calls_alloca
3929                 && known_lt (cfun->machine->frame.hard_fp_offset,
3930                              max_push_offset)))
3931     {
3932       /* Frame with small outgoing arguments:
3933          sub sp, sp, frame_size
3934          stp reg1, reg2, [sp, outgoing_args_size]
3935          stp reg3, reg4, [sp, outgoing_args_size + 16]  */
3936       cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3937       cfun->machine->frame.callee_offset
3938         = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3939     }
3940   else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
3941            && const_fp_offset < max_push_offset)
3942     {
3943       /* Frame with large outgoing arguments but a small local area:
3944          stp reg1, reg2, [sp, -hard_fp_offset]!
3945          stp reg3, reg4, [sp, 16]
3946          sub sp, sp, outgoing_args_size  */
3947       cfun->machine->frame.callee_adjust = const_fp_offset;
3948       cfun->machine->frame.final_adjust
3949         = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3950     }
3951   else
3952     {
3953       /* Frame with large local area and outgoing arguments using frame pointer:
3954          sub sp, sp, hard_fp_offset
3955          stp x29, x30, [sp, 0]
3956          add x29, sp, 0
3957          stp reg3, reg4, [sp, 16]
3958          sub sp, sp, outgoing_args_size  */
3959       cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3960       cfun->machine->frame.final_adjust
3961         = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3962     }
3963
3964   cfun->machine->frame.laid_out = true;
3965 }
3966
3967 /* Return true if the register REGNO is saved on entry to
3968    the current function.  */
3969
3970 static bool
3971 aarch64_register_saved_on_entry (int regno)
3972 {
3973   return cfun->machine->frame.reg_offset[regno] >= 0;
3974 }
3975
3976 /* Return the next register up from REGNO up to LIMIT for the callee
3977    to save.  */
3978
3979 static unsigned
3980 aarch64_next_callee_save (unsigned regno, unsigned limit)
3981 {
3982   while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3983     regno ++;
3984   return regno;
3985 }
3986
3987 /* Push the register number REGNO of mode MODE to the stack with write-back
3988    adjusting the stack by ADJUSTMENT.  */
3989
3990 static void
3991 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3992                            HOST_WIDE_INT adjustment)
3993  {
3994   rtx base_rtx = stack_pointer_rtx;
3995   rtx insn, reg, mem;
3996
3997   reg = gen_rtx_REG (mode, regno);
3998   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3999                             plus_constant (Pmode, base_rtx, -adjustment));
4000   mem = gen_frame_mem (mode, mem);
4001
4002   insn = emit_move_insn (mem, reg);
4003   RTX_FRAME_RELATED_P (insn) = 1;
4004 }
4005
4006 /* Generate and return an instruction to store the pair of registers
4007    REG and REG2 of mode MODE to location BASE with write-back adjusting
4008    the stack location BASE by ADJUSTMENT.  */
4009
4010 static rtx
4011 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4012                           HOST_WIDE_INT adjustment)
4013 {
4014   switch (mode)
4015     {
4016     case E_DImode:
4017       return gen_storewb_pairdi_di (base, base, reg, reg2,
4018                                     GEN_INT (-adjustment),
4019                                     GEN_INT (UNITS_PER_WORD - adjustment));
4020     case E_DFmode:
4021       return gen_storewb_pairdf_di (base, base, reg, reg2,
4022                                     GEN_INT (-adjustment),
4023                                     GEN_INT (UNITS_PER_WORD - adjustment));
4024     default:
4025       gcc_unreachable ();
4026     }
4027 }
4028
4029 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4030    stack pointer by ADJUSTMENT.  */
4031
4032 static void
4033 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4034 {
4035   rtx_insn *insn;
4036   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4037
4038   if (regno2 == INVALID_REGNUM)
4039     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4040
4041   rtx reg1 = gen_rtx_REG (mode, regno1);
4042   rtx reg2 = gen_rtx_REG (mode, regno2);
4043
4044   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4045                                               reg2, adjustment));
4046   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4047   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4048   RTX_FRAME_RELATED_P (insn) = 1;
4049 }
4050
4051 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4052    adjusting it by ADJUSTMENT afterwards.  */
4053
4054 static rtx
4055 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4056                          HOST_WIDE_INT adjustment)
4057 {
4058   switch (mode)
4059     {
4060     case E_DImode:
4061       return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4062                                    GEN_INT (UNITS_PER_WORD));
4063     case E_DFmode:
4064       return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4065                                    GEN_INT (UNITS_PER_WORD));
4066     default:
4067       gcc_unreachable ();
4068     }
4069 }
4070
4071 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4072    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4073    into CFI_OPS.  */
4074
4075 static void
4076 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4077                   rtx *cfi_ops)
4078 {
4079   machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4080   rtx reg1 = gen_rtx_REG (mode, regno1);
4081
4082   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4083
4084   if (regno2 == INVALID_REGNUM)
4085     {
4086       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4087       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4088       emit_move_insn (reg1, gen_frame_mem (mode, mem));
4089     }
4090   else
4091     {
4092       rtx reg2 = gen_rtx_REG (mode, regno2);
4093       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4094       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4095                                           reg2, adjustment));
4096     }
4097 }
4098
4099 /* Generate and return a store pair instruction of mode MODE to store
4100    register REG1 to MEM1 and register REG2 to MEM2.  */
4101
4102 static rtx
4103 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4104                         rtx reg2)
4105 {
4106   switch (mode)
4107     {
4108     case E_DImode:
4109       return gen_store_pairdi (mem1, reg1, mem2, reg2);
4110
4111     case E_DFmode:
4112       return gen_store_pairdf (mem1, reg1, mem2, reg2);
4113
4114     default:
4115       gcc_unreachable ();
4116     }
4117 }
4118
4119 /* Generate and regurn a load pair isntruction of mode MODE to load register
4120    REG1 from MEM1 and register REG2 from MEM2.  */
4121
4122 static rtx
4123 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4124                        rtx mem2)
4125 {
4126   switch (mode)
4127     {
4128     case E_DImode:
4129       return gen_load_pairdi (reg1, mem1, reg2, mem2);
4130
4131     case E_DFmode:
4132       return gen_load_pairdf (reg1, mem1, reg2, mem2);
4133
4134     default:
4135       gcc_unreachable ();
4136     }
4137 }
4138
4139 /* Return TRUE if return address signing should be enabled for the current
4140    function, otherwise return FALSE.  */
4141
4142 bool
4143 aarch64_return_address_signing_enabled (void)
4144 {
4145   /* This function should only be called after frame laid out.   */
4146   gcc_assert (cfun->machine->frame.laid_out);
4147
4148   /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4149      if it's LR is pushed onto stack.  */
4150   return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4151           || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4152               && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4153 }
4154
4155 /* Emit code to save the callee-saved registers from register number START
4156    to LIMIT to the stack at the location starting at offset START_OFFSET,
4157    skipping any write-back candidates if SKIP_WB is true.  */
4158
4159 static void
4160 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4161                            unsigned start, unsigned limit, bool skip_wb)
4162 {
4163   rtx_insn *insn;
4164   unsigned regno;
4165   unsigned regno2;
4166
4167   for (regno = aarch64_next_callee_save (start, limit);
4168        regno <= limit;
4169        regno = aarch64_next_callee_save (regno + 1, limit))
4170     {
4171       rtx reg, mem;
4172       poly_int64 offset;
4173
4174       if (skip_wb
4175           && (regno == cfun->machine->frame.wb_candidate1
4176               || regno == cfun->machine->frame.wb_candidate2))
4177         continue;
4178
4179       if (cfun->machine->reg_is_wrapped_separately[regno])
4180        continue;
4181
4182       reg = gen_rtx_REG (mode, regno);
4183       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4184       mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4185                                                 offset));
4186
4187       regno2 = aarch64_next_callee_save (regno + 1, limit);
4188
4189       if (regno2 <= limit
4190           && !cfun->machine->reg_is_wrapped_separately[regno2]
4191           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4192               == cfun->machine->frame.reg_offset[regno2]))
4193
4194         {
4195           rtx reg2 = gen_rtx_REG (mode, regno2);
4196           rtx mem2;
4197
4198           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4199           mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4200                                                      offset));
4201           insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4202                                                     reg2));
4203
4204           /* The first part of a frame-related parallel insn is
4205              always assumed to be relevant to the frame
4206              calculations; subsequent parts, are only
4207              frame-related if explicitly marked.  */
4208           RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4209           regno = regno2;
4210         }
4211       else
4212         insn = emit_move_insn (mem, reg);
4213
4214       RTX_FRAME_RELATED_P (insn) = 1;
4215     }
4216 }
4217
4218 /* Emit code to restore the callee registers of mode MODE from register
4219    number START up to and including LIMIT.  Restore from the stack offset
4220    START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4221    Write the appropriate REG_CFA_RESTORE notes into CFI_OPS.  */
4222
4223 static void
4224 aarch64_restore_callee_saves (machine_mode mode,
4225                               poly_int64 start_offset, unsigned start,
4226                               unsigned limit, bool skip_wb, rtx *cfi_ops)
4227 {
4228   rtx base_rtx = stack_pointer_rtx;
4229   unsigned regno;
4230   unsigned regno2;
4231   poly_int64 offset;
4232
4233   for (regno = aarch64_next_callee_save (start, limit);
4234        regno <= limit;
4235        regno = aarch64_next_callee_save (regno + 1, limit))
4236     {
4237       if (cfun->machine->reg_is_wrapped_separately[regno])
4238        continue;
4239
4240       rtx reg, mem;
4241
4242       if (skip_wb
4243           && (regno == cfun->machine->frame.wb_candidate1
4244               || regno == cfun->machine->frame.wb_candidate2))
4245         continue;
4246
4247       reg = gen_rtx_REG (mode, regno);
4248       offset = start_offset + cfun->machine->frame.reg_offset[regno];
4249       mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4250
4251       regno2 = aarch64_next_callee_save (regno + 1, limit);
4252
4253       if (regno2 <= limit
4254           && !cfun->machine->reg_is_wrapped_separately[regno2]
4255           && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4256               == cfun->machine->frame.reg_offset[regno2]))
4257         {
4258           rtx reg2 = gen_rtx_REG (mode, regno2);
4259           rtx mem2;
4260
4261           offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4262           mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4263           emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4264
4265           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4266           regno = regno2;
4267         }
4268       else
4269         emit_move_insn (reg, mem);
4270       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4271     }
4272 }
4273
4274 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4275    of MODE.  */
4276
4277 static inline bool
4278 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4279 {
4280   HOST_WIDE_INT multiple;
4281   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4282           && IN_RANGE (multiple, -8, 7));
4283 }
4284
4285 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4286    of MODE.  */
4287
4288 static inline bool
4289 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4290 {
4291   HOST_WIDE_INT multiple;
4292   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4293           && IN_RANGE (multiple, 0, 63));
4294 }
4295
4296 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4297    of MODE.  */
4298
4299 bool
4300 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4301 {
4302   HOST_WIDE_INT multiple;
4303   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4304           && IN_RANGE (multiple, -64, 63));
4305 }
4306
4307 /* Return true if OFFSET is a signed 9-bit value.  */
4308
4309 static inline bool
4310 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4311                                poly_int64 offset)
4312 {
4313   HOST_WIDE_INT const_offset;
4314   return (offset.is_constant (&const_offset)
4315           && IN_RANGE (const_offset, -256, 255));
4316 }
4317
4318 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4319    of MODE.  */
4320
4321 static inline bool
4322 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4323 {
4324   HOST_WIDE_INT multiple;
4325   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4326           && IN_RANGE (multiple, -256, 255));
4327 }
4328
4329 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4330    of MODE.  */
4331
4332 static inline bool
4333 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4334 {
4335   HOST_WIDE_INT multiple;
4336   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4337           && IN_RANGE (multiple, 0, 4095));
4338 }
4339
4340 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
4341
4342 static sbitmap
4343 aarch64_get_separate_components (void)
4344 {
4345   aarch64_layout_frame ();
4346
4347   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4348   bitmap_clear (components);
4349
4350   /* The registers we need saved to the frame.  */
4351   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4352     if (aarch64_register_saved_on_entry (regno))
4353       {
4354         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4355         if (!frame_pointer_needed)
4356           offset += cfun->machine->frame.frame_size
4357                     - cfun->machine->frame.hard_fp_offset;
4358         /* Check that we can access the stack slot of the register with one
4359            direct load with no adjustments needed.  */
4360         if (offset_12bit_unsigned_scaled_p (DImode, offset))
4361           bitmap_set_bit (components, regno);
4362       }
4363
4364   /* Don't mess with the hard frame pointer.  */
4365   if (frame_pointer_needed)
4366     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4367
4368   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4369   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4370   /* If aarch64_layout_frame has chosen registers to store/restore with
4371      writeback don't interfere with them to avoid having to output explicit
4372      stack adjustment instructions.  */
4373   if (reg2 != INVALID_REGNUM)
4374     bitmap_clear_bit (components, reg2);
4375   if (reg1 != INVALID_REGNUM)
4376     bitmap_clear_bit (components, reg1);
4377
4378   bitmap_clear_bit (components, LR_REGNUM);
4379   bitmap_clear_bit (components, SP_REGNUM);
4380
4381   return components;
4382 }
4383
4384 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
4385
4386 static sbitmap
4387 aarch64_components_for_bb (basic_block bb)
4388 {
4389   bitmap in = DF_LIVE_IN (bb);
4390   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4391   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4392
4393   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4394   bitmap_clear (components);
4395
4396   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
4397   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4398     if ((!call_used_regs[regno])
4399        && (bitmap_bit_p (in, regno)
4400            || bitmap_bit_p (gen, regno)
4401            || bitmap_bit_p (kill, regno)))
4402           bitmap_set_bit (components, regno);
4403
4404   return components;
4405 }
4406
4407 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4408    Nothing to do for aarch64.  */
4409
4410 static void
4411 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4412 {
4413 }
4414
4415 /* Return the next set bit in BMP from START onwards.  Return the total number
4416    of bits in BMP if no set bit is found at or after START.  */
4417
4418 static unsigned int
4419 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4420 {
4421   unsigned int nbits = SBITMAP_SIZE (bmp);
4422   if (start == nbits)
4423     return start;
4424
4425   gcc_assert (start < nbits);
4426   for (unsigned int i = start; i < nbits; i++)
4427     if (bitmap_bit_p (bmp, i))
4428       return i;
4429
4430   return nbits;
4431 }
4432
4433 /* Do the work for aarch64_emit_prologue_components and
4434    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
4435    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4436    for these components or the epilogue sequence.  That is, it determines
4437    whether we should emit stores or loads and what kind of CFA notes to attach
4438    to the insns.  Otherwise the logic for the two sequences is very
4439    similar.  */
4440
4441 static void
4442 aarch64_process_components (sbitmap components, bool prologue_p)
4443 {
4444   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4445                              ? HARD_FRAME_POINTER_REGNUM
4446                              : STACK_POINTER_REGNUM);
4447
4448   unsigned last_regno = SBITMAP_SIZE (components);
4449   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4450   rtx_insn *insn = NULL;
4451
4452   while (regno != last_regno)
4453     {
4454       /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4455          so DFmode for the vector registers is enough.  */
4456       machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4457       rtx reg = gen_rtx_REG (mode, regno);
4458       poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4459       if (!frame_pointer_needed)
4460         offset += cfun->machine->frame.frame_size
4461                   - cfun->machine->frame.hard_fp_offset;
4462       rtx addr = plus_constant (Pmode, ptr_reg, offset);
4463       rtx mem = gen_frame_mem (mode, addr);
4464
4465       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4466       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4467       /* No more registers to handle after REGNO.
4468          Emit a single save/restore and exit.  */
4469       if (regno2 == last_regno)
4470         {
4471           insn = emit_insn (set);
4472           RTX_FRAME_RELATED_P (insn) = 1;
4473           if (prologue_p)
4474             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4475           else
4476             add_reg_note (insn, REG_CFA_RESTORE, reg);
4477           break;
4478         }
4479
4480       poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4481       /* The next register is not of the same class or its offset is not
4482          mergeable with the current one into a pair.  */
4483       if (!satisfies_constraint_Ump (mem)
4484           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4485           || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4486                        GET_MODE_SIZE (mode)))
4487         {
4488           insn = emit_insn (set);
4489           RTX_FRAME_RELATED_P (insn) = 1;
4490           if (prologue_p)
4491             add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4492           else
4493             add_reg_note (insn, REG_CFA_RESTORE, reg);
4494
4495           regno = regno2;
4496           continue;
4497         }
4498
4499       /* REGNO2 can be saved/restored in a pair with REGNO.  */
4500       rtx reg2 = gen_rtx_REG (mode, regno2);
4501       if (!frame_pointer_needed)
4502         offset2 += cfun->machine->frame.frame_size
4503                   - cfun->machine->frame.hard_fp_offset;
4504       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4505       rtx mem2 = gen_frame_mem (mode, addr2);
4506       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4507                              : gen_rtx_SET (reg2, mem2);
4508
4509       if (prologue_p)
4510         insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4511       else
4512         insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4513
4514       RTX_FRAME_RELATED_P (insn) = 1;
4515       if (prologue_p)
4516         {
4517           add_reg_note (insn, REG_CFA_OFFSET, set);
4518           add_reg_note (insn, REG_CFA_OFFSET, set2);
4519         }
4520       else
4521         {
4522           add_reg_note (insn, REG_CFA_RESTORE, reg);
4523           add_reg_note (insn, REG_CFA_RESTORE, reg2);
4524         }
4525
4526       regno = aarch64_get_next_set_bit (components, regno2 + 1);
4527     }
4528 }
4529
4530 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
4531
4532 static void
4533 aarch64_emit_prologue_components (sbitmap components)
4534 {
4535   aarch64_process_components (components, true);
4536 }
4537
4538 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
4539
4540 static void
4541 aarch64_emit_epilogue_components (sbitmap components)
4542 {
4543   aarch64_process_components (components, false);
4544 }
4545
4546 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
4547
4548 static void
4549 aarch64_set_handled_components (sbitmap components)
4550 {
4551   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4552     if (bitmap_bit_p (components, regno))
4553       cfun->machine->reg_is_wrapped_separately[regno] = true;
4554 }
4555
4556 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4557    is saved at BASE + OFFSET.  */
4558
4559 static void
4560 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4561                             rtx base, poly_int64 offset)
4562 {
4563   rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4564   add_reg_note (insn, REG_CFA_EXPRESSION,
4565                 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4566 }
4567
4568 /* AArch64 stack frames generated by this compiler look like:
4569
4570         +-------------------------------+
4571         |                               |
4572         |  incoming stack arguments     |
4573         |                               |
4574         +-------------------------------+
4575         |                               | <-- incoming stack pointer (aligned)
4576         |  callee-allocated save area   |
4577         |  for register varargs         |
4578         |                               |
4579         +-------------------------------+
4580         |  local variables              | <-- frame_pointer_rtx
4581         |                               |
4582         +-------------------------------+
4583         |  padding0                     | \
4584         +-------------------------------+  |
4585         |  callee-saved registers       |  | frame.saved_regs_size
4586         +-------------------------------+  |
4587         |  LR'                          |  |
4588         +-------------------------------+  |
4589         |  FP'                          | / <- hard_frame_pointer_rtx (aligned)
4590         +-------------------------------+
4591         |  dynamic allocation           |
4592         +-------------------------------+
4593         |  padding                      |
4594         +-------------------------------+
4595         |  outgoing stack arguments     | <-- arg_pointer
4596         |                               |
4597         +-------------------------------+
4598         |                               | <-- stack_pointer_rtx (aligned)
4599
4600    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4601    but leave frame_pointer_rtx and hard_frame_pointer_rtx
4602    unchanged.  */
4603
4604 /* Generate the prologue instructions for entry into a function.
4605    Establish the stack frame by decreasing the stack pointer with a
4606    properly calculated size and, if necessary, create a frame record
4607    filled with the values of LR and previous frame pointer.  The
4608    current FP is also set up if it is in use.  */
4609
4610 void
4611 aarch64_expand_prologue (void)
4612 {
4613   aarch64_layout_frame ();
4614
4615   poly_int64 frame_size = cfun->machine->frame.frame_size;
4616   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4617   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4618   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4619   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4620   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4621   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4622   bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4623   rtx_insn *insn;
4624
4625   /* Sign return address for functions.  */
4626   if (aarch64_return_address_signing_enabled ())
4627     {
4628       insn = emit_insn (gen_pacisp ());
4629       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4630       RTX_FRAME_RELATED_P (insn) = 1;
4631     }
4632
4633   if (flag_stack_usage_info)
4634     current_function_static_stack_size = constant_lower_bound (frame_size);
4635
4636   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4637     {
4638       if (crtl->is_leaf && !cfun->calls_alloca)
4639         {
4640           if (maybe_gt (frame_size, PROBE_INTERVAL)
4641               && maybe_gt (frame_size, get_stack_check_protect ()))
4642             aarch64_emit_probe_stack_range (get_stack_check_protect (),
4643                                             (frame_size
4644                                              - get_stack_check_protect ()));
4645         }
4646       else if (maybe_gt (frame_size, 0))
4647         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4648     }
4649
4650   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4651   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4652
4653   aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4654
4655   if (callee_adjust != 0)
4656     aarch64_push_regs (reg1, reg2, callee_adjust);
4657
4658   if (emit_frame_chain)
4659     {
4660       poly_int64 reg_offset = callee_adjust;
4661       if (callee_adjust == 0)
4662         {
4663           reg1 = R29_REGNUM;
4664           reg2 = R30_REGNUM;
4665           reg_offset = callee_offset;
4666           aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4667         }
4668       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4669                           stack_pointer_rtx, callee_offset,
4670                           ip1_rtx, ip0_rtx, frame_pointer_needed);
4671       if (frame_pointer_needed && !frame_size.is_constant ())
4672         {
4673           /* Variable-sized frames need to describe the save slot
4674              address using DW_CFA_expression rather than DW_CFA_offset.
4675              This means that, without taking further action, the
4676              locations of the registers that we've already saved would
4677              remain based on the stack pointer even after we redefine
4678              the CFA based on the frame pointer.  We therefore need new
4679              DW_CFA_expressions to re-express the save slots with addresses
4680              based on the frame pointer.  */
4681           rtx_insn *insn = get_last_insn ();
4682           gcc_assert (RTX_FRAME_RELATED_P (insn));
4683
4684           /* Add an explicit CFA definition if this was previously
4685              implicit.  */
4686           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4687             {
4688               rtx src = plus_constant (Pmode, stack_pointer_rtx,
4689                                        callee_offset);
4690               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4691                             gen_rtx_SET (hard_frame_pointer_rtx, src));
4692             }
4693
4694           /* Change the save slot expressions for the registers that
4695              we've already saved.  */
4696           reg_offset -= callee_offset;
4697           aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4698                                       reg_offset + UNITS_PER_WORD);
4699           aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4700                                       reg_offset);
4701         }
4702       emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4703     }
4704
4705   aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4706                              callee_adjust != 0 || emit_frame_chain);
4707   aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4708                              callee_adjust != 0 || emit_frame_chain);
4709   aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4710 }
4711
4712 /* Return TRUE if we can use a simple_return insn.
4713
4714    This function checks whether the callee saved stack is empty, which
4715    means no restore actions are need. The pro_and_epilogue will use
4716    this to check whether shrink-wrapping opt is feasible.  */
4717
4718 bool
4719 aarch64_use_return_insn_p (void)
4720 {
4721   if (!reload_completed)
4722     return false;
4723
4724   if (crtl->profile)
4725     return false;
4726
4727   aarch64_layout_frame ();
4728
4729   return known_eq (cfun->machine->frame.frame_size, 0);
4730 }
4731
4732 /* Generate the epilogue instructions for returning from a function.
4733    This is almost exactly the reverse of the prolog sequence, except
4734    that we need to insert barriers to avoid scheduling loads that read
4735    from a deallocated stack, and we optimize the unwind records by
4736    emitting them all together if possible.  */
4737 void
4738 aarch64_expand_epilogue (bool for_sibcall)
4739 {
4740   aarch64_layout_frame ();
4741
4742   poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4743   HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4744   poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4745   poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4746   unsigned reg1 = cfun->machine->frame.wb_candidate1;
4747   unsigned reg2 = cfun->machine->frame.wb_candidate2;
4748   rtx cfi_ops = NULL;
4749   rtx_insn *insn;
4750   /* A stack clash protection prologue may not have left IP0_REGNUM or
4751      IP1_REGNUM in a usable state.  The same is true for allocations
4752      with an SVE component, since we then need both temporary registers
4753      for each allocation.  */
4754   bool can_inherit_p = (initial_adjust.is_constant ()
4755                         && final_adjust.is_constant ()
4756                         && !flag_stack_clash_protection);
4757
4758   /* We need to add memory barrier to prevent read from deallocated stack.  */
4759   bool need_barrier_p
4760     = maybe_ne (get_frame_size ()
4761                 + cfun->machine->frame.saved_varargs_size, 0);
4762
4763   /* Emit a barrier to prevent loads from a deallocated stack.  */
4764   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4765       || cfun->calls_alloca
4766       || crtl->calls_eh_return)
4767     {
4768       emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4769       need_barrier_p = false;
4770     }
4771
4772   /* Restore the stack pointer from the frame pointer if it may not
4773      be the same as the stack pointer.  */
4774   rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4775   rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4776   if (frame_pointer_needed
4777       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
4778     /* If writeback is used when restoring callee-saves, the CFA
4779        is restored on the instruction doing the writeback.  */
4780     aarch64_add_offset (Pmode, stack_pointer_rtx,
4781                         hard_frame_pointer_rtx, -callee_offset,
4782                         ip1_rtx, ip0_rtx, callee_adjust == 0);
4783   else
4784     aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
4785                     !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
4786
4787   aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4788                                 callee_adjust != 0, &cfi_ops);
4789   aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4790                                 callee_adjust != 0, &cfi_ops);
4791
4792   if (need_barrier_p)
4793     emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4794
4795   if (callee_adjust != 0)
4796     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
4797
4798   if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
4799     {
4800       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
4801       insn = get_last_insn ();
4802       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
4803       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
4804       RTX_FRAME_RELATED_P (insn) = 1;
4805       cfi_ops = NULL;
4806     }
4807
4808   aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
4809                   !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
4810
4811   if (cfi_ops)
4812     {
4813       /* Emit delayed restores and reset the CFA to be SP.  */
4814       insn = get_last_insn ();
4815       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
4816       REG_NOTES (insn) = cfi_ops;
4817       RTX_FRAME_RELATED_P (insn) = 1;
4818     }
4819
4820   /* We prefer to emit the combined return/authenticate instruction RETAA,
4821      however there are three cases in which we must instead emit an explicit
4822      authentication instruction.
4823
4824         1) Sibcalls don't return in a normal way, so if we're about to call one
4825            we must authenticate.
4826
4827         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
4828            generating code for !TARGET_ARMV8_3 we can't use it and must
4829            explicitly authenticate.
4830
4831         3) On an eh_return path we make extra stack adjustments to update the
4832            canonical frame address to be the exception handler's CFA.  We want
4833            to authenticate using the CFA of the function which calls eh_return.
4834     */
4835   if (aarch64_return_address_signing_enabled ()
4836       && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
4837     {
4838       insn = emit_insn (gen_autisp ());
4839       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4840       RTX_FRAME_RELATED_P (insn) = 1;
4841     }
4842
4843   /* Stack adjustment for exception handler.  */
4844   if (crtl->calls_eh_return)
4845     {
4846       /* We need to unwind the stack by the offset computed by
4847          EH_RETURN_STACKADJ_RTX.  We have already reset the CFA
4848          to be SP; letting the CFA move during this adjustment
4849          is just as correct as retaining the CFA from the body
4850          of the function.  Therefore, do nothing special.  */
4851       emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
4852     }
4853
4854   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
4855   if (!for_sibcall)
4856     emit_jump_insn (ret_rtx);
4857 }
4858
4859 /* Implement EH_RETURN_HANDLER_RTX.  EH returns need to either return
4860    normally or return to a previous frame after unwinding.
4861
4862    An EH return uses a single shared return sequence.  The epilogue is
4863    exactly like a normal epilogue except that it has an extra input
4864    register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
4865    that must be applied after the frame has been destroyed.  An extra label
4866    is inserted before the epilogue which initializes this register to zero,
4867    and this is the entry point for a normal return.
4868
4869    An actual EH return updates the return address, initializes the stack
4870    adjustment and jumps directly into the epilogue (bypassing the zeroing
4871    of the adjustment).  Since the return address is typically saved on the
4872    stack when a function makes a call, the saved LR must be updated outside
4873    the epilogue.
4874
4875    This poses problems as the store is generated well before the epilogue,
4876    so the offset of LR is not known yet.  Also optimizations will remove the
4877    store as it appears dead, even after the epilogue is generated (as the
4878    base or offset for loading LR is different in many cases).
4879
4880    To avoid these problems this implementation forces the frame pointer
4881    in eh_return functions so that the location of LR is fixed and known early.
4882    It also marks the store volatile, so no optimization is permitted to
4883    remove the store.  */
4884 rtx
4885 aarch64_eh_return_handler_rtx (void)
4886 {
4887   rtx tmp = gen_frame_mem (Pmode,
4888     plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
4889
4890   /* Mark the store volatile, so no optimization is permitted to remove it.  */
4891   MEM_VOLATILE_P (tmp) = true;
4892   return tmp;
4893 }
4894
4895 /* Output code to add DELTA to the first argument, and then jump
4896    to FUNCTION.  Used for C++ multiple inheritance.  */
4897 static void
4898 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
4899                          HOST_WIDE_INT delta,
4900                          HOST_WIDE_INT vcall_offset,
4901                          tree function)
4902 {
4903   /* The this pointer is always in x0.  Note that this differs from
4904      Arm where the this pointer maybe bumped to r1 if r0 is required
4905      to return a pointer to an aggregate.  On AArch64 a result value
4906      pointer will be in x8.  */
4907   int this_regno = R0_REGNUM;
4908   rtx this_rtx, temp0, temp1, addr, funexp;
4909   rtx_insn *insn;
4910
4911   reload_completed = 1;
4912   emit_note (NOTE_INSN_PROLOGUE_END);
4913
4914   this_rtx = gen_rtx_REG (Pmode, this_regno);
4915   temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
4916   temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
4917
4918   if (vcall_offset == 0)
4919     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
4920   else
4921     {
4922       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
4923
4924       addr = this_rtx;
4925       if (delta != 0)
4926         {
4927           if (delta >= -256 && delta < 256)
4928             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
4929                                        plus_constant (Pmode, this_rtx, delta));
4930           else
4931             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
4932                                 temp1, temp0, false);
4933         }
4934
4935       if (Pmode == ptr_mode)
4936         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
4937       else
4938         aarch64_emit_move (temp0,
4939                            gen_rtx_ZERO_EXTEND (Pmode,
4940                                                 gen_rtx_MEM (ptr_mode, addr)));
4941
4942       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
4943           addr = plus_constant (Pmode, temp0, vcall_offset);
4944       else
4945         {
4946           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
4947                                           Pmode);
4948           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
4949         }
4950
4951       if (Pmode == ptr_mode)
4952         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
4953       else
4954         aarch64_emit_move (temp1,
4955                            gen_rtx_SIGN_EXTEND (Pmode,
4956                                                 gen_rtx_MEM (ptr_mode, addr)));
4957
4958       emit_insn (gen_add2_insn (this_rtx, temp1));
4959     }
4960
4961   /* Generate a tail call to the target function.  */
4962   if (!TREE_USED (function))
4963     {
4964       assemble_external (function);
4965       TREE_USED (function) = 1;
4966     }
4967   funexp = XEXP (DECL_RTL (function), 0);
4968   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
4969   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
4970   SIBLING_CALL_P (insn) = 1;
4971
4972   insn = get_insns ();
4973   shorten_branches (insn);
4974   final_start_function (insn, file, 1);
4975   final (insn, file, 1);
4976   final_end_function ();
4977
4978   /* Stop pretending to be a post-reload pass.  */
4979   reload_completed = 0;
4980 }
4981
4982 static bool
4983 aarch64_tls_referenced_p (rtx x)
4984 {
4985   if (!TARGET_HAVE_TLS)
4986     return false;
4987   subrtx_iterator::array_type array;
4988   FOR_EACH_SUBRTX (iter, array, x, ALL)
4989     {
4990       const_rtx x = *iter;
4991       if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
4992         return true;
4993       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
4994          TLS offsets, not real symbol references.  */
4995       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4996         iter.skip_subrtxes ();
4997     }
4998   return false;
4999 }
5000
5001
5002 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5003    a left shift of 0 or 12 bits.  */
5004 bool
5005 aarch64_uimm12_shift (HOST_WIDE_INT val)
5006 {
5007   return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5008           || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5009           );
5010 }
5011
5012
5013 /* Return true if val is an immediate that can be loaded into a
5014    register by a MOVZ instruction.  */
5015 static bool
5016 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5017 {
5018   if (GET_MODE_SIZE (mode) > 4)
5019     {
5020       if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5021           || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5022         return 1;
5023     }
5024   else
5025     {
5026       /* Ignore sign extension.  */
5027       val &= (HOST_WIDE_INT) 0xffffffff;
5028     }
5029   return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5030           || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5031 }
5032
5033 /* VAL is a value with the inner mode of MODE.  Replicate it to fill a
5034    64-bit (DImode) integer.  */
5035
5036 static unsigned HOST_WIDE_INT
5037 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5038 {
5039   unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5040   while (size < 64)
5041     {
5042       val &= (HOST_WIDE_INT_1U << size) - 1;
5043       val |= val << size;
5044       size *= 2;
5045     }
5046   return val;
5047 }
5048
5049 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
5050
5051 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5052   {
5053     0x0000000100000001ull,
5054     0x0001000100010001ull,
5055     0x0101010101010101ull,
5056     0x1111111111111111ull,
5057     0x5555555555555555ull,
5058   };
5059
5060
5061 /* Return true if val is a valid bitmask immediate.  */
5062
5063 bool
5064 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5065 {
5066   unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5067   int bits;
5068
5069   /* Check for a single sequence of one bits and return quickly if so.
5070      The special cases of all ones and all zeroes returns false.  */
5071   val = aarch64_replicate_bitmask_imm (val_in, mode);
5072   tmp = val + (val & -val);
5073
5074   if (tmp == (tmp & -tmp))
5075     return (val + 1) > 1;
5076
5077   /* Replicate 32-bit immediates so we can treat them as 64-bit.  */
5078   if (mode == SImode)
5079     val = (val << 32) | (val & 0xffffffff);
5080
5081   /* Invert if the immediate doesn't start with a zero bit - this means we
5082      only need to search for sequences of one bits.  */
5083   if (val & 1)
5084     val = ~val;
5085
5086   /* Find the first set bit and set tmp to val with the first sequence of one
5087      bits removed.  Return success if there is a single sequence of ones.  */
5088   first_one = val & -val;
5089   tmp = val & (val + first_one);
5090
5091   if (tmp == 0)
5092     return true;
5093
5094   /* Find the next set bit and compute the difference in bit position.  */
5095   next_one = tmp & -tmp;
5096   bits = clz_hwi (first_one) - clz_hwi (next_one);
5097   mask = val ^ tmp;
5098
5099   /* Check the bit position difference is a power of 2, and that the first
5100      sequence of one bits fits within 'bits' bits.  */
5101   if ((mask >> bits) != 0 || bits != (bits & -bits))
5102     return false;
5103
5104   /* Check the sequence of one bits is repeated 64/bits times.  */
5105   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5106 }
5107
5108 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5109    Assumed precondition: VAL_IN Is not zero.  */
5110
5111 unsigned HOST_WIDE_INT
5112 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5113 {
5114   int lowest_bit_set = ctz_hwi (val_in);
5115   int highest_bit_set = floor_log2 (val_in);
5116   gcc_assert (val_in != 0);
5117
5118   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5119           (HOST_WIDE_INT_1U << lowest_bit_set));
5120 }
5121
5122 /* Create constant where bits outside of lowest bit set to highest bit set
5123    are set to 1.  */
5124
5125 unsigned HOST_WIDE_INT
5126 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5127 {
5128   return val_in | ~aarch64_and_split_imm1 (val_in);
5129 }
5130
5131 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
5132
5133 bool
5134 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5135 {
5136   scalar_int_mode int_mode;
5137   if (!is_a <scalar_int_mode> (mode, &int_mode))
5138     return false;
5139
5140   if (aarch64_bitmask_imm (val_in, int_mode))
5141     return false;
5142
5143   if (aarch64_move_imm (val_in, int_mode))
5144     return false;
5145
5146   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5147
5148   return aarch64_bitmask_imm (imm2, int_mode);
5149 }
5150
5151 /* Return true if val is an immediate that can be loaded into a
5152    register in a single instruction.  */
5153 bool
5154 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5155 {
5156   scalar_int_mode int_mode;
5157   if (!is_a <scalar_int_mode> (mode, &int_mode))
5158     return false;
5159
5160   if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5161     return 1;
5162   return aarch64_bitmask_imm (val, int_mode);
5163 }
5164
5165 static bool
5166 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5167 {
5168   rtx base, offset;
5169
5170   if (GET_CODE (x) == HIGH)
5171     return true;
5172
5173   /* There's no way to calculate VL-based values using relocations.  */
5174   subrtx_iterator::array_type array;
5175   FOR_EACH_SUBRTX (iter, array, x, ALL)
5176     if (GET_CODE (*iter) == CONST_POLY_INT)
5177       return true;
5178
5179   split_const (x, &base, &offset);
5180   if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5181     {
5182       if (aarch64_classify_symbol (base, INTVAL (offset))
5183           != SYMBOL_FORCE_TO_MEM)
5184         return true;
5185       else
5186         /* Avoid generating a 64-bit relocation in ILP32; leave
5187            to aarch64_expand_mov_immediate to handle it properly.  */
5188         return mode != ptr_mode;
5189     }
5190
5191   return aarch64_tls_referenced_p (x);
5192 }
5193
5194 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5195    The expansion for a table switch is quite expensive due to the number
5196    of instructions, the table lookup and hard to predict indirect jump.
5197    When optimizing for speed, and -O3 enabled, use the per-core tuning if
5198    set, otherwise use tables for > 16 cases as a tradeoff between size and
5199    performance.  When optimizing for size, use the default setting.  */
5200
5201 static unsigned int
5202 aarch64_case_values_threshold (void)
5203 {
5204   /* Use the specified limit for the number of cases before using jump
5205      tables at higher optimization levels.  */
5206   if (optimize > 2
5207       && selected_cpu->tune->max_case_values != 0)
5208     return selected_cpu->tune->max_case_values;
5209   else
5210     return optimize_size ? default_case_values_threshold () : 17;
5211 }
5212
5213 /* Return true if register REGNO is a valid index register.
5214    STRICT_P is true if REG_OK_STRICT is in effect.  */
5215
5216 bool
5217 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5218 {
5219   if (!HARD_REGISTER_NUM_P (regno))
5220     {
5221       if (!strict_p)
5222         return true;
5223
5224       if (!reg_renumber)
5225         return false;
5226
5227       regno = reg_renumber[regno];
5228     }
5229   return GP_REGNUM_P (regno);
5230 }
5231
5232 /* Return true if register REGNO is a valid base register for mode MODE.
5233    STRICT_P is true if REG_OK_STRICT is in effect.  */
5234
5235 bool
5236 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5237 {
5238   if (!HARD_REGISTER_NUM_P (regno))
5239     {
5240       if (!strict_p)
5241         return true;
5242
5243       if (!reg_renumber)
5244         return false;
5245
5246       regno = reg_renumber[regno];
5247     }
5248
5249   /* The fake registers will be eliminated to either the stack or
5250      hard frame pointer, both of which are usually valid base registers.
5251      Reload deals with the cases where the eliminated form isn't valid.  */
5252   return (GP_REGNUM_P (regno)
5253           || regno == SP_REGNUM
5254           || regno == FRAME_POINTER_REGNUM
5255           || regno == ARG_POINTER_REGNUM);
5256 }
5257
5258 /* Return true if X is a valid base register for mode MODE.
5259    STRICT_P is true if REG_OK_STRICT is in effect.  */
5260
5261 static bool
5262 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5263 {
5264   if (!strict_p
5265       && GET_CODE (x) == SUBREG
5266       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5267     x = SUBREG_REG (x);
5268
5269   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5270 }
5271
5272 /* Return true if address offset is a valid index.  If it is, fill in INFO
5273    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5274
5275 static bool
5276 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5277                         machine_mode mode, bool strict_p)
5278 {
5279   enum aarch64_address_type type;
5280   rtx index;
5281   int shift;
5282
5283   /* (reg:P) */
5284   if ((REG_P (x) || GET_CODE (x) == SUBREG)
5285       && GET_MODE (x) == Pmode)
5286     {
5287       type = ADDRESS_REG_REG;
5288       index = x;
5289       shift = 0;
5290     }
5291   /* (sign_extend:DI (reg:SI)) */
5292   else if ((GET_CODE (x) == SIGN_EXTEND
5293             || GET_CODE (x) == ZERO_EXTEND)
5294            && GET_MODE (x) == DImode
5295            && GET_MODE (XEXP (x, 0)) == SImode)
5296     {
5297       type = (GET_CODE (x) == SIGN_EXTEND)
5298         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5299       index = XEXP (x, 0);
5300       shift = 0;
5301     }
5302   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5303   else if (GET_CODE (x) == MULT
5304            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5305                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5306            && GET_MODE (XEXP (x, 0)) == DImode
5307            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5308            && CONST_INT_P (XEXP (x, 1)))
5309     {
5310       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5311         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5312       index = XEXP (XEXP (x, 0), 0);
5313       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5314     }
5315   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5316   else if (GET_CODE (x) == ASHIFT
5317            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5318                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5319            && GET_MODE (XEXP (x, 0)) == DImode
5320            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5321            && CONST_INT_P (XEXP (x, 1)))
5322     {
5323       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5324         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5325       index = XEXP (XEXP (x, 0), 0);
5326       shift = INTVAL (XEXP (x, 1));
5327     }
5328   /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5329   else if ((GET_CODE (x) == SIGN_EXTRACT
5330             || GET_CODE (x) == ZERO_EXTRACT)
5331            && GET_MODE (x) == DImode
5332            && GET_CODE (XEXP (x, 0)) == MULT
5333            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5334            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5335     {
5336       type = (GET_CODE (x) == SIGN_EXTRACT)
5337         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5338       index = XEXP (XEXP (x, 0), 0);
5339       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5340       if (INTVAL (XEXP (x, 1)) != 32 + shift
5341           || INTVAL (XEXP (x, 2)) != 0)
5342         shift = -1;
5343     }
5344   /* (and:DI (mult:DI (reg:DI) (const_int scale))
5345      (const_int 0xffffffff<<shift)) */
5346   else if (GET_CODE (x) == AND
5347            && GET_MODE (x) == DImode
5348            && GET_CODE (XEXP (x, 0)) == MULT
5349            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5350            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5351            && CONST_INT_P (XEXP (x, 1)))
5352     {
5353       type = ADDRESS_REG_UXTW;
5354       index = XEXP (XEXP (x, 0), 0);
5355       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5356       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5357         shift = -1;
5358     }
5359   /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5360   else if ((GET_CODE (x) == SIGN_EXTRACT
5361             || GET_CODE (x) == ZERO_EXTRACT)
5362            && GET_MODE (x) == DImode
5363            && GET_CODE (XEXP (x, 0)) == ASHIFT
5364            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5365            && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5366     {
5367       type = (GET_CODE (x) == SIGN_EXTRACT)
5368         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5369       index = XEXP (XEXP (x, 0), 0);
5370       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5371       if (INTVAL (XEXP (x, 1)) != 32 + shift
5372           || INTVAL (XEXP (x, 2)) != 0)
5373         shift = -1;
5374     }
5375   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5376      (const_int 0xffffffff<<shift)) */
5377   else if (GET_CODE (x) == AND
5378            && GET_MODE (x) == DImode
5379            && GET_CODE (XEXP (x, 0)) == ASHIFT
5380            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5381            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5382            && CONST_INT_P (XEXP (x, 1)))
5383     {
5384       type = ADDRESS_REG_UXTW;
5385       index = XEXP (XEXP (x, 0), 0);
5386       shift = INTVAL (XEXP (XEXP (x, 0), 1));
5387       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5388         shift = -1;
5389     }
5390   /* (mult:P (reg:P) (const_int scale)) */
5391   else if (GET_CODE (x) == MULT
5392            && GET_MODE (x) == Pmode
5393            && GET_MODE (XEXP (x, 0)) == Pmode
5394            && CONST_INT_P (XEXP (x, 1)))
5395     {
5396       type = ADDRESS_REG_REG;
5397       index = XEXP (x, 0);
5398       shift = exact_log2 (INTVAL (XEXP (x, 1)));
5399     }
5400   /* (ashift:P (reg:P) (const_int shift)) */
5401   else if (GET_CODE (x) == ASHIFT
5402            && GET_MODE (x) == Pmode
5403            && GET_MODE (XEXP (x, 0)) == Pmode
5404            && CONST_INT_P (XEXP (x, 1)))
5405     {
5406       type = ADDRESS_REG_REG;
5407       index = XEXP (x, 0);
5408       shift = INTVAL (XEXP (x, 1));
5409     }
5410   else
5411     return false;
5412
5413   if (!strict_p
5414       && GET_CODE (index) == SUBREG
5415       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5416     index = SUBREG_REG (index);
5417
5418   if (aarch64_sve_data_mode_p (mode))
5419     {
5420       if (type != ADDRESS_REG_REG
5421           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5422         return false;
5423     }
5424   else
5425     {
5426       if (shift != 0
5427           && !(IN_RANGE (shift, 1, 3)
5428                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5429         return false;
5430     }
5431
5432   if (REG_P (index)
5433       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5434     {
5435       info->type = type;
5436       info->offset = index;
5437       info->shift = shift;
5438       return true;
5439     }
5440
5441   return false;
5442 }
5443
5444 /* Return true if MODE is one of the modes for which we
5445    support LDP/STP operations.  */
5446
5447 static bool
5448 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5449 {
5450   return mode == SImode || mode == DImode
5451          || mode == SFmode || mode == DFmode
5452          || (aarch64_vector_mode_supported_p (mode)
5453              && known_eq (GET_MODE_SIZE (mode), 8));
5454 }
5455
5456 /* Return true if REGNO is a virtual pointer register, or an eliminable
5457    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
5458    include stack_pointer or hard_frame_pointer.  */
5459 static bool
5460 virt_or_elim_regno_p (unsigned regno)
5461 {
5462   return ((regno >= FIRST_VIRTUAL_REGISTER
5463            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5464           || regno == FRAME_POINTER_REGNUM
5465           || regno == ARG_POINTER_REGNUM);
5466 }
5467
5468 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5469    If it is, fill in INFO appropriately.  STRICT_P is true if
5470    REG_OK_STRICT is in effect.  */
5471
5472 static bool
5473 aarch64_classify_address (struct aarch64_address_info *info,
5474                           rtx x, machine_mode mode, bool strict_p,
5475                           aarch64_addr_query_type type = ADDR_QUERY_M)
5476 {
5477   enum rtx_code code = GET_CODE (x);
5478   rtx op0, op1;
5479   poly_int64 offset;
5480
5481   HOST_WIDE_INT const_size;
5482
5483   /* On BE, we use load/store pair for all large int mode load/stores.
5484      TI/TFmode may also use a load/store pair.  */
5485   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5486   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5487   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5488                             || mode == TImode
5489                             || mode == TFmode
5490                             || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5491
5492   bool allow_reg_index_p = (!load_store_pair_p
5493                             && (known_lt (GET_MODE_SIZE (mode), 16)
5494                                 || vec_flags == VEC_ADVSIMD
5495                                 || vec_flags == VEC_SVE_DATA));
5496
5497   /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5498      [Rn, #offset, MUL VL].  */
5499   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5500       && (code != REG && code != PLUS))
5501     return false;
5502
5503   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5504      REG addressing.  */
5505   if (advsimd_struct_p
5506       && !BYTES_BIG_ENDIAN
5507       && (code != POST_INC && code != REG))
5508     return false;
5509
5510   gcc_checking_assert (GET_MODE (x) == VOIDmode
5511                        || SCALAR_INT_MODE_P (GET_MODE (x)));
5512
5513   switch (code)
5514     {
5515     case REG:
5516     case SUBREG:
5517       info->type = ADDRESS_REG_IMM;
5518       info->base = x;
5519       info->offset = const0_rtx;
5520       info->const_offset = 0;
5521       return aarch64_base_register_rtx_p (x, strict_p);
5522
5523     case PLUS:
5524       op0 = XEXP (x, 0);
5525       op1 = XEXP (x, 1);
5526
5527       if (! strict_p
5528           && REG_P (op0)
5529           && virt_or_elim_regno_p (REGNO (op0))
5530           && poly_int_rtx_p (op1, &offset))
5531         {
5532           info->type = ADDRESS_REG_IMM;
5533           info->base = op0;
5534           info->offset = op1;
5535           info->const_offset = offset;
5536
5537           return true;
5538         }
5539
5540       if (maybe_ne (GET_MODE_SIZE (mode), 0)
5541           && aarch64_base_register_rtx_p (op0, strict_p)
5542           && poly_int_rtx_p (op1, &offset))
5543         {
5544           info->type = ADDRESS_REG_IMM;
5545           info->base = op0;
5546           info->offset = op1;
5547           info->const_offset = offset;
5548
5549           /* TImode and TFmode values are allowed in both pairs of X
5550              registers and individual Q registers.  The available
5551              address modes are:
5552              X,X: 7-bit signed scaled offset
5553              Q:   9-bit signed offset
5554              We conservatively require an offset representable in either mode.
5555              When performing the check for pairs of X registers i.e.  LDP/STP
5556              pass down DImode since that is the natural size of the LDP/STP
5557              instruction memory accesses.  */
5558           if (mode == TImode || mode == TFmode)
5559             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5560                     && (offset_9bit_signed_unscaled_p (mode, offset)
5561                         || offset_12bit_unsigned_scaled_p (mode, offset)));
5562
5563           /* A 7bit offset check because OImode will emit a ldp/stp
5564              instruction (only big endian will get here).
5565              For ldp/stp instructions, the offset is scaled for the size of a
5566              single element of the pair.  */
5567           if (mode == OImode)
5568             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5569
5570           /* Three 9/12 bit offsets checks because CImode will emit three
5571              ldr/str instructions (only big endian will get here).  */
5572           if (mode == CImode)
5573             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5574                     && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5575                         || offset_12bit_unsigned_scaled_p (V16QImode,
5576                                                            offset + 32)));
5577
5578           /* Two 7bit offsets checks because XImode will emit two ldp/stp
5579              instructions (only big endian will get here).  */
5580           if (mode == XImode)
5581             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5582                     && aarch64_offset_7bit_signed_scaled_p (TImode,
5583                                                             offset + 32));
5584
5585           /* Make "m" use the LD1 offset range for SVE data modes, so
5586              that pre-RTL optimizers like ivopts will work to that
5587              instead of the wider LDR/STR range.  */
5588           if (vec_flags == VEC_SVE_DATA)
5589             return (type == ADDR_QUERY_M
5590                     ? offset_4bit_signed_scaled_p (mode, offset)
5591                     : offset_9bit_signed_scaled_p (mode, offset));
5592
5593           if (vec_flags == VEC_SVE_PRED)
5594             return offset_9bit_signed_scaled_p (mode, offset);
5595
5596           if (load_store_pair_p)
5597             return ((known_eq (GET_MODE_SIZE (mode), 4)
5598                      || known_eq (GET_MODE_SIZE (mode), 8))
5599                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5600           else
5601             return (offset_9bit_signed_unscaled_p (mode, offset)
5602                     || offset_12bit_unsigned_scaled_p (mode, offset));
5603         }
5604
5605       if (allow_reg_index_p)
5606         {
5607           /* Look for base + (scaled/extended) index register.  */
5608           if (aarch64_base_register_rtx_p (op0, strict_p)
5609               && aarch64_classify_index (info, op1, mode, strict_p))
5610             {
5611               info->base = op0;
5612               return true;
5613             }
5614           if (aarch64_base_register_rtx_p (op1, strict_p)
5615               && aarch64_classify_index (info, op0, mode, strict_p))
5616             {
5617               info->base = op1;
5618               return true;
5619             }
5620         }
5621
5622       return false;
5623
5624     case POST_INC:
5625     case POST_DEC:
5626     case PRE_INC:
5627     case PRE_DEC:
5628       info->type = ADDRESS_REG_WB;
5629       info->base = XEXP (x, 0);
5630       info->offset = NULL_RTX;
5631       return aarch64_base_register_rtx_p (info->base, strict_p);
5632
5633     case POST_MODIFY:
5634     case PRE_MODIFY:
5635       info->type = ADDRESS_REG_WB;
5636       info->base = XEXP (x, 0);
5637       if (GET_CODE (XEXP (x, 1)) == PLUS
5638           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5639           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5640           && aarch64_base_register_rtx_p (info->base, strict_p))
5641         {
5642           info->offset = XEXP (XEXP (x, 1), 1);
5643           info->const_offset = offset;
5644
5645           /* TImode and TFmode values are allowed in both pairs of X
5646              registers and individual Q registers.  The available
5647              address modes are:
5648              X,X: 7-bit signed scaled offset
5649              Q:   9-bit signed offset
5650              We conservatively require an offset representable in either mode.
5651            */
5652           if (mode == TImode || mode == TFmode)
5653             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5654                     && offset_9bit_signed_unscaled_p (mode, offset));
5655
5656           if (load_store_pair_p)
5657             return ((known_eq (GET_MODE_SIZE (mode), 4)
5658                      || known_eq (GET_MODE_SIZE (mode), 8))
5659                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5660           else
5661             return offset_9bit_signed_unscaled_p (mode, offset);
5662         }
5663       return false;
5664
5665     case CONST:
5666     case SYMBOL_REF:
5667     case LABEL_REF:
5668       /* load literal: pc-relative constant pool entry.  Only supported
5669          for SI mode or larger.  */
5670       info->type = ADDRESS_SYMBOLIC;
5671
5672       if (!load_store_pair_p
5673           && GET_MODE_SIZE (mode).is_constant (&const_size)
5674           && const_size >= 4)
5675         {
5676           rtx sym, addend;
5677
5678           split_const (x, &sym, &addend);
5679           return ((GET_CODE (sym) == LABEL_REF
5680                    || (GET_CODE (sym) == SYMBOL_REF
5681                        && CONSTANT_POOL_ADDRESS_P (sym)
5682                        && aarch64_pcrelative_literal_loads)));
5683         }
5684       return false;
5685
5686     case LO_SUM:
5687       info->type = ADDRESS_LO_SUM;
5688       info->base = XEXP (x, 0);
5689       info->offset = XEXP (x, 1);
5690       if (allow_reg_index_p
5691           && aarch64_base_register_rtx_p (info->base, strict_p))
5692         {
5693           rtx sym, offs;
5694           split_const (info->offset, &sym, &offs);
5695           if (GET_CODE (sym) == SYMBOL_REF
5696               && (aarch64_classify_symbol (sym, INTVAL (offs))
5697                   == SYMBOL_SMALL_ABSOLUTE))
5698             {
5699               /* The symbol and offset must be aligned to the access size.  */
5700               unsigned int align;
5701
5702               if (CONSTANT_POOL_ADDRESS_P (sym))
5703                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5704               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5705                 {
5706                   tree exp = SYMBOL_REF_DECL (sym);
5707                   align = TYPE_ALIGN (TREE_TYPE (exp));
5708                   align = aarch64_constant_alignment (exp, align);
5709                 }
5710               else if (SYMBOL_REF_DECL (sym))
5711                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5712               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5713                        && SYMBOL_REF_BLOCK (sym) != NULL)
5714                 align = SYMBOL_REF_BLOCK (sym)->alignment;
5715               else
5716                 align = BITS_PER_UNIT;
5717
5718               poly_int64 ref_size = GET_MODE_SIZE (mode);
5719               if (known_eq (ref_size, 0))
5720                 ref_size = GET_MODE_SIZE (DImode);
5721
5722               return (multiple_p (INTVAL (offs), ref_size)
5723                       && multiple_p (align / BITS_PER_UNIT, ref_size));
5724             }
5725         }
5726       return false;
5727
5728     default:
5729       return false;
5730     }
5731 }
5732
5733 /* Return true if the address X is valid for a PRFM instruction.
5734    STRICT_P is true if we should do strict checking with
5735    aarch64_classify_address.  */
5736
5737 bool
5738 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5739 {
5740   struct aarch64_address_info addr;
5741
5742   /* PRFM accepts the same addresses as DImode...  */
5743   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5744   if (!res)
5745     return false;
5746
5747   /* ... except writeback forms.  */
5748   return addr.type != ADDRESS_REG_WB;
5749 }
5750
5751 bool
5752 aarch64_symbolic_address_p (rtx x)
5753 {
5754   rtx offset;
5755
5756   split_const (x, &x, &offset);
5757   return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
5758 }
5759
5760 /* Classify the base of symbolic expression X.  */
5761
5762 enum aarch64_symbol_type
5763 aarch64_classify_symbolic_expression (rtx x)
5764 {
5765   rtx offset;
5766
5767   split_const (x, &x, &offset);
5768   return aarch64_classify_symbol (x, INTVAL (offset));
5769 }
5770
5771
5772 /* Return TRUE if X is a legitimate address for accessing memory in
5773    mode MODE.  */
5774 static bool
5775 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
5776 {
5777   struct aarch64_address_info addr;
5778
5779   return aarch64_classify_address (&addr, x, mode, strict_p);
5780 }
5781
5782 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5783    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
5784 bool
5785 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
5786                               aarch64_addr_query_type type)
5787 {
5788   struct aarch64_address_info addr;
5789
5790   return aarch64_classify_address (&addr, x, mode, strict_p, type);
5791 }
5792
5793 /* Split an out-of-range address displacement into a base and offset.
5794    Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
5795    to increase opportunities for sharing the base address of different sizes.
5796    Unaligned accesses use the signed 9-bit range, TImode/TFmode use
5797    the intersection of signed scaled 7-bit and signed 9-bit offset.  */
5798 static bool
5799 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
5800 {
5801   HOST_WIDE_INT size;
5802   if (GET_MODE_SIZE (mode).is_constant (&size))
5803     {
5804       HOST_WIDE_INT offset = INTVAL (*disp);
5805       HOST_WIDE_INT base;
5806
5807       if (mode == TImode || mode == TFmode)
5808         base = (offset + 0x100) & ~0x1f8;
5809       else if ((offset & (size - 1)) != 0)
5810         base = (offset + 0x100) & ~0x1ff;
5811       else
5812         base = offset & ~(size < 4 ? 0xfff : 0x3ffc);
5813
5814       *off = GEN_INT (base);
5815       *disp = GEN_INT (offset - base);
5816       return true;
5817     }
5818   return false;
5819 }
5820
5821 /* Return the binary representation of floating point constant VALUE in INTVAL.
5822    If the value cannot be converted, return false without setting INTVAL.
5823    The conversion is done in the given MODE.  */
5824 bool
5825 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
5826 {
5827
5828   /* We make a general exception for 0.  */
5829   if (aarch64_float_const_zero_rtx_p (value))
5830     {
5831       *intval = 0;
5832       return true;
5833     }
5834
5835   scalar_float_mode mode;
5836   if (GET_CODE (value) != CONST_DOUBLE
5837       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
5838       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
5839       /* Only support up to DF mode.  */
5840       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
5841     return false;
5842
5843   unsigned HOST_WIDE_INT ival = 0;
5844
5845   long res[2];
5846   real_to_target (res,
5847                   CONST_DOUBLE_REAL_VALUE (value),
5848                   REAL_MODE_FORMAT (mode));
5849
5850   if (mode == DFmode)
5851     {
5852       int order = BYTES_BIG_ENDIAN ? 1 : 0;
5853       ival = zext_hwi (res[order], 32);
5854       ival |= (zext_hwi (res[1 - order], 32) << 32);
5855     }
5856   else
5857       ival = zext_hwi (res[0], 32);
5858
5859   *intval = ival;
5860   return true;
5861 }
5862
5863 /* Return TRUE if rtx X is an immediate constant that can be moved using a
5864    single MOV(+MOVK) followed by an FMOV.  */
5865 bool
5866 aarch64_float_const_rtx_p (rtx x)
5867 {
5868   machine_mode mode = GET_MODE (x);
5869   if (mode == VOIDmode)
5870     return false;
5871
5872   /* Determine whether it's cheaper to write float constants as
5873      mov/movk pairs over ldr/adrp pairs.  */
5874   unsigned HOST_WIDE_INT ival;
5875
5876   if (GET_CODE (x) == CONST_DOUBLE
5877       && SCALAR_FLOAT_MODE_P (mode)
5878       && aarch64_reinterpret_float_as_int (x, &ival))
5879     {
5880       scalar_int_mode imode = (mode == HFmode
5881                                ? SImode
5882                                : int_mode_for_mode (mode).require ());
5883       int num_instr = aarch64_internal_mov_immediate
5884                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
5885       return num_instr < 3;
5886     }
5887
5888   return false;
5889 }
5890
5891 /* Return TRUE if rtx X is immediate constant 0.0 */
5892 bool
5893 aarch64_float_const_zero_rtx_p (rtx x)
5894 {
5895   if (GET_MODE (x) == VOIDmode)
5896     return false;
5897
5898   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
5899     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
5900   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
5901 }
5902
5903 /* Return TRUE if rtx X is immediate constant that fits in a single
5904    MOVI immediate operation.  */
5905 bool
5906 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
5907 {
5908   if (!TARGET_SIMD)
5909      return false;
5910
5911   machine_mode vmode;
5912   scalar_int_mode imode;
5913   unsigned HOST_WIDE_INT ival;
5914
5915   if (GET_CODE (x) == CONST_DOUBLE
5916       && SCALAR_FLOAT_MODE_P (mode))
5917     {
5918       if (!aarch64_reinterpret_float_as_int (x, &ival))
5919         return false;
5920
5921       /* We make a general exception for 0.  */
5922       if (aarch64_float_const_zero_rtx_p (x))
5923         return true;
5924
5925       imode = int_mode_for_mode (mode).require ();
5926     }
5927   else if (GET_CODE (x) == CONST_INT
5928            && is_a <scalar_int_mode> (mode, &imode))
5929     ival = INTVAL (x);
5930   else
5931     return false;
5932
5933    /* use a 64 bit mode for everything except for DI/DF mode, where we use
5934      a 128 bit vector mode.  */
5935   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
5936
5937   vmode = aarch64_simd_container_mode (imode, width);
5938   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
5939
5940   return aarch64_simd_valid_immediate (v_op, NULL);
5941 }
5942
5943
5944 /* Return the fixed registers used for condition codes.  */
5945
5946 static bool
5947 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
5948 {
5949   *p1 = CC_REGNUM;
5950   *p2 = INVALID_REGNUM;
5951   return true;
5952 }
5953
5954 /* This function is used by the call expanders of the machine description.
5955    RESULT is the register in which the result is returned.  It's NULL for
5956    "call" and "sibcall".
5957    MEM is the location of the function call.
5958    SIBCALL indicates whether this function call is normal call or sibling call.
5959    It will generate different pattern accordingly.  */
5960
5961 void
5962 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
5963 {
5964   rtx call, callee, tmp;
5965   rtvec vec;
5966   machine_mode mode;
5967
5968   gcc_assert (MEM_P (mem));
5969   callee = XEXP (mem, 0);
5970   mode = GET_MODE (callee);
5971   gcc_assert (mode == Pmode);
5972
5973   /* Decide if we should generate indirect calls by loading the
5974      address of the callee into a register before performing
5975      the branch-and-link.  */
5976   if (SYMBOL_REF_P (callee)
5977       ? (aarch64_is_long_call_p (callee)
5978          || aarch64_is_noplt_call_p (callee))
5979       : !REG_P (callee))
5980     XEXP (mem, 0) = force_reg (mode, callee);
5981
5982   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
5983
5984   if (result != NULL_RTX)
5985     call = gen_rtx_SET (result, call);
5986
5987   if (sibcall)
5988     tmp = ret_rtx;
5989   else
5990     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
5991
5992   vec = gen_rtvec (2, call, tmp);
5993   call = gen_rtx_PARALLEL (VOIDmode, vec);
5994
5995   aarch64_emit_call_insn (call);
5996 }
5997
5998 /* Emit call insn with PAT and do aarch64-specific handling.  */
5999
6000 void
6001 aarch64_emit_call_insn (rtx pat)
6002 {
6003   rtx insn = emit_call_insn (pat);
6004
6005   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6006   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6007   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6008 }
6009
6010 machine_mode
6011 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6012 {
6013   /* All floating point compares return CCFP if it is an equality
6014      comparison, and CCFPE otherwise.  */
6015   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6016     {
6017       switch (code)
6018         {
6019         case EQ:
6020         case NE:
6021         case UNORDERED:
6022         case ORDERED:
6023         case UNLT:
6024         case UNLE:
6025         case UNGT:
6026         case UNGE:
6027         case UNEQ:
6028           return CCFPmode;
6029
6030         case LT:
6031         case LE:
6032         case GT:
6033         case GE:
6034         case LTGT:
6035           return CCFPEmode;
6036
6037         default:
6038           gcc_unreachable ();
6039         }
6040     }
6041
6042   /* Equality comparisons of short modes against zero can be performed
6043      using the TST instruction with the appropriate bitmask.  */
6044   if (y == const0_rtx && REG_P (x)
6045       && (code == EQ || code == NE)
6046       && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6047     return CC_NZmode;
6048
6049   /* Similarly, comparisons of zero_extends from shorter modes can
6050      be performed using an ANDS with an immediate mask.  */
6051   if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6052       && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6053       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6054       && (code == EQ || code == NE))
6055     return CC_NZmode;
6056
6057   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6058       && y == const0_rtx
6059       && (code == EQ || code == NE || code == LT || code == GE)
6060       && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6061           || GET_CODE (x) == NEG
6062           || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6063               && CONST_INT_P (XEXP (x, 2)))))
6064     return CC_NZmode;
6065
6066   /* A compare with a shifted operand.  Because of canonicalization,
6067      the comparison will have to be swapped when we emit the assembly
6068      code.  */
6069   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6070       && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6071       && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6072           || GET_CODE (x) == LSHIFTRT
6073           || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6074     return CC_SWPmode;
6075
6076   /* Similarly for a negated operand, but we can only do this for
6077      equalities.  */
6078   if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6079       && (REG_P (y) || GET_CODE (y) == SUBREG)
6080       && (code == EQ || code == NE)
6081       && GET_CODE (x) == NEG)
6082     return CC_Zmode;
6083
6084   /* A test for unsigned overflow.  */
6085   if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6086       && code == NE
6087       && GET_CODE (x) == PLUS
6088       && GET_CODE (y) == ZERO_EXTEND)
6089     return CC_Cmode;
6090
6091   /* For everything else, return CCmode.  */
6092   return CCmode;
6093 }
6094
6095 static int
6096 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6097
6098 int
6099 aarch64_get_condition_code (rtx x)
6100 {
6101   machine_mode mode = GET_MODE (XEXP (x, 0));
6102   enum rtx_code comp_code = GET_CODE (x);
6103
6104   if (GET_MODE_CLASS (mode) != MODE_CC)
6105     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6106   return aarch64_get_condition_code_1 (mode, comp_code);
6107 }
6108
6109 static int
6110 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6111 {
6112   switch (mode)
6113     {
6114     case E_CCFPmode:
6115     case E_CCFPEmode:
6116       switch (comp_code)
6117         {
6118         case GE: return AARCH64_GE;
6119         case GT: return AARCH64_GT;
6120         case LE: return AARCH64_LS;
6121         case LT: return AARCH64_MI;
6122         case NE: return AARCH64_NE;
6123         case EQ: return AARCH64_EQ;
6124         case ORDERED: return AARCH64_VC;
6125         case UNORDERED: return AARCH64_VS;
6126         case UNLT: return AARCH64_LT;
6127         case UNLE: return AARCH64_LE;
6128         case UNGT: return AARCH64_HI;
6129         case UNGE: return AARCH64_PL;
6130         default: return -1;
6131         }
6132       break;
6133
6134     case E_CCmode:
6135       switch (comp_code)
6136         {
6137         case NE: return AARCH64_NE;
6138         case EQ: return AARCH64_EQ;
6139         case GE: return AARCH64_GE;
6140         case GT: return AARCH64_GT;
6141         case LE: return AARCH64_LE;
6142         case LT: return AARCH64_LT;
6143         case GEU: return AARCH64_CS;
6144         case GTU: return AARCH64_HI;
6145         case LEU: return AARCH64_LS;
6146         case LTU: return AARCH64_CC;
6147         default: return -1;
6148         }
6149       break;
6150
6151     case E_CC_SWPmode:
6152       switch (comp_code)
6153         {
6154         case NE: return AARCH64_NE;
6155         case EQ: return AARCH64_EQ;
6156         case GE: return AARCH64_LE;
6157         case GT: return AARCH64_LT;
6158         case LE: return AARCH64_GE;
6159         case LT: return AARCH64_GT;
6160         case GEU: return AARCH64_LS;
6161         case GTU: return AARCH64_CC;
6162         case LEU: return AARCH64_CS;
6163         case LTU: return AARCH64_HI;
6164         default: return -1;
6165         }
6166       break;
6167
6168     case E_CC_NZmode:
6169       switch (comp_code)
6170         {
6171         case NE: return AARCH64_NE;
6172         case EQ: return AARCH64_EQ;
6173         case GE: return AARCH64_PL;
6174         case LT: return AARCH64_MI;
6175         default: return -1;
6176         }
6177       break;
6178
6179     case E_CC_Zmode:
6180       switch (comp_code)
6181         {
6182         case NE: return AARCH64_NE;
6183         case EQ: return AARCH64_EQ;
6184         default: return -1;
6185         }
6186       break;
6187
6188     case E_CC_Cmode:
6189       switch (comp_code)
6190         {
6191         case NE: return AARCH64_CS;
6192         case EQ: return AARCH64_CC;
6193         default: return -1;
6194         }
6195       break;
6196
6197     default:
6198       return -1;
6199     }
6200
6201   return -1;
6202 }
6203
6204 bool
6205 aarch64_const_vec_all_same_in_range_p (rtx x,
6206                                        HOST_WIDE_INT minval,
6207                                        HOST_WIDE_INT maxval)
6208 {
6209   rtx elt;
6210   return (const_vec_duplicate_p (x, &elt)
6211           && CONST_INT_P (elt)
6212           && IN_RANGE (INTVAL (elt), minval, maxval));
6213 }
6214
6215 bool
6216 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6217 {
6218   return aarch64_const_vec_all_same_in_range_p (x, val, val);
6219 }
6220
6221 /* Return true if VEC is a constant in which every element is in the range
6222    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
6223
6224 static bool
6225 aarch64_const_vec_all_in_range_p (rtx vec,
6226                                   HOST_WIDE_INT minval,
6227                                   HOST_WIDE_INT maxval)
6228 {
6229   if (GET_CODE (vec) != CONST_VECTOR
6230       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6231     return false;
6232
6233   int nunits;
6234   if (!CONST_VECTOR_STEPPED_P (vec))
6235     nunits = const_vector_encoded_nelts (vec);
6236   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6237     return false;
6238
6239   for (int i = 0; i < nunits; i++)
6240     {
6241       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6242       if (!CONST_INT_P (vec_elem)
6243           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6244         return false;
6245     }
6246   return true;
6247 }
6248
6249 /* N Z C V.  */
6250 #define AARCH64_CC_V 1
6251 #define AARCH64_CC_C (1 << 1)
6252 #define AARCH64_CC_Z (1 << 2)
6253 #define AARCH64_CC_N (1 << 3)
6254
6255 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
6256 static const int aarch64_nzcv_codes[] =
6257 {
6258   0,            /* EQ, Z == 1.  */
6259   AARCH64_CC_Z, /* NE, Z == 0.  */
6260   0,            /* CS, C == 1.  */
6261   AARCH64_CC_C, /* CC, C == 0.  */
6262   0,            /* MI, N == 1.  */
6263   AARCH64_CC_N, /* PL, N == 0.  */
6264   0,            /* VS, V == 1.  */
6265   AARCH64_CC_V, /* VC, V == 0.  */
6266   0,            /* HI, C ==1 && Z == 0.  */
6267   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
6268   AARCH64_CC_V, /* GE, N == V.  */
6269   0,            /* LT, N != V.  */
6270   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
6271   0,            /* LE, !(Z == 0 && N == V).  */
6272   0,            /* AL, Any.  */
6273   0             /* NV, Any.  */
6274 };
6275
6276 /* Print floating-point vector immediate operand X to F, negating it
6277    first if NEGATE is true.  Return true on success, false if it isn't
6278    a constant we can handle.  */
6279
6280 static bool
6281 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6282 {
6283   rtx elt;
6284
6285   if (!const_vec_duplicate_p (x, &elt))
6286     return false;
6287
6288   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6289   if (negate)
6290     r = real_value_negate (&r);
6291
6292   /* We only handle the SVE single-bit immediates here.  */
6293   if (real_equal (&r, &dconst0))
6294     asm_fprintf (f, "0.0");
6295   else if (real_equal (&r, &dconst1))
6296     asm_fprintf (f, "1.0");
6297   else if (real_equal (&r, &dconsthalf))
6298     asm_fprintf (f, "0.5");
6299   else
6300     return false;
6301
6302   return true;
6303 }
6304
6305 /* Print operand X to file F in a target specific manner according to CODE.
6306    The acceptable formatting commands given by CODE are:
6307      'c':               An integer or symbol address without a preceding #
6308                         sign.
6309      'C':               Take the duplicated element in a vector constant
6310                         and print it in hex.
6311      'D':               Take the duplicated element in a vector constant
6312                         and print it as an unsigned integer, in decimal.
6313      'e':               Print the sign/zero-extend size as a character 8->b,
6314                         16->h, 32->w.
6315      'p':               Prints N such that 2^N == X (X must be power of 2 and
6316                         const int).
6317      'P':               Print the number of non-zero bits in X (a const_int).
6318      'H':               Print the higher numbered register of a pair (TImode)
6319                         of regs.
6320      'm':               Print a condition (eq, ne, etc).
6321      'M':               Same as 'm', but invert condition.
6322      'N':               Take the duplicated element in a vector constant
6323                         and print the negative of it in decimal.
6324      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
6325      'S/T/U/V':         Print a FP/SIMD register name for a register list.
6326                         The register printed is the FP/SIMD register name
6327                         of X + 0/1/2/3 for S/T/U/V.
6328      'R':               Print a scalar FP/SIMD register name + 1.
6329      'X':               Print bottom 16 bits of integer constant in hex.
6330      'w/x':             Print a general register name or the zero register
6331                         (32-bit or 64-bit).
6332      '0':               Print a normal operand, if it's a general register,
6333                         then we assume DImode.
6334      'k':               Print NZCV for conditional compare instructions.
6335      'A':               Output address constant representing the first
6336                         argument of X, specifying a relocation offset
6337                         if appropriate.
6338      'L':               Output constant address specified by X
6339                         with a relocation offset if appropriate.
6340      'G':               Prints address of X, specifying a PC relative
6341                         relocation mode if appropriate.
6342      'y':               Output address of LDP or STP - this is used for
6343                         some LDP/STPs which don't use a PARALLEL in their
6344                         pattern (so the mode needs to be adjusted).
6345      'z':               Output address of a typical LDP or STP.  */
6346
6347 static void
6348 aarch64_print_operand (FILE *f, rtx x, int code)
6349 {
6350   rtx elt;
6351   switch (code)
6352     {
6353     case 'c':
6354       switch (GET_CODE (x))
6355         {
6356         case CONST_INT:
6357           fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6358           break;
6359
6360         case SYMBOL_REF:
6361           output_addr_const (f, x);
6362           break;
6363
6364         case CONST:
6365           if (GET_CODE (XEXP (x, 0)) == PLUS
6366               && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6367             {
6368               output_addr_const (f, x);
6369               break;
6370             }
6371           /* Fall through.  */
6372
6373         default:
6374           output_operand_lossage ("unsupported operand for code '%c'", code);
6375         }
6376       break;
6377
6378     case 'e':
6379       {
6380         int n;
6381
6382         if (!CONST_INT_P (x)
6383             || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6384           {
6385             output_operand_lossage ("invalid operand for '%%%c'", code);
6386             return;
6387           }
6388
6389         switch (n)
6390           {
6391           case 3:
6392             fputc ('b', f);
6393             break;
6394           case 4:
6395             fputc ('h', f);
6396             break;
6397           case 5:
6398             fputc ('w', f);
6399             break;
6400           default:
6401             output_operand_lossage ("invalid operand for '%%%c'", code);
6402             return;
6403           }
6404       }
6405       break;
6406
6407     case 'p':
6408       {
6409         int n;
6410
6411         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6412           {
6413             output_operand_lossage ("invalid operand for '%%%c'", code);
6414             return;
6415           }
6416
6417         asm_fprintf (f, "%d", n);
6418       }
6419       break;
6420
6421     case 'P':
6422       if (!CONST_INT_P (x))
6423         {
6424           output_operand_lossage ("invalid operand for '%%%c'", code);
6425           return;
6426         }
6427
6428       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6429       break;
6430
6431     case 'H':
6432       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6433         {
6434           output_operand_lossage ("invalid operand for '%%%c'", code);
6435           return;
6436         }
6437
6438       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6439       break;
6440
6441     case 'M':
6442     case 'm':
6443       {
6444         int cond_code;
6445         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
6446         if (x == const_true_rtx)
6447           {
6448             if (code == 'M')
6449               fputs ("nv", f);
6450             return;
6451           }
6452
6453         if (!COMPARISON_P (x))
6454           {
6455             output_operand_lossage ("invalid operand for '%%%c'", code);
6456             return;
6457           }
6458
6459         cond_code = aarch64_get_condition_code (x);
6460         gcc_assert (cond_code >= 0);
6461         if (code == 'M')
6462           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6463         fputs (aarch64_condition_codes[cond_code], f);
6464       }
6465       break;
6466
6467     case 'N':
6468       if (!const_vec_duplicate_p (x, &elt))
6469         {
6470           output_operand_lossage ("invalid vector constant");
6471           return;
6472         }
6473
6474       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6475         asm_fprintf (f, "%wd", -INTVAL (elt));
6476       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6477                && aarch64_print_vector_float_operand (f, x, true))
6478         ;
6479       else
6480         {
6481           output_operand_lossage ("invalid vector constant");
6482           return;
6483         }
6484       break;
6485
6486     case 'b':
6487     case 'h':
6488     case 's':
6489     case 'd':
6490     case 'q':
6491       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6492         {
6493           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6494           return;
6495         }
6496       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6497       break;
6498
6499     case 'S':
6500     case 'T':
6501     case 'U':
6502     case 'V':
6503       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6504         {
6505           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6506           return;
6507         }
6508       asm_fprintf (f, "%c%d",
6509                    aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6510                    REGNO (x) - V0_REGNUM + (code - 'S'));
6511       break;
6512
6513     case 'R':
6514       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6515         {
6516           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6517           return;
6518         }
6519       asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6520       break;
6521
6522     case 'X':
6523       if (!CONST_INT_P (x))
6524         {
6525           output_operand_lossage ("invalid operand for '%%%c'", code);
6526           return;
6527         }
6528       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6529       break;
6530
6531     case 'C':
6532       {
6533         /* Print a replicated constant in hex.  */
6534         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6535           {
6536             output_operand_lossage ("invalid operand for '%%%c'", code);
6537             return;
6538           }
6539         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6540         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6541       }
6542       break;
6543
6544     case 'D':
6545       {
6546         /* Print a replicated constant in decimal, treating it as
6547            unsigned.  */
6548         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6549           {
6550             output_operand_lossage ("invalid operand for '%%%c'", code);
6551             return;
6552           }
6553         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6554         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6555       }
6556       break;
6557
6558     case 'w':
6559     case 'x':
6560       if (x == const0_rtx
6561           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6562         {
6563           asm_fprintf (f, "%czr", code);
6564           break;
6565         }
6566
6567       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6568         {
6569           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6570           break;
6571         }
6572
6573       if (REG_P (x) && REGNO (x) == SP_REGNUM)
6574         {
6575           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6576           break;
6577         }
6578
6579       /* Fall through */
6580
6581     case 0:
6582       if (x == NULL)
6583         {
6584           output_operand_lossage ("missing operand");
6585           return;
6586         }
6587
6588       switch (GET_CODE (x))
6589         {
6590         case REG:
6591           if (aarch64_sve_data_mode_p (GET_MODE (x)))
6592             asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6593           else
6594             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6595           break;
6596
6597         case MEM:
6598           output_address (GET_MODE (x), XEXP (x, 0));
6599           break;
6600
6601         case LABEL_REF:
6602         case SYMBOL_REF:
6603           output_addr_const (asm_out_file, x);
6604           break;
6605
6606         case CONST_INT:
6607           asm_fprintf (f, "%wd", INTVAL (x));
6608           break;
6609
6610         case CONST:
6611           if (!VECTOR_MODE_P (GET_MODE (x)))
6612             {
6613               output_addr_const (asm_out_file, x);
6614               break;
6615             }
6616           /* fall through */
6617
6618         case CONST_VECTOR:
6619           if (!const_vec_duplicate_p (x, &elt))
6620             {
6621               output_operand_lossage ("invalid vector constant");
6622               return;
6623             }
6624
6625           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6626             asm_fprintf (f, "%wd", INTVAL (elt));
6627           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6628                    && aarch64_print_vector_float_operand (f, x, false))
6629             ;
6630           else
6631             {
6632               output_operand_lossage ("invalid vector constant");
6633               return;
6634             }
6635           break;
6636
6637         case CONST_DOUBLE:
6638           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6639              be getting CONST_DOUBLEs holding integers.  */
6640           gcc_assert (GET_MODE (x) != VOIDmode);
6641           if (aarch64_float_const_zero_rtx_p (x))
6642             {
6643               fputc ('0', f);
6644               break;
6645             }
6646           else if (aarch64_float_const_representable_p (x))
6647             {
6648 #define buf_size 20
6649               char float_buf[buf_size] = {'\0'};
6650               real_to_decimal_for_mode (float_buf,
6651                                         CONST_DOUBLE_REAL_VALUE (x),
6652                                         buf_size, buf_size,
6653                                         1, GET_MODE (x));
6654               asm_fprintf (asm_out_file, "%s", float_buf);
6655               break;
6656 #undef buf_size
6657             }
6658           output_operand_lossage ("invalid constant");
6659           return;
6660         default:
6661           output_operand_lossage ("invalid operand");
6662           return;
6663         }
6664       break;
6665
6666     case 'A':
6667       if (GET_CODE (x) == HIGH)
6668         x = XEXP (x, 0);
6669
6670       switch (aarch64_classify_symbolic_expression (x))
6671         {
6672         case SYMBOL_SMALL_GOT_4G:
6673           asm_fprintf (asm_out_file, ":got:");
6674           break;
6675
6676         case SYMBOL_SMALL_TLSGD:
6677           asm_fprintf (asm_out_file, ":tlsgd:");
6678           break;
6679
6680         case SYMBOL_SMALL_TLSDESC:
6681           asm_fprintf (asm_out_file, ":tlsdesc:");
6682           break;
6683
6684         case SYMBOL_SMALL_TLSIE:
6685           asm_fprintf (asm_out_file, ":gottprel:");
6686           break;
6687
6688         case SYMBOL_TLSLE24:
6689           asm_fprintf (asm_out_file, ":tprel:");
6690           break;
6691
6692         case SYMBOL_TINY_GOT:
6693           gcc_unreachable ();
6694           break;
6695
6696         default:
6697           break;
6698         }
6699       output_addr_const (asm_out_file, x);
6700       break;
6701
6702     case 'L':
6703       switch (aarch64_classify_symbolic_expression (x))
6704         {
6705         case SYMBOL_SMALL_GOT_4G:
6706           asm_fprintf (asm_out_file, ":lo12:");
6707           break;
6708
6709         case SYMBOL_SMALL_TLSGD:
6710           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
6711           break;
6712
6713         case SYMBOL_SMALL_TLSDESC:
6714           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
6715           break;
6716
6717         case SYMBOL_SMALL_TLSIE:
6718           asm_fprintf (asm_out_file, ":gottprel_lo12:");
6719           break;
6720
6721         case SYMBOL_TLSLE12:
6722           asm_fprintf (asm_out_file, ":tprel_lo12:");
6723           break;
6724
6725         case SYMBOL_TLSLE24:
6726           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
6727           break;
6728
6729         case SYMBOL_TINY_GOT:
6730           asm_fprintf (asm_out_file, ":got:");
6731           break;
6732
6733         case SYMBOL_TINY_TLSIE:
6734           asm_fprintf (asm_out_file, ":gottprel:");
6735           break;
6736
6737         default:
6738           break;
6739         }
6740       output_addr_const (asm_out_file, x);
6741       break;
6742
6743     case 'G':
6744       switch (aarch64_classify_symbolic_expression (x))
6745         {
6746         case SYMBOL_TLSLE24:
6747           asm_fprintf (asm_out_file, ":tprel_hi12:");
6748           break;
6749         default:
6750           break;
6751         }
6752       output_addr_const (asm_out_file, x);
6753       break;
6754
6755     case 'k':
6756       {
6757         HOST_WIDE_INT cond_code;
6758
6759         if (!CONST_INT_P (x))
6760           {
6761             output_operand_lossage ("invalid operand for '%%%c'", code);
6762             return;
6763           }
6764
6765         cond_code = INTVAL (x);
6766         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
6767         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
6768       }
6769       break;
6770
6771     case 'y':
6772     case 'z':
6773       {
6774         machine_mode mode = GET_MODE (x);
6775
6776         if (GET_CODE (x) != MEM
6777             || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
6778           {
6779             output_operand_lossage ("invalid operand for '%%%c'", code);
6780             return;
6781           }
6782
6783         if (code == 'y')
6784           /* LDP/STP which uses a single double-width memory operand.
6785              Adjust the mode to appear like a typical LDP/STP.
6786              Currently this is supported for 16-byte accesses only.  */
6787           mode = DFmode;
6788
6789         if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
6790           output_operand_lossage ("invalid operand prefix '%%%c'", code);
6791       }
6792       break;
6793
6794     default:
6795       output_operand_lossage ("invalid operand prefix '%%%c'", code);
6796       return;
6797     }
6798 }
6799
6800 /* Print address 'x' of a memory access with mode 'mode'.
6801    'op' is the context required by aarch64_classify_address.  It can either be
6802    MEM for a normal memory access or PARALLEL for LDP/STP.  */
6803 static bool
6804 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
6805                                 aarch64_addr_query_type type)
6806 {
6807   struct aarch64_address_info addr;
6808   unsigned int size;
6809
6810   /* Check all addresses are Pmode - including ILP32.  */
6811   gcc_assert (GET_MODE (x) == Pmode);
6812
6813   if (aarch64_classify_address (&addr, x, mode, true, type))
6814     switch (addr.type)
6815       {
6816       case ADDRESS_REG_IMM:
6817         if (known_eq (addr.const_offset, 0))
6818           asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
6819         else if (aarch64_sve_data_mode_p (mode))
6820           {
6821             HOST_WIDE_INT vnum
6822               = exact_div (addr.const_offset,
6823                            BYTES_PER_SVE_VECTOR).to_constant ();
6824             asm_fprintf (f, "[%s, #%wd, mul vl]",
6825                          reg_names[REGNO (addr.base)], vnum);
6826           }
6827         else if (aarch64_sve_pred_mode_p (mode))
6828           {
6829             HOST_WIDE_INT vnum
6830               = exact_div (addr.const_offset,
6831                            BYTES_PER_SVE_PRED).to_constant ();
6832             asm_fprintf (f, "[%s, #%wd, mul vl]",
6833                          reg_names[REGNO (addr.base)], vnum);
6834           }
6835         else
6836           asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
6837                        INTVAL (addr.offset));
6838         return true;
6839
6840       case ADDRESS_REG_REG:
6841         if (addr.shift == 0)
6842           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
6843                        reg_names [REGNO (addr.offset)]);
6844         else
6845           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
6846                        reg_names [REGNO (addr.offset)], addr.shift);
6847         return true;
6848
6849       case ADDRESS_REG_UXTW:
6850         if (addr.shift == 0)
6851           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
6852                        REGNO (addr.offset) - R0_REGNUM);
6853         else
6854           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
6855                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
6856         return true;
6857
6858       case ADDRESS_REG_SXTW:
6859         if (addr.shift == 0)
6860           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
6861                        REGNO (addr.offset) - R0_REGNUM);
6862         else
6863           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
6864                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
6865         return true;
6866
6867       case ADDRESS_REG_WB:
6868         /* Writeback is only supported for fixed-width modes.  */
6869         size = GET_MODE_SIZE (mode).to_constant ();
6870         switch (GET_CODE (x))
6871           {
6872           case PRE_INC:
6873             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
6874             return true;
6875           case POST_INC:
6876             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
6877             return true;
6878           case PRE_DEC:
6879             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
6880             return true;
6881           case POST_DEC:
6882             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
6883             return true;
6884           case PRE_MODIFY:
6885             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
6886                          INTVAL (addr.offset));
6887             return true;
6888           case POST_MODIFY:
6889             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
6890                          INTVAL (addr.offset));
6891             return true;
6892           default:
6893             break;
6894           }
6895         break;
6896
6897       case ADDRESS_LO_SUM:
6898         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
6899         output_addr_const (f, addr.offset);
6900         asm_fprintf (f, "]");
6901         return true;
6902
6903       case ADDRESS_SYMBOLIC:
6904         output_addr_const (f, x);
6905         return true;
6906       }
6907
6908   return false;
6909 }
6910
6911 /* Print address 'x' of a LDP/STP with mode 'mode'.  */
6912 static bool
6913 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
6914 {
6915   return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
6916 }
6917
6918 /* Print address 'x' of a memory access with mode 'mode'.  */
6919 static void
6920 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
6921 {
6922   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
6923     output_addr_const (f, x);
6924 }
6925
6926 bool
6927 aarch64_label_mentioned_p (rtx x)
6928 {
6929   const char *fmt;
6930   int i;
6931
6932   if (GET_CODE (x) == LABEL_REF)
6933     return true;
6934
6935   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
6936      referencing instruction, but they are constant offsets, not
6937      symbols.  */
6938   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6939     return false;
6940
6941   fmt = GET_RTX_FORMAT (GET_CODE (x));
6942   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
6943     {
6944       if (fmt[i] == 'E')
6945         {
6946           int j;
6947
6948           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
6949             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
6950               return 1;
6951         }
6952       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
6953         return 1;
6954     }
6955
6956   return 0;
6957 }
6958
6959 /* Implement REGNO_REG_CLASS.  */
6960
6961 enum reg_class
6962 aarch64_regno_regclass (unsigned regno)
6963 {
6964   if (GP_REGNUM_P (regno))
6965     return GENERAL_REGS;
6966
6967   if (regno == SP_REGNUM)
6968     return STACK_REG;
6969
6970   if (regno == FRAME_POINTER_REGNUM
6971       || regno == ARG_POINTER_REGNUM)
6972     return POINTER_REGS;
6973
6974   if (FP_REGNUM_P (regno))
6975     return FP_LO_REGNUM_P (regno) ?  FP_LO_REGS : FP_REGS;
6976
6977   if (PR_REGNUM_P (regno))
6978     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
6979
6980   return NO_REGS;
6981 }
6982
6983 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
6984    If OFFSET is out of range, return an offset of an anchor point
6985    that is in range.  Return 0 otherwise.  */
6986
6987 static HOST_WIDE_INT
6988 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
6989                        machine_mode mode)
6990 {
6991   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
6992   if (size > 16)
6993     return (offset + 0x400) & ~0x7f0;
6994
6995   /* For offsets that aren't a multiple of the access size, the limit is
6996      -256...255.  */
6997   if (offset & (size - 1))
6998     {
6999       /* BLKmode typically uses LDP of X-registers.  */
7000       if (mode == BLKmode)
7001         return (offset + 512) & ~0x3ff;
7002       return (offset + 0x100) & ~0x1ff;
7003     }
7004
7005   /* Small negative offsets are supported.  */
7006   if (IN_RANGE (offset, -256, 0))
7007     return 0;
7008
7009   if (mode == TImode || mode == TFmode)
7010     return (offset + 0x100) & ~0x1ff;
7011
7012   /* Use 12-bit offset by access size.  */
7013   return offset & (~0xfff * size);
7014 }
7015
7016 static rtx
7017 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
7018 {
7019   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7020      where mask is selected by alignment and size of the offset.
7021      We try to pick as large a range for the offset as possible to
7022      maximize the chance of a CSE.  However, for aligned addresses
7023      we limit the range to 4k so that structures with different sized
7024      elements are likely to use the same base.  We need to be careful
7025      not to split a CONST for some forms of address expression, otherwise
7026      it will generate sub-optimal code.  */
7027
7028   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7029     {
7030       rtx base = XEXP (x, 0);
7031       rtx offset_rtx = XEXP (x, 1);
7032       HOST_WIDE_INT offset = INTVAL (offset_rtx);
7033
7034       if (GET_CODE (base) == PLUS)
7035         {
7036           rtx op0 = XEXP (base, 0);
7037           rtx op1 = XEXP (base, 1);
7038
7039           /* Force any scaling into a temp for CSE.  */
7040           op0 = force_reg (Pmode, op0);
7041           op1 = force_reg (Pmode, op1);
7042
7043           /* Let the pointer register be in op0.  */
7044           if (REG_POINTER (op1))
7045             std::swap (op0, op1);
7046
7047           /* If the pointer is virtual or frame related, then we know that
7048              virtual register instantiation or register elimination is going
7049              to apply a second constant.  We want the two constants folded
7050              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
7051           if (virt_or_elim_regno_p (REGNO (op0)))
7052             {
7053               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7054                                    NULL_RTX, true, OPTAB_DIRECT);
7055               return gen_rtx_PLUS (Pmode, base, op1);
7056             }
7057
7058           /* Otherwise, in order to encourage CSE (and thence loop strength
7059              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
7060           base = expand_binop (Pmode, add_optab, op0, op1,
7061                                NULL_RTX, true, OPTAB_DIRECT);
7062           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7063         }
7064
7065       HOST_WIDE_INT size;
7066       if (GET_MODE_SIZE (mode).is_constant (&size))
7067         {
7068           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7069                                                              mode);
7070           if (base_offset != 0)
7071             {
7072               base = plus_constant (Pmode, base, base_offset);
7073               base = force_operand (base, NULL_RTX);
7074               return plus_constant (Pmode, base, offset - base_offset);
7075             }
7076         }
7077     }
7078
7079   return x;
7080 }
7081
7082 /* Return the reload icode required for a constant pool in mode.  */
7083 static enum insn_code
7084 aarch64_constant_pool_reload_icode (machine_mode mode)
7085 {
7086   switch (mode)
7087     {
7088     case E_SFmode:
7089       return CODE_FOR_aarch64_reload_movcpsfdi;
7090
7091     case E_DFmode:
7092       return CODE_FOR_aarch64_reload_movcpdfdi;
7093
7094     case E_TFmode:
7095       return CODE_FOR_aarch64_reload_movcptfdi;
7096
7097     case E_V8QImode:
7098       return CODE_FOR_aarch64_reload_movcpv8qidi;
7099
7100     case E_V16QImode:
7101       return CODE_FOR_aarch64_reload_movcpv16qidi;
7102
7103     case E_V4HImode:
7104       return CODE_FOR_aarch64_reload_movcpv4hidi;
7105
7106     case E_V8HImode:
7107       return CODE_FOR_aarch64_reload_movcpv8hidi;
7108
7109     case E_V2SImode:
7110       return CODE_FOR_aarch64_reload_movcpv2sidi;
7111
7112     case E_V4SImode:
7113       return CODE_FOR_aarch64_reload_movcpv4sidi;
7114
7115     case E_V2DImode:
7116       return CODE_FOR_aarch64_reload_movcpv2didi;
7117
7118     case E_V2DFmode:
7119       return CODE_FOR_aarch64_reload_movcpv2dfdi;
7120
7121     default:
7122       gcc_unreachable ();
7123     }
7124
7125   gcc_unreachable ();
7126 }
7127 static reg_class_t
7128 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7129                           reg_class_t rclass,
7130                           machine_mode mode,
7131                           secondary_reload_info *sri)
7132 {
7133   if (BYTES_BIG_ENDIAN
7134       && reg_class_subset_p (rclass, FP_REGS)
7135       && (MEM_P (x) || (REG_P (x) && !HARD_REGISTER_P (x)))
7136       && aarch64_sve_data_mode_p (mode))
7137     {
7138       sri->icode = CODE_FOR_aarch64_sve_reload_be;
7139       return NO_REGS;
7140     }
7141
7142   /* If we have to disable direct literal pool loads and stores because the
7143      function is too big, then we need a scratch register.  */
7144   if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7145       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7146           || targetm.vector_mode_supported_p (GET_MODE (x)))
7147       && !aarch64_pcrelative_literal_loads)
7148     {
7149       sri->icode = aarch64_constant_pool_reload_icode (mode);
7150       return NO_REGS;
7151     }
7152
7153   /* Without the TARGET_SIMD instructions we cannot move a Q register
7154      to a Q register directly.  We need a scratch.  */
7155   if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7156       && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7157       && reg_class_subset_p (rclass, FP_REGS))
7158     {
7159       if (mode == TFmode)
7160         sri->icode = CODE_FOR_aarch64_reload_movtf;
7161       else if (mode == TImode)
7162         sri->icode = CODE_FOR_aarch64_reload_movti;
7163       return NO_REGS;
7164     }
7165
7166   /* A TFmode or TImode memory access should be handled via an FP_REGS
7167      because AArch64 has richer addressing modes for LDR/STR instructions
7168      than LDP/STP instructions.  */
7169   if (TARGET_FLOAT && rclass == GENERAL_REGS
7170       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7171     return FP_REGS;
7172
7173   if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7174       return GENERAL_REGS;
7175
7176   return NO_REGS;
7177 }
7178
7179 static bool
7180 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7181 {
7182   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7183
7184   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7185      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
7186   if (frame_pointer_needed)
7187     return to == HARD_FRAME_POINTER_REGNUM;
7188   return true;
7189 }
7190
7191 poly_int64
7192 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7193 {
7194   aarch64_layout_frame ();
7195
7196   if (to == HARD_FRAME_POINTER_REGNUM)
7197     {
7198       if (from == ARG_POINTER_REGNUM)
7199         return cfun->machine->frame.hard_fp_offset;
7200
7201       if (from == FRAME_POINTER_REGNUM)
7202         return cfun->machine->frame.hard_fp_offset
7203                - cfun->machine->frame.locals_offset;
7204     }
7205
7206   if (to == STACK_POINTER_REGNUM)
7207     {
7208       if (from == FRAME_POINTER_REGNUM)
7209           return cfun->machine->frame.frame_size
7210                  - cfun->machine->frame.locals_offset;
7211     }
7212
7213   return cfun->machine->frame.frame_size;
7214 }
7215
7216 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
7217    previous frame.  */
7218
7219 rtx
7220 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7221 {
7222   if (count != 0)
7223     return const0_rtx;
7224   return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7225 }
7226
7227
7228 static void
7229 aarch64_asm_trampoline_template (FILE *f)
7230 {
7231   if (TARGET_ILP32)
7232     {
7233       asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7234       asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7235     }
7236   else
7237     {
7238       asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7239       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7240     }
7241   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7242   assemble_aligned_integer (4, const0_rtx);
7243   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7244   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7245 }
7246
7247 static void
7248 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7249 {
7250   rtx fnaddr, mem, a_tramp;
7251   const int tramp_code_sz = 16;
7252
7253   /* Don't need to copy the trailing D-words, we fill those in below.  */
7254   emit_block_move (m_tramp, assemble_trampoline_template (),
7255                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7256   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7257   fnaddr = XEXP (DECL_RTL (fndecl), 0);
7258   if (GET_MODE (fnaddr) != ptr_mode)
7259     fnaddr = convert_memory_address (ptr_mode, fnaddr);
7260   emit_move_insn (mem, fnaddr);
7261
7262   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7263   emit_move_insn (mem, chain_value);
7264
7265   /* XXX We should really define a "clear_cache" pattern and use
7266      gen_clear_cache().  */
7267   a_tramp = XEXP (m_tramp, 0);
7268   emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7269                      LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7270                      plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7271                      ptr_mode);
7272 }
7273
7274 static unsigned char
7275 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7276 {
7277   /* ??? Logically we should only need to provide a value when
7278      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7279      can hold MODE, but at the moment we need to handle all modes.
7280      Just ignore any runtime parts for registers that can't store them.  */
7281   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7282   unsigned int nregs;
7283   switch (regclass)
7284     {
7285     case CALLER_SAVE_REGS:
7286     case POINTER_REGS:
7287     case GENERAL_REGS:
7288     case ALL_REGS:
7289     case POINTER_AND_FP_REGS:
7290     case FP_REGS:
7291     case FP_LO_REGS:
7292       if (aarch64_sve_data_mode_p (mode)
7293           && constant_multiple_p (GET_MODE_SIZE (mode),
7294                                   BYTES_PER_SVE_VECTOR, &nregs))
7295         return nregs;
7296       return (aarch64_vector_data_mode_p (mode)
7297               ? CEIL (lowest_size, UNITS_PER_VREG)
7298               : CEIL (lowest_size, UNITS_PER_WORD));
7299     case STACK_REG:
7300     case PR_REGS:
7301     case PR_LO_REGS:
7302     case PR_HI_REGS:
7303       return 1;
7304
7305     case NO_REGS:
7306       return 0;
7307
7308     default:
7309       break;
7310     }
7311   gcc_unreachable ();
7312 }
7313
7314 static reg_class_t
7315 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7316 {
7317   if (regclass == POINTER_REGS)
7318     return GENERAL_REGS;
7319
7320   if (regclass == STACK_REG)
7321     {
7322       if (REG_P(x)
7323           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7324           return regclass;
7325
7326       return NO_REGS;
7327     }
7328
7329   /* Register eliminiation can result in a request for
7330      SP+constant->FP_REGS.  We cannot support such operations which
7331      use SP as source and an FP_REG as destination, so reject out
7332      right now.  */
7333   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7334     {
7335       rtx lhs = XEXP (x, 0);
7336
7337       /* Look through a possible SUBREG introduced by ILP32.  */
7338       if (GET_CODE (lhs) == SUBREG)
7339         lhs = SUBREG_REG (lhs);
7340
7341       gcc_assert (REG_P (lhs));
7342       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7343                                       POINTER_REGS));
7344       return NO_REGS;
7345     }
7346
7347   return regclass;
7348 }
7349
7350 void
7351 aarch64_asm_output_labelref (FILE* f, const char *name)
7352 {
7353   asm_fprintf (f, "%U%s", name);
7354 }
7355
7356 static void
7357 aarch64_elf_asm_constructor (rtx symbol, int priority)
7358 {
7359   if (priority == DEFAULT_INIT_PRIORITY)
7360     default_ctor_section_asm_out_constructor (symbol, priority);
7361   else
7362     {
7363       section *s;
7364       /* While priority is known to be in range [0, 65535], so 18 bytes
7365          would be enough, the compiler might not know that.  To avoid
7366          -Wformat-truncation false positive, use a larger size.  */
7367       char buf[23];
7368       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7369       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7370       switch_to_section (s);
7371       assemble_align (POINTER_SIZE);
7372       assemble_aligned_integer (POINTER_BYTES, symbol);
7373     }
7374 }
7375
7376 static void
7377 aarch64_elf_asm_destructor (rtx symbol, int priority)
7378 {
7379   if (priority == DEFAULT_INIT_PRIORITY)
7380     default_dtor_section_asm_out_destructor (symbol, priority);
7381   else
7382     {
7383       section *s;
7384       /* While priority is known to be in range [0, 65535], so 18 bytes
7385          would be enough, the compiler might not know that.  To avoid
7386          -Wformat-truncation false positive, use a larger size.  */
7387       char buf[23];
7388       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7389       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7390       switch_to_section (s);
7391       assemble_align (POINTER_SIZE);
7392       assemble_aligned_integer (POINTER_BYTES, symbol);
7393     }
7394 }
7395
7396 const char*
7397 aarch64_output_casesi (rtx *operands)
7398 {
7399   char buf[100];
7400   char label[100];
7401   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7402   int index;
7403   static const char *const patterns[4][2] =
7404   {
7405     {
7406       "ldrb\t%w3, [%0,%w1,uxtw]",
7407       "add\t%3, %4, %w3, sxtb #2"
7408     },
7409     {
7410       "ldrh\t%w3, [%0,%w1,uxtw #1]",
7411       "add\t%3, %4, %w3, sxth #2"
7412     },
7413     {
7414       "ldr\t%w3, [%0,%w1,uxtw #2]",
7415       "add\t%3, %4, %w3, sxtw #2"
7416     },
7417     /* We assume that DImode is only generated when not optimizing and
7418        that we don't really need 64-bit address offsets.  That would
7419        imply an object file with 8GB of code in a single function!  */
7420     {
7421       "ldr\t%w3, [%0,%w1,uxtw #2]",
7422       "add\t%3, %4, %w3, sxtw #2"
7423     }
7424   };
7425
7426   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7427
7428   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7429   index = exact_log2 (GET_MODE_SIZE (mode));
7430
7431   gcc_assert (index >= 0 && index <= 3);
7432
7433   /* Need to implement table size reduction, by chaning the code below.  */
7434   output_asm_insn (patterns[index][0], operands);
7435   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7436   snprintf (buf, sizeof (buf),
7437             "adr\t%%4, %s", targetm.strip_name_encoding (label));
7438   output_asm_insn (buf, operands);
7439   output_asm_insn (patterns[index][1], operands);
7440   output_asm_insn ("br\t%3", operands);
7441   assemble_label (asm_out_file, label);
7442   return "";
7443 }
7444
7445
7446 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7447    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7448    operator.  */
7449
7450 int
7451 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7452 {
7453   if (shift >= 0 && shift <= 3)
7454     {
7455       int size;
7456       for (size = 8; size <= 32; size *= 2)
7457         {
7458           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7459           if (mask == bits << shift)
7460             return size;
7461         }
7462     }
7463   return 0;
7464 }
7465
7466 /* Constant pools are per function only when PC relative
7467    literal loads are true or we are in the large memory
7468    model.  */
7469
7470 static inline bool
7471 aarch64_can_use_per_function_literal_pools_p (void)
7472 {
7473   return (aarch64_pcrelative_literal_loads
7474           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7475 }
7476
7477 static bool
7478 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7479 {
7480   /* Fixme:: In an ideal world this would work similar
7481      to the logic in aarch64_select_rtx_section but this
7482      breaks bootstrap in gcc go.  For now we workaround
7483      this by returning false here.  */
7484   return false;
7485 }
7486
7487 /* Select appropriate section for constants depending
7488    on where we place literal pools.  */
7489
7490 static section *
7491 aarch64_select_rtx_section (machine_mode mode,
7492                             rtx x,
7493                             unsigned HOST_WIDE_INT align)
7494 {
7495   if (aarch64_can_use_per_function_literal_pools_p ())
7496     return function_section (current_function_decl);
7497
7498   return default_elf_select_rtx_section (mode, x, align);
7499 }
7500
7501 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
7502 void
7503 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7504                                   HOST_WIDE_INT offset)
7505 {
7506   /* When using per-function literal pools, we must ensure that any code
7507      section is aligned to the minimal instruction length, lest we get
7508      errors from the assembler re "unaligned instructions".  */
7509   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7510     ASM_OUTPUT_ALIGN (f, 2);
7511 }
7512
7513 /* Costs.  */
7514
7515 /* Helper function for rtx cost calculation.  Strip a shift expression
7516    from X.  Returns the inner operand if successful, or the original
7517    expression on failure.  */
7518 static rtx
7519 aarch64_strip_shift (rtx x)
7520 {
7521   rtx op = x;
7522
7523   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7524      we can convert both to ROR during final output.  */
7525   if ((GET_CODE (op) == ASHIFT
7526        || GET_CODE (op) == ASHIFTRT
7527        || GET_CODE (op) == LSHIFTRT
7528        || GET_CODE (op) == ROTATERT
7529        || GET_CODE (op) == ROTATE)
7530       && CONST_INT_P (XEXP (op, 1)))
7531     return XEXP (op, 0);
7532
7533   if (GET_CODE (op) == MULT
7534       && CONST_INT_P (XEXP (op, 1))
7535       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7536     return XEXP (op, 0);
7537
7538   return x;
7539 }
7540
7541 /* Helper function for rtx cost calculation.  Strip an extend
7542    expression from X.  Returns the inner operand if successful, or the
7543    original expression on failure.  We deal with a number of possible
7544    canonicalization variations here. If STRIP_SHIFT is true, then
7545    we can strip off a shift also.  */
7546 static rtx
7547 aarch64_strip_extend (rtx x, bool strip_shift)
7548 {
7549   scalar_int_mode mode;
7550   rtx op = x;
7551
7552   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7553     return op;
7554
7555   /* Zero and sign extraction of a widened value.  */
7556   if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7557       && XEXP (op, 2) == const0_rtx
7558       && GET_CODE (XEXP (op, 0)) == MULT
7559       && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7560                                          XEXP (op, 1)))
7561     return XEXP (XEXP (op, 0), 0);
7562
7563   /* It can also be represented (for zero-extend) as an AND with an
7564      immediate.  */
7565   if (GET_CODE (op) == AND
7566       && GET_CODE (XEXP (op, 0)) == MULT
7567       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7568       && CONST_INT_P (XEXP (op, 1))
7569       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7570                            INTVAL (XEXP (op, 1))) != 0)
7571     return XEXP (XEXP (op, 0), 0);
7572
7573   /* Now handle extended register, as this may also have an optional
7574      left shift by 1..4.  */
7575   if (strip_shift
7576       && GET_CODE (op) == ASHIFT
7577       && CONST_INT_P (XEXP (op, 1))
7578       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7579     op = XEXP (op, 0);
7580
7581   if (GET_CODE (op) == ZERO_EXTEND
7582       || GET_CODE (op) == SIGN_EXTEND)
7583     op = XEXP (op, 0);
7584
7585   if (op != x)
7586     return op;
7587
7588   return x;
7589 }
7590
7591 /* Return true iff CODE is a shift supported in combination
7592    with arithmetic instructions.  */
7593
7594 static bool
7595 aarch64_shift_p (enum rtx_code code)
7596 {
7597   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7598 }
7599
7600
7601 /* Return true iff X is a cheap shift without a sign extend. */
7602
7603 static bool
7604 aarch64_cheap_mult_shift_p (rtx x)
7605 {
7606   rtx op0, op1;
7607
7608   op0 = XEXP (x, 0);
7609   op1 = XEXP (x, 1);
7610
7611   if (!(aarch64_tune_params.extra_tuning_flags
7612                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7613     return false;
7614
7615   if (GET_CODE (op0) == SIGN_EXTEND)
7616     return false;
7617
7618   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7619       && UINTVAL (op1) <= 4)
7620     return true;
7621
7622   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7623     return false;
7624
7625   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7626
7627   if (l2 > 0 && l2 <= 4)
7628     return true;
7629
7630   return false;
7631 }
7632
7633 /* Helper function for rtx cost calculation.  Calculate the cost of
7634    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7635    Return the calculated cost of the expression, recursing manually in to
7636    operands where needed.  */
7637
7638 static int
7639 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7640 {
7641   rtx op0, op1;
7642   const struct cpu_cost_table *extra_cost
7643     = aarch64_tune_params.insn_extra_cost;
7644   int cost = 0;
7645   bool compound_p = (outer == PLUS || outer == MINUS);
7646   machine_mode mode = GET_MODE (x);
7647
7648   gcc_checking_assert (code == MULT);
7649
7650   op0 = XEXP (x, 0);
7651   op1 = XEXP (x, 1);
7652
7653   if (VECTOR_MODE_P (mode))
7654     mode = GET_MODE_INNER (mode);
7655
7656   /* Integer multiply/fma.  */
7657   if (GET_MODE_CLASS (mode) == MODE_INT)
7658     {
7659       /* The multiply will be canonicalized as a shift, cost it as such.  */
7660       if (aarch64_shift_p (GET_CODE (x))
7661           || (CONST_INT_P (op1)
7662               && exact_log2 (INTVAL (op1)) > 0))
7663         {
7664           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7665                            || GET_CODE (op0) == SIGN_EXTEND;
7666           if (speed)
7667             {
7668               if (compound_p)
7669                 {
7670                   /* If the shift is considered cheap,
7671                      then don't add any cost. */
7672                   if (aarch64_cheap_mult_shift_p (x))
7673                     ;
7674                   else if (REG_P (op1))
7675                     /* ARITH + shift-by-register.  */
7676                     cost += extra_cost->alu.arith_shift_reg;
7677                   else if (is_extend)
7678                     /* ARITH + extended register.  We don't have a cost field
7679                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
7680                     cost += extra_cost->alu.extend_arith;
7681                   else
7682                     /* ARITH + shift-by-immediate.  */
7683                     cost += extra_cost->alu.arith_shift;
7684                 }
7685               else
7686                 /* LSL (immediate).  */
7687                 cost += extra_cost->alu.shift;
7688
7689             }
7690           /* Strip extends as we will have costed them in the case above.  */
7691           if (is_extend)
7692             op0 = aarch64_strip_extend (op0, true);
7693
7694           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7695
7696           return cost;
7697         }
7698
7699       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
7700          compound and let the below cases handle it.  After all, MNEG is a
7701          special-case alias of MSUB.  */
7702       if (GET_CODE (op0) == NEG)
7703         {
7704           op0 = XEXP (op0, 0);
7705           compound_p = true;
7706         }
7707
7708       /* Integer multiplies or FMAs have zero/sign extending variants.  */
7709       if ((GET_CODE (op0) == ZERO_EXTEND
7710            && GET_CODE (op1) == ZERO_EXTEND)
7711           || (GET_CODE (op0) == SIGN_EXTEND
7712               && GET_CODE (op1) == SIGN_EXTEND))
7713         {
7714           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
7715           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
7716
7717           if (speed)
7718             {
7719               if (compound_p)
7720                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
7721                 cost += extra_cost->mult[0].extend_add;
7722               else
7723                 /* MUL/SMULL/UMULL.  */
7724                 cost += extra_cost->mult[0].extend;
7725             }
7726
7727           return cost;
7728         }
7729
7730       /* This is either an integer multiply or a MADD.  In both cases
7731          we want to recurse and cost the operands.  */
7732       cost += rtx_cost (op0, mode, MULT, 0, speed);
7733       cost += rtx_cost (op1, mode, MULT, 1, speed);
7734
7735       if (speed)
7736         {
7737           if (compound_p)
7738             /* MADD/MSUB.  */
7739             cost += extra_cost->mult[mode == DImode].add;
7740           else
7741             /* MUL.  */
7742             cost += extra_cost->mult[mode == DImode].simple;
7743         }
7744
7745       return cost;
7746     }
7747   else
7748     {
7749       if (speed)
7750         {
7751           /* Floating-point FMA/FMUL can also support negations of the
7752              operands, unless the rounding mode is upward or downward in
7753              which case FNMUL is different than FMUL with operand negation.  */
7754           bool neg0 = GET_CODE (op0) == NEG;
7755           bool neg1 = GET_CODE (op1) == NEG;
7756           if (compound_p || !flag_rounding_math || (neg0 && neg1))
7757             {
7758               if (neg0)
7759                 op0 = XEXP (op0, 0);
7760               if (neg1)
7761                 op1 = XEXP (op1, 0);
7762             }
7763
7764           if (compound_p)
7765             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
7766             cost += extra_cost->fp[mode == DFmode].fma;
7767           else
7768             /* FMUL/FNMUL.  */
7769             cost += extra_cost->fp[mode == DFmode].mult;
7770         }
7771
7772       cost += rtx_cost (op0, mode, MULT, 0, speed);
7773       cost += rtx_cost (op1, mode, MULT, 1, speed);
7774       return cost;
7775     }
7776 }
7777
7778 static int
7779 aarch64_address_cost (rtx x,
7780                       machine_mode mode,
7781                       addr_space_t as ATTRIBUTE_UNUSED,
7782                       bool speed)
7783 {
7784   enum rtx_code c = GET_CODE (x);
7785   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
7786   struct aarch64_address_info info;
7787   int cost = 0;
7788   info.shift = 0;
7789
7790   if (!aarch64_classify_address (&info, x, mode, false))
7791     {
7792       if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
7793         {
7794           /* This is a CONST or SYMBOL ref which will be split
7795              in a different way depending on the code model in use.
7796              Cost it through the generic infrastructure.  */
7797           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
7798           /* Divide through by the cost of one instruction to
7799              bring it to the same units as the address costs.  */
7800           cost_symbol_ref /= COSTS_N_INSNS (1);
7801           /* The cost is then the cost of preparing the address,
7802              followed by an immediate (possibly 0) offset.  */
7803           return cost_symbol_ref + addr_cost->imm_offset;
7804         }
7805       else
7806         {
7807           /* This is most likely a jump table from a case
7808              statement.  */
7809           return addr_cost->register_offset;
7810         }
7811     }
7812
7813   switch (info.type)
7814     {
7815       case ADDRESS_LO_SUM:
7816       case ADDRESS_SYMBOLIC:
7817       case ADDRESS_REG_IMM:
7818         cost += addr_cost->imm_offset;
7819         break;
7820
7821       case ADDRESS_REG_WB:
7822         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
7823           cost += addr_cost->pre_modify;
7824         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
7825           cost += addr_cost->post_modify;
7826         else
7827           gcc_unreachable ();
7828
7829         break;
7830
7831       case ADDRESS_REG_REG:
7832         cost += addr_cost->register_offset;
7833         break;
7834
7835       case ADDRESS_REG_SXTW:
7836         cost += addr_cost->register_sextend;
7837         break;
7838
7839       case ADDRESS_REG_UXTW:
7840         cost += addr_cost->register_zextend;
7841         break;
7842
7843       default:
7844         gcc_unreachable ();
7845     }
7846
7847
7848   if (info.shift > 0)
7849     {
7850       /* For the sake of calculating the cost of the shifted register
7851          component, we can treat same sized modes in the same way.  */
7852       if (known_eq (GET_MODE_BITSIZE (mode), 16))
7853         cost += addr_cost->addr_scale_costs.hi;
7854       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
7855         cost += addr_cost->addr_scale_costs.si;
7856       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
7857         cost += addr_cost->addr_scale_costs.di;
7858       else
7859         /* We can't tell, or this is a 128-bit vector.  */
7860         cost += addr_cost->addr_scale_costs.ti;
7861     }
7862
7863   return cost;
7864 }
7865
7866 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
7867    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
7868    to be taken.  */
7869
7870 int
7871 aarch64_branch_cost (bool speed_p, bool predictable_p)
7872 {
7873   /* When optimizing for speed, use the cost of unpredictable branches.  */
7874   const struct cpu_branch_cost *branch_costs =
7875     aarch64_tune_params.branch_costs;
7876
7877   if (!speed_p || predictable_p)
7878     return branch_costs->predictable;
7879   else
7880     return branch_costs->unpredictable;
7881 }
7882
7883 /* Return true if the RTX X in mode MODE is a zero or sign extract
7884    usable in an ADD or SUB (extended register) instruction.  */
7885 static bool
7886 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
7887 {
7888   /* Catch add with a sign extract.
7889      This is add_<optab><mode>_multp2.  */
7890   if (GET_CODE (x) == SIGN_EXTRACT
7891       || GET_CODE (x) == ZERO_EXTRACT)
7892     {
7893       rtx op0 = XEXP (x, 0);
7894       rtx op1 = XEXP (x, 1);
7895       rtx op2 = XEXP (x, 2);
7896
7897       if (GET_CODE (op0) == MULT
7898           && CONST_INT_P (op1)
7899           && op2 == const0_rtx
7900           && CONST_INT_P (XEXP (op0, 1))
7901           && aarch64_is_extend_from_extract (mode,
7902                                              XEXP (op0, 1),
7903                                              op1))
7904         {
7905           return true;
7906         }
7907     }
7908   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
7909      No shift.  */
7910   else if (GET_CODE (x) == SIGN_EXTEND
7911            || GET_CODE (x) == ZERO_EXTEND)
7912     return REG_P (XEXP (x, 0));
7913
7914   return false;
7915 }
7916
7917 static bool
7918 aarch64_frint_unspec_p (unsigned int u)
7919 {
7920   switch (u)
7921     {
7922       case UNSPEC_FRINTZ:
7923       case UNSPEC_FRINTP:
7924       case UNSPEC_FRINTM:
7925       case UNSPEC_FRINTA:
7926       case UNSPEC_FRINTN:
7927       case UNSPEC_FRINTX:
7928       case UNSPEC_FRINTI:
7929         return true;
7930
7931       default:
7932         return false;
7933     }
7934 }
7935
7936 /* Return true iff X is an rtx that will match an extr instruction
7937    i.e. as described in the *extr<mode>5_insn family of patterns.
7938    OP0 and OP1 will be set to the operands of the shifts involved
7939    on success and will be NULL_RTX otherwise.  */
7940
7941 static bool
7942 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
7943 {
7944   rtx op0, op1;
7945   scalar_int_mode mode;
7946   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
7947     return false;
7948
7949   *res_op0 = NULL_RTX;
7950   *res_op1 = NULL_RTX;
7951
7952   if (GET_CODE (x) != IOR)
7953     return false;
7954
7955   op0 = XEXP (x, 0);
7956   op1 = XEXP (x, 1);
7957
7958   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
7959       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
7960     {
7961      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
7962       if (GET_CODE (op1) == ASHIFT)
7963         std::swap (op0, op1);
7964
7965       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
7966         return false;
7967
7968       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
7969       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
7970
7971       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
7972           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
7973         {
7974           *res_op0 = XEXP (op0, 0);
7975           *res_op1 = XEXP (op1, 0);
7976           return true;
7977         }
7978     }
7979
7980   return false;
7981 }
7982
7983 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
7984    storing it in *COST.  Result is true if the total cost of the operation
7985    has now been calculated.  */
7986 static bool
7987 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
7988 {
7989   rtx inner;
7990   rtx comparator;
7991   enum rtx_code cmpcode;
7992
7993   if (COMPARISON_P (op0))
7994     {
7995       inner = XEXP (op0, 0);
7996       comparator = XEXP (op0, 1);
7997       cmpcode = GET_CODE (op0);
7998     }
7999   else
8000     {
8001       inner = op0;
8002       comparator = const0_rtx;
8003       cmpcode = NE;
8004     }
8005
8006   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8007     {
8008       /* Conditional branch.  */
8009       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8010         return true;
8011       else
8012         {
8013           if (cmpcode == NE || cmpcode == EQ)
8014             {
8015               if (comparator == const0_rtx)
8016                 {
8017                   /* TBZ/TBNZ/CBZ/CBNZ.  */
8018                   if (GET_CODE (inner) == ZERO_EXTRACT)
8019                     /* TBZ/TBNZ.  */
8020                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8021                                        ZERO_EXTRACT, 0, speed);
8022                   else
8023                     /* CBZ/CBNZ.  */
8024                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8025
8026                 return true;
8027               }
8028             }
8029           else if (cmpcode == LT || cmpcode == GE)
8030             {
8031               /* TBZ/TBNZ.  */
8032               if (comparator == const0_rtx)
8033                 return true;
8034             }
8035         }
8036     }
8037   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8038     {
8039       /* CCMP.  */
8040       if (GET_CODE (op1) == COMPARE)
8041         {
8042           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
8043           if (XEXP (op1, 1) == const0_rtx)
8044             *cost += 1;
8045           if (speed)
8046             {
8047               machine_mode mode = GET_MODE (XEXP (op1, 0));
8048               const struct cpu_cost_table *extra_cost
8049                 = aarch64_tune_params.insn_extra_cost;
8050
8051               if (GET_MODE_CLASS (mode) == MODE_INT)
8052                 *cost += extra_cost->alu.arith;
8053               else
8054                 *cost += extra_cost->fp[mode == DFmode].compare;
8055             }
8056           return true;
8057         }
8058
8059       /* It's a conditional operation based on the status flags,
8060          so it must be some flavor of CSEL.  */
8061
8062       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
8063       if (GET_CODE (op1) == NEG
8064           || GET_CODE (op1) == NOT
8065           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8066         op1 = XEXP (op1, 0);
8067       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8068         {
8069           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
8070           op1 = XEXP (op1, 0);
8071           op2 = XEXP (op2, 0);
8072         }
8073
8074       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8075       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8076       return true;
8077     }
8078
8079   /* We don't know what this is, cost all operands.  */
8080   return false;
8081 }
8082
8083 /* Check whether X is a bitfield operation of the form shift + extend that
8084    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
8085    operand to which the bitfield operation is applied.  Otherwise return
8086    NULL_RTX.  */
8087
8088 static rtx
8089 aarch64_extend_bitfield_pattern_p (rtx x)
8090 {
8091   rtx_code outer_code = GET_CODE (x);
8092   machine_mode outer_mode = GET_MODE (x);
8093
8094   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8095       && outer_mode != SImode && outer_mode != DImode)
8096     return NULL_RTX;
8097
8098   rtx inner = XEXP (x, 0);
8099   rtx_code inner_code = GET_CODE (inner);
8100   machine_mode inner_mode = GET_MODE (inner);
8101   rtx op = NULL_RTX;
8102
8103   switch (inner_code)
8104     {
8105       case ASHIFT:
8106         if (CONST_INT_P (XEXP (inner, 1))
8107             && (inner_mode == QImode || inner_mode == HImode))
8108           op = XEXP (inner, 0);
8109         break;
8110       case LSHIFTRT:
8111         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8112             && (inner_mode == QImode || inner_mode == HImode))
8113           op = XEXP (inner, 0);
8114         break;
8115       case ASHIFTRT:
8116         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8117             && (inner_mode == QImode || inner_mode == HImode))
8118           op = XEXP (inner, 0);
8119         break;
8120       default:
8121         break;
8122     }
8123
8124   return op;
8125 }
8126
8127 /* Return true if the mask and a shift amount from an RTX of the form
8128    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8129    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
8130
8131 bool
8132 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8133                                     rtx shft_amnt)
8134 {
8135   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8136          && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8137          && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8138          && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8139 }
8140
8141 /* Calculate the cost of calculating X, storing it in *COST.  Result
8142    is true if the total cost of the operation has now been calculated.  */
8143 static bool
8144 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8145                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8146 {
8147   rtx op0, op1, op2;
8148   const struct cpu_cost_table *extra_cost
8149     = aarch64_tune_params.insn_extra_cost;
8150   int code = GET_CODE (x);
8151   scalar_int_mode int_mode;
8152
8153   /* By default, assume that everything has equivalent cost to the
8154      cheapest instruction.  Any additional costs are applied as a delta
8155      above this default.  */
8156   *cost = COSTS_N_INSNS (1);
8157
8158   switch (code)
8159     {
8160     case SET:
8161       /* The cost depends entirely on the operands to SET.  */
8162       *cost = 0;
8163       op0 = SET_DEST (x);
8164       op1 = SET_SRC (x);
8165
8166       switch (GET_CODE (op0))
8167         {
8168         case MEM:
8169           if (speed)
8170             {
8171               rtx address = XEXP (op0, 0);
8172               if (VECTOR_MODE_P (mode))
8173                 *cost += extra_cost->ldst.storev;
8174               else if (GET_MODE_CLASS (mode) == MODE_INT)
8175                 *cost += extra_cost->ldst.store;
8176               else if (mode == SFmode)
8177                 *cost += extra_cost->ldst.storef;
8178               else if (mode == DFmode)
8179                 *cost += extra_cost->ldst.stored;
8180
8181               *cost +=
8182                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8183                                                      0, speed));
8184             }
8185
8186           *cost += rtx_cost (op1, mode, SET, 1, speed);
8187           return true;
8188
8189         case SUBREG:
8190           if (! REG_P (SUBREG_REG (op0)))
8191             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8192
8193           /* Fall through.  */
8194         case REG:
8195           /* The cost is one per vector-register copied.  */
8196           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8197             {
8198               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8199               *cost = COSTS_N_INSNS (nregs);
8200             }
8201           /* const0_rtx is in general free, but we will use an
8202              instruction to set a register to 0.  */
8203           else if (REG_P (op1) || op1 == const0_rtx)
8204             {
8205               /* The cost is 1 per register copied.  */
8206               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8207               *cost = COSTS_N_INSNS (nregs);
8208             }
8209           else
8210             /* Cost is just the cost of the RHS of the set.  */
8211             *cost += rtx_cost (op1, mode, SET, 1, speed);
8212           return true;
8213
8214         case ZERO_EXTRACT:
8215         case SIGN_EXTRACT:
8216           /* Bit-field insertion.  Strip any redundant widening of
8217              the RHS to meet the width of the target.  */
8218           if (GET_CODE (op1) == SUBREG)
8219             op1 = SUBREG_REG (op1);
8220           if ((GET_CODE (op1) == ZERO_EXTEND
8221                || GET_CODE (op1) == SIGN_EXTEND)
8222               && CONST_INT_P (XEXP (op0, 1))
8223               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8224               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8225             op1 = XEXP (op1, 0);
8226
8227           if (CONST_INT_P (op1))
8228             {
8229               /* MOV immediate is assumed to always be cheap.  */
8230               *cost = COSTS_N_INSNS (1);
8231             }
8232           else
8233             {
8234               /* BFM.  */
8235               if (speed)
8236                 *cost += extra_cost->alu.bfi;
8237               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8238             }
8239
8240           return true;
8241
8242         default:
8243           /* We can't make sense of this, assume default cost.  */
8244           *cost = COSTS_N_INSNS (1);
8245           return false;
8246         }
8247       return false;
8248
8249     case CONST_INT:
8250       /* If an instruction can incorporate a constant within the
8251          instruction, the instruction's expression avoids calling
8252          rtx_cost() on the constant.  If rtx_cost() is called on a
8253          constant, then it is usually because the constant must be
8254          moved into a register by one or more instructions.
8255
8256          The exception is constant 0, which can be expressed
8257          as XZR/WZR and is therefore free.  The exception to this is
8258          if we have (set (reg) (const0_rtx)) in which case we must cost
8259          the move.  However, we can catch that when we cost the SET, so
8260          we don't need to consider that here.  */
8261       if (x == const0_rtx)
8262         *cost = 0;
8263       else
8264         {
8265           /* To an approximation, building any other constant is
8266              proportionally expensive to the number of instructions
8267              required to build that constant.  This is true whether we
8268              are compiling for SPEED or otherwise.  */
8269           if (!is_a <scalar_int_mode> (mode, &int_mode))
8270             int_mode = word_mode;
8271           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8272                                  (NULL_RTX, x, false, int_mode));
8273         }
8274       return true;
8275
8276     case CONST_DOUBLE:
8277
8278       /* First determine number of instructions to do the move
8279           as an integer constant.  */
8280       if (!aarch64_float_const_representable_p (x)
8281            && !aarch64_can_const_movi_rtx_p (x, mode)
8282            && aarch64_float_const_rtx_p (x))
8283         {
8284           unsigned HOST_WIDE_INT ival;
8285           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8286           gcc_assert (succeed);
8287
8288           scalar_int_mode imode = (mode == HFmode
8289                                    ? SImode
8290                                    : int_mode_for_mode (mode).require ());
8291           int ncost = aarch64_internal_mov_immediate
8292                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8293           *cost += COSTS_N_INSNS (ncost);
8294           return true;
8295         }
8296
8297       if (speed)
8298         {
8299           /* mov[df,sf]_aarch64.  */
8300           if (aarch64_float_const_representable_p (x))
8301             /* FMOV (scalar immediate).  */
8302             *cost += extra_cost->fp[mode == DFmode].fpconst;
8303           else if (!aarch64_float_const_zero_rtx_p (x))
8304             {
8305               /* This will be a load from memory.  */
8306               if (mode == DFmode)
8307                 *cost += extra_cost->ldst.loadd;
8308               else
8309                 *cost += extra_cost->ldst.loadf;
8310             }
8311           else
8312             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
8313                or MOV v0.s[0], wzr - neither of which are modeled by the
8314                cost tables.  Just use the default cost.  */
8315             {
8316             }
8317         }
8318
8319       return true;
8320
8321     case MEM:
8322       if (speed)
8323         {
8324           /* For loads we want the base cost of a load, plus an
8325              approximation for the additional cost of the addressing
8326              mode.  */
8327           rtx address = XEXP (x, 0);
8328           if (VECTOR_MODE_P (mode))
8329             *cost += extra_cost->ldst.loadv;
8330           else if (GET_MODE_CLASS (mode) == MODE_INT)
8331             *cost += extra_cost->ldst.load;
8332           else if (mode == SFmode)
8333             *cost += extra_cost->ldst.loadf;
8334           else if (mode == DFmode)
8335             *cost += extra_cost->ldst.loadd;
8336
8337           *cost +=
8338                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8339                                                      0, speed));
8340         }
8341
8342       return true;
8343
8344     case NEG:
8345       op0 = XEXP (x, 0);
8346
8347       if (VECTOR_MODE_P (mode))
8348         {
8349           if (speed)
8350             {
8351               /* FNEG.  */
8352               *cost += extra_cost->vect.alu;
8353             }
8354           return false;
8355         }
8356
8357       if (GET_MODE_CLASS (mode) == MODE_INT)
8358         {
8359           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8360               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8361             {
8362               /* CSETM.  */
8363               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8364               return true;
8365             }
8366
8367           /* Cost this as SUB wzr, X.  */
8368           op0 = CONST0_RTX (mode);
8369           op1 = XEXP (x, 0);
8370           goto cost_minus;
8371         }
8372
8373       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8374         {
8375           /* Support (neg(fma...)) as a single instruction only if
8376              sign of zeros is unimportant.  This matches the decision
8377              making in aarch64.md.  */
8378           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8379             {
8380               /* FNMADD.  */
8381               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8382               return true;
8383             }
8384           if (GET_CODE (op0) == MULT)
8385             {
8386               /* FNMUL.  */
8387               *cost = rtx_cost (op0, mode, NEG, 0, speed);
8388               return true;
8389             }
8390           if (speed)
8391             /* FNEG.  */
8392             *cost += extra_cost->fp[mode == DFmode].neg;
8393           return false;
8394         }
8395
8396       return false;
8397
8398     case CLRSB:
8399     case CLZ:
8400       if (speed)
8401         {
8402           if (VECTOR_MODE_P (mode))
8403             *cost += extra_cost->vect.alu;
8404           else
8405             *cost += extra_cost->alu.clz;
8406         }
8407
8408       return false;
8409
8410     case COMPARE:
8411       op0 = XEXP (x, 0);
8412       op1 = XEXP (x, 1);
8413
8414       if (op1 == const0_rtx
8415           && GET_CODE (op0) == AND)
8416         {
8417           x = op0;
8418           mode = GET_MODE (op0);
8419           goto cost_logic;
8420         }
8421
8422       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8423         {
8424           /* TODO: A write to the CC flags possibly costs extra, this
8425              needs encoding in the cost tables.  */
8426
8427           mode = GET_MODE (op0);
8428           /* ANDS.  */
8429           if (GET_CODE (op0) == AND)
8430             {
8431               x = op0;
8432               goto cost_logic;
8433             }
8434
8435           if (GET_CODE (op0) == PLUS)
8436             {
8437               /* ADDS (and CMN alias).  */
8438               x = op0;
8439               goto cost_plus;
8440             }
8441
8442           if (GET_CODE (op0) == MINUS)
8443             {
8444               /* SUBS.  */
8445               x = op0;
8446               goto cost_minus;
8447             }
8448
8449           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8450               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8451               && CONST_INT_P (XEXP (op0, 2)))
8452             {
8453               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8454                  Handle it here directly rather than going to cost_logic
8455                  since we know the immediate generated for the TST is valid
8456                  so we can avoid creating an intermediate rtx for it only
8457                  for costing purposes.  */
8458               if (speed)
8459                 *cost += extra_cost->alu.logical;
8460
8461               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8462                                  ZERO_EXTRACT, 0, speed);
8463               return true;
8464             }
8465
8466           if (GET_CODE (op1) == NEG)
8467             {
8468               /* CMN.  */
8469               if (speed)
8470                 *cost += extra_cost->alu.arith;
8471
8472               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8473               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8474               return true;
8475             }
8476
8477           /* CMP.
8478
8479              Compare can freely swap the order of operands, and
8480              canonicalization puts the more complex operation first.
8481              But the integer MINUS logic expects the shift/extend
8482              operation in op1.  */
8483           if (! (REG_P (op0)
8484                  || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8485           {
8486             op0 = XEXP (x, 1);
8487             op1 = XEXP (x, 0);
8488           }
8489           goto cost_minus;
8490         }
8491
8492       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8493         {
8494           /* FCMP.  */
8495           if (speed)
8496             *cost += extra_cost->fp[mode == DFmode].compare;
8497
8498           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8499             {
8500               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8501               /* FCMP supports constant 0.0 for no extra cost. */
8502               return true;
8503             }
8504           return false;
8505         }
8506
8507       if (VECTOR_MODE_P (mode))
8508         {
8509           /* Vector compare.  */
8510           if (speed)
8511             *cost += extra_cost->vect.alu;
8512
8513           if (aarch64_float_const_zero_rtx_p (op1))
8514             {
8515               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8516                  cost.  */
8517               return true;
8518             }
8519           return false;
8520         }
8521       return false;
8522
8523     case MINUS:
8524       {
8525         op0 = XEXP (x, 0);
8526         op1 = XEXP (x, 1);
8527
8528 cost_minus:
8529         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8530
8531         /* Detect valid immediates.  */
8532         if ((GET_MODE_CLASS (mode) == MODE_INT
8533              || (GET_MODE_CLASS (mode) == MODE_CC
8534                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8535             && CONST_INT_P (op1)
8536             && aarch64_uimm12_shift (INTVAL (op1)))
8537           {
8538             if (speed)
8539               /* SUB(S) (immediate).  */
8540               *cost += extra_cost->alu.arith;
8541             return true;
8542           }
8543
8544         /* Look for SUB (extended register).  */
8545         if (is_a <scalar_int_mode> (mode, &int_mode)
8546             && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8547           {
8548             if (speed)
8549               *cost += extra_cost->alu.extend_arith;
8550
8551             op1 = aarch64_strip_extend (op1, true);
8552             *cost += rtx_cost (op1, VOIDmode,
8553                                (enum rtx_code) GET_CODE (op1), 0, speed);
8554             return true;
8555           }
8556
8557         rtx new_op1 = aarch64_strip_extend (op1, false);
8558
8559         /* Cost this as an FMA-alike operation.  */
8560         if ((GET_CODE (new_op1) == MULT
8561              || aarch64_shift_p (GET_CODE (new_op1)))
8562             && code != COMPARE)
8563           {
8564             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8565                                             (enum rtx_code) code,
8566                                             speed);
8567             return true;
8568           }
8569
8570         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8571
8572         if (speed)
8573           {
8574             if (VECTOR_MODE_P (mode))
8575               {
8576                 /* Vector SUB.  */
8577                 *cost += extra_cost->vect.alu;
8578               }
8579             else if (GET_MODE_CLASS (mode) == MODE_INT)
8580               {
8581                 /* SUB(S).  */
8582                 *cost += extra_cost->alu.arith;
8583               }
8584             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8585               {
8586                 /* FSUB.  */
8587                 *cost += extra_cost->fp[mode == DFmode].addsub;
8588               }
8589           }
8590         return true;
8591       }
8592
8593     case PLUS:
8594       {
8595         rtx new_op0;
8596
8597         op0 = XEXP (x, 0);
8598         op1 = XEXP (x, 1);
8599
8600 cost_plus:
8601         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8602             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8603           {
8604             /* CSINC.  */
8605             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8606             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8607             return true;
8608           }
8609
8610         if (GET_MODE_CLASS (mode) == MODE_INT
8611             && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8612                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8613           {
8614             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8615
8616             if (speed)
8617               /* ADD (immediate).  */
8618               *cost += extra_cost->alu.arith;
8619             return true;
8620           }
8621
8622         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8623
8624         /* Look for ADD (extended register).  */
8625         if (is_a <scalar_int_mode> (mode, &int_mode)
8626             && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8627           {
8628             if (speed)
8629               *cost += extra_cost->alu.extend_arith;
8630
8631             op0 = aarch64_strip_extend (op0, true);
8632             *cost += rtx_cost (op0, VOIDmode,
8633                                (enum rtx_code) GET_CODE (op0), 0, speed);
8634             return true;
8635           }
8636
8637         /* Strip any extend, leave shifts behind as we will
8638            cost them through mult_cost.  */
8639         new_op0 = aarch64_strip_extend (op0, false);
8640
8641         if (GET_CODE (new_op0) == MULT
8642             || aarch64_shift_p (GET_CODE (new_op0)))
8643           {
8644             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8645                                             speed);
8646             return true;
8647           }
8648
8649         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8650
8651         if (speed)
8652           {
8653             if (VECTOR_MODE_P (mode))
8654               {
8655                 /* Vector ADD.  */
8656                 *cost += extra_cost->vect.alu;
8657               }
8658             else if (GET_MODE_CLASS (mode) == MODE_INT)
8659               {
8660                 /* ADD.  */
8661                 *cost += extra_cost->alu.arith;
8662               }
8663             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8664               {
8665                 /* FADD.  */
8666                 *cost += extra_cost->fp[mode == DFmode].addsub;
8667               }
8668           }
8669         return true;
8670       }
8671
8672     case BSWAP:
8673       *cost = COSTS_N_INSNS (1);
8674
8675       if (speed)
8676         {
8677           if (VECTOR_MODE_P (mode))
8678             *cost += extra_cost->vect.alu;
8679           else
8680             *cost += extra_cost->alu.rev;
8681         }
8682       return false;
8683
8684     case IOR:
8685       if (aarch_rev16_p (x))
8686         {
8687           *cost = COSTS_N_INSNS (1);
8688
8689           if (speed)
8690             {
8691               if (VECTOR_MODE_P (mode))
8692                 *cost += extra_cost->vect.alu;
8693               else
8694                 *cost += extra_cost->alu.rev;
8695             }
8696           return true;
8697         }
8698
8699       if (aarch64_extr_rtx_p (x, &op0, &op1))
8700         {
8701           *cost += rtx_cost (op0, mode, IOR, 0, speed);
8702           *cost += rtx_cost (op1, mode, IOR, 1, speed);
8703           if (speed)
8704             *cost += extra_cost->alu.shift;
8705
8706           return true;
8707         }
8708     /* Fall through.  */
8709     case XOR:
8710     case AND:
8711     cost_logic:
8712       op0 = XEXP (x, 0);
8713       op1 = XEXP (x, 1);
8714
8715       if (VECTOR_MODE_P (mode))
8716         {
8717           if (speed)
8718             *cost += extra_cost->vect.alu;
8719           return true;
8720         }
8721
8722       if (code == AND
8723           && GET_CODE (op0) == MULT
8724           && CONST_INT_P (XEXP (op0, 1))
8725           && CONST_INT_P (op1)
8726           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
8727                                INTVAL (op1)) != 0)
8728         {
8729           /* This is a UBFM/SBFM.  */
8730           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
8731           if (speed)
8732             *cost += extra_cost->alu.bfx;
8733           return true;
8734         }
8735
8736       if (is_int_mode (mode, &int_mode))
8737         {
8738           if (CONST_INT_P (op1))
8739             {
8740               /* We have a mask + shift version of a UBFIZ
8741                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
8742               if (GET_CODE (op0) == ASHIFT
8743                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
8744                                                          XEXP (op0, 1)))
8745                 {
8746                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
8747                                      (enum rtx_code) code, 0, speed);
8748                   if (speed)
8749                     *cost += extra_cost->alu.bfx;
8750
8751                   return true;
8752                 }
8753               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
8754                 {
8755                 /* We possibly get the immediate for free, this is not
8756                    modelled.  */
8757                   *cost += rtx_cost (op0, int_mode,
8758                                      (enum rtx_code) code, 0, speed);
8759                   if (speed)
8760                     *cost += extra_cost->alu.logical;
8761
8762                   return true;
8763                 }
8764             }
8765           else
8766             {
8767               rtx new_op0 = op0;
8768
8769               /* Handle ORN, EON, or BIC.  */
8770               if (GET_CODE (op0) == NOT)
8771                 op0 = XEXP (op0, 0);
8772
8773               new_op0 = aarch64_strip_shift (op0);
8774
8775               /* If we had a shift on op0 then this is a logical-shift-
8776                  by-register/immediate operation.  Otherwise, this is just
8777                  a logical operation.  */
8778               if (speed)
8779                 {
8780                   if (new_op0 != op0)
8781                     {
8782                       /* Shift by immediate.  */
8783                       if (CONST_INT_P (XEXP (op0, 1)))
8784                         *cost += extra_cost->alu.log_shift;
8785                       else
8786                         *cost += extra_cost->alu.log_shift_reg;
8787                     }
8788                   else
8789                     *cost += extra_cost->alu.logical;
8790                 }
8791
8792               /* In both cases we want to cost both operands.  */
8793               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
8794                                  0, speed);
8795               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
8796                                  1, speed);
8797
8798               return true;
8799             }
8800         }
8801       return false;
8802
8803     case NOT:
8804       x = XEXP (x, 0);
8805       op0 = aarch64_strip_shift (x);
8806
8807       if (VECTOR_MODE_P (mode))
8808         {
8809           /* Vector NOT.  */
8810           *cost += extra_cost->vect.alu;
8811           return false;
8812         }
8813
8814       /* MVN-shifted-reg.  */
8815       if (op0 != x)
8816         {
8817           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
8818
8819           if (speed)
8820             *cost += extra_cost->alu.log_shift;
8821
8822           return true;
8823         }
8824       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
8825          Handle the second form here taking care that 'a' in the above can
8826          be a shift.  */
8827       else if (GET_CODE (op0) == XOR)
8828         {
8829           rtx newop0 = XEXP (op0, 0);
8830           rtx newop1 = XEXP (op0, 1);
8831           rtx op0_stripped = aarch64_strip_shift (newop0);
8832
8833           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
8834           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
8835
8836           if (speed)
8837             {
8838               if (op0_stripped != newop0)
8839                 *cost += extra_cost->alu.log_shift;
8840               else
8841                 *cost += extra_cost->alu.logical;
8842             }
8843
8844           return true;
8845         }
8846       /* MVN.  */
8847       if (speed)
8848         *cost += extra_cost->alu.logical;
8849
8850       return false;
8851
8852     case ZERO_EXTEND:
8853
8854       op0 = XEXP (x, 0);
8855       /* If a value is written in SI mode, then zero extended to DI
8856          mode, the operation will in general be free as a write to
8857          a 'w' register implicitly zeroes the upper bits of an 'x'
8858          register.  However, if this is
8859
8860            (set (reg) (zero_extend (reg)))
8861
8862          we must cost the explicit register move.  */
8863       if (mode == DImode
8864           && GET_MODE (op0) == SImode
8865           && outer == SET)
8866         {
8867           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
8868
8869         /* If OP_COST is non-zero, then the cost of the zero extend
8870            is effectively the cost of the inner operation.  Otherwise
8871            we have a MOV instruction and we take the cost from the MOV
8872            itself.  This is true independently of whether we are
8873            optimizing for space or time.  */
8874           if (op_cost)
8875             *cost = op_cost;
8876
8877           return true;
8878         }
8879       else if (MEM_P (op0))
8880         {
8881           /* All loads can zero extend to any size for free.  */
8882           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
8883           return true;
8884         }
8885
8886       op0 = aarch64_extend_bitfield_pattern_p (x);
8887       if (op0)
8888         {
8889           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
8890           if (speed)
8891             *cost += extra_cost->alu.bfx;
8892           return true;
8893         }
8894
8895       if (speed)
8896         {
8897           if (VECTOR_MODE_P (mode))
8898             {
8899               /* UMOV.  */
8900               *cost += extra_cost->vect.alu;
8901             }
8902           else
8903             {
8904               /* We generate an AND instead of UXTB/UXTH.  */
8905               *cost += extra_cost->alu.logical;
8906             }
8907         }
8908       return false;
8909
8910     case SIGN_EXTEND:
8911       if (MEM_P (XEXP (x, 0)))
8912         {
8913           /* LDRSH.  */
8914           if (speed)
8915             {
8916               rtx address = XEXP (XEXP (x, 0), 0);
8917               *cost += extra_cost->ldst.load_sign_extend;
8918
8919               *cost +=
8920                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8921                                                      0, speed));
8922             }
8923           return true;
8924         }
8925
8926       op0 = aarch64_extend_bitfield_pattern_p (x);
8927       if (op0)
8928         {
8929           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
8930           if (speed)
8931             *cost += extra_cost->alu.bfx;
8932           return true;
8933         }
8934
8935       if (speed)
8936         {
8937           if (VECTOR_MODE_P (mode))
8938             *cost += extra_cost->vect.alu;
8939           else
8940             *cost += extra_cost->alu.extend;
8941         }
8942       return false;
8943
8944     case ASHIFT:
8945       op0 = XEXP (x, 0);
8946       op1 = XEXP (x, 1);
8947
8948       if (CONST_INT_P (op1))
8949         {
8950           if (speed)
8951             {
8952               if (VECTOR_MODE_P (mode))
8953                 {
8954                   /* Vector shift (immediate).  */
8955                   *cost += extra_cost->vect.alu;
8956                 }
8957               else
8958                 {
8959                   /* LSL (immediate), UBMF, UBFIZ and friends.  These are all
8960                      aliases.  */
8961                   *cost += extra_cost->alu.shift;
8962                 }
8963             }
8964
8965           /* We can incorporate zero/sign extend for free.  */
8966           if (GET_CODE (op0) == ZERO_EXTEND
8967               || GET_CODE (op0) == SIGN_EXTEND)
8968             op0 = XEXP (op0, 0);
8969
8970           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
8971           return true;
8972         }
8973       else
8974         {
8975           if (VECTOR_MODE_P (mode))
8976             {
8977               if (speed)
8978                 /* Vector shift (register).  */
8979                 *cost += extra_cost->vect.alu;
8980             }
8981           else
8982             {
8983               if (speed)
8984                 /* LSLV.  */
8985                 *cost += extra_cost->alu.shift_reg;
8986
8987               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
8988                   && CONST_INT_P (XEXP (op1, 1))
8989                   && known_eq (INTVAL (XEXP (op1, 1)),
8990                                GET_MODE_BITSIZE (mode) - 1))
8991                 {
8992                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
8993                   /* We already demanded XEXP (op1, 0) to be REG_P, so
8994                      don't recurse into it.  */
8995                   return true;
8996                 }
8997             }
8998           return false;  /* All arguments need to be in registers.  */
8999         }
9000
9001     case ROTATE:
9002     case ROTATERT:
9003     case LSHIFTRT:
9004     case ASHIFTRT:
9005       op0 = XEXP (x, 0);
9006       op1 = XEXP (x, 1);
9007
9008       if (CONST_INT_P (op1))
9009         {
9010           /* ASR (immediate) and friends.  */
9011           if (speed)
9012             {
9013               if (VECTOR_MODE_P (mode))
9014                 *cost += extra_cost->vect.alu;
9015               else
9016                 *cost += extra_cost->alu.shift;
9017             }
9018
9019           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9020           return true;
9021         }
9022       else
9023         {
9024           if (VECTOR_MODE_P (mode))
9025             {
9026               if (speed)
9027                 /* Vector shift (register).  */
9028                 *cost += extra_cost->vect.alu;
9029             }
9030           else
9031             {
9032               if (speed)
9033                 /* ASR (register) and friends.  */
9034                 *cost += extra_cost->alu.shift_reg;
9035
9036               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9037                   && CONST_INT_P (XEXP (op1, 1))
9038                   && known_eq (INTVAL (XEXP (op1, 1)),
9039                                GET_MODE_BITSIZE (mode) - 1))
9040                 {
9041                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9042                   /* We already demanded XEXP (op1, 0) to be REG_P, so
9043                      don't recurse into it.  */
9044                   return true;
9045                 }
9046             }
9047           return false;  /* All arguments need to be in registers.  */
9048         }
9049
9050     case SYMBOL_REF:
9051
9052       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9053           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9054         {
9055           /* LDR.  */
9056           if (speed)
9057             *cost += extra_cost->ldst.load;
9058         }
9059       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9060                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9061         {
9062           /* ADRP, followed by ADD.  */
9063           *cost += COSTS_N_INSNS (1);
9064           if (speed)
9065             *cost += 2 * extra_cost->alu.arith;
9066         }
9067       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9068                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9069         {
9070           /* ADR.  */
9071           if (speed)
9072             *cost += extra_cost->alu.arith;
9073         }
9074
9075       if (flag_pic)
9076         {
9077           /* One extra load instruction, after accessing the GOT.  */
9078           *cost += COSTS_N_INSNS (1);
9079           if (speed)
9080             *cost += extra_cost->ldst.load;
9081         }
9082       return true;
9083
9084     case HIGH:
9085     case LO_SUM:
9086       /* ADRP/ADD (immediate).  */
9087       if (speed)
9088         *cost += extra_cost->alu.arith;
9089       return true;
9090
9091     case ZERO_EXTRACT:
9092     case SIGN_EXTRACT:
9093       /* UBFX/SBFX.  */
9094       if (speed)
9095         {
9096           if (VECTOR_MODE_P (mode))
9097             *cost += extra_cost->vect.alu;
9098           else
9099             *cost += extra_cost->alu.bfx;
9100         }
9101
9102       /* We can trust that the immediates used will be correct (there
9103          are no by-register forms), so we need only cost op0.  */
9104       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9105       return true;
9106
9107     case MULT:
9108       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9109       /* aarch64_rtx_mult_cost always handles recursion to its
9110          operands.  */
9111       return true;
9112
9113     case MOD:
9114     /* We can expand signed mod by power of 2 using a NEGS, two parallel
9115        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
9116        an unconditional negate.  This case should only ever be reached through
9117        the set_smod_pow2_cheap check in expmed.c.  */
9118       if (CONST_INT_P (XEXP (x, 1))
9119           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9120           && (mode == SImode || mode == DImode))
9121         {
9122           /* We expand to 4 instructions.  Reset the baseline.  */
9123           *cost = COSTS_N_INSNS (4);
9124
9125           if (speed)
9126             *cost += 2 * extra_cost->alu.logical
9127                      + 2 * extra_cost->alu.arith;
9128
9129           return true;
9130         }
9131
9132     /* Fall-through.  */
9133     case UMOD:
9134       if (speed)
9135         {
9136           /* Slighly prefer UMOD over SMOD.  */
9137           if (VECTOR_MODE_P (mode))
9138             *cost += extra_cost->vect.alu;
9139           else if (GET_MODE_CLASS (mode) == MODE_INT)
9140             *cost += (extra_cost->mult[mode == DImode].add
9141                       + extra_cost->mult[mode == DImode].idiv
9142                       + (code == MOD ? 1 : 0));
9143         }
9144       return false;  /* All arguments need to be in registers.  */
9145
9146     case DIV:
9147     case UDIV:
9148     case SQRT:
9149       if (speed)
9150         {
9151           if (VECTOR_MODE_P (mode))
9152             *cost += extra_cost->vect.alu;
9153           else if (GET_MODE_CLASS (mode) == MODE_INT)
9154             /* There is no integer SQRT, so only DIV and UDIV can get
9155                here.  */
9156             *cost += (extra_cost->mult[mode == DImode].idiv
9157                      /* Slighly prefer UDIV over SDIV.  */
9158                      + (code == DIV ? 1 : 0));
9159           else
9160             *cost += extra_cost->fp[mode == DFmode].div;
9161         }
9162       return false;  /* All arguments need to be in registers.  */
9163
9164     case IF_THEN_ELSE:
9165       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9166                                          XEXP (x, 2), cost, speed);
9167
9168     case EQ:
9169     case NE:
9170     case GT:
9171     case GTU:
9172     case LT:
9173     case LTU:
9174     case GE:
9175     case GEU:
9176     case LE:
9177     case LEU:
9178
9179       return false; /* All arguments must be in registers.  */
9180
9181     case FMA:
9182       op0 = XEXP (x, 0);
9183       op1 = XEXP (x, 1);
9184       op2 = XEXP (x, 2);
9185
9186       if (speed)
9187         {
9188           if (VECTOR_MODE_P (mode))
9189             *cost += extra_cost->vect.alu;
9190           else
9191             *cost += extra_cost->fp[mode == DFmode].fma;
9192         }
9193
9194       /* FMSUB, FNMADD, and FNMSUB are free.  */
9195       if (GET_CODE (op0) == NEG)
9196         op0 = XEXP (op0, 0);
9197
9198       if (GET_CODE (op2) == NEG)
9199         op2 = XEXP (op2, 0);
9200
9201       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9202          and the by-element operand as operand 0.  */
9203       if (GET_CODE (op1) == NEG)
9204         op1 = XEXP (op1, 0);
9205
9206       /* Catch vector-by-element operations.  The by-element operand can
9207          either be (vec_duplicate (vec_select (x))) or just
9208          (vec_select (x)), depending on whether we are multiplying by
9209          a vector or a scalar.
9210
9211          Canonicalization is not very good in these cases, FMA4 will put the
9212          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
9213       if (GET_CODE (op0) == VEC_DUPLICATE)
9214         op0 = XEXP (op0, 0);
9215       else if (GET_CODE (op1) == VEC_DUPLICATE)
9216         op1 = XEXP (op1, 0);
9217
9218       if (GET_CODE (op0) == VEC_SELECT)
9219         op0 = XEXP (op0, 0);
9220       else if (GET_CODE (op1) == VEC_SELECT)
9221         op1 = XEXP (op1, 0);
9222
9223       /* If the remaining parameters are not registers,
9224          get the cost to put them into registers.  */
9225       *cost += rtx_cost (op0, mode, FMA, 0, speed);
9226       *cost += rtx_cost (op1, mode, FMA, 1, speed);
9227       *cost += rtx_cost (op2, mode, FMA, 2, speed);
9228       return true;
9229
9230     case FLOAT:
9231     case UNSIGNED_FLOAT:
9232       if (speed)
9233         *cost += extra_cost->fp[mode == DFmode].fromint;
9234       return false;
9235
9236     case FLOAT_EXTEND:
9237       if (speed)
9238         {
9239           if (VECTOR_MODE_P (mode))
9240             {
9241               /*Vector truncate.  */
9242               *cost += extra_cost->vect.alu;
9243             }
9244           else
9245             *cost += extra_cost->fp[mode == DFmode].widen;
9246         }
9247       return false;
9248
9249     case FLOAT_TRUNCATE:
9250       if (speed)
9251         {
9252           if (VECTOR_MODE_P (mode))
9253             {
9254               /*Vector conversion.  */
9255               *cost += extra_cost->vect.alu;
9256             }
9257           else
9258             *cost += extra_cost->fp[mode == DFmode].narrow;
9259         }
9260       return false;
9261
9262     case FIX:
9263     case UNSIGNED_FIX:
9264       x = XEXP (x, 0);
9265       /* Strip the rounding part.  They will all be implemented
9266          by the fcvt* family of instructions anyway.  */
9267       if (GET_CODE (x) == UNSPEC)
9268         {
9269           unsigned int uns_code = XINT (x, 1);
9270
9271           if (uns_code == UNSPEC_FRINTA
9272               || uns_code == UNSPEC_FRINTM
9273               || uns_code == UNSPEC_FRINTN
9274               || uns_code == UNSPEC_FRINTP
9275               || uns_code == UNSPEC_FRINTZ)
9276             x = XVECEXP (x, 0, 0);
9277         }
9278
9279       if (speed)
9280         {
9281           if (VECTOR_MODE_P (mode))
9282             *cost += extra_cost->vect.alu;
9283           else
9284             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9285         }
9286
9287       /* We can combine fmul by a power of 2 followed by a fcvt into a single
9288          fixed-point fcvt.  */
9289       if (GET_CODE (x) == MULT
9290           && ((VECTOR_MODE_P (mode)
9291                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9292               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9293         {
9294           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9295                              0, speed);
9296           return true;
9297         }
9298
9299       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9300       return true;
9301
9302     case ABS:
9303       if (VECTOR_MODE_P (mode))
9304         {
9305           /* ABS (vector).  */
9306           if (speed)
9307             *cost += extra_cost->vect.alu;
9308         }
9309       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9310         {
9311           op0 = XEXP (x, 0);
9312
9313           /* FABD, which is analogous to FADD.  */
9314           if (GET_CODE (op0) == MINUS)
9315             {
9316               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9317               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9318               if (speed)
9319                 *cost += extra_cost->fp[mode == DFmode].addsub;
9320
9321               return true;
9322             }
9323           /* Simple FABS is analogous to FNEG.  */
9324           if (speed)
9325             *cost += extra_cost->fp[mode == DFmode].neg;
9326         }
9327       else
9328         {
9329           /* Integer ABS will either be split to
9330              two arithmetic instructions, or will be an ABS
9331              (scalar), which we don't model.  */
9332           *cost = COSTS_N_INSNS (2);
9333           if (speed)
9334             *cost += 2 * extra_cost->alu.arith;
9335         }
9336       return false;
9337
9338     case SMAX:
9339     case SMIN:
9340       if (speed)
9341         {
9342           if (VECTOR_MODE_P (mode))
9343             *cost += extra_cost->vect.alu;
9344           else
9345             {
9346               /* FMAXNM/FMINNM/FMAX/FMIN.
9347                  TODO: This may not be accurate for all implementations, but
9348                  we do not model this in the cost tables.  */
9349               *cost += extra_cost->fp[mode == DFmode].addsub;
9350             }
9351         }
9352       return false;
9353
9354     case UNSPEC:
9355       /* The floating point round to integer frint* instructions.  */
9356       if (aarch64_frint_unspec_p (XINT (x, 1)))
9357         {
9358           if (speed)
9359             *cost += extra_cost->fp[mode == DFmode].roundint;
9360
9361           return false;
9362         }
9363
9364       if (XINT (x, 1) == UNSPEC_RBIT)
9365         {
9366           if (speed)
9367             *cost += extra_cost->alu.rev;
9368
9369           return false;
9370         }
9371       break;
9372
9373     case TRUNCATE:
9374
9375       /* Decompose <su>muldi3_highpart.  */
9376       if (/* (truncate:DI  */
9377           mode == DImode
9378           /*   (lshiftrt:TI  */
9379           && GET_MODE (XEXP (x, 0)) == TImode
9380           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9381           /*      (mult:TI  */
9382           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9383           /*        (ANY_EXTEND:TI (reg:DI))
9384                     (ANY_EXTEND:TI (reg:DI)))  */
9385           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9386                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9387               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9388                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9389           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9390           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9391           /*     (const_int 64)  */
9392           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9393           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9394         {
9395           /* UMULH/SMULH.  */
9396           if (speed)
9397             *cost += extra_cost->mult[mode == DImode].extend;
9398           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9399                              mode, MULT, 0, speed);
9400           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9401                              mode, MULT, 1, speed);
9402           return true;
9403         }
9404
9405       /* Fall through.  */
9406     default:
9407       break;
9408     }
9409
9410   if (dump_file
9411       && flag_aarch64_verbose_cost)
9412     fprintf (dump_file,
9413       "\nFailed to cost RTX.  Assuming default cost.\n");
9414
9415   return true;
9416 }
9417
9418 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9419    calculated for X.  This cost is stored in *COST.  Returns true
9420    if the total cost of X was calculated.  */
9421 static bool
9422 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9423                    int param, int *cost, bool speed)
9424 {
9425   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9426
9427   if (dump_file
9428       && flag_aarch64_verbose_cost)
9429     {
9430       print_rtl_single (dump_file, x);
9431       fprintf (dump_file, "\n%s cost: %d (%s)\n",
9432                speed ? "Hot" : "Cold",
9433                *cost, result ? "final" : "partial");
9434     }
9435
9436   return result;
9437 }
9438
9439 static int
9440 aarch64_register_move_cost (machine_mode mode,
9441                             reg_class_t from_i, reg_class_t to_i)
9442 {
9443   enum reg_class from = (enum reg_class) from_i;
9444   enum reg_class to = (enum reg_class) to_i;
9445   const struct cpu_regmove_cost *regmove_cost
9446     = aarch64_tune_params.regmove_cost;
9447
9448   /* Caller save and pointer regs are equivalent to GENERAL_REGS.  */
9449   if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
9450     to = GENERAL_REGS;
9451
9452   if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
9453     from = GENERAL_REGS;
9454
9455   /* Moving between GPR and stack cost is the same as GP2GP.  */
9456   if ((from == GENERAL_REGS && to == STACK_REG)
9457       || (to == GENERAL_REGS && from == STACK_REG))
9458     return regmove_cost->GP2GP;
9459
9460   /* To/From the stack register, we move via the gprs.  */
9461   if (to == STACK_REG || from == STACK_REG)
9462     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9463             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9464
9465   if (known_eq (GET_MODE_SIZE (mode), 16))
9466     {
9467       /* 128-bit operations on general registers require 2 instructions.  */
9468       if (from == GENERAL_REGS && to == GENERAL_REGS)
9469         return regmove_cost->GP2GP * 2;
9470       else if (from == GENERAL_REGS)
9471         return regmove_cost->GP2FP * 2;
9472       else if (to == GENERAL_REGS)
9473         return regmove_cost->FP2GP * 2;
9474
9475       /* When AdvSIMD instructions are disabled it is not possible to move
9476          a 128-bit value directly between Q registers.  This is handled in
9477          secondary reload.  A general register is used as a scratch to move
9478          the upper DI value and the lower DI value is moved directly,
9479          hence the cost is the sum of three moves. */
9480       if (! TARGET_SIMD)
9481         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9482
9483       return regmove_cost->FP2FP;
9484     }
9485
9486   if (from == GENERAL_REGS && to == GENERAL_REGS)
9487     return regmove_cost->GP2GP;
9488   else if (from == GENERAL_REGS)
9489     return regmove_cost->GP2FP;
9490   else if (to == GENERAL_REGS)
9491     return regmove_cost->FP2GP;
9492
9493   return regmove_cost->FP2FP;
9494 }
9495
9496 static int
9497 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9498                           reg_class_t rclass ATTRIBUTE_UNUSED,
9499                           bool in ATTRIBUTE_UNUSED)
9500 {
9501   return aarch64_tune_params.memmov_cost;
9502 }
9503
9504 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9505    to optimize 1.0/sqrt.  */
9506
9507 static bool
9508 use_rsqrt_p (machine_mode mode)
9509 {
9510   return (!flag_trapping_math
9511           && flag_unsafe_math_optimizations
9512           && ((aarch64_tune_params.approx_modes->recip_sqrt
9513                & AARCH64_APPROX_MODE (mode))
9514               || flag_mrecip_low_precision_sqrt));
9515 }
9516
9517 /* Function to decide when to use the approximate reciprocal square root
9518    builtin.  */
9519
9520 static tree
9521 aarch64_builtin_reciprocal (tree fndecl)
9522 {
9523   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9524
9525   if (!use_rsqrt_p (mode))
9526     return NULL_TREE;
9527   return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9528 }
9529
9530 typedef rtx (*rsqrte_type) (rtx, rtx);
9531
9532 /* Select reciprocal square root initial estimate insn depending on machine
9533    mode.  */
9534
9535 static rsqrte_type
9536 get_rsqrte_type (machine_mode mode)
9537 {
9538   switch (mode)
9539   {
9540     case E_DFmode:   return gen_aarch64_rsqrtedf;
9541     case E_SFmode:   return gen_aarch64_rsqrtesf;
9542     case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9543     case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9544     case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9545     default: gcc_unreachable ();
9546   }
9547 }
9548
9549 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9550
9551 /* Select reciprocal square root series step insn depending on machine mode.  */
9552
9553 static rsqrts_type
9554 get_rsqrts_type (machine_mode mode)
9555 {
9556   switch (mode)
9557   {
9558     case E_DFmode:   return gen_aarch64_rsqrtsdf;
9559     case E_SFmode:   return gen_aarch64_rsqrtssf;
9560     case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9561     case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9562     case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9563     default: gcc_unreachable ();
9564   }
9565 }
9566
9567 /* Emit instruction sequence to compute either the approximate square root
9568    or its approximate reciprocal, depending on the flag RECP, and return
9569    whether the sequence was emitted or not.  */
9570
9571 bool
9572 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9573 {
9574   machine_mode mode = GET_MODE (dst);
9575
9576   if (GET_MODE_INNER (mode) == HFmode)
9577     {
9578       gcc_assert (!recp);
9579       return false;
9580     }
9581
9582   if (!recp)
9583     {
9584       if (!(flag_mlow_precision_sqrt
9585             || (aarch64_tune_params.approx_modes->sqrt
9586                 & AARCH64_APPROX_MODE (mode))))
9587         return false;
9588
9589       if (flag_finite_math_only
9590           || flag_trapping_math
9591           || !flag_unsafe_math_optimizations
9592           || optimize_function_for_size_p (cfun))
9593         return false;
9594     }
9595   else
9596     /* Caller assumes we cannot fail.  */
9597     gcc_assert (use_rsqrt_p (mode));
9598
9599   machine_mode mmsk = mode_for_int_vector (mode).require ();
9600   rtx xmsk = gen_reg_rtx (mmsk);
9601   if (!recp)
9602     /* When calculating the approximate square root, compare the
9603        argument with 0.0 and create a mask.  */
9604     emit_insn (gen_rtx_SET (xmsk,
9605                             gen_rtx_NEG (mmsk,
9606                                          gen_rtx_EQ (mmsk, src,
9607                                                      CONST0_RTX (mode)))));
9608
9609   /* Estimate the approximate reciprocal square root.  */
9610   rtx xdst = gen_reg_rtx (mode);
9611   emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9612
9613   /* Iterate over the series twice for SF and thrice for DF.  */
9614   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9615
9616   /* Optionally iterate over the series once less for faster performance
9617      while sacrificing the accuracy.  */
9618   if ((recp && flag_mrecip_low_precision_sqrt)
9619       || (!recp && flag_mlow_precision_sqrt))
9620     iterations--;
9621
9622   /* Iterate over the series to calculate the approximate reciprocal square
9623      root.  */
9624   rtx x1 = gen_reg_rtx (mode);
9625   while (iterations--)
9626     {
9627       rtx x2 = gen_reg_rtx (mode);
9628       emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9629
9630       emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9631
9632       if (iterations > 0)
9633         emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9634     }
9635
9636   if (!recp)
9637     {
9638       /* Qualify the approximate reciprocal square root when the argument is
9639          0.0 by squashing the intermediary result to 0.0.  */
9640       rtx xtmp = gen_reg_rtx (mmsk);
9641       emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9642                                               gen_rtx_SUBREG (mmsk, xdst, 0)));
9643       emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9644
9645       /* Calculate the approximate square root.  */
9646       emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9647     }
9648
9649   /* Finalize the approximation.  */
9650   emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9651
9652   return true;
9653 }
9654
9655 typedef rtx (*recpe_type) (rtx, rtx);
9656
9657 /* Select reciprocal initial estimate insn depending on machine mode.  */
9658
9659 static recpe_type
9660 get_recpe_type (machine_mode mode)
9661 {
9662   switch (mode)
9663   {
9664     case E_SFmode:   return (gen_aarch64_frecpesf);
9665     case E_V2SFmode: return (gen_aarch64_frecpev2sf);
9666     case E_V4SFmode: return (gen_aarch64_frecpev4sf);
9667     case E_DFmode:   return (gen_aarch64_frecpedf);
9668     case E_V2DFmode: return (gen_aarch64_frecpev2df);
9669     default:         gcc_unreachable ();
9670   }
9671 }
9672
9673 typedef rtx (*recps_type) (rtx, rtx, rtx);
9674
9675 /* Select reciprocal series step insn depending on machine mode.  */
9676
9677 static recps_type
9678 get_recps_type (machine_mode mode)
9679 {
9680   switch (mode)
9681   {
9682     case E_SFmode:   return (gen_aarch64_frecpssf);
9683     case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
9684     case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
9685     case E_DFmode:   return (gen_aarch64_frecpsdf);
9686     case E_V2DFmode: return (gen_aarch64_frecpsv2df);
9687     default:         gcc_unreachable ();
9688   }
9689 }
9690
9691 /* Emit the instruction sequence to compute the approximation for the division
9692    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
9693
9694 bool
9695 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9696 {
9697   machine_mode mode = GET_MODE (quo);
9698
9699   if (GET_MODE_INNER (mode) == HFmode)
9700     return false;
9701
9702   bool use_approx_division_p = (flag_mlow_precision_div
9703                                 || (aarch64_tune_params.approx_modes->division
9704                                     & AARCH64_APPROX_MODE (mode)));
9705
9706   if (!flag_finite_math_only
9707       || flag_trapping_math
9708       || !flag_unsafe_math_optimizations
9709       || optimize_function_for_size_p (cfun)
9710       || !use_approx_division_p)
9711     return false;
9712
9713   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
9714     return false;
9715
9716   /* Estimate the approximate reciprocal.  */
9717   rtx xrcp = gen_reg_rtx (mode);
9718   emit_insn ((*get_recpe_type (mode)) (xrcp, den));
9719
9720   /* Iterate over the series twice for SF and thrice for DF.  */
9721   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9722
9723   /* Optionally iterate over the series once less for faster performance,
9724      while sacrificing the accuracy.  */
9725   if (flag_mlow_precision_div)
9726     iterations--;
9727
9728   /* Iterate over the series to calculate the approximate reciprocal.  */
9729   rtx xtmp = gen_reg_rtx (mode);
9730   while (iterations--)
9731     {
9732       emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
9733
9734       if (iterations > 0)
9735         emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
9736     }
9737
9738   if (num != CONST1_RTX (mode))
9739     {
9740       /* As the approximate reciprocal of DEN is already calculated, only
9741          calculate the approximate division when NUM is not 1.0.  */
9742       rtx xnum = force_reg (mode, num);
9743       emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
9744     }
9745
9746   /* Finalize the approximation.  */
9747   emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
9748   return true;
9749 }
9750
9751 /* Return the number of instructions that can be issued per cycle.  */
9752 static int
9753 aarch64_sched_issue_rate (void)
9754 {
9755   return aarch64_tune_params.issue_rate;
9756 }
9757
9758 static int
9759 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
9760 {
9761   int issue_rate = aarch64_sched_issue_rate ();
9762
9763   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
9764 }
9765
9766
9767 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
9768    autopref_multipass_dfa_lookahead_guard from haifa-sched.c.  It only
9769    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
9770
9771 static int
9772 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
9773                                                     int ready_index)
9774 {
9775   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
9776 }
9777
9778
9779 /* Vectorizer cost model target hooks.  */
9780
9781 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
9782 static int
9783 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
9784                                     tree vectype,
9785                                     int misalign ATTRIBUTE_UNUSED)
9786 {
9787   unsigned elements;
9788   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
9789   bool fp = false;
9790
9791   if (vectype != NULL)
9792     fp = FLOAT_TYPE_P (vectype);
9793
9794   switch (type_of_cost)
9795     {
9796       case scalar_stmt:
9797         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
9798
9799       case scalar_load:
9800         return costs->scalar_load_cost;
9801
9802       case scalar_store:
9803         return costs->scalar_store_cost;
9804
9805       case vector_stmt:
9806         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
9807
9808       case vector_load:
9809         return costs->vec_align_load_cost;
9810
9811       case vector_store:
9812         return costs->vec_store_cost;
9813
9814       case vec_to_scalar:
9815         return costs->vec_to_scalar_cost;
9816
9817       case scalar_to_vec:
9818         return costs->scalar_to_vec_cost;
9819
9820       case unaligned_load:
9821       case vector_gather_load:
9822         return costs->vec_unalign_load_cost;
9823
9824       case unaligned_store:
9825       case vector_scatter_store:
9826         return costs->vec_unalign_store_cost;
9827
9828       case cond_branch_taken:
9829         return costs->cond_taken_branch_cost;
9830
9831       case cond_branch_not_taken:
9832         return costs->cond_not_taken_branch_cost;
9833
9834       case vec_perm:
9835         return costs->vec_permute_cost;
9836
9837       case vec_promote_demote:
9838         return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
9839
9840       case vec_construct:
9841         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
9842         return elements / 2 + 1;
9843
9844       default:
9845         gcc_unreachable ();
9846     }
9847 }
9848
9849 /* Implement targetm.vectorize.add_stmt_cost.  */
9850 static unsigned
9851 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
9852                        struct _stmt_vec_info *stmt_info, int misalign,
9853                        enum vect_cost_model_location where)
9854 {
9855   unsigned *cost = (unsigned *) data;
9856   unsigned retval = 0;
9857
9858   if (flag_vect_cost_model)
9859     {
9860       tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
9861       int stmt_cost =
9862             aarch64_builtin_vectorization_cost (kind, vectype, misalign);
9863
9864       /* Statements in an inner loop relative to the loop being
9865          vectorized are weighted more heavily.  The value here is
9866          arbitrary and could potentially be improved with analysis.  */
9867       if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
9868         count *= 50; /*  FIXME  */
9869
9870       retval = (unsigned) (count * stmt_cost);
9871       cost[where] += retval;
9872     }
9873
9874   return retval;
9875 }
9876
9877 static void initialize_aarch64_code_model (struct gcc_options *);
9878
9879 /* Parse the TO_PARSE string and put the architecture struct that it
9880    selects into RES and the architectural features into ISA_FLAGS.
9881    Return an aarch64_parse_opt_result describing the parse result.
9882    If there is an error parsing, RES and ISA_FLAGS are left unchanged.  */
9883
9884 static enum aarch64_parse_opt_result
9885 aarch64_parse_arch (const char *to_parse, const struct processor **res,
9886                     unsigned long *isa_flags)
9887 {
9888   char *ext;
9889   const struct processor *arch;
9890   char *str = (char *) alloca (strlen (to_parse) + 1);
9891   size_t len;
9892
9893   strcpy (str, to_parse);
9894
9895   ext = strchr (str, '+');
9896
9897   if (ext != NULL)
9898     len = ext - str;
9899   else
9900     len = strlen (str);
9901
9902   if (len == 0)
9903     return AARCH64_PARSE_MISSING_ARG;
9904
9905
9906   /* Loop through the list of supported ARCHes to find a match.  */
9907   for (arch = all_architectures; arch->name != NULL; arch++)
9908     {
9909       if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
9910         {
9911           unsigned long isa_temp = arch->flags;
9912
9913           if (ext != NULL)
9914             {
9915               /* TO_PARSE string contains at least one extension.  */
9916               enum aarch64_parse_opt_result ext_res
9917                 = aarch64_parse_extension (ext, &isa_temp);
9918
9919               if (ext_res != AARCH64_PARSE_OK)
9920                 return ext_res;
9921             }
9922           /* Extension parsing was successful.  Confirm the result
9923              arch and ISA flags.  */
9924           *res = arch;
9925           *isa_flags = isa_temp;
9926           return AARCH64_PARSE_OK;
9927         }
9928     }
9929
9930   /* ARCH name not found in list.  */
9931   return AARCH64_PARSE_INVALID_ARG;
9932 }
9933
9934 /* Parse the TO_PARSE string and put the result tuning in RES and the
9935    architecture flags in ISA_FLAGS.  Return an aarch64_parse_opt_result
9936    describing the parse result.  If there is an error parsing, RES and
9937    ISA_FLAGS are left unchanged.  */
9938
9939 static enum aarch64_parse_opt_result
9940 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
9941                    unsigned long *isa_flags)
9942 {
9943   char *ext;
9944   const struct processor *cpu;
9945   char *str = (char *) alloca (strlen (to_parse) + 1);
9946   size_t len;
9947
9948   strcpy (str, to_parse);
9949
9950   ext = strchr (str, '+');
9951
9952   if (ext != NULL)
9953     len = ext - str;
9954   else
9955     len = strlen (str);
9956
9957   if (len == 0)
9958     return AARCH64_PARSE_MISSING_ARG;
9959
9960
9961   /* Loop through the list of supported CPUs to find a match.  */
9962   for (cpu = all_cores; cpu->name != NULL; cpu++)
9963     {
9964       if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
9965         {
9966           unsigned long isa_temp = cpu->flags;
9967
9968
9969           if (ext != NULL)
9970             {
9971               /* TO_PARSE string contains at least one extension.  */
9972               enum aarch64_parse_opt_result ext_res
9973                 = aarch64_parse_extension (ext, &isa_temp);
9974
9975               if (ext_res != AARCH64_PARSE_OK)
9976                 return ext_res;
9977             }
9978           /* Extension parsing was successfull.  Confirm the result
9979              cpu and ISA flags.  */
9980           *res = cpu;
9981           *isa_flags = isa_temp;
9982           return AARCH64_PARSE_OK;
9983         }
9984     }
9985
9986   /* CPU name not found in list.  */
9987   return AARCH64_PARSE_INVALID_ARG;
9988 }
9989
9990 /* Parse the TO_PARSE string and put the cpu it selects into RES.
9991    Return an aarch64_parse_opt_result describing the parse result.
9992    If the parsing fails the RES does not change.  */
9993
9994 static enum aarch64_parse_opt_result
9995 aarch64_parse_tune (const char *to_parse, const struct processor **res)
9996 {
9997   const struct processor *cpu;
9998   char *str = (char *) alloca (strlen (to_parse) + 1);
9999
10000   strcpy (str, to_parse);
10001
10002   /* Loop through the list of supported CPUs to find a match.  */
10003   for (cpu = all_cores; cpu->name != NULL; cpu++)
10004     {
10005       if (strcmp (cpu->name, str) == 0)
10006         {
10007           *res = cpu;
10008           return AARCH64_PARSE_OK;
10009         }
10010     }
10011
10012   /* CPU name not found in list.  */
10013   return AARCH64_PARSE_INVALID_ARG;
10014 }
10015
10016 /* Parse TOKEN, which has length LENGTH to see if it is an option
10017    described in FLAG.  If it is, return the index bit for that fusion type.
10018    If not, error (printing OPTION_NAME) and return zero.  */
10019
10020 static unsigned int
10021 aarch64_parse_one_option_token (const char *token,
10022                                 size_t length,
10023                                 const struct aarch64_flag_desc *flag,
10024                                 const char *option_name)
10025 {
10026   for (; flag->name != NULL; flag++)
10027     {
10028       if (length == strlen (flag->name)
10029           && !strncmp (flag->name, token, length))
10030         return flag->flag;
10031     }
10032
10033   error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10034   return 0;
10035 }
10036
10037 /* Parse OPTION which is a comma-separated list of flags to enable.
10038    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10039    default state we inherit from the CPU tuning structures.  OPTION_NAME
10040    gives the top-level option we are parsing in the -moverride string,
10041    for use in error messages.  */
10042
10043 static unsigned int
10044 aarch64_parse_boolean_options (const char *option,
10045                                const struct aarch64_flag_desc *flags,
10046                                unsigned int initial_state,
10047                                const char *option_name)
10048 {
10049   const char separator = '.';
10050   const char* specs = option;
10051   const char* ntoken = option;
10052   unsigned int found_flags = initial_state;
10053
10054   while ((ntoken = strchr (specs, separator)))
10055     {
10056       size_t token_length = ntoken - specs;
10057       unsigned token_ops = aarch64_parse_one_option_token (specs,
10058                                                            token_length,
10059                                                            flags,
10060                                                            option_name);
10061       /* If we find "none" (or, for simplicity's sake, an error) anywhere
10062          in the token stream, reset the supported operations.  So:
10063
10064            adrp+add.cmp+branch.none.adrp+add
10065
10066            would have the result of turning on only adrp+add fusion.  */
10067       if (!token_ops)
10068         found_flags = 0;
10069
10070       found_flags |= token_ops;
10071       specs = ++ntoken;
10072     }
10073
10074   /* We ended with a comma, print something.  */
10075   if (!(*specs))
10076     {
10077       error ("%s string ill-formed\n", option_name);
10078       return 0;
10079     }
10080
10081   /* We still have one more token to parse.  */
10082   size_t token_length = strlen (specs);
10083   unsigned token_ops = aarch64_parse_one_option_token (specs,
10084                                                        token_length,
10085                                                        flags,
10086                                                        option_name);
10087    if (!token_ops)
10088      found_flags = 0;
10089
10090   found_flags |= token_ops;
10091   return found_flags;
10092 }
10093
10094 /* Support for overriding instruction fusion.  */
10095
10096 static void
10097 aarch64_parse_fuse_string (const char *fuse_string,
10098                             struct tune_params *tune)
10099 {
10100   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10101                                                      aarch64_fusible_pairs,
10102                                                      tune->fusible_ops,
10103                                                      "fuse=");
10104 }
10105
10106 /* Support for overriding other tuning flags.  */
10107
10108 static void
10109 aarch64_parse_tune_string (const char *tune_string,
10110                             struct tune_params *tune)
10111 {
10112   tune->extra_tuning_flags
10113     = aarch64_parse_boolean_options (tune_string,
10114                                      aarch64_tuning_flags,
10115                                      tune->extra_tuning_flags,
10116                                      "tune=");
10117 }
10118
10119 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10120    we understand.  If it is, extract the option string and handoff to
10121    the appropriate function.  */
10122
10123 void
10124 aarch64_parse_one_override_token (const char* token,
10125                                   size_t length,
10126                                   struct tune_params *tune)
10127 {
10128   const struct aarch64_tuning_override_function *fn
10129     = aarch64_tuning_override_functions;
10130
10131   const char *option_part = strchr (token, '=');
10132   if (!option_part)
10133     {
10134       error ("tuning string missing in option (%s)", token);
10135       return;
10136     }
10137
10138   /* Get the length of the option name.  */
10139   length = option_part - token;
10140   /* Skip the '=' to get to the option string.  */
10141   option_part++;
10142
10143   for (; fn->name != NULL; fn++)
10144     {
10145       if (!strncmp (fn->name, token, length))
10146         {
10147           fn->parse_override (option_part, tune);
10148           return;
10149         }
10150     }
10151
10152   error ("unknown tuning option (%s)",token);
10153   return;
10154 }
10155
10156 /* A checking mechanism for the implementation of the tls size.  */
10157
10158 static void
10159 initialize_aarch64_tls_size (struct gcc_options *opts)
10160 {
10161   if (aarch64_tls_size == 0)
10162     aarch64_tls_size = 24;
10163
10164   switch (opts->x_aarch64_cmodel_var)
10165     {
10166     case AARCH64_CMODEL_TINY:
10167       /* Both the default and maximum TLS size allowed under tiny is 1M which
10168          needs two instructions to address, so we clamp the size to 24.  */
10169       if (aarch64_tls_size > 24)
10170         aarch64_tls_size = 24;
10171       break;
10172     case AARCH64_CMODEL_SMALL:
10173       /* The maximum TLS size allowed under small is 4G.  */
10174       if (aarch64_tls_size > 32)
10175         aarch64_tls_size = 32;
10176       break;
10177     case AARCH64_CMODEL_LARGE:
10178       /* The maximum TLS size allowed under large is 16E.
10179          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
10180       if (aarch64_tls_size > 48)
10181         aarch64_tls_size = 48;
10182       break;
10183     default:
10184       gcc_unreachable ();
10185     }
10186
10187   return;
10188 }
10189
10190 /* Parse STRING looking for options in the format:
10191      string     :: option:string
10192      option     :: name=substring
10193      name       :: {a-z}
10194      substring  :: defined by option.  */
10195
10196 static void
10197 aarch64_parse_override_string (const char* input_string,
10198                                struct tune_params* tune)
10199 {
10200   const char separator = ':';
10201   size_t string_length = strlen (input_string) + 1;
10202   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10203   char *string = string_root;
10204   strncpy (string, input_string, string_length);
10205   string[string_length - 1] = '\0';
10206
10207   char* ntoken = string;
10208
10209   while ((ntoken = strchr (string, separator)))
10210     {
10211       size_t token_length = ntoken - string;
10212       /* Make this substring look like a string.  */
10213       *ntoken = '\0';
10214       aarch64_parse_one_override_token (string, token_length, tune);
10215       string = ++ntoken;
10216     }
10217
10218   /* One last option to parse.  */
10219   aarch64_parse_one_override_token (string, strlen (string), tune);
10220   free (string_root);
10221 }
10222
10223
10224 static void
10225 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10226 {
10227   /* PR 70044: We have to be careful about being called multiple times for the
10228      same function.  This means all changes should be repeatable.  */
10229
10230   /* If the frame pointer is enabled, set it to a special value that behaves
10231      similar to frame pointer omission.  If we don't do this all leaf functions
10232      will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
10233      If flag_omit_frame_pointer has this special value, we must force the
10234      frame pointer if not in a leaf function.  We also need to force it in a
10235      leaf function if flag_omit_frame_pointer is not set or if LR is used.  */
10236   if (opts->x_flag_omit_frame_pointer == 0)
10237     opts->x_flag_omit_frame_pointer = 2;
10238
10239   /* If not optimizing for size, set the default
10240      alignment to what the target wants.  */
10241   if (!opts->x_optimize_size)
10242     {
10243       if (opts->x_align_loops <= 0)
10244         opts->x_align_loops = aarch64_tune_params.loop_align;
10245       if (opts->x_align_jumps <= 0)
10246         opts->x_align_jumps = aarch64_tune_params.jump_align;
10247       if (opts->x_align_functions <= 0)
10248         opts->x_align_functions = aarch64_tune_params.function_align;
10249     }
10250
10251   /* We default to no pc-relative literal loads.  */
10252
10253   aarch64_pcrelative_literal_loads = false;
10254
10255   /* If -mpc-relative-literal-loads is set on the command line, this
10256      implies that the user asked for PC relative literal loads.  */
10257   if (opts->x_pcrelative_literal_loads == 1)
10258     aarch64_pcrelative_literal_loads = true;
10259
10260   /* In the tiny memory model it makes no sense to disallow PC relative
10261      literal pool loads.  */
10262   if (aarch64_cmodel == AARCH64_CMODEL_TINY
10263       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10264     aarch64_pcrelative_literal_loads = true;
10265
10266   /* When enabling the lower precision Newton series for the square root, also
10267      enable it for the reciprocal square root, since the latter is an
10268      intermediary step for the former.  */
10269   if (flag_mlow_precision_sqrt)
10270     flag_mrecip_low_precision_sqrt = true;
10271 }
10272
10273 /* 'Unpack' up the internal tuning structs and update the options
10274     in OPTS.  The caller must have set up selected_tune and selected_arch
10275     as all the other target-specific codegen decisions are
10276     derived from them.  */
10277
10278 void
10279 aarch64_override_options_internal (struct gcc_options *opts)
10280 {
10281   aarch64_tune_flags = selected_tune->flags;
10282   aarch64_tune = selected_tune->sched_core;
10283   /* Make a copy of the tuning parameters attached to the core, which
10284      we may later overwrite.  */
10285   aarch64_tune_params = *(selected_tune->tune);
10286   aarch64_architecture_version = selected_arch->architecture_version;
10287
10288   if (opts->x_aarch64_override_tune_string)
10289     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10290                                   &aarch64_tune_params);
10291
10292   /* This target defaults to strict volatile bitfields.  */
10293   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10294     opts->x_flag_strict_volatile_bitfields = 1;
10295
10296   initialize_aarch64_code_model (opts);
10297   initialize_aarch64_tls_size (opts);
10298
10299   int queue_depth = 0;
10300   switch (aarch64_tune_params.autoprefetcher_model)
10301     {
10302       case tune_params::AUTOPREFETCHER_OFF:
10303         queue_depth = -1;
10304         break;
10305       case tune_params::AUTOPREFETCHER_WEAK:
10306         queue_depth = 0;
10307         break;
10308       case tune_params::AUTOPREFETCHER_STRONG:
10309         queue_depth = max_insn_queue_index + 1;
10310         break;
10311       default:
10312         gcc_unreachable ();
10313     }
10314
10315   /* We don't mind passing in global_options_set here as we don't use
10316      the *options_set structs anyway.  */
10317   maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10318                          queue_depth,
10319                          opts->x_param_values,
10320                          global_options_set.x_param_values);
10321
10322   /* Set up parameters to be used in prefetching algorithm.  Do not
10323      override the defaults unless we are tuning for a core we have
10324      researched values for.  */
10325   if (aarch64_tune_params.prefetch->num_slots > 0)
10326     maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10327                            aarch64_tune_params.prefetch->num_slots,
10328                            opts->x_param_values,
10329                            global_options_set.x_param_values);
10330   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10331     maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10332                            aarch64_tune_params.prefetch->l1_cache_size,
10333                            opts->x_param_values,
10334                            global_options_set.x_param_values);
10335   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10336     maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10337                            aarch64_tune_params.prefetch->l1_cache_line_size,
10338                            opts->x_param_values,
10339                            global_options_set.x_param_values);
10340   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10341     maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10342                            aarch64_tune_params.prefetch->l2_cache_size,
10343                            opts->x_param_values,
10344                            global_options_set.x_param_values);
10345
10346   /* Use the alternative scheduling-pressure algorithm by default.  */
10347   maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10348                          opts->x_param_values,
10349                          global_options_set.x_param_values);
10350
10351   /* Enable sw prefetching at specified optimization level for
10352      CPUS that have prefetch.  Lower optimization level threshold by 1
10353      when profiling is enabled.  */
10354   if (opts->x_flag_prefetch_loop_arrays < 0
10355       && !opts->x_optimize_size
10356       && aarch64_tune_params.prefetch->default_opt_level >= 0
10357       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10358     opts->x_flag_prefetch_loop_arrays = 1;
10359
10360   aarch64_override_options_after_change_1 (opts);
10361 }
10362
10363 /* Print a hint with a suggestion for a core or architecture name that
10364    most closely resembles what the user passed in STR.  ARCH is true if
10365    the user is asking for an architecture name.  ARCH is false if the user
10366    is asking for a core name.  */
10367
10368 static void
10369 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10370 {
10371   auto_vec<const char *> candidates;
10372   const struct processor *entry = arch ? all_architectures : all_cores;
10373   for (; entry->name != NULL; entry++)
10374     candidates.safe_push (entry->name);
10375   char *s;
10376   const char *hint = candidates_list_and_hint (str, s, candidates);
10377   if (hint)
10378     inform (input_location, "valid arguments are: %s;"
10379                              " did you mean %qs?", s, hint);
10380   XDELETEVEC (s);
10381 }
10382
10383 /* Print a hint with a suggestion for a core name that most closely resembles
10384    what the user passed in STR.  */
10385
10386 inline static void
10387 aarch64_print_hint_for_core (const char *str)
10388 {
10389   aarch64_print_hint_for_core_or_arch (str, false);
10390 }
10391
10392 /* Print a hint with a suggestion for an architecture name that most closely
10393    resembles what the user passed in STR.  */
10394
10395 inline static void
10396 aarch64_print_hint_for_arch (const char *str)
10397 {
10398   aarch64_print_hint_for_core_or_arch (str, true);
10399 }
10400
10401 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
10402    specified in STR and throw errors if appropriate.  Put the results if
10403    they are valid in RES and ISA_FLAGS.  Return whether the option is
10404    valid.  */
10405
10406 static bool
10407 aarch64_validate_mcpu (const char *str, const struct processor **res,
10408                        unsigned long *isa_flags)
10409 {
10410   enum aarch64_parse_opt_result parse_res
10411     = aarch64_parse_cpu (str, res, isa_flags);
10412
10413   if (parse_res == AARCH64_PARSE_OK)
10414     return true;
10415
10416   switch (parse_res)
10417     {
10418       case AARCH64_PARSE_MISSING_ARG:
10419         error ("missing cpu name in %<-mcpu=%s%>", str);
10420         break;
10421       case AARCH64_PARSE_INVALID_ARG:
10422         error ("unknown value %qs for -mcpu", str);
10423         aarch64_print_hint_for_core (str);
10424         break;
10425       case AARCH64_PARSE_INVALID_FEATURE:
10426         error ("invalid feature modifier in %<-mcpu=%s%>", str);
10427         break;
10428       default:
10429         gcc_unreachable ();
10430     }
10431
10432   return false;
10433 }
10434
10435 /* Validate a command-line -march option.  Parse the arch and extensions
10436    (if any) specified in STR and throw errors if appropriate.  Put the
10437    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
10438    option is valid.  */
10439
10440 static bool
10441 aarch64_validate_march (const char *str, const struct processor **res,
10442                          unsigned long *isa_flags)
10443 {
10444   enum aarch64_parse_opt_result parse_res
10445     = aarch64_parse_arch (str, res, isa_flags);
10446
10447   if (parse_res == AARCH64_PARSE_OK)
10448     return true;
10449
10450   switch (parse_res)
10451     {
10452       case AARCH64_PARSE_MISSING_ARG:
10453         error ("missing arch name in %<-march=%s%>", str);
10454         break;
10455       case AARCH64_PARSE_INVALID_ARG:
10456         error ("unknown value %qs for -march", str);
10457         aarch64_print_hint_for_arch (str);
10458         break;
10459       case AARCH64_PARSE_INVALID_FEATURE:
10460         error ("invalid feature modifier in %<-march=%s%>", str);
10461         break;
10462       default:
10463         gcc_unreachable ();
10464     }
10465
10466   return false;
10467 }
10468
10469 /* Validate a command-line -mtune option.  Parse the cpu
10470    specified in STR and throw errors if appropriate.  Put the
10471    result, if it is valid, in RES.  Return whether the option is
10472    valid.  */
10473
10474 static bool
10475 aarch64_validate_mtune (const char *str, const struct processor **res)
10476 {
10477   enum aarch64_parse_opt_result parse_res
10478     = aarch64_parse_tune (str, res);
10479
10480   if (parse_res == AARCH64_PARSE_OK)
10481     return true;
10482
10483   switch (parse_res)
10484     {
10485       case AARCH64_PARSE_MISSING_ARG:
10486         error ("missing cpu name in %<-mtune=%s%>", str);
10487         break;
10488       case AARCH64_PARSE_INVALID_ARG:
10489         error ("unknown value %qs for -mtune", str);
10490         aarch64_print_hint_for_core (str);
10491         break;
10492       default:
10493         gcc_unreachable ();
10494     }
10495   return false;
10496 }
10497
10498 /* Return the CPU corresponding to the enum CPU.
10499    If it doesn't specify a cpu, return the default.  */
10500
10501 static const struct processor *
10502 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10503 {
10504   if (cpu != aarch64_none)
10505     return &all_cores[cpu];
10506
10507   /* The & 0x3f is to extract the bottom 6 bits that encode the
10508      default cpu as selected by the --with-cpu GCC configure option
10509      in config.gcc.
10510      ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10511      flags mechanism should be reworked to make it more sane.  */
10512   return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10513 }
10514
10515 /* Return the architecture corresponding to the enum ARCH.
10516    If it doesn't specify a valid architecture, return the default.  */
10517
10518 static const struct processor *
10519 aarch64_get_arch (enum aarch64_arch arch)
10520 {
10521   if (arch != aarch64_no_arch)
10522     return &all_architectures[arch];
10523
10524   const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10525
10526   return &all_architectures[cpu->arch];
10527 }
10528
10529 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
10530
10531 static poly_uint16
10532 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10533 {
10534   /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10535      This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10536      deciding which .md file patterns to use and when deciding whether
10537      something is a legitimate address or constant.  */
10538   if (value == SVE_SCALABLE || value == SVE_128)
10539     return poly_uint16 (2, 2);
10540   else
10541     return (int) value / 64;
10542 }
10543
10544 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
10545    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10546    tuning structs.  In particular it must set selected_tune and
10547    aarch64_isa_flags that define the available ISA features and tuning
10548    decisions.  It must also set selected_arch as this will be used to
10549    output the .arch asm tags for each function.  */
10550
10551 static void
10552 aarch64_override_options (void)
10553 {
10554   unsigned long cpu_isa = 0;
10555   unsigned long arch_isa = 0;
10556   aarch64_isa_flags = 0;
10557
10558   bool valid_cpu = true;
10559   bool valid_tune = true;
10560   bool valid_arch = true;
10561
10562   selected_cpu = NULL;
10563   selected_arch = NULL;
10564   selected_tune = NULL;
10565
10566   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10567      If either of -march or -mtune is given, they override their
10568      respective component of -mcpu.  */
10569   if (aarch64_cpu_string)
10570     valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10571                                         &cpu_isa);
10572
10573   if (aarch64_arch_string)
10574     valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10575                                           &arch_isa);
10576
10577   if (aarch64_tune_string)
10578     valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10579
10580   /* If the user did not specify a processor, choose the default
10581      one for them.  This will be the CPU set during configuration using
10582      --with-cpu, otherwise it is "generic".  */
10583   if (!selected_cpu)
10584     {
10585       if (selected_arch)
10586         {
10587           selected_cpu = &all_cores[selected_arch->ident];
10588           aarch64_isa_flags = arch_isa;
10589           explicit_arch = selected_arch->arch;
10590         }
10591       else
10592         {
10593           /* Get default configure-time CPU.  */
10594           selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10595           aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10596         }
10597
10598       if (selected_tune)
10599         explicit_tune_core = selected_tune->ident;
10600     }
10601   /* If both -mcpu and -march are specified check that they are architecturally
10602      compatible, warn if they're not and prefer the -march ISA flags.  */
10603   else if (selected_arch)
10604     {
10605       if (selected_arch->arch != selected_cpu->arch)
10606         {
10607           warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10608                        all_architectures[selected_cpu->arch].name,
10609                        selected_arch->name);
10610         }
10611       aarch64_isa_flags = arch_isa;
10612       explicit_arch = selected_arch->arch;
10613       explicit_tune_core = selected_tune ? selected_tune->ident
10614                                           : selected_cpu->ident;
10615     }
10616   else
10617     {
10618       /* -mcpu but no -march.  */
10619       aarch64_isa_flags = cpu_isa;
10620       explicit_tune_core = selected_tune ? selected_tune->ident
10621                                           : selected_cpu->ident;
10622       gcc_assert (selected_cpu);
10623       selected_arch = &all_architectures[selected_cpu->arch];
10624       explicit_arch = selected_arch->arch;
10625     }
10626
10627   /* Set the arch as well as we will need it when outputing
10628      the .arch directive in assembly.  */
10629   if (!selected_arch)
10630     {
10631       gcc_assert (selected_cpu);
10632       selected_arch = &all_architectures[selected_cpu->arch];
10633     }
10634
10635   if (!selected_tune)
10636     selected_tune = selected_cpu;
10637
10638 #ifndef HAVE_AS_MABI_OPTION
10639   /* The compiler may have been configured with 2.23.* binutils, which does
10640      not have support for ILP32.  */
10641   if (TARGET_ILP32)
10642     error ("assembler does not support -mabi=ilp32");
10643 #endif
10644
10645   /* Convert -msve-vector-bits to a VG count.  */
10646   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10647
10648   if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10649     sorry ("return address signing is only supported for -mabi=lp64");
10650
10651   /* Make sure we properly set up the explicit options.  */
10652   if ((aarch64_cpu_string && valid_cpu)
10653        || (aarch64_tune_string && valid_tune))
10654     gcc_assert (explicit_tune_core != aarch64_none);
10655
10656   if ((aarch64_cpu_string && valid_cpu)
10657        || (aarch64_arch_string && valid_arch))
10658     gcc_assert (explicit_arch != aarch64_no_arch);
10659
10660   aarch64_override_options_internal (&global_options);
10661
10662   /* Save these options as the default ones in case we push and pop them later
10663      while processing functions with potential target attributes.  */
10664   target_option_default_node = target_option_current_node
10665       = build_target_option_node (&global_options);
10666 }
10667
10668 /* Implement targetm.override_options_after_change.  */
10669
10670 static void
10671 aarch64_override_options_after_change (void)
10672 {
10673   aarch64_override_options_after_change_1 (&global_options);
10674 }
10675
10676 static struct machine_function *
10677 aarch64_init_machine_status (void)
10678 {
10679   struct machine_function *machine;
10680   machine = ggc_cleared_alloc<machine_function> ();
10681   return machine;
10682 }
10683
10684 void
10685 aarch64_init_expanders (void)
10686 {
10687   init_machine_status = aarch64_init_machine_status;
10688 }
10689
10690 /* A checking mechanism for the implementation of the various code models.  */
10691 static void
10692 initialize_aarch64_code_model (struct gcc_options *opts)
10693 {
10694    if (opts->x_flag_pic)
10695      {
10696        switch (opts->x_aarch64_cmodel_var)
10697          {
10698          case AARCH64_CMODEL_TINY:
10699            aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
10700            break;
10701          case AARCH64_CMODEL_SMALL:
10702 #ifdef HAVE_AS_SMALL_PIC_RELOCS
10703            aarch64_cmodel = (flag_pic == 2
10704                              ? AARCH64_CMODEL_SMALL_PIC
10705                              : AARCH64_CMODEL_SMALL_SPIC);
10706 #else
10707            aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
10708 #endif
10709            break;
10710          case AARCH64_CMODEL_LARGE:
10711            sorry ("code model %qs with -f%s", "large",
10712                   opts->x_flag_pic > 1 ? "PIC" : "pic");
10713            break;
10714          default:
10715            gcc_unreachable ();
10716          }
10717      }
10718    else
10719      aarch64_cmodel = opts->x_aarch64_cmodel_var;
10720 }
10721
10722 /* Implement TARGET_OPTION_SAVE.  */
10723
10724 static void
10725 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
10726 {
10727   ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
10728 }
10729
10730 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
10731    using the information saved in PTR.  */
10732
10733 static void
10734 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
10735 {
10736   opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
10737   selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
10738   opts->x_explicit_arch = ptr->x_explicit_arch;
10739   selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
10740   opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
10741
10742   aarch64_override_options_internal (opts);
10743 }
10744
10745 /* Implement TARGET_OPTION_PRINT.  */
10746
10747 static void
10748 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
10749 {
10750   const struct processor *cpu
10751     = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
10752   unsigned long isa_flags = ptr->x_aarch64_isa_flags;
10753   const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
10754   std::string extension
10755     = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
10756
10757   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
10758   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
10759            arch->name, extension.c_str ());
10760 }
10761
10762 static GTY(()) tree aarch64_previous_fndecl;
10763
10764 void
10765 aarch64_reset_previous_fndecl (void)
10766 {
10767   aarch64_previous_fndecl = NULL;
10768 }
10769
10770 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
10771    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
10772    make sure optab availability predicates are recomputed when necessary.  */
10773
10774 void
10775 aarch64_save_restore_target_globals (tree new_tree)
10776 {
10777   if (TREE_TARGET_GLOBALS (new_tree))
10778     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
10779   else if (new_tree == target_option_default_node)
10780     restore_target_globals (&default_target_globals);
10781   else
10782     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
10783 }
10784
10785 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
10786    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
10787    of the function, if such exists.  This function may be called multiple
10788    times on a single function so use aarch64_previous_fndecl to avoid
10789    setting up identical state.  */
10790
10791 static void
10792 aarch64_set_current_function (tree fndecl)
10793 {
10794   if (!fndecl || fndecl == aarch64_previous_fndecl)
10795     return;
10796
10797   tree old_tree = (aarch64_previous_fndecl
10798                    ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
10799                    : NULL_TREE);
10800
10801   tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
10802
10803   /* If current function has no attributes but the previous one did,
10804      use the default node.  */
10805   if (!new_tree && old_tree)
10806     new_tree = target_option_default_node;
10807
10808   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
10809      the default have been handled by aarch64_save_restore_target_globals from
10810      aarch64_pragma_target_parse.  */
10811   if (old_tree == new_tree)
10812     return;
10813
10814   aarch64_previous_fndecl = fndecl;
10815
10816   /* First set the target options.  */
10817   cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
10818
10819   aarch64_save_restore_target_globals (new_tree);
10820 }
10821
10822 /* Enum describing the various ways we can handle attributes.
10823    In many cases we can reuse the generic option handling machinery.  */
10824
10825 enum aarch64_attr_opt_type
10826 {
10827   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
10828   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
10829   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
10830   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
10831 };
10832
10833 /* All the information needed to handle a target attribute.
10834    NAME is the name of the attribute.
10835    ATTR_TYPE specifies the type of behavior of the attribute as described
10836    in the definition of enum aarch64_attr_opt_type.
10837    ALLOW_NEG is true if the attribute supports a "no-" form.
10838    HANDLER is the function that takes the attribute string as an argument
10839    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
10840    OPT_NUM is the enum specifying the option that the attribute modifies.
10841    This is needed for attributes that mirror the behavior of a command-line
10842    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
10843    aarch64_attr_enum.  */
10844
10845 struct aarch64_attribute_info
10846 {
10847   const char *name;
10848   enum aarch64_attr_opt_type attr_type;
10849   bool allow_neg;
10850   bool (*handler) (const char *);
10851   enum opt_code opt_num;
10852 };
10853
10854 /* Handle the ARCH_STR argument to the arch= target attribute.  */
10855
10856 static bool
10857 aarch64_handle_attr_arch (const char *str)
10858 {
10859   const struct processor *tmp_arch = NULL;
10860   enum aarch64_parse_opt_result parse_res
10861     = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
10862
10863   if (parse_res == AARCH64_PARSE_OK)
10864     {
10865       gcc_assert (tmp_arch);
10866       selected_arch = tmp_arch;
10867       explicit_arch = selected_arch->arch;
10868       return true;
10869     }
10870
10871   switch (parse_res)
10872     {
10873       case AARCH64_PARSE_MISSING_ARG:
10874         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
10875         break;
10876       case AARCH64_PARSE_INVALID_ARG:
10877         error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
10878         aarch64_print_hint_for_arch (str);
10879         break;
10880       case AARCH64_PARSE_INVALID_FEATURE:
10881         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
10882         break;
10883       default:
10884         gcc_unreachable ();
10885     }
10886
10887   return false;
10888 }
10889
10890 /* Handle the argument CPU_STR to the cpu= target attribute.  */
10891
10892 static bool
10893 aarch64_handle_attr_cpu (const char *str)
10894 {
10895   const struct processor *tmp_cpu = NULL;
10896   enum aarch64_parse_opt_result parse_res
10897     = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
10898
10899   if (parse_res == AARCH64_PARSE_OK)
10900     {
10901       gcc_assert (tmp_cpu);
10902       selected_tune = tmp_cpu;
10903       explicit_tune_core = selected_tune->ident;
10904
10905       selected_arch = &all_architectures[tmp_cpu->arch];
10906       explicit_arch = selected_arch->arch;
10907       return true;
10908     }
10909
10910   switch (parse_res)
10911     {
10912       case AARCH64_PARSE_MISSING_ARG:
10913         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
10914         break;
10915       case AARCH64_PARSE_INVALID_ARG:
10916         error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
10917         aarch64_print_hint_for_core (str);
10918         break;
10919       case AARCH64_PARSE_INVALID_FEATURE:
10920         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
10921         break;
10922       default:
10923         gcc_unreachable ();
10924     }
10925
10926   return false;
10927 }
10928
10929 /* Handle the argument STR to the tune= target attribute.  */
10930
10931 static bool
10932 aarch64_handle_attr_tune (const char *str)
10933 {
10934   const struct processor *tmp_tune = NULL;
10935   enum aarch64_parse_opt_result parse_res
10936     = aarch64_parse_tune (str, &tmp_tune);
10937
10938   if (parse_res == AARCH64_PARSE_OK)
10939     {
10940       gcc_assert (tmp_tune);
10941       selected_tune = tmp_tune;
10942       explicit_tune_core = selected_tune->ident;
10943       return true;
10944     }
10945
10946   switch (parse_res)
10947     {
10948       case AARCH64_PARSE_INVALID_ARG:
10949         error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
10950         aarch64_print_hint_for_core (str);
10951         break;
10952       default:
10953         gcc_unreachable ();
10954     }
10955
10956   return false;
10957 }
10958
10959 /* Parse an architecture extensions target attribute string specified in STR.
10960    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
10961    if successful.  Update aarch64_isa_flags to reflect the ISA features
10962    modified.  */
10963
10964 static bool
10965 aarch64_handle_attr_isa_flags (char *str)
10966 {
10967   enum aarch64_parse_opt_result parse_res;
10968   unsigned long isa_flags = aarch64_isa_flags;
10969
10970   /* We allow "+nothing" in the beginning to clear out all architectural
10971      features if the user wants to handpick specific features.  */
10972   if (strncmp ("+nothing", str, 8) == 0)
10973     {
10974       isa_flags = 0;
10975       str += 8;
10976     }
10977
10978   parse_res = aarch64_parse_extension (str, &isa_flags);
10979
10980   if (parse_res == AARCH64_PARSE_OK)
10981     {
10982       aarch64_isa_flags = isa_flags;
10983       return true;
10984     }
10985
10986   switch (parse_res)
10987     {
10988       case AARCH64_PARSE_MISSING_ARG:
10989         error ("missing value in %<target()%> pragma or attribute");
10990         break;
10991
10992       case AARCH64_PARSE_INVALID_FEATURE:
10993         error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
10994         break;
10995
10996       default:
10997         gcc_unreachable ();
10998     }
10999
11000  return false;
11001 }
11002
11003 /* The target attributes that we support.  On top of these we also support just
11004    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
11005    handled explicitly in aarch64_process_one_target_attr.  */
11006
11007 static const struct aarch64_attribute_info aarch64_attributes[] =
11008 {
11009   { "general-regs-only", aarch64_attr_mask, false, NULL,
11010      OPT_mgeneral_regs_only },
11011   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11012      OPT_mfix_cortex_a53_835769 },
11013   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11014      OPT_mfix_cortex_a53_843419 },
11015   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11016   { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
11017   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11018      OPT_momit_leaf_frame_pointer },
11019   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11020   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11021      OPT_march_ },
11022   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11023   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11024      OPT_mtune_ },
11025   { "sign-return-address", aarch64_attr_enum, false, NULL,
11026      OPT_msign_return_address_ },
11027   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11028 };
11029
11030 /* Parse ARG_STR which contains the definition of one target attribute.
11031    Show appropriate errors if any or return true if the attribute is valid.  */
11032
11033 static bool
11034 aarch64_process_one_target_attr (char *arg_str)
11035 {
11036   bool invert = false;
11037
11038   size_t len = strlen (arg_str);
11039
11040   if (len == 0)
11041     {
11042       error ("malformed %<target()%> pragma or attribute");
11043       return false;
11044     }
11045
11046   char *str_to_check = (char *) alloca (len + 1);
11047   strcpy (str_to_check, arg_str);
11048
11049   /* Skip leading whitespace.  */
11050   while (*str_to_check == ' ' || *str_to_check == '\t')
11051     str_to_check++;
11052
11053   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11054      It is easier to detect and handle it explicitly here rather than going
11055      through the machinery for the rest of the target attributes in this
11056      function.  */
11057   if (*str_to_check == '+')
11058     return aarch64_handle_attr_isa_flags (str_to_check);
11059
11060   if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11061     {
11062       invert = true;
11063       str_to_check += 3;
11064     }
11065   char *arg = strchr (str_to_check, '=');
11066
11067   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11068      and point ARG to "foo".  */
11069   if (arg)
11070     {
11071       *arg = '\0';
11072       arg++;
11073     }
11074   const struct aarch64_attribute_info *p_attr;
11075   bool found = false;
11076   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11077     {
11078       /* If the names don't match up, or the user has given an argument
11079          to an attribute that doesn't accept one, or didn't give an argument
11080          to an attribute that expects one, fail to match.  */
11081       if (strcmp (str_to_check, p_attr->name) != 0)
11082         continue;
11083
11084       found = true;
11085       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11086                               || p_attr->attr_type == aarch64_attr_enum;
11087
11088       if (attr_need_arg_p ^ (arg != NULL))
11089         {
11090           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11091           return false;
11092         }
11093
11094       /* If the name matches but the attribute does not allow "no-" versions
11095          then we can't match.  */
11096       if (invert && !p_attr->allow_neg)
11097         {
11098           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11099           return false;
11100         }
11101
11102       switch (p_attr->attr_type)
11103         {
11104         /* Has a custom handler registered.
11105            For example, cpu=, arch=, tune=.  */
11106           case aarch64_attr_custom:
11107             gcc_assert (p_attr->handler);
11108             if (!p_attr->handler (arg))
11109               return false;
11110             break;
11111
11112           /* Either set or unset a boolean option.  */
11113           case aarch64_attr_bool:
11114             {
11115               struct cl_decoded_option decoded;
11116
11117               generate_option (p_attr->opt_num, NULL, !invert,
11118                                CL_TARGET, &decoded);
11119               aarch64_handle_option (&global_options, &global_options_set,
11120                                       &decoded, input_location);
11121               break;
11122             }
11123           /* Set or unset a bit in the target_flags.  aarch64_handle_option
11124              should know what mask to apply given the option number.  */
11125           case aarch64_attr_mask:
11126             {
11127               struct cl_decoded_option decoded;
11128               /* We only need to specify the option number.
11129                  aarch64_handle_option will know which mask to apply.  */
11130               decoded.opt_index = p_attr->opt_num;
11131               decoded.value = !invert;
11132               aarch64_handle_option (&global_options, &global_options_set,
11133                                       &decoded, input_location);
11134               break;
11135             }
11136           /* Use the option setting machinery to set an option to an enum.  */
11137           case aarch64_attr_enum:
11138             {
11139               gcc_assert (arg);
11140               bool valid;
11141               int value;
11142               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11143                                               &value, CL_TARGET);
11144               if (valid)
11145                 {
11146                   set_option (&global_options, NULL, p_attr->opt_num, value,
11147                               NULL, DK_UNSPECIFIED, input_location,
11148                               global_dc);
11149                 }
11150               else
11151                 {
11152                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11153                 }
11154               break;
11155             }
11156           default:
11157             gcc_unreachable ();
11158         }
11159     }
11160
11161   /* If we reached here we either have found an attribute and validated
11162      it or didn't match any.  If we matched an attribute but its arguments
11163      were malformed we will have returned false already.  */
11164   return found;
11165 }
11166
11167 /* Count how many times the character C appears in
11168    NULL-terminated string STR.  */
11169
11170 static unsigned int
11171 num_occurences_in_str (char c, char *str)
11172 {
11173   unsigned int res = 0;
11174   while (*str != '\0')
11175     {
11176       if (*str == c)
11177         res++;
11178
11179       str++;
11180     }
11181
11182   return res;
11183 }
11184
11185 /* Parse the tree in ARGS that contains the target attribute information
11186    and update the global target options space.  */
11187
11188 bool
11189 aarch64_process_target_attr (tree args)
11190 {
11191   if (TREE_CODE (args) == TREE_LIST)
11192     {
11193       do
11194         {
11195           tree head = TREE_VALUE (args);
11196           if (head)
11197             {
11198               if (!aarch64_process_target_attr (head))
11199                 return false;
11200             }
11201           args = TREE_CHAIN (args);
11202         } while (args);
11203
11204       return true;
11205     }
11206
11207   if (TREE_CODE (args) != STRING_CST)
11208     {
11209       error ("attribute %<target%> argument not a string");
11210       return false;
11211     }
11212
11213   size_t len = strlen (TREE_STRING_POINTER (args));
11214   char *str_to_check = (char *) alloca (len + 1);
11215   strcpy (str_to_check, TREE_STRING_POINTER (args));
11216
11217   if (len == 0)
11218     {
11219       error ("malformed %<target()%> pragma or attribute");
11220       return false;
11221     }
11222
11223   /* Used to catch empty spaces between commas i.e.
11224      attribute ((target ("attr1,,attr2"))).  */
11225   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11226
11227   /* Handle multiple target attributes separated by ','.  */
11228   char *token = strtok (str_to_check, ",");
11229
11230   unsigned int num_attrs = 0;
11231   while (token)
11232     {
11233       num_attrs++;
11234       if (!aarch64_process_one_target_attr (token))
11235         {
11236           error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11237           return false;
11238         }
11239
11240       token = strtok (NULL, ",");
11241     }
11242
11243   if (num_attrs != num_commas + 1)
11244     {
11245       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11246       return false;
11247     }
11248
11249   return true;
11250 }
11251
11252 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
11253    process attribute ((target ("..."))).  */
11254
11255 static bool
11256 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11257 {
11258   struct cl_target_option cur_target;
11259   bool ret;
11260   tree old_optimize;
11261   tree new_target, new_optimize;
11262   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11263
11264   /* If what we're processing is the current pragma string then the
11265      target option node is already stored in target_option_current_node
11266      by aarch64_pragma_target_parse in aarch64-c.c.  Use that to avoid
11267      having to re-parse the string.  This is especially useful to keep
11268      arm_neon.h compile times down since that header contains a lot
11269      of intrinsics enclosed in pragmas.  */
11270   if (!existing_target && args == current_target_pragma)
11271     {
11272       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11273       return true;
11274     }
11275   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11276
11277   old_optimize = build_optimization_node (&global_options);
11278   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11279
11280   /* If the function changed the optimization levels as well as setting
11281      target options, start with the optimizations specified.  */
11282   if (func_optimize && func_optimize != old_optimize)
11283     cl_optimization_restore (&global_options,
11284                              TREE_OPTIMIZATION (func_optimize));
11285
11286   /* Save the current target options to restore at the end.  */
11287   cl_target_option_save (&cur_target, &global_options);
11288
11289   /* If fndecl already has some target attributes applied to it, unpack
11290      them so that we add this attribute on top of them, rather than
11291      overwriting them.  */
11292   if (existing_target)
11293     {
11294       struct cl_target_option *existing_options
11295         = TREE_TARGET_OPTION (existing_target);
11296
11297       if (existing_options)
11298         cl_target_option_restore (&global_options, existing_options);
11299     }
11300   else
11301     cl_target_option_restore (&global_options,
11302                         TREE_TARGET_OPTION (target_option_current_node));
11303
11304   ret = aarch64_process_target_attr (args);
11305
11306   /* Set up any additional state.  */
11307   if (ret)
11308     {
11309       aarch64_override_options_internal (&global_options);
11310       /* Initialize SIMD builtins if we haven't already.
11311          Set current_target_pragma to NULL for the duration so that
11312          the builtin initialization code doesn't try to tag the functions
11313          being built with the attributes specified by any current pragma, thus
11314          going into an infinite recursion.  */
11315       if (TARGET_SIMD)
11316         {
11317           tree saved_current_target_pragma = current_target_pragma;
11318           current_target_pragma = NULL;
11319           aarch64_init_simd_builtins ();
11320           current_target_pragma = saved_current_target_pragma;
11321         }
11322       new_target = build_target_option_node (&global_options);
11323     }
11324   else
11325     new_target = NULL;
11326
11327   new_optimize = build_optimization_node (&global_options);
11328
11329   if (fndecl && ret)
11330     {
11331       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11332
11333       if (old_optimize != new_optimize)
11334         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11335     }
11336
11337   cl_target_option_restore (&global_options, &cur_target);
11338
11339   if (old_optimize != new_optimize)
11340     cl_optimization_restore (&global_options,
11341                              TREE_OPTIMIZATION (old_optimize));
11342   return ret;
11343 }
11344
11345 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
11346    tri-bool options (yes, no, don't care) and the default value is
11347    DEF, determine whether to reject inlining.  */
11348
11349 static bool
11350 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11351                                      int dont_care, int def)
11352 {
11353   /* If the callee doesn't care, always allow inlining.  */
11354   if (callee == dont_care)
11355     return true;
11356
11357   /* If the caller doesn't care, always allow inlining.  */
11358   if (caller == dont_care)
11359     return true;
11360
11361   /* Otherwise, allow inlining if either the callee and caller values
11362      agree, or if the callee is using the default value.  */
11363   return (callee == caller || callee == def);
11364 }
11365
11366 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
11367    to inline CALLEE into CALLER based on target-specific info.
11368    Make sure that the caller and callee have compatible architectural
11369    features.  Then go through the other possible target attributes
11370    and see if they can block inlining.  Try not to reject always_inline
11371    callees unless they are incompatible architecturally.  */
11372
11373 static bool
11374 aarch64_can_inline_p (tree caller, tree callee)
11375 {
11376   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11377   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11378
11379   /* If callee has no option attributes, then it is ok to inline.  */
11380   if (!callee_tree)
11381     return true;
11382
11383   struct cl_target_option *caller_opts
11384         = TREE_TARGET_OPTION (caller_tree ? caller_tree
11385                                            : target_option_default_node);
11386
11387   struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
11388
11389
11390   /* Callee's ISA flags should be a subset of the caller's.  */
11391   if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11392        != callee_opts->x_aarch64_isa_flags)
11393     return false;
11394
11395   /* Allow non-strict aligned functions inlining into strict
11396      aligned ones.  */
11397   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11398        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11399       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11400            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11401     return false;
11402
11403   bool always_inline = lookup_attribute ("always_inline",
11404                                           DECL_ATTRIBUTES (callee));
11405
11406   /* If the architectural features match up and the callee is always_inline
11407      then the other attributes don't matter.  */
11408   if (always_inline)
11409     return true;
11410
11411   if (caller_opts->x_aarch64_cmodel_var
11412       != callee_opts->x_aarch64_cmodel_var)
11413     return false;
11414
11415   if (caller_opts->x_aarch64_tls_dialect
11416       != callee_opts->x_aarch64_tls_dialect)
11417     return false;
11418
11419   /* Honour explicit requests to workaround errata.  */
11420   if (!aarch64_tribools_ok_for_inlining_p (
11421           caller_opts->x_aarch64_fix_a53_err835769,
11422           callee_opts->x_aarch64_fix_a53_err835769,
11423           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11424     return false;
11425
11426   if (!aarch64_tribools_ok_for_inlining_p (
11427           caller_opts->x_aarch64_fix_a53_err843419,
11428           callee_opts->x_aarch64_fix_a53_err843419,
11429           2, TARGET_FIX_ERR_A53_843419))
11430     return false;
11431
11432   /* If the user explicitly specified -momit-leaf-frame-pointer for the
11433      caller and calle and they don't match up, reject inlining.  */
11434   if (!aarch64_tribools_ok_for_inlining_p (
11435           caller_opts->x_flag_omit_leaf_frame_pointer,
11436           callee_opts->x_flag_omit_leaf_frame_pointer,
11437           2, 1))
11438     return false;
11439
11440   /* If the callee has specific tuning overrides, respect them.  */
11441   if (callee_opts->x_aarch64_override_tune_string != NULL
11442       && caller_opts->x_aarch64_override_tune_string == NULL)
11443     return false;
11444
11445   /* If the user specified tuning override strings for the
11446      caller and callee and they don't match up, reject inlining.
11447      We just do a string compare here, we don't analyze the meaning
11448      of the string, as it would be too costly for little gain.  */
11449   if (callee_opts->x_aarch64_override_tune_string
11450       && caller_opts->x_aarch64_override_tune_string
11451       && (strcmp (callee_opts->x_aarch64_override_tune_string,
11452                   caller_opts->x_aarch64_override_tune_string) != 0))
11453     return false;
11454
11455   return true;
11456 }
11457
11458 /* Return true if SYMBOL_REF X binds locally.  */
11459
11460 static bool
11461 aarch64_symbol_binds_local_p (const_rtx x)
11462 {
11463   return (SYMBOL_REF_DECL (x)
11464           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11465           : SYMBOL_REF_LOCAL_P (x));
11466 }
11467
11468 /* Return true if SYMBOL_REF X is thread local */
11469 static bool
11470 aarch64_tls_symbol_p (rtx x)
11471 {
11472   if (! TARGET_HAVE_TLS)
11473     return false;
11474
11475   if (GET_CODE (x) != SYMBOL_REF)
11476     return false;
11477
11478   return SYMBOL_REF_TLS_MODEL (x) != 0;
11479 }
11480
11481 /* Classify a TLS symbol into one of the TLS kinds.  */
11482 enum aarch64_symbol_type
11483 aarch64_classify_tls_symbol (rtx x)
11484 {
11485   enum tls_model tls_kind = tls_symbolic_operand_type (x);
11486
11487   switch (tls_kind)
11488     {
11489     case TLS_MODEL_GLOBAL_DYNAMIC:
11490     case TLS_MODEL_LOCAL_DYNAMIC:
11491       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11492
11493     case TLS_MODEL_INITIAL_EXEC:
11494       switch (aarch64_cmodel)
11495         {
11496         case AARCH64_CMODEL_TINY:
11497         case AARCH64_CMODEL_TINY_PIC:
11498           return SYMBOL_TINY_TLSIE;
11499         default:
11500           return SYMBOL_SMALL_TLSIE;
11501         }
11502
11503     case TLS_MODEL_LOCAL_EXEC:
11504       if (aarch64_tls_size == 12)
11505         return SYMBOL_TLSLE12;
11506       else if (aarch64_tls_size == 24)
11507         return SYMBOL_TLSLE24;
11508       else if (aarch64_tls_size == 32)
11509         return SYMBOL_TLSLE32;
11510       else if (aarch64_tls_size == 48)
11511         return SYMBOL_TLSLE48;
11512       else
11513         gcc_unreachable ();
11514
11515     case TLS_MODEL_EMULATED:
11516     case TLS_MODEL_NONE:
11517       return SYMBOL_FORCE_TO_MEM;
11518
11519     default:
11520       gcc_unreachable ();
11521     }
11522 }
11523
11524 /* Return the correct method for accessing X + OFFSET, where X is either
11525    a SYMBOL_REF or LABEL_REF.  */
11526
11527 enum aarch64_symbol_type
11528 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11529 {
11530   if (GET_CODE (x) == LABEL_REF)
11531     {
11532       switch (aarch64_cmodel)
11533         {
11534         case AARCH64_CMODEL_LARGE:
11535           return SYMBOL_FORCE_TO_MEM;
11536
11537         case AARCH64_CMODEL_TINY_PIC:
11538         case AARCH64_CMODEL_TINY:
11539           return SYMBOL_TINY_ABSOLUTE;
11540
11541         case AARCH64_CMODEL_SMALL_SPIC:
11542         case AARCH64_CMODEL_SMALL_PIC:
11543         case AARCH64_CMODEL_SMALL:
11544           return SYMBOL_SMALL_ABSOLUTE;
11545
11546         default:
11547           gcc_unreachable ();
11548         }
11549     }
11550
11551   if (GET_CODE (x) == SYMBOL_REF)
11552     {
11553       if (aarch64_tls_symbol_p (x))
11554         return aarch64_classify_tls_symbol (x);
11555
11556       switch (aarch64_cmodel)
11557         {
11558         case AARCH64_CMODEL_TINY:
11559           /* When we retrieve symbol + offset address, we have to make sure
11560              the offset does not cause overflow of the final address.  But
11561              we have no way of knowing the address of symbol at compile time
11562              so we can't accurately say if the distance between the PC and
11563              symbol + offset is outside the addressible range of +/-1M in the
11564              TINY code model.  So we rely on images not being greater than
11565              1M and cap the offset at 1M and anything beyond 1M will have to
11566              be loaded using an alternative mechanism.  Furthermore if the
11567              symbol is a weak reference to something that isn't known to
11568              resolve to a symbol in this module, then force to memory.  */
11569           if ((SYMBOL_REF_WEAK (x)
11570                && !aarch64_symbol_binds_local_p (x))
11571               || !IN_RANGE (offset, -1048575, 1048575))
11572             return SYMBOL_FORCE_TO_MEM;
11573           return SYMBOL_TINY_ABSOLUTE;
11574
11575         case AARCH64_CMODEL_SMALL:
11576           /* Same reasoning as the tiny code model, but the offset cap here is
11577              4G.  */
11578           if ((SYMBOL_REF_WEAK (x)
11579                && !aarch64_symbol_binds_local_p (x))
11580               || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11581                             HOST_WIDE_INT_C (4294967264)))
11582             return SYMBOL_FORCE_TO_MEM;
11583           return SYMBOL_SMALL_ABSOLUTE;
11584
11585         case AARCH64_CMODEL_TINY_PIC:
11586           if (!aarch64_symbol_binds_local_p (x))
11587             return SYMBOL_TINY_GOT;
11588           return SYMBOL_TINY_ABSOLUTE;
11589
11590         case AARCH64_CMODEL_SMALL_SPIC:
11591         case AARCH64_CMODEL_SMALL_PIC:
11592           if (!aarch64_symbol_binds_local_p (x))
11593             return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11594                     ?  SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11595           return SYMBOL_SMALL_ABSOLUTE;
11596
11597         case AARCH64_CMODEL_LARGE:
11598           /* This is alright even in PIC code as the constant
11599              pool reference is always PC relative and within
11600              the same translation unit.  */
11601           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11602             return SYMBOL_SMALL_ABSOLUTE;
11603           else
11604             return SYMBOL_FORCE_TO_MEM;
11605
11606         default:
11607           gcc_unreachable ();
11608         }
11609     }
11610
11611   /* By default push everything into the constant pool.  */
11612   return SYMBOL_FORCE_TO_MEM;
11613 }
11614
11615 bool
11616 aarch64_constant_address_p (rtx x)
11617 {
11618   return (CONSTANT_P (x) && memory_address_p (DImode, x));
11619 }
11620
11621 bool
11622 aarch64_legitimate_pic_operand_p (rtx x)
11623 {
11624   if (GET_CODE (x) == SYMBOL_REF
11625       || (GET_CODE (x) == CONST
11626           && GET_CODE (XEXP (x, 0)) == PLUS
11627           && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11628      return false;
11629
11630   return true;
11631 }
11632
11633 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
11634    that should be rematerialized rather than spilled.  */
11635
11636 static bool
11637 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11638 {
11639   /* Support CSE and rematerialization of common constants.  */
11640   if (CONST_INT_P (x) || CONST_DOUBLE_P (x) || GET_CODE (x) == CONST_VECTOR)
11641     return true;
11642
11643   /* Do not allow vector struct mode constants for Advanced SIMD.
11644      We could support 0 and -1 easily, but they need support in
11645      aarch64-simd.md.  */
11646   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11647   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11648     return false;
11649
11650   /* Do not allow wide int constants - this requires support in movti.  */
11651   if (CONST_WIDE_INT_P (x))
11652     return false;
11653
11654   /* Only accept variable-length vector constants if they can be
11655      handled directly.
11656
11657      ??? It would be possible to handle rematerialization of other
11658      constants via secondary reloads.  */
11659   if (vec_flags & VEC_ANY_SVE)
11660     return aarch64_simd_valid_immediate (x, NULL);
11661
11662   if (GET_CODE (x) == HIGH)
11663     x = XEXP (x, 0);
11664
11665   /* Accept polynomial constants that can be calculated by using the
11666      destination of a move as the sole temporary.  Constants that
11667      require a second temporary cannot be rematerialized (they can't be
11668      forced to memory and also aren't legitimate constants).  */
11669   poly_int64 offset;
11670   if (poly_int_rtx_p (x, &offset))
11671     return aarch64_offset_temporaries (false, offset) <= 1;
11672
11673   /* If an offset is being added to something else, we need to allow the
11674      base to be moved into the destination register, meaning that there
11675      are no free temporaries for the offset.  */
11676   x = strip_offset (x, &offset);
11677   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11678     return false;
11679
11680   /* Do not allow const (plus (anchor_symbol, const_int)).  */
11681   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11682     return false;
11683
11684   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
11685      so spilling them is better than rematerialization.  */
11686   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11687     return true;
11688
11689   /* Label references are always constant.  */
11690   if (GET_CODE (x) == LABEL_REF)
11691     return true;
11692
11693   return false;
11694 }
11695
11696 rtx
11697 aarch64_load_tp (rtx target)
11698 {
11699   if (!target
11700       || GET_MODE (target) != Pmode
11701       || !register_operand (target, Pmode))
11702     target = gen_reg_rtx (Pmode);
11703
11704   /* Can return in any reg.  */
11705   emit_insn (gen_aarch64_load_tp_hard (target));
11706   return target;
11707 }
11708
11709 /* On AAPCS systems, this is the "struct __va_list".  */
11710 static GTY(()) tree va_list_type;
11711
11712 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
11713    Return the type to use as __builtin_va_list.
11714
11715    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
11716
11717    struct __va_list
11718    {
11719      void *__stack;
11720      void *__gr_top;
11721      void *__vr_top;
11722      int   __gr_offs;
11723      int   __vr_offs;
11724    };  */
11725
11726 static tree
11727 aarch64_build_builtin_va_list (void)
11728 {
11729   tree va_list_name;
11730   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11731
11732   /* Create the type.  */
11733   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
11734   /* Give it the required name.  */
11735   va_list_name = build_decl (BUILTINS_LOCATION,
11736                              TYPE_DECL,
11737                              get_identifier ("__va_list"),
11738                              va_list_type);
11739   DECL_ARTIFICIAL (va_list_name) = 1;
11740   TYPE_NAME (va_list_type) = va_list_name;
11741   TYPE_STUB_DECL (va_list_type) = va_list_name;
11742
11743   /* Create the fields.  */
11744   f_stack = build_decl (BUILTINS_LOCATION,
11745                         FIELD_DECL, get_identifier ("__stack"),
11746                         ptr_type_node);
11747   f_grtop = build_decl (BUILTINS_LOCATION,
11748                         FIELD_DECL, get_identifier ("__gr_top"),
11749                         ptr_type_node);
11750   f_vrtop = build_decl (BUILTINS_LOCATION,
11751                         FIELD_DECL, get_identifier ("__vr_top"),
11752                         ptr_type_node);
11753   f_groff = build_decl (BUILTINS_LOCATION,
11754                         FIELD_DECL, get_identifier ("__gr_offs"),
11755                         integer_type_node);
11756   f_vroff = build_decl (BUILTINS_LOCATION,
11757                         FIELD_DECL, get_identifier ("__vr_offs"),
11758                         integer_type_node);
11759
11760   /* Tell tree-stdarg pass about our internal offset fields.
11761      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
11762      purpose to identify whether the code is updating va_list internal
11763      offset fields through irregular way.  */
11764   va_list_gpr_counter_field = f_groff;
11765   va_list_fpr_counter_field = f_vroff;
11766
11767   DECL_ARTIFICIAL (f_stack) = 1;
11768   DECL_ARTIFICIAL (f_grtop) = 1;
11769   DECL_ARTIFICIAL (f_vrtop) = 1;
11770   DECL_ARTIFICIAL (f_groff) = 1;
11771   DECL_ARTIFICIAL (f_vroff) = 1;
11772
11773   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
11774   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
11775   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
11776   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
11777   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
11778
11779   TYPE_FIELDS (va_list_type) = f_stack;
11780   DECL_CHAIN (f_stack) = f_grtop;
11781   DECL_CHAIN (f_grtop) = f_vrtop;
11782   DECL_CHAIN (f_vrtop) = f_groff;
11783   DECL_CHAIN (f_groff) = f_vroff;
11784
11785   /* Compute its layout.  */
11786   layout_type (va_list_type);
11787
11788   return va_list_type;
11789 }
11790
11791 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
11792 static void
11793 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
11794 {
11795   const CUMULATIVE_ARGS *cum;
11796   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11797   tree stack, grtop, vrtop, groff, vroff;
11798   tree t;
11799   int gr_save_area_size = cfun->va_list_gpr_size;
11800   int vr_save_area_size = cfun->va_list_fpr_size;
11801   int vr_offset;
11802
11803   cum = &crtl->args.info;
11804   if (cfun->va_list_gpr_size)
11805     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
11806                              cfun->va_list_gpr_size);
11807   if (cfun->va_list_fpr_size)
11808     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
11809                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
11810
11811   if (!TARGET_FLOAT)
11812     {
11813       gcc_assert (cum->aapcs_nvrn == 0);
11814       vr_save_area_size = 0;
11815     }
11816
11817   f_stack = TYPE_FIELDS (va_list_type_node);
11818   f_grtop = DECL_CHAIN (f_stack);
11819   f_vrtop = DECL_CHAIN (f_grtop);
11820   f_groff = DECL_CHAIN (f_vrtop);
11821   f_vroff = DECL_CHAIN (f_groff);
11822
11823   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
11824                   NULL_TREE);
11825   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
11826                   NULL_TREE);
11827   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
11828                   NULL_TREE);
11829   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
11830                   NULL_TREE);
11831   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
11832                   NULL_TREE);
11833
11834   /* Emit code to initialize STACK, which points to the next varargs stack
11835      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
11836      by named arguments.  STACK is 8-byte aligned.  */
11837   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
11838   if (cum->aapcs_stack_size > 0)
11839     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
11840   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
11841   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11842
11843   /* Emit code to initialize GRTOP, the top of the GR save area.
11844      virtual_incoming_args_rtx should have been 16 byte aligned.  */
11845   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
11846   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
11847   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11848
11849   /* Emit code to initialize VRTOP, the top of the VR save area.
11850      This address is gr_save_area_bytes below GRTOP, rounded
11851      down to the next 16-byte boundary.  */
11852   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
11853   vr_offset = ROUND_UP (gr_save_area_size,
11854                         STACK_BOUNDARY / BITS_PER_UNIT);
11855
11856   if (vr_offset)
11857     t = fold_build_pointer_plus_hwi (t, -vr_offset);
11858   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
11859   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11860
11861   /* Emit code to initialize GROFF, the offset from GRTOP of the
11862      next GPR argument.  */
11863   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
11864               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
11865   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11866
11867   /* Likewise emit code to initialize VROFF, the offset from FTOP
11868      of the next VR argument.  */
11869   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
11870               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
11871   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11872 }
11873
11874 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
11875
11876 static tree
11877 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
11878                               gimple_seq *post_p ATTRIBUTE_UNUSED)
11879 {
11880   tree addr;
11881   bool indirect_p;
11882   bool is_ha;           /* is HFA or HVA.  */
11883   bool dw_align;        /* double-word align.  */
11884   machine_mode ag_mode = VOIDmode;
11885   int nregs;
11886   machine_mode mode;
11887
11888   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11889   tree stack, f_top, f_off, off, arg, roundup, on_stack;
11890   HOST_WIDE_INT size, rsize, adjust, align;
11891   tree t, u, cond1, cond2;
11892
11893   indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
11894   if (indirect_p)
11895     type = build_pointer_type (type);
11896
11897   mode = TYPE_MODE (type);
11898
11899   f_stack = TYPE_FIELDS (va_list_type_node);
11900   f_grtop = DECL_CHAIN (f_stack);
11901   f_vrtop = DECL_CHAIN (f_grtop);
11902   f_groff = DECL_CHAIN (f_vrtop);
11903   f_vroff = DECL_CHAIN (f_groff);
11904
11905   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
11906                   f_stack, NULL_TREE);
11907   size = int_size_in_bytes (type);
11908   align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
11909
11910   dw_align = false;
11911   adjust = 0;
11912   if (aarch64_vfp_is_call_or_return_candidate (mode,
11913                                                type,
11914                                                &ag_mode,
11915                                                &nregs,
11916                                                &is_ha))
11917     {
11918       /* No frontends can create types with variable-sized modes, so we
11919          shouldn't be asked to pass or return them.  */
11920       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
11921
11922       /* TYPE passed in fp/simd registers.  */
11923       if (!TARGET_FLOAT)
11924         aarch64_err_no_fpadvsimd (mode, "varargs");
11925
11926       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
11927                       unshare_expr (valist), f_vrtop, NULL_TREE);
11928       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
11929                       unshare_expr (valist), f_vroff, NULL_TREE);
11930
11931       rsize = nregs * UNITS_PER_VREG;
11932
11933       if (is_ha)
11934         {
11935           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
11936             adjust = UNITS_PER_VREG - ag_size;
11937         }
11938       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
11939                && size < UNITS_PER_VREG)
11940         {
11941           adjust = UNITS_PER_VREG - size;
11942         }
11943     }
11944   else
11945     {
11946       /* TYPE passed in general registers.  */
11947       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
11948                       unshare_expr (valist), f_grtop, NULL_TREE);
11949       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
11950                       unshare_expr (valist), f_groff, NULL_TREE);
11951       rsize = ROUND_UP (size, UNITS_PER_WORD);
11952       nregs = rsize / UNITS_PER_WORD;
11953
11954       if (align > 8)
11955         dw_align = true;
11956
11957       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
11958           && size < UNITS_PER_WORD)
11959         {
11960           adjust = UNITS_PER_WORD  - size;
11961         }
11962     }
11963
11964   /* Get a local temporary for the field value.  */
11965   off = get_initialized_tmp_var (f_off, pre_p, NULL);
11966
11967   /* Emit code to branch if off >= 0.  */
11968   t = build2 (GE_EXPR, boolean_type_node, off,
11969               build_int_cst (TREE_TYPE (off), 0));
11970   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
11971
11972   if (dw_align)
11973     {
11974       /* Emit: offs = (offs + 15) & -16.  */
11975       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
11976                   build_int_cst (TREE_TYPE (off), 15));
11977       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
11978                   build_int_cst (TREE_TYPE (off), -16));
11979       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
11980     }
11981   else
11982     roundup = NULL;
11983
11984   /* Update ap.__[g|v]r_offs  */
11985   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
11986               build_int_cst (TREE_TYPE (off), rsize));
11987   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
11988
11989   /* String up.  */
11990   if (roundup)
11991     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
11992
11993   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
11994   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
11995               build_int_cst (TREE_TYPE (f_off), 0));
11996   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
11997
11998   /* String up: make sure the assignment happens before the use.  */
11999   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12000   COND_EXPR_ELSE (cond1) = t;
12001
12002   /* Prepare the trees handling the argument that is passed on the stack;
12003      the top level node will store in ON_STACK.  */
12004   arg = get_initialized_tmp_var (stack, pre_p, NULL);
12005   if (align > 8)
12006     {
12007       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
12008       t = fold_convert (intDI_type_node, arg);
12009       t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12010                   build_int_cst (TREE_TYPE (t), 15));
12011       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12012                   build_int_cst (TREE_TYPE (t), -16));
12013       t = fold_convert (TREE_TYPE (arg), t);
12014       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12015     }
12016   else
12017     roundup = NULL;
12018   /* Advance ap.__stack  */
12019   t = fold_convert (intDI_type_node, arg);
12020   t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12021               build_int_cst (TREE_TYPE (t), size + 7));
12022   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12023               build_int_cst (TREE_TYPE (t), -8));
12024   t = fold_convert (TREE_TYPE (arg), t);
12025   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12026   /* String up roundup and advance.  */
12027   if (roundup)
12028     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12029   /* String up with arg */
12030   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12031   /* Big-endianness related address adjustment.  */
12032   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12033       && size < UNITS_PER_WORD)
12034   {
12035     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12036                 size_int (UNITS_PER_WORD - size));
12037     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12038   }
12039
12040   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12041   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12042
12043   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
12044   t = off;
12045   if (adjust)
12046     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12047                 build_int_cst (TREE_TYPE (off), adjust));
12048
12049   t = fold_convert (sizetype, t);
12050   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12051
12052   if (is_ha)
12053     {
12054       /* type ha; // treat as "struct {ftype field[n];}"
12055          ... [computing offs]
12056          for (i = 0; i <nregs; ++i, offs += 16)
12057            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12058          return ha;  */
12059       int i;
12060       tree tmp_ha, field_t, field_ptr_t;
12061
12062       /* Declare a local variable.  */
12063       tmp_ha = create_tmp_var_raw (type, "ha");
12064       gimple_add_tmp_var (tmp_ha);
12065
12066       /* Establish the base type.  */
12067       switch (ag_mode)
12068         {
12069         case E_SFmode:
12070           field_t = float_type_node;
12071           field_ptr_t = float_ptr_type_node;
12072           break;
12073         case E_DFmode:
12074           field_t = double_type_node;
12075           field_ptr_t = double_ptr_type_node;
12076           break;
12077         case E_TFmode:
12078           field_t = long_double_type_node;
12079           field_ptr_t = long_double_ptr_type_node;
12080           break;
12081         case E_HFmode:
12082           field_t = aarch64_fp16_type_node;
12083           field_ptr_t = aarch64_fp16_ptr_type_node;
12084           break;
12085         case E_V2SImode:
12086         case E_V4SImode:
12087             {
12088               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12089               field_t = build_vector_type_for_mode (innertype, ag_mode);
12090               field_ptr_t = build_pointer_type (field_t);
12091             }
12092           break;
12093         default:
12094           gcc_assert (0);
12095         }
12096
12097       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
12098       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12099       addr = t;
12100       t = fold_convert (field_ptr_t, addr);
12101       t = build2 (MODIFY_EXPR, field_t,
12102                   build1 (INDIRECT_REF, field_t, tmp_ha),
12103                   build1 (INDIRECT_REF, field_t, t));
12104
12105       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
12106       for (i = 1; i < nregs; ++i)
12107         {
12108           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12109           u = fold_convert (field_ptr_t, addr);
12110           u = build2 (MODIFY_EXPR, field_t,
12111                       build2 (MEM_REF, field_t, tmp_ha,
12112                               build_int_cst (field_ptr_t,
12113                                              (i *
12114                                               int_size_in_bytes (field_t)))),
12115                       build1 (INDIRECT_REF, field_t, u));
12116           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12117         }
12118
12119       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12120       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12121     }
12122
12123   COND_EXPR_ELSE (cond2) = t;
12124   addr = fold_convert (build_pointer_type (type), cond1);
12125   addr = build_va_arg_indirect_ref (addr);
12126
12127   if (indirect_p)
12128     addr = build_va_arg_indirect_ref (addr);
12129
12130   return addr;
12131 }
12132
12133 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
12134
12135 static void
12136 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12137                                 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12138                                 int no_rtl)
12139 {
12140   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12141   CUMULATIVE_ARGS local_cum;
12142   int gr_saved = cfun->va_list_gpr_size;
12143   int vr_saved = cfun->va_list_fpr_size;
12144
12145   /* The caller has advanced CUM up to, but not beyond, the last named
12146      argument.  Advance a local copy of CUM past the last "real" named
12147      argument, to find out how many registers are left over.  */
12148   local_cum = *cum;
12149   aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12150
12151   /* Found out how many registers we need to save.
12152      Honor tree-stdvar analysis results.  */
12153   if (cfun->va_list_gpr_size)
12154     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12155                     cfun->va_list_gpr_size / UNITS_PER_WORD);
12156   if (cfun->va_list_fpr_size)
12157     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12158                     cfun->va_list_fpr_size / UNITS_PER_VREG);
12159
12160   if (!TARGET_FLOAT)
12161     {
12162       gcc_assert (local_cum.aapcs_nvrn == 0);
12163       vr_saved = 0;
12164     }
12165
12166   if (!no_rtl)
12167     {
12168       if (gr_saved > 0)
12169         {
12170           rtx ptr, mem;
12171
12172           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
12173           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12174                                - gr_saved * UNITS_PER_WORD);
12175           mem = gen_frame_mem (BLKmode, ptr);
12176           set_mem_alias_set (mem, get_varargs_alias_set ());
12177
12178           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12179                                mem, gr_saved);
12180         }
12181       if (vr_saved > 0)
12182         {
12183           /* We can't use move_block_from_reg, because it will use
12184              the wrong mode, storing D regs only.  */
12185           machine_mode mode = TImode;
12186           int off, i, vr_start;
12187
12188           /* Set OFF to the offset from virtual_incoming_args_rtx of
12189              the first vector register.  The VR save area lies below
12190              the GR one, and is aligned to 16 bytes.  */
12191           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12192                            STACK_BOUNDARY / BITS_PER_UNIT);
12193           off -= vr_saved * UNITS_PER_VREG;
12194
12195           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12196           for (i = 0; i < vr_saved; ++i)
12197             {
12198               rtx ptr, mem;
12199
12200               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12201               mem = gen_frame_mem (mode, ptr);
12202               set_mem_alias_set (mem, get_varargs_alias_set ());
12203               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12204               off += UNITS_PER_VREG;
12205             }
12206         }
12207     }
12208
12209   /* We don't save the size into *PRETEND_SIZE because we want to avoid
12210      any complication of having crtl->args.pretend_args_size changed.  */
12211   cfun->machine->frame.saved_varargs_size
12212     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12213                  STACK_BOUNDARY / BITS_PER_UNIT)
12214        + vr_saved * UNITS_PER_VREG);
12215 }
12216
12217 static void
12218 aarch64_conditional_register_usage (void)
12219 {
12220   int i;
12221   if (!TARGET_FLOAT)
12222     {
12223       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12224         {
12225           fixed_regs[i] = 1;
12226           call_used_regs[i] = 1;
12227         }
12228     }
12229   if (!TARGET_SVE)
12230     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12231       {
12232         fixed_regs[i] = 1;
12233         call_used_regs[i] = 1;
12234       }
12235 }
12236
12237 /* Walk down the type tree of TYPE counting consecutive base elements.
12238    If *MODEP is VOIDmode, then set it to the first valid floating point
12239    type.  If a non-floating point type is found, or if a floating point
12240    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12241    otherwise return the count in the sub-tree.  */
12242 static int
12243 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12244 {
12245   machine_mode mode;
12246   HOST_WIDE_INT size;
12247
12248   switch (TREE_CODE (type))
12249     {
12250     case REAL_TYPE:
12251       mode = TYPE_MODE (type);
12252       if (mode != DFmode && mode != SFmode
12253           && mode != TFmode && mode != HFmode)
12254         return -1;
12255
12256       if (*modep == VOIDmode)
12257         *modep = mode;
12258
12259       if (*modep == mode)
12260         return 1;
12261
12262       break;
12263
12264     case COMPLEX_TYPE:
12265       mode = TYPE_MODE (TREE_TYPE (type));
12266       if (mode != DFmode && mode != SFmode
12267           && mode != TFmode && mode != HFmode)
12268         return -1;
12269
12270       if (*modep == VOIDmode)
12271         *modep = mode;
12272
12273       if (*modep == mode)
12274         return 2;
12275
12276       break;
12277
12278     case VECTOR_TYPE:
12279       /* Use V2SImode and V4SImode as representatives of all 64-bit
12280          and 128-bit vector types.  */
12281       size = int_size_in_bytes (type);
12282       switch (size)
12283         {
12284         case 8:
12285           mode = V2SImode;
12286           break;
12287         case 16:
12288           mode = V4SImode;
12289           break;
12290         default:
12291           return -1;
12292         }
12293
12294       if (*modep == VOIDmode)
12295         *modep = mode;
12296
12297       /* Vector modes are considered to be opaque: two vectors are
12298          equivalent for the purposes of being homogeneous aggregates
12299          if they are the same size.  */
12300       if (*modep == mode)
12301         return 1;
12302
12303       break;
12304
12305     case ARRAY_TYPE:
12306       {
12307         int count;
12308         tree index = TYPE_DOMAIN (type);
12309
12310         /* Can't handle incomplete types nor sizes that are not
12311            fixed.  */
12312         if (!COMPLETE_TYPE_P (type)
12313             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12314           return -1;
12315
12316         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12317         if (count == -1
12318             || !index
12319             || !TYPE_MAX_VALUE (index)
12320             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12321             || !TYPE_MIN_VALUE (index)
12322             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12323             || count < 0)
12324           return -1;
12325
12326         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12327                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12328
12329         /* There must be no padding.  */
12330         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12331                       count * GET_MODE_BITSIZE (*modep)))
12332           return -1;
12333
12334         return count;
12335       }
12336
12337     case RECORD_TYPE:
12338       {
12339         int count = 0;
12340         int sub_count;
12341         tree field;
12342
12343         /* Can't handle incomplete types nor sizes that are not
12344            fixed.  */
12345         if (!COMPLETE_TYPE_P (type)
12346             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12347           return -1;
12348
12349         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12350           {
12351             if (TREE_CODE (field) != FIELD_DECL)
12352               continue;
12353
12354             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12355             if (sub_count < 0)
12356               return -1;
12357             count += sub_count;
12358           }
12359
12360         /* There must be no padding.  */
12361         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12362                       count * GET_MODE_BITSIZE (*modep)))
12363           return -1;
12364
12365         return count;
12366       }
12367
12368     case UNION_TYPE:
12369     case QUAL_UNION_TYPE:
12370       {
12371         /* These aren't very interesting except in a degenerate case.  */
12372         int count = 0;
12373         int sub_count;
12374         tree field;
12375
12376         /* Can't handle incomplete types nor sizes that are not
12377            fixed.  */
12378         if (!COMPLETE_TYPE_P (type)
12379             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12380           return -1;
12381
12382         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12383           {
12384             if (TREE_CODE (field) != FIELD_DECL)
12385               continue;
12386
12387             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12388             if (sub_count < 0)
12389               return -1;
12390             count = count > sub_count ? count : sub_count;
12391           }
12392
12393         /* There must be no padding.  */
12394         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12395                       count * GET_MODE_BITSIZE (*modep)))
12396           return -1;
12397
12398         return count;
12399       }
12400
12401     default:
12402       break;
12403     }
12404
12405   return -1;
12406 }
12407
12408 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12409    type as described in AAPCS64 \S 4.1.2.
12410
12411    See the comment above aarch64_composite_type_p for the notes on MODE.  */
12412
12413 static bool
12414 aarch64_short_vector_p (const_tree type,
12415                         machine_mode mode)
12416 {
12417   poly_int64 size = -1;
12418
12419   if (type && TREE_CODE (type) == VECTOR_TYPE)
12420     size = int_size_in_bytes (type);
12421   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12422             || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12423     size = GET_MODE_SIZE (mode);
12424
12425   return known_eq (size, 8) || known_eq (size, 16);
12426 }
12427
12428 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12429    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
12430    array types.  The C99 floating-point complex types are also considered
12431    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
12432    types, which are GCC extensions and out of the scope of AAPCS64, are
12433    treated as composite types here as well.
12434
12435    Note that MODE itself is not sufficient in determining whether a type
12436    is such a composite type or not.  This is because
12437    stor-layout.c:compute_record_mode may have already changed the MODE
12438    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
12439    structure with only one field may have its MODE set to the mode of the
12440    field.  Also an integer mode whose size matches the size of the
12441    RECORD_TYPE type may be used to substitute the original mode
12442    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
12443    solely relied on.  */
12444
12445 static bool
12446 aarch64_composite_type_p (const_tree type,
12447                           machine_mode mode)
12448 {
12449   if (aarch64_short_vector_p (type, mode))
12450     return false;
12451
12452   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12453     return true;
12454
12455   if (mode == BLKmode
12456       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12457       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12458     return true;
12459
12460   return false;
12461 }
12462
12463 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12464    shall be passed or returned in simd/fp register(s) (providing these
12465    parameter passing registers are available).
12466
12467    Upon successful return, *COUNT returns the number of needed registers,
12468    *BASE_MODE returns the mode of the individual register and when IS_HAF
12469    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12470    floating-point aggregate or a homogeneous short-vector aggregate.  */
12471
12472 static bool
12473 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12474                                          const_tree type,
12475                                          machine_mode *base_mode,
12476                                          int *count,
12477                                          bool *is_ha)
12478 {
12479   machine_mode new_mode = VOIDmode;
12480   bool composite_p = aarch64_composite_type_p (type, mode);
12481
12482   if (is_ha != NULL) *is_ha = false;
12483
12484   if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12485       || aarch64_short_vector_p (type, mode))
12486     {
12487       *count = 1;
12488       new_mode = mode;
12489     }
12490   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12491     {
12492       if (is_ha != NULL) *is_ha = true;
12493       *count = 2;
12494       new_mode = GET_MODE_INNER (mode);
12495     }
12496   else if (type && composite_p)
12497     {
12498       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12499
12500       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12501         {
12502           if (is_ha != NULL) *is_ha = true;
12503           *count = ag_count;
12504         }
12505       else
12506         return false;
12507     }
12508   else
12509     return false;
12510
12511   *base_mode = new_mode;
12512   return true;
12513 }
12514
12515 /* Implement TARGET_STRUCT_VALUE_RTX.  */
12516
12517 static rtx
12518 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12519                           int incoming ATTRIBUTE_UNUSED)
12520 {
12521   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12522 }
12523
12524 /* Implements target hook vector_mode_supported_p.  */
12525 static bool
12526 aarch64_vector_mode_supported_p (machine_mode mode)
12527 {
12528   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12529   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12530 }
12531
12532 /* Return appropriate SIMD container
12533    for MODE within a vector of WIDTH bits.  */
12534 static machine_mode
12535 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12536 {
12537   if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12538     switch (mode)
12539       {
12540       case E_DFmode:
12541         return VNx2DFmode;
12542       case E_SFmode:
12543         return VNx4SFmode;
12544       case E_HFmode:
12545         return VNx8HFmode;
12546       case E_DImode:
12547         return VNx2DImode;
12548       case E_SImode:
12549         return VNx4SImode;
12550       case E_HImode:
12551         return VNx8HImode;
12552       case E_QImode:
12553         return VNx16QImode;
12554       default:
12555         return word_mode;
12556       }
12557
12558   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12559   if (TARGET_SIMD)
12560     {
12561       if (known_eq (width, 128))
12562         switch (mode)
12563           {
12564           case E_DFmode:
12565             return V2DFmode;
12566           case E_SFmode:
12567             return V4SFmode;
12568           case E_HFmode:
12569             return V8HFmode;
12570           case E_SImode:
12571             return V4SImode;
12572           case E_HImode:
12573             return V8HImode;
12574           case E_QImode:
12575             return V16QImode;
12576           case E_DImode:
12577             return V2DImode;
12578           default:
12579             break;
12580           }
12581       else
12582         switch (mode)
12583           {
12584           case E_SFmode:
12585             return V2SFmode;
12586           case E_HFmode:
12587             return V4HFmode;
12588           case E_SImode:
12589             return V2SImode;
12590           case E_HImode:
12591             return V4HImode;
12592           case E_QImode:
12593             return V8QImode;
12594           default:
12595             break;
12596           }
12597     }
12598   return word_mode;
12599 }
12600
12601 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
12602 static machine_mode
12603 aarch64_preferred_simd_mode (scalar_mode mode)
12604 {
12605   poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12606   return aarch64_simd_container_mode (mode, bits);
12607 }
12608
12609 /* Return a list of possible vector sizes for the vectorizer
12610    to iterate over.  */
12611 static void
12612 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12613 {
12614   if (TARGET_SVE)
12615     sizes->safe_push (BYTES_PER_SVE_VECTOR);
12616   sizes->safe_push (16);
12617   sizes->safe_push (8);
12618 }
12619
12620 /* Implement TARGET_MANGLE_TYPE.  */
12621
12622 static const char *
12623 aarch64_mangle_type (const_tree type)
12624 {
12625   /* The AArch64 ABI documents say that "__va_list" has to be
12626      managled as if it is in the "std" namespace.  */
12627   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12628     return "St9__va_list";
12629
12630   /* Half-precision float.  */
12631   if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12632     return "Dh";
12633
12634   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
12635      builtin types.  */
12636   if (TYPE_NAME (type) != NULL)
12637     return aarch64_mangle_builtin_type (type);
12638
12639   /* Use the default mangling.  */
12640   return NULL;
12641 }
12642
12643 /* Find the first rtx_insn before insn that will generate an assembly
12644    instruction.  */
12645
12646 static rtx_insn *
12647 aarch64_prev_real_insn (rtx_insn *insn)
12648 {
12649   if (!insn)
12650     return NULL;
12651
12652   do
12653     {
12654       insn = prev_real_insn (insn);
12655     }
12656   while (insn && recog_memoized (insn) < 0);
12657
12658   return insn;
12659 }
12660
12661 static bool
12662 is_madd_op (enum attr_type t1)
12663 {
12664   unsigned int i;
12665   /* A number of these may be AArch32 only.  */
12666   enum attr_type mlatypes[] = {
12667     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12668     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12669     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12670   };
12671
12672   for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12673     {
12674       if (t1 == mlatypes[i])
12675         return true;
12676     }
12677
12678   return false;
12679 }
12680
12681 /* Check if there is a register dependency between a load and the insn
12682    for which we hold recog_data.  */
12683
12684 static bool
12685 dep_between_memop_and_curr (rtx memop)
12686 {
12687   rtx load_reg;
12688   int opno;
12689
12690   gcc_assert (GET_CODE (memop) == SET);
12691
12692   if (!REG_P (SET_DEST (memop)))
12693     return false;
12694
12695   load_reg = SET_DEST (memop);
12696   for (opno = 1; opno < recog_data.n_operands; opno++)
12697     {
12698       rtx operand = recog_data.operand[opno];
12699       if (REG_P (operand)
12700           && reg_overlap_mentioned_p (load_reg, operand))
12701         return true;
12702
12703     }
12704   return false;
12705 }
12706
12707
12708 /* When working around the Cortex-A53 erratum 835769,
12709    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12710    instruction and has a preceding memory instruction such that a NOP
12711    should be inserted between them.  */
12712
12713 bool
12714 aarch64_madd_needs_nop (rtx_insn* insn)
12715 {
12716   enum attr_type attr_type;
12717   rtx_insn *prev;
12718   rtx body;
12719
12720   if (!TARGET_FIX_ERR_A53_835769)
12721     return false;
12722
12723   if (!INSN_P (insn) || recog_memoized (insn) < 0)
12724     return false;
12725
12726   attr_type = get_attr_type (insn);
12727   if (!is_madd_op (attr_type))
12728     return false;
12729
12730   prev = aarch64_prev_real_insn (insn);
12731   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
12732      Restore recog state to INSN to avoid state corruption.  */
12733   extract_constrain_insn_cached (insn);
12734
12735   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
12736     return false;
12737
12738   body = single_set (prev);
12739
12740   /* If the previous insn is a memory op and there is no dependency between
12741      it and the DImode madd, emit a NOP between them.  If body is NULL then we
12742      have a complex memory operation, probably a load/store pair.
12743      Be conservative for now and emit a NOP.  */
12744   if (GET_MODE (recog_data.operand[0]) == DImode
12745       && (!body || !dep_between_memop_and_curr (body)))
12746     return true;
12747
12748   return false;
12749
12750 }
12751
12752
12753 /* Implement FINAL_PRESCAN_INSN.  */
12754
12755 void
12756 aarch64_final_prescan_insn (rtx_insn *insn)
12757 {
12758   if (aarch64_madd_needs_nop (insn))
12759     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
12760 }
12761
12762
12763 /* Return the equivalent letter for size.  */
12764 static char
12765 sizetochar (int size)
12766 {
12767   switch (size)
12768     {
12769     case 64: return 'd';
12770     case 32: return 's';
12771     case 16: return 'h';
12772     case 8 : return 'b';
12773     default: gcc_unreachable ();
12774     }
12775 }
12776
12777 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
12778    instruction.  */
12779
12780 bool
12781 aarch64_sve_index_immediate_p (rtx base_or_step)
12782 {
12783   return (CONST_INT_P (base_or_step)
12784           && IN_RANGE (INTVAL (base_or_step), -16, 15));
12785 }
12786
12787 /* Return true if X is a valid immediate for the SVE ADD and SUB
12788    instructions.  Negate X first if NEGATE_P is true.  */
12789
12790 bool
12791 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
12792 {
12793   rtx elt;
12794
12795   if (!const_vec_duplicate_p (x, &elt)
12796       || !CONST_INT_P (elt))
12797     return false;
12798
12799   HOST_WIDE_INT val = INTVAL (elt);
12800   if (negate_p)
12801     val = -val;
12802   val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
12803
12804   if (val & 0xff)
12805     return IN_RANGE (val, 0, 0xff);
12806   return IN_RANGE (val, 0, 0xff00);
12807 }
12808
12809 /* Return true if X is a valid immediate operand for an SVE logical
12810    instruction such as AND.  */
12811
12812 bool
12813 aarch64_sve_bitmask_immediate_p (rtx x)
12814 {
12815   rtx elt;
12816
12817   return (const_vec_duplicate_p (x, &elt)
12818           && CONST_INT_P (elt)
12819           && aarch64_bitmask_imm (INTVAL (elt),
12820                                   GET_MODE_INNER (GET_MODE (x))));
12821 }
12822
12823 /* Return true if X is a valid immediate for the SVE DUP and CPY
12824    instructions.  */
12825
12826 bool
12827 aarch64_sve_dup_immediate_p (rtx x)
12828 {
12829   rtx elt;
12830
12831   if (!const_vec_duplicate_p (x, &elt)
12832       || !CONST_INT_P (elt))
12833     return false;
12834
12835   HOST_WIDE_INT val = INTVAL (elt);
12836   if (val & 0xff)
12837     return IN_RANGE (val, -0x80, 0x7f);
12838   return IN_RANGE (val, -0x8000, 0x7f00);
12839 }
12840
12841 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
12842    SIGNED_P says whether the operand is signed rather than unsigned.  */
12843
12844 bool
12845 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
12846 {
12847   rtx elt;
12848
12849   return (const_vec_duplicate_p (x, &elt)
12850           && CONST_INT_P (elt)
12851           && (signed_p
12852               ? IN_RANGE (INTVAL (elt), -16, 15)
12853               : IN_RANGE (INTVAL (elt), 0, 127)));
12854 }
12855
12856 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
12857    instruction.  Negate X first if NEGATE_P is true.  */
12858
12859 bool
12860 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
12861 {
12862   rtx elt;
12863   REAL_VALUE_TYPE r;
12864
12865   if (!const_vec_duplicate_p (x, &elt)
12866       || GET_CODE (elt) != CONST_DOUBLE)
12867     return false;
12868
12869   r = *CONST_DOUBLE_REAL_VALUE (elt);
12870
12871   if (negate_p)
12872     r = real_value_negate (&r);
12873
12874   if (real_equal (&r, &dconst1))
12875     return true;
12876   if (real_equal (&r, &dconsthalf))
12877     return true;
12878   return false;
12879 }
12880
12881 /* Return true if X is a valid immediate operand for an SVE FMUL
12882    instruction.  */
12883
12884 bool
12885 aarch64_sve_float_mul_immediate_p (rtx x)
12886 {
12887   rtx elt;
12888
12889   /* GCC will never generate a multiply with an immediate of 2, so there is no
12890      point testing for it (even though it is a valid constant).  */
12891   return (const_vec_duplicate_p (x, &elt)
12892           && GET_CODE (elt) == CONST_DOUBLE
12893           && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
12894 }
12895
12896 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
12897    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
12898    is nonnull, use it to describe valid immediates.  */
12899 static bool
12900 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
12901                                     simd_immediate_info *info,
12902                                     enum simd_immediate_check which,
12903                                     simd_immediate_info::insn_type insn)
12904 {
12905   /* Try a 4-byte immediate with LSL.  */
12906   for (unsigned int shift = 0; shift < 32; shift += 8)
12907     if ((val32 & (0xff << shift)) == val32)
12908       {
12909         if (info)
12910           *info = simd_immediate_info (SImode, val32 >> shift, insn,
12911                                        simd_immediate_info::LSL, shift);
12912         return true;
12913       }
12914
12915   /* Try a 2-byte immediate with LSL.  */
12916   unsigned int imm16 = val32 & 0xffff;
12917   if (imm16 == (val32 >> 16))
12918     for (unsigned int shift = 0; shift < 16; shift += 8)
12919       if ((imm16 & (0xff << shift)) == imm16)
12920         {
12921           if (info)
12922             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
12923                                          simd_immediate_info::LSL, shift);
12924           return true;
12925         }
12926
12927   /* Try a 4-byte immediate with MSL, except for cases that MVN
12928      can handle.  */
12929   if (which == AARCH64_CHECK_MOV)
12930     for (unsigned int shift = 8; shift < 24; shift += 8)
12931       {
12932         unsigned int low = (1 << shift) - 1;
12933         if (((val32 & (0xff << shift)) | low) == val32)
12934           {
12935             if (info)
12936               *info = simd_immediate_info (SImode, val32 >> shift, insn,
12937                                            simd_immediate_info::MSL, shift);
12938             return true;
12939           }
12940       }
12941
12942   return false;
12943 }
12944
12945 /* Return true if replicating VAL64 is a valid immediate for the
12946    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
12947    use it to describe valid immediates.  */
12948 static bool
12949 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
12950                                  simd_immediate_info *info,
12951                                  enum simd_immediate_check which)
12952 {
12953   unsigned int val32 = val64 & 0xffffffff;
12954   unsigned int val16 = val64 & 0xffff;
12955   unsigned int val8 = val64 & 0xff;
12956
12957   if (val32 == (val64 >> 32))
12958     {
12959       if ((which & AARCH64_CHECK_ORR) != 0
12960           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
12961                                                  simd_immediate_info::MOV))
12962         return true;
12963
12964       if ((which & AARCH64_CHECK_BIC) != 0
12965           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
12966                                                  simd_immediate_info::MVN))
12967         return true;
12968
12969       /* Try using a replicated byte.  */
12970       if (which == AARCH64_CHECK_MOV
12971           && val16 == (val32 >> 16)
12972           && val8 == (val16 >> 8))
12973         {
12974           if (info)
12975             *info = simd_immediate_info (QImode, val8);
12976           return true;
12977         }
12978     }
12979
12980   /* Try using a bit-to-bytemask.  */
12981   if (which == AARCH64_CHECK_MOV)
12982     {
12983       unsigned int i;
12984       for (i = 0; i < 64; i += 8)
12985         {
12986           unsigned char byte = (val64 >> i) & 0xff;
12987           if (byte != 0 && byte != 0xff)
12988             break;
12989         }
12990       if (i == 64)
12991         {
12992           if (info)
12993             *info = simd_immediate_info (DImode, val64);
12994           return true;
12995         }
12996     }
12997   return false;
12998 }
12999
13000 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13001    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
13002
13003 static bool
13004 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13005                              simd_immediate_info *info)
13006 {
13007   scalar_int_mode mode = DImode;
13008   unsigned int val32 = val64 & 0xffffffff;
13009   if (val32 == (val64 >> 32))
13010     {
13011       mode = SImode;
13012       unsigned int val16 = val32 & 0xffff;
13013       if (val16 == (val32 >> 16))
13014         {
13015           mode = HImode;
13016           unsigned int val8 = val16 & 0xff;
13017           if (val8 == (val16 >> 8))
13018             mode = QImode;
13019         }
13020     }
13021   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13022   if (IN_RANGE (val, -0x80, 0x7f))
13023     {
13024       /* DUP with no shift.  */
13025       if (info)
13026         *info = simd_immediate_info (mode, val);
13027       return true;
13028     }
13029   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13030     {
13031       /* DUP with LSL #8.  */
13032       if (info)
13033         *info = simd_immediate_info (mode, val);
13034       return true;
13035     }
13036   if (aarch64_bitmask_imm (val64, mode))
13037     {
13038       /* DUPM.  */
13039       if (info)
13040         *info = simd_immediate_info (mode, val);
13041       return true;
13042     }
13043   return false;
13044 }
13045
13046 /* Return true if OP is a valid SIMD immediate for the operation
13047    described by WHICH.  If INFO is nonnull, use it to describe valid
13048    immediates.  */
13049 bool
13050 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13051                               enum simd_immediate_check which)
13052 {
13053   machine_mode mode = GET_MODE (op);
13054   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13055   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13056     return false;
13057
13058   scalar_mode elt_mode = GET_MODE_INNER (mode);
13059   rtx elt = NULL, base, step;
13060   unsigned int n_elts;
13061   if (const_vec_duplicate_p (op, &elt))
13062     n_elts = 1;
13063   else if ((vec_flags & VEC_SVE_DATA)
13064            && const_vec_series_p (op, &base, &step))
13065     {
13066       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13067       if (!aarch64_sve_index_immediate_p (base)
13068           || !aarch64_sve_index_immediate_p (step))
13069         return false;
13070
13071       if (info)
13072         *info = simd_immediate_info (elt_mode, base, step);
13073       return true;
13074     }
13075   else if (GET_CODE (op) == CONST_VECTOR
13076            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13077     /* N_ELTS set above.  */;
13078   else
13079     return false;
13080
13081   /* Handle PFALSE and PTRUE.  */
13082   if (vec_flags & VEC_SVE_PRED)
13083     return (op == CONST0_RTX (mode)
13084             || op == CONSTM1_RTX (mode));
13085
13086   scalar_float_mode elt_float_mode;
13087   if (elt
13088       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode)
13089       && (aarch64_float_const_zero_rtx_p (elt)
13090           || aarch64_float_const_representable_p (elt)))
13091     {
13092       if (info)
13093         *info = simd_immediate_info (elt_float_mode, elt);
13094       return true;
13095     }
13096
13097   unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13098   if (elt_size > 8)
13099     return false;
13100
13101   scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13102
13103   /* Expand the vector constant out into a byte vector, with the least
13104      significant byte of the register first.  */
13105   auto_vec<unsigned char, 16> bytes;
13106   bytes.reserve (n_elts * elt_size);
13107   for (unsigned int i = 0; i < n_elts; i++)
13108     {
13109       if (!elt || n_elts != 1)
13110         /* The vector is provided in gcc endian-neutral fashion.
13111            For aarch64_be, it must be laid out in the vector register
13112            in reverse order.  */
13113         elt = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
13114
13115       if (elt_mode != elt_int_mode)
13116         elt = gen_lowpart (elt_int_mode, elt);
13117
13118       if (!CONST_INT_P (elt))
13119         return false;
13120
13121       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13122       for (unsigned int byte = 0; byte < elt_size; byte++)
13123         {
13124           bytes.quick_push (elt_val & 0xff);
13125           elt_val >>= BITS_PER_UNIT;
13126         }
13127     }
13128
13129   /* The immediate must repeat every eight bytes.  */
13130   unsigned int nbytes = bytes.length ();
13131   for (unsigned i = 8; i < nbytes; ++i)
13132     if (bytes[i] != bytes[i - 8])
13133       return false;
13134
13135   /* Get the repeating 8-byte value as an integer.  No endian correction
13136      is needed here because bytes is already in lsb-first order.  */
13137   unsigned HOST_WIDE_INT val64 = 0;
13138   for (unsigned int i = 0; i < 8; i++)
13139     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13140               << (i * BITS_PER_UNIT));
13141
13142   if (vec_flags & VEC_SVE_DATA)
13143     return aarch64_sve_valid_immediate (val64, info);
13144   else
13145     return aarch64_advsimd_valid_immediate (val64, info, which);
13146 }
13147
13148 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13149    has a step in the range of INDEX.  Return the index expression if so,
13150    otherwise return null.  */
13151 rtx
13152 aarch64_check_zero_based_sve_index_immediate (rtx x)
13153 {
13154   rtx base, step;
13155   if (const_vec_series_p (x, &base, &step)
13156       && base == const0_rtx
13157       && aarch64_sve_index_immediate_p (step))
13158     return step;
13159   return NULL_RTX;
13160 }
13161
13162 /* Check of immediate shift constants are within range.  */
13163 bool
13164 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13165 {
13166   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13167   if (left)
13168     return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13169   else
13170     return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13171 }
13172
13173 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13174    operation of width WIDTH at bit position POS.  */
13175
13176 rtx
13177 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13178 {
13179   gcc_assert (CONST_INT_P (width));
13180   gcc_assert (CONST_INT_P (pos));
13181
13182   unsigned HOST_WIDE_INT mask
13183     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13184   return GEN_INT (mask << UINTVAL (pos));
13185 }
13186
13187 bool
13188 aarch64_mov_operand_p (rtx x, machine_mode mode)
13189 {
13190   if (GET_CODE (x) == HIGH
13191       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13192     return true;
13193
13194   if (CONST_INT_P (x))
13195     return true;
13196
13197   if (VECTOR_MODE_P (GET_MODE (x)))
13198     return aarch64_simd_valid_immediate (x, NULL);
13199
13200   if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13201     return true;
13202
13203   if (aarch64_sve_cnt_immediate_p (x))
13204     return true;
13205
13206   return aarch64_classify_symbolic_expression (x)
13207     == SYMBOL_TINY_ABSOLUTE;
13208 }
13209
13210 /* Return a const_int vector of VAL.  */
13211 rtx
13212 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13213 {
13214   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13215   return gen_const_vec_duplicate (mode, c);
13216 }
13217
13218 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
13219
13220 bool
13221 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13222 {
13223   machine_mode vmode;
13224
13225   vmode = aarch64_simd_container_mode (mode, 64);
13226   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13227   return aarch64_simd_valid_immediate (op_v, NULL);
13228 }
13229
13230 /* Construct and return a PARALLEL RTX vector with elements numbering the
13231    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13232    the vector - from the perspective of the architecture.  This does not
13233    line up with GCC's perspective on lane numbers, so we end up with
13234    different masks depending on our target endian-ness.  The diagram
13235    below may help.  We must draw the distinction when building masks
13236    which select one half of the vector.  An instruction selecting
13237    architectural low-lanes for a big-endian target, must be described using
13238    a mask selecting GCC high-lanes.
13239
13240                  Big-Endian             Little-Endian
13241
13242 GCC             0   1   2   3           3   2   1   0
13243               | x | x | x | x |       | x | x | x | x |
13244 Architecture    3   2   1   0           3   2   1   0
13245
13246 Low Mask:         { 2, 3 }                { 0, 1 }
13247 High Mask:        { 0, 1 }                { 2, 3 }
13248
13249    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
13250
13251 rtx
13252 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13253 {
13254   rtvec v = rtvec_alloc (nunits / 2);
13255   int high_base = nunits / 2;
13256   int low_base = 0;
13257   int base;
13258   rtx t1;
13259   int i;
13260
13261   if (BYTES_BIG_ENDIAN)
13262     base = high ? low_base : high_base;
13263   else
13264     base = high ? high_base : low_base;
13265
13266   for (i = 0; i < nunits / 2; i++)
13267     RTVEC_ELT (v, i) = GEN_INT (base + i);
13268
13269   t1 = gen_rtx_PARALLEL (mode, v);
13270   return t1;
13271 }
13272
13273 /* Check OP for validity as a PARALLEL RTX vector with elements
13274    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13275    from the perspective of the architecture.  See the diagram above
13276    aarch64_simd_vect_par_cnst_half for more details.  */
13277
13278 bool
13279 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13280                                        bool high)
13281 {
13282   int nelts;
13283   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13284     return false;
13285
13286   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13287   HOST_WIDE_INT count_op = XVECLEN (op, 0);
13288   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13289   int i = 0;
13290
13291   if (count_op != count_ideal)
13292     return false;
13293
13294   for (i = 0; i < count_ideal; i++)
13295     {
13296       rtx elt_op = XVECEXP (op, 0, i);
13297       rtx elt_ideal = XVECEXP (ideal, 0, i);
13298
13299       if (!CONST_INT_P (elt_op)
13300           || INTVAL (elt_ideal) != INTVAL (elt_op))
13301         return false;
13302     }
13303   return true;
13304 }
13305
13306 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
13307    HIGH (exclusive).  */
13308 void
13309 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13310                           const_tree exp)
13311 {
13312   HOST_WIDE_INT lane;
13313   gcc_assert (CONST_INT_P (operand));
13314   lane = INTVAL (operand);
13315
13316   if (lane < low || lane >= high)
13317   {
13318     if (exp)
13319       error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13320     else
13321       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13322   }
13323 }
13324
13325 /* Peform endian correction on lane number N, which indexes a vector
13326    of mode MODE, and return the result as an SImode rtx.  */
13327
13328 rtx
13329 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13330 {
13331   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13332 }
13333
13334 /* Return TRUE if OP is a valid vector addressing mode.  */
13335
13336 bool
13337 aarch64_simd_mem_operand_p (rtx op)
13338 {
13339   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13340                         || REG_P (XEXP (op, 0)));
13341 }
13342
13343 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
13344
13345 bool
13346 aarch64_sve_ld1r_operand_p (rtx op)
13347 {
13348   struct aarch64_address_info addr;
13349   scalar_mode mode;
13350
13351   return (MEM_P (op)
13352           && is_a <scalar_mode> (GET_MODE (op), &mode)
13353           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13354           && addr.type == ADDRESS_REG_IMM
13355           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13356 }
13357
13358 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13359    The conditions for STR are the same.  */
13360 bool
13361 aarch64_sve_ldr_operand_p (rtx op)
13362 {
13363   struct aarch64_address_info addr;
13364
13365   return (MEM_P (op)
13366           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13367                                        false, ADDR_QUERY_ANY)
13368           && addr.type == ADDRESS_REG_IMM);
13369 }
13370
13371 /* Emit a register copy from operand to operand, taking care not to
13372    early-clobber source registers in the process.
13373
13374    COUNT is the number of components into which the copy needs to be
13375    decomposed.  */
13376 void
13377 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13378                                 unsigned int count)
13379 {
13380   unsigned int i;
13381   int rdest = REGNO (operands[0]);
13382   int rsrc = REGNO (operands[1]);
13383
13384   if (!reg_overlap_mentioned_p (operands[0], operands[1])
13385       || rdest < rsrc)
13386     for (i = 0; i < count; i++)
13387       emit_move_insn (gen_rtx_REG (mode, rdest + i),
13388                       gen_rtx_REG (mode, rsrc + i));
13389   else
13390     for (i = 0; i < count; i++)
13391       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13392                       gen_rtx_REG (mode, rsrc + count - i - 1));
13393 }
13394
13395 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13396    one of VSTRUCT modes: OI, CI, or XI.  */
13397 int
13398 aarch64_simd_attr_length_rglist (machine_mode mode)
13399 {
13400   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
13401   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13402 }
13403
13404 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
13405    alignment of a vector to 128 bits.  SVE predicates have an alignment of
13406    16 bits.  */
13407 static HOST_WIDE_INT
13408 aarch64_simd_vector_alignment (const_tree type)
13409 {
13410   if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13411     /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13412        be set for non-predicate vectors of booleans.  Modes are the most
13413        direct way we have of identifying real SVE predicate types.  */
13414     return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13415   HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13416   return MIN (align, 128);
13417 }
13418
13419 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
13420 static HOST_WIDE_INT
13421 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13422 {
13423   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13424     {
13425       /* If the length of the vector is fixed, try to align to that length,
13426          otherwise don't try to align at all.  */
13427       HOST_WIDE_INT result;
13428       if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13429         result = TYPE_ALIGN (TREE_TYPE (type));
13430       return result;
13431     }
13432   return TYPE_ALIGN (type);
13433 }
13434
13435 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
13436 static bool
13437 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13438 {
13439   if (is_packed)
13440     return false;
13441
13442   /* For fixed-length vectors, check that the vectorizer will aim for
13443      full-vector alignment.  This isn't true for generic GCC vectors
13444      that are wider than the ABI maximum of 128 bits.  */
13445   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13446       && (wi::to_widest (TYPE_SIZE (type))
13447           != aarch64_vectorize_preferred_vector_alignment (type)))
13448     return false;
13449
13450   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
13451   return true;
13452 }
13453
13454 /* Return true if the vector misalignment factor is supported by the
13455    target.  */
13456 static bool
13457 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13458                                              const_tree type, int misalignment,
13459                                              bool is_packed)
13460 {
13461   if (TARGET_SIMD && STRICT_ALIGNMENT)
13462     {
13463       /* Return if movmisalign pattern is not supported for this mode.  */
13464       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13465         return false;
13466
13467       /* Misalignment factor is unknown at compile time.  */
13468       if (misalignment == -1)
13469         return false;
13470     }
13471   return default_builtin_support_vector_misalignment (mode, type, misalignment,
13472                                                       is_packed);
13473 }
13474
13475 /* If VALS is a vector constant that can be loaded into a register
13476    using DUP, generate instructions to do so and return an RTX to
13477    assign to the register.  Otherwise return NULL_RTX.  */
13478 static rtx
13479 aarch64_simd_dup_constant (rtx vals)
13480 {
13481   machine_mode mode = GET_MODE (vals);
13482   machine_mode inner_mode = GET_MODE_INNER (mode);
13483   rtx x;
13484
13485   if (!const_vec_duplicate_p (vals, &x))
13486     return NULL_RTX;
13487
13488   /* We can load this constant by using DUP and a constant in a
13489      single ARM register.  This will be cheaper than a vector
13490      load.  */
13491   x = copy_to_mode_reg (inner_mode, x);
13492   return gen_vec_duplicate (mode, x);
13493 }
13494
13495
13496 /* Generate code to load VALS, which is a PARALLEL containing only
13497    constants (for vec_init) or CONST_VECTOR, efficiently into a
13498    register.  Returns an RTX to copy into the register, or NULL_RTX
13499    for a PARALLEL that can not be converted into a CONST_VECTOR.  */
13500 static rtx
13501 aarch64_simd_make_constant (rtx vals)
13502 {
13503   machine_mode mode = GET_MODE (vals);
13504   rtx const_dup;
13505   rtx const_vec = NULL_RTX;
13506   int n_const = 0;
13507   int i;
13508
13509   if (GET_CODE (vals) == CONST_VECTOR)
13510     const_vec = vals;
13511   else if (GET_CODE (vals) == PARALLEL)
13512     {
13513       /* A CONST_VECTOR must contain only CONST_INTs and
13514          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13515          Only store valid constants in a CONST_VECTOR.  */
13516       int n_elts = XVECLEN (vals, 0);
13517       for (i = 0; i < n_elts; ++i)
13518         {
13519           rtx x = XVECEXP (vals, 0, i);
13520           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13521             n_const++;
13522         }
13523       if (n_const == n_elts)
13524         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13525     }
13526   else
13527     gcc_unreachable ();
13528
13529   if (const_vec != NULL_RTX
13530       && aarch64_simd_valid_immediate (const_vec, NULL))
13531     /* Load using MOVI/MVNI.  */
13532     return const_vec;
13533   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13534     /* Loaded using DUP.  */
13535     return const_dup;
13536   else if (const_vec != NULL_RTX)
13537     /* Load from constant pool. We can not take advantage of single-cycle
13538        LD1 because we need a PC-relative addressing mode.  */
13539     return const_vec;
13540   else
13541     /* A PARALLEL containing something not valid inside CONST_VECTOR.
13542        We can not construct an initializer.  */
13543     return NULL_RTX;
13544 }
13545
13546 /* Expand a vector initialisation sequence, such that TARGET is
13547    initialised to contain VALS.  */
13548
13549 void
13550 aarch64_expand_vector_init (rtx target, rtx vals)
13551 {
13552   machine_mode mode = GET_MODE (target);
13553   scalar_mode inner_mode = GET_MODE_INNER (mode);
13554   /* The number of vector elements.  */
13555   int n_elts = XVECLEN (vals, 0);
13556   /* The number of vector elements which are not constant.  */
13557   int n_var = 0;
13558   rtx any_const = NULL_RTX;
13559   /* The first element of vals.  */
13560   rtx v0 = XVECEXP (vals, 0, 0);
13561   bool all_same = true;
13562
13563   /* Count the number of variable elements to initialise.  */
13564   for (int i = 0; i < n_elts; ++i)
13565     {
13566       rtx x = XVECEXP (vals, 0, i);
13567       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13568         ++n_var;
13569       else
13570         any_const = x;
13571
13572       all_same &= rtx_equal_p (x, v0);
13573     }
13574
13575   /* No variable elements, hand off to aarch64_simd_make_constant which knows
13576      how best to handle this.  */
13577   if (n_var == 0)
13578     {
13579       rtx constant = aarch64_simd_make_constant (vals);
13580       if (constant != NULL_RTX)
13581         {
13582           emit_move_insn (target, constant);
13583           return;
13584         }
13585     }
13586
13587   /* Splat a single non-constant element if we can.  */
13588   if (all_same)
13589     {
13590       rtx x = copy_to_mode_reg (inner_mode, v0);
13591       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13592       return;
13593     }
13594
13595   enum insn_code icode = optab_handler (vec_set_optab, mode);
13596   gcc_assert (icode != CODE_FOR_nothing);
13597
13598   /* If there are only variable elements, try to optimize
13599      the insertion using dup for the most common element
13600      followed by insertions.  */
13601
13602   /* The algorithm will fill matches[*][0] with the earliest matching element,
13603      and matches[X][1] with the count of duplicate elements (if X is the
13604      earliest element which has duplicates).  */
13605
13606   if (n_var == n_elts && n_elts <= 16)
13607     {
13608       int matches[16][2] = {0};
13609       for (int i = 0; i < n_elts; i++)
13610         {
13611           for (int j = 0; j <= i; j++)
13612             {
13613               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13614                 {
13615                   matches[i][0] = j;
13616                   matches[j][1]++;
13617                   break;
13618                 }
13619             }
13620         }
13621       int maxelement = 0;
13622       int maxv = 0;
13623       for (int i = 0; i < n_elts; i++)
13624         if (matches[i][1] > maxv)
13625           {
13626             maxelement = i;
13627             maxv = matches[i][1];
13628           }
13629
13630       /* Create a duplicate of the most common element.  */
13631       rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13632       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13633
13634       /* Insert the rest.  */
13635       for (int i = 0; i < n_elts; i++)
13636         {
13637           rtx x = XVECEXP (vals, 0, i);
13638           if (matches[i][0] == maxelement)
13639             continue;
13640           x = copy_to_mode_reg (inner_mode, x);
13641           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13642         }
13643       return;
13644     }
13645
13646   /* Initialise a vector which is part-variable.  We want to first try
13647      to build those lanes which are constant in the most efficient way we
13648      can.  */
13649   if (n_var != n_elts)
13650     {
13651       rtx copy = copy_rtx (vals);
13652
13653       /* Load constant part of vector.  We really don't care what goes into the
13654          parts we will overwrite, but we're more likely to be able to load the
13655          constant efficiently if it has fewer, larger, repeating parts
13656          (see aarch64_simd_valid_immediate).  */
13657       for (int i = 0; i < n_elts; i++)
13658         {
13659           rtx x = XVECEXP (vals, 0, i);
13660           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13661             continue;
13662           rtx subst = any_const;
13663           for (int bit = n_elts / 2; bit > 0; bit /= 2)
13664             {
13665               /* Look in the copied vector, as more elements are const.  */
13666               rtx test = XVECEXP (copy, 0, i ^ bit);
13667               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
13668                 {
13669                   subst = test;
13670                   break;
13671                 }
13672             }
13673           XVECEXP (copy, 0, i) = subst;
13674         }
13675       aarch64_expand_vector_init (target, copy);
13676     }
13677
13678   /* Insert the variable lanes directly.  */
13679   for (int i = 0; i < n_elts; i++)
13680     {
13681       rtx x = XVECEXP (vals, 0, i);
13682       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13683         continue;
13684       x = copy_to_mode_reg (inner_mode, x);
13685       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13686     }
13687 }
13688
13689 static unsigned HOST_WIDE_INT
13690 aarch64_shift_truncation_mask (machine_mode mode)
13691 {
13692   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
13693     return 0;
13694   return GET_MODE_UNIT_BITSIZE (mode) - 1;
13695 }
13696
13697 /* Select a format to encode pointers in exception handling data.  */
13698 int
13699 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
13700 {
13701    int type;
13702    switch (aarch64_cmodel)
13703      {
13704      case AARCH64_CMODEL_TINY:
13705      case AARCH64_CMODEL_TINY_PIC:
13706      case AARCH64_CMODEL_SMALL:
13707      case AARCH64_CMODEL_SMALL_PIC:
13708      case AARCH64_CMODEL_SMALL_SPIC:
13709        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
13710           for everything.  */
13711        type = DW_EH_PE_sdata4;
13712        break;
13713      default:
13714        /* No assumptions here.  8-byte relocs required.  */
13715        type = DW_EH_PE_sdata8;
13716        break;
13717      }
13718    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
13719 }
13720
13721 /* The last .arch and .tune assembly strings that we printed.  */
13722 static std::string aarch64_last_printed_arch_string;
13723 static std::string aarch64_last_printed_tune_string;
13724
13725 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
13726    by the function fndecl.  */
13727
13728 void
13729 aarch64_declare_function_name (FILE *stream, const char* name,
13730                                 tree fndecl)
13731 {
13732   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13733
13734   struct cl_target_option *targ_options;
13735   if (target_parts)
13736     targ_options = TREE_TARGET_OPTION (target_parts);
13737   else
13738     targ_options = TREE_TARGET_OPTION (target_option_current_node);
13739   gcc_assert (targ_options);
13740
13741   const struct processor *this_arch
13742     = aarch64_get_arch (targ_options->x_explicit_arch);
13743
13744   unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
13745   std::string extension
13746     = aarch64_get_extension_string_for_isa_flags (isa_flags,
13747                                                   this_arch->flags);
13748   /* Only update the assembler .arch string if it is distinct from the last
13749      such string we printed.  */
13750   std::string to_print = this_arch->name + extension;
13751   if (to_print != aarch64_last_printed_arch_string)
13752     {
13753       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
13754       aarch64_last_printed_arch_string = to_print;
13755     }
13756
13757   /* Print the cpu name we're tuning for in the comments, might be
13758      useful to readers of the generated asm.  Do it only when it changes
13759      from function to function and verbose assembly is requested.  */
13760   const struct processor *this_tune
13761     = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
13762
13763   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
13764     {
13765       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
13766                    this_tune->name);
13767       aarch64_last_printed_tune_string = this_tune->name;
13768     }
13769
13770   /* Don't forget the type directive for ELF.  */
13771   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
13772   ASM_OUTPUT_LABEL (stream, name);
13773 }
13774
13775 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
13776
13777 static void
13778 aarch64_start_file (void)
13779 {
13780   struct cl_target_option *default_options
13781     = TREE_TARGET_OPTION (target_option_default_node);
13782
13783   const struct processor *default_arch
13784     = aarch64_get_arch (default_options->x_explicit_arch);
13785   unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
13786   std::string extension
13787     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
13788                                                   default_arch->flags);
13789
13790    aarch64_last_printed_arch_string = default_arch->name + extension;
13791    aarch64_last_printed_tune_string = "";
13792    asm_fprintf (asm_out_file, "\t.arch %s\n",
13793                 aarch64_last_printed_arch_string.c_str ());
13794
13795    default_file_start ();
13796 }
13797
13798 /* Emit load exclusive.  */
13799
13800 static void
13801 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
13802                              rtx mem, rtx model_rtx)
13803 {
13804   rtx (*gen) (rtx, rtx, rtx);
13805
13806   switch (mode)
13807     {
13808     case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
13809     case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
13810     case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
13811     case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
13812     default:
13813       gcc_unreachable ();
13814     }
13815
13816   emit_insn (gen (rval, mem, model_rtx));
13817 }
13818
13819 /* Emit store exclusive.  */
13820
13821 static void
13822 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
13823                               rtx rval, rtx mem, rtx model_rtx)
13824 {
13825   rtx (*gen) (rtx, rtx, rtx, rtx);
13826
13827   switch (mode)
13828     {
13829     case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
13830     case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
13831     case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
13832     case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
13833     default:
13834       gcc_unreachable ();
13835     }
13836
13837   emit_insn (gen (bval, rval, mem, model_rtx));
13838 }
13839
13840 /* Mark the previous jump instruction as unlikely.  */
13841
13842 static void
13843 aarch64_emit_unlikely_jump (rtx insn)
13844 {
13845   rtx_insn *jump = emit_jump_insn (insn);
13846   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
13847 }
13848
13849 /* Expand a compare and swap pattern.  */
13850
13851 void
13852 aarch64_expand_compare_and_swap (rtx operands[])
13853 {
13854   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
13855   machine_mode mode, cmp_mode;
13856   typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
13857   int idx;
13858   gen_cas_fn gen;
13859   const gen_cas_fn split_cas[] =
13860   {
13861     gen_aarch64_compare_and_swapqi,
13862     gen_aarch64_compare_and_swaphi,
13863     gen_aarch64_compare_and_swapsi,
13864     gen_aarch64_compare_and_swapdi
13865   };
13866   const gen_cas_fn atomic_cas[] =
13867   {
13868     gen_aarch64_compare_and_swapqi_lse,
13869     gen_aarch64_compare_and_swaphi_lse,
13870     gen_aarch64_compare_and_swapsi_lse,
13871     gen_aarch64_compare_and_swapdi_lse
13872   };
13873
13874   bval = operands[0];
13875   rval = operands[1];
13876   mem = operands[2];
13877   oldval = operands[3];
13878   newval = operands[4];
13879   is_weak = operands[5];
13880   mod_s = operands[6];
13881   mod_f = operands[7];
13882   mode = GET_MODE (mem);
13883   cmp_mode = mode;
13884
13885   /* Normally the succ memory model must be stronger than fail, but in the
13886      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
13887      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
13888
13889   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
13890       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
13891     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
13892
13893   switch (mode)
13894     {
13895     case E_QImode:
13896     case E_HImode:
13897       /* For short modes, we're going to perform the comparison in SImode,
13898          so do the zero-extension now.  */
13899       cmp_mode = SImode;
13900       rval = gen_reg_rtx (SImode);
13901       oldval = convert_modes (SImode, mode, oldval, true);
13902       /* Fall through.  */
13903
13904     case E_SImode:
13905     case E_DImode:
13906       /* Force the value into a register if needed.  */
13907       if (!aarch64_plus_operand (oldval, mode))
13908         oldval = force_reg (cmp_mode, oldval);
13909       break;
13910
13911     default:
13912       gcc_unreachable ();
13913     }
13914
13915   switch (mode)
13916     {
13917     case E_QImode: idx = 0; break;
13918     case E_HImode: idx = 1; break;
13919     case E_SImode: idx = 2; break;
13920     case E_DImode: idx = 3; break;
13921     default:
13922       gcc_unreachable ();
13923     }
13924   if (TARGET_LSE)
13925     gen = atomic_cas[idx];
13926   else
13927     gen = split_cas[idx];
13928
13929   emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
13930
13931   if (mode == QImode || mode == HImode)
13932     emit_move_insn (operands[1], gen_lowpart (mode, rval));
13933
13934   x = gen_rtx_REG (CCmode, CC_REGNUM);
13935   x = gen_rtx_EQ (SImode, x, const0_rtx);
13936   emit_insn (gen_rtx_SET (bval, x));
13937 }
13938
13939 /* Test whether the target supports using a atomic load-operate instruction.
13940    CODE is the operation and AFTER is TRUE if the data in memory after the
13941    operation should be returned and FALSE if the data before the operation
13942    should be returned.  Returns FALSE if the operation isn't supported by the
13943    architecture.  */
13944
13945 bool
13946 aarch64_atomic_ldop_supported_p (enum rtx_code code)
13947 {
13948   if (!TARGET_LSE)
13949     return false;
13950
13951   switch (code)
13952     {
13953     case SET:
13954     case AND:
13955     case IOR:
13956     case XOR:
13957     case MINUS:
13958     case PLUS:
13959       return true;
13960     default:
13961       return false;
13962     }
13963 }
13964
13965 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
13966    sequence implementing an atomic operation.  */
13967
13968 static void
13969 aarch64_emit_post_barrier (enum memmodel model)
13970 {
13971   const enum memmodel base_model = memmodel_base (model);
13972
13973   if (is_mm_sync (model)
13974       && (base_model == MEMMODEL_ACQUIRE
13975           || base_model == MEMMODEL_ACQ_REL
13976           || base_model == MEMMODEL_SEQ_CST))
13977     {
13978       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
13979     }
13980 }
13981
13982 /* Emit an atomic compare-and-swap operation.  RVAL is the destination register
13983    for the data in memory.  EXPECTED is the value expected to be in memory.
13984    DESIRED is the value to store to memory.  MEM is the memory location.  MODEL
13985    is the memory ordering to use.  */
13986
13987 void
13988 aarch64_gen_atomic_cas (rtx rval, rtx mem,
13989                         rtx expected, rtx desired,
13990                         rtx model)
13991 {
13992   rtx (*gen) (rtx, rtx, rtx, rtx);
13993   machine_mode mode;
13994
13995   mode = GET_MODE (mem);
13996
13997   switch (mode)
13998     {
13999     case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14000     case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14001     case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14002     case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14003     default:
14004       gcc_unreachable ();
14005     }
14006
14007   /* Move the expected value into the CAS destination register.  */
14008   emit_insn (gen_rtx_SET (rval, expected));
14009
14010   /* Emit the CAS.  */
14011   emit_insn (gen (rval, mem, desired, model));
14012
14013   /* Compare the expected value with the value loaded by the CAS, to establish
14014      whether the swap was made.  */
14015   aarch64_gen_compare_reg (EQ, rval, expected);
14016 }
14017
14018 /* Split a compare and swap pattern.  */
14019
14020 void
14021 aarch64_split_compare_and_swap (rtx operands[])
14022 {
14023   rtx rval, mem, oldval, newval, scratch;
14024   machine_mode mode;
14025   bool is_weak;
14026   rtx_code_label *label1, *label2;
14027   rtx x, cond;
14028   enum memmodel model;
14029   rtx model_rtx;
14030
14031   rval = operands[0];
14032   mem = operands[1];
14033   oldval = operands[2];
14034   newval = operands[3];
14035   is_weak = (operands[4] != const0_rtx);
14036   model_rtx = operands[5];
14037   scratch = operands[7];
14038   mode = GET_MODE (mem);
14039   model = memmodel_from_int (INTVAL (model_rtx));
14040
14041   /* When OLDVAL is zero and we want the strong version we can emit a tighter
14042     loop:
14043     .label1:
14044         LD[A]XR rval, [mem]
14045         CBNZ    rval, .label2
14046         ST[L]XR scratch, newval, [mem]
14047         CBNZ    scratch, .label1
14048     .label2:
14049         CMP     rval, 0.  */
14050   bool strong_zero_p = !is_weak && oldval == const0_rtx;
14051
14052   label1 = NULL;
14053   if (!is_weak)
14054     {
14055       label1 = gen_label_rtx ();
14056       emit_label (label1);
14057     }
14058   label2 = gen_label_rtx ();
14059
14060   /* The initial load can be relaxed for a __sync operation since a final
14061      barrier will be emitted to stop code hoisting.  */
14062   if (is_mm_sync (model))
14063     aarch64_emit_load_exclusive (mode, rval, mem,
14064                                  GEN_INT (MEMMODEL_RELAXED));
14065   else
14066     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14067
14068   if (strong_zero_p)
14069     {
14070       x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14071       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14072                                 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14073       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14074     }
14075   else
14076     {
14077       cond = aarch64_gen_compare_reg (NE, rval, oldval);
14078       x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14079       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14080                                  gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14081       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14082     }
14083
14084   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14085
14086   if (!is_weak)
14087     {
14088       x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14089       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14090                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14091       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14092     }
14093   else
14094     {
14095       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14096       x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14097       emit_insn (gen_rtx_SET (cond, x));
14098     }
14099
14100   emit_label (label2);
14101   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14102      to set the condition flags.  If this is not used it will be removed by
14103      later passes.  */
14104   if (strong_zero_p)
14105     {
14106       cond = gen_rtx_REG (CCmode, CC_REGNUM);
14107       x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14108       emit_insn (gen_rtx_SET (cond, x));
14109     }
14110   /* Emit any final barrier needed for a __sync operation.  */
14111   if (is_mm_sync (model))
14112     aarch64_emit_post_barrier (model);
14113 }
14114
14115 /* Emit a BIC instruction.  */
14116
14117 static void
14118 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14119 {
14120   rtx shift_rtx = GEN_INT (shift);
14121   rtx (*gen) (rtx, rtx, rtx, rtx);
14122
14123   switch (mode)
14124     {
14125     case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14126     case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14127     default:
14128       gcc_unreachable ();
14129     }
14130
14131   emit_insn (gen (dst, s2, shift_rtx, s1));
14132 }
14133
14134 /* Emit an atomic swap.  */
14135
14136 static void
14137 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14138                           rtx mem, rtx model)
14139 {
14140   rtx (*gen) (rtx, rtx, rtx, rtx);
14141
14142   switch (mode)
14143     {
14144     case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14145     case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14146     case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14147     case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14148     default:
14149       gcc_unreachable ();
14150     }
14151
14152   emit_insn (gen (dst, mem, value, model));
14153 }
14154
14155 /* Operations supported by aarch64_emit_atomic_load_op.  */
14156
14157 enum aarch64_atomic_load_op_code
14158 {
14159   AARCH64_LDOP_PLUS,    /* A + B  */
14160   AARCH64_LDOP_XOR,     /* A ^ B  */
14161   AARCH64_LDOP_OR,      /* A | B  */
14162   AARCH64_LDOP_BIC      /* A & ~B  */
14163 };
14164
14165 /* Emit an atomic load-operate.  */
14166
14167 static void
14168 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14169                              machine_mode mode, rtx dst, rtx src,
14170                              rtx mem, rtx model)
14171 {
14172   typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14173   const aarch64_atomic_load_op_fn plus[] =
14174   {
14175     gen_aarch64_atomic_loadaddqi,
14176     gen_aarch64_atomic_loadaddhi,
14177     gen_aarch64_atomic_loadaddsi,
14178     gen_aarch64_atomic_loadadddi
14179   };
14180   const aarch64_atomic_load_op_fn eor[] =
14181   {
14182     gen_aarch64_atomic_loadeorqi,
14183     gen_aarch64_atomic_loadeorhi,
14184     gen_aarch64_atomic_loadeorsi,
14185     gen_aarch64_atomic_loadeordi
14186   };
14187   const aarch64_atomic_load_op_fn ior[] =
14188   {
14189     gen_aarch64_atomic_loadsetqi,
14190     gen_aarch64_atomic_loadsethi,
14191     gen_aarch64_atomic_loadsetsi,
14192     gen_aarch64_atomic_loadsetdi
14193   };
14194   const aarch64_atomic_load_op_fn bic[] =
14195   {
14196     gen_aarch64_atomic_loadclrqi,
14197     gen_aarch64_atomic_loadclrhi,
14198     gen_aarch64_atomic_loadclrsi,
14199     gen_aarch64_atomic_loadclrdi
14200   };
14201   aarch64_atomic_load_op_fn gen;
14202   int idx = 0;
14203
14204   switch (mode)
14205     {
14206     case E_QImode: idx = 0; break;
14207     case E_HImode: idx = 1; break;
14208     case E_SImode: idx = 2; break;
14209     case E_DImode: idx = 3; break;
14210     default:
14211       gcc_unreachable ();
14212     }
14213
14214   switch (code)
14215     {
14216     case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14217     case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14218     case AARCH64_LDOP_OR: gen = ior[idx]; break;
14219     case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14220     default:
14221       gcc_unreachable ();
14222     }
14223
14224   emit_insn (gen (dst, mem, src, model));
14225 }
14226
14227 /* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
14228    location to store the data read from memory.  OUT_RESULT is the location to
14229    store the result of the operation.  MEM is the memory location to read and
14230    modify.  MODEL_RTX is the memory ordering to use.  VALUE is the second
14231    operand for the operation.  Either OUT_DATA or OUT_RESULT, but not both, can
14232    be NULL.  */
14233
14234 void
14235 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14236                          rtx mem, rtx value, rtx model_rtx)
14237 {
14238   machine_mode mode = GET_MODE (mem);
14239   machine_mode wmode = (mode == DImode ? DImode : SImode);
14240   const bool short_mode = (mode < SImode);
14241   aarch64_atomic_load_op_code ldop_code;
14242   rtx src;
14243   rtx x;
14244
14245   if (out_data)
14246     out_data = gen_lowpart (mode, out_data);
14247
14248   if (out_result)
14249     out_result = gen_lowpart (mode, out_result);
14250
14251   /* Make sure the value is in a register, putting it into a destination
14252      register if it needs to be manipulated.  */
14253   if (!register_operand (value, mode)
14254       || code == AND || code == MINUS)
14255     {
14256       src = out_result ? out_result : out_data;
14257       emit_move_insn (src, gen_lowpart (mode, value));
14258     }
14259   else
14260     src = value;
14261   gcc_assert (register_operand (src, mode));
14262
14263   /* Preprocess the data for the operation as necessary.  If the operation is
14264      a SET then emit a swap instruction and finish.  */
14265   switch (code)
14266     {
14267     case SET:
14268       aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14269       return;
14270
14271     case MINUS:
14272       /* Negate the value and treat it as a PLUS.  */
14273       {
14274         rtx neg_src;
14275
14276         /* Resize the value if necessary.  */
14277         if (short_mode)
14278           src = gen_lowpart (wmode, src);
14279
14280         neg_src = gen_rtx_NEG (wmode, src);
14281         emit_insn (gen_rtx_SET (src, neg_src));
14282
14283         if (short_mode)
14284           src = gen_lowpart (mode, src);
14285       }
14286       /* Fall-through.  */
14287     case PLUS:
14288       ldop_code = AARCH64_LDOP_PLUS;
14289       break;
14290
14291     case IOR:
14292       ldop_code = AARCH64_LDOP_OR;
14293       break;
14294
14295     case XOR:
14296       ldop_code = AARCH64_LDOP_XOR;
14297       break;
14298
14299     case AND:
14300       {
14301         rtx not_src;
14302
14303         /* Resize the value if necessary.  */
14304         if (short_mode)
14305           src = gen_lowpart (wmode, src);
14306
14307         not_src = gen_rtx_NOT (wmode, src);
14308         emit_insn (gen_rtx_SET (src, not_src));
14309
14310         if (short_mode)
14311           src = gen_lowpart (mode, src);
14312       }
14313       ldop_code = AARCH64_LDOP_BIC;
14314       break;
14315
14316     default:
14317       /* The operation can't be done with atomic instructions.  */
14318       gcc_unreachable ();
14319     }
14320
14321   aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14322
14323   /* If necessary, calculate the data in memory after the update by redoing the
14324      operation from values in registers.  */
14325   if (!out_result)
14326     return;
14327
14328   if (short_mode)
14329     {
14330       src = gen_lowpart (wmode, src);
14331       out_data = gen_lowpart (wmode, out_data);
14332       out_result = gen_lowpart (wmode, out_result);
14333     }
14334
14335   x = NULL_RTX;
14336
14337   switch (code)
14338     {
14339     case MINUS:
14340     case PLUS:
14341       x = gen_rtx_PLUS (wmode, out_data, src);
14342       break;
14343     case IOR:
14344       x = gen_rtx_IOR (wmode, out_data, src);
14345       break;
14346     case XOR:
14347       x = gen_rtx_XOR (wmode, out_data, src);
14348       break;
14349     case AND:
14350       aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14351       return;
14352     default:
14353       gcc_unreachable ();
14354     }
14355
14356   emit_set_insn (out_result, x);
14357
14358   return;
14359 }
14360
14361 /* Split an atomic operation.  */
14362
14363 void
14364 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14365                          rtx value, rtx model_rtx, rtx cond)
14366 {
14367   machine_mode mode = GET_MODE (mem);
14368   machine_mode wmode = (mode == DImode ? DImode : SImode);
14369   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14370   const bool is_sync = is_mm_sync (model);
14371   rtx_code_label *label;
14372   rtx x;
14373
14374   /* Split the atomic operation into a sequence.  */
14375   label = gen_label_rtx ();
14376   emit_label (label);
14377
14378   if (new_out)
14379     new_out = gen_lowpart (wmode, new_out);
14380   if (old_out)
14381     old_out = gen_lowpart (wmode, old_out);
14382   else
14383     old_out = new_out;
14384   value = simplify_gen_subreg (wmode, value, mode, 0);
14385
14386   /* The initial load can be relaxed for a __sync operation since a final
14387      barrier will be emitted to stop code hoisting.  */
14388  if (is_sync)
14389     aarch64_emit_load_exclusive (mode, old_out, mem,
14390                                  GEN_INT (MEMMODEL_RELAXED));
14391   else
14392     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14393
14394   switch (code)
14395     {
14396     case SET:
14397       new_out = value;
14398       break;
14399
14400     case NOT:
14401       x = gen_rtx_AND (wmode, old_out, value);
14402       emit_insn (gen_rtx_SET (new_out, x));
14403       x = gen_rtx_NOT (wmode, new_out);
14404       emit_insn (gen_rtx_SET (new_out, x));
14405       break;
14406
14407     case MINUS:
14408       if (CONST_INT_P (value))
14409         {
14410           value = GEN_INT (-INTVAL (value));
14411           code = PLUS;
14412         }
14413       /* Fall through.  */
14414
14415     default:
14416       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14417       emit_insn (gen_rtx_SET (new_out, x));
14418       break;
14419     }
14420
14421   aarch64_emit_store_exclusive (mode, cond, mem,
14422                                 gen_lowpart (mode, new_out), model_rtx);
14423
14424   x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14425   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14426                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14427   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14428
14429   /* Emit any final barrier needed for a __sync operation.  */
14430   if (is_sync)
14431     aarch64_emit_post_barrier (model);
14432 }
14433
14434 static void
14435 aarch64_init_libfuncs (void)
14436 {
14437    /* Half-precision float operations.  The compiler handles all operations
14438      with NULL libfuncs by converting to SFmode.  */
14439
14440   /* Conversions.  */
14441   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14442   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14443
14444   /* Arithmetic.  */
14445   set_optab_libfunc (add_optab, HFmode, NULL);
14446   set_optab_libfunc (sdiv_optab, HFmode, NULL);
14447   set_optab_libfunc (smul_optab, HFmode, NULL);
14448   set_optab_libfunc (neg_optab, HFmode, NULL);
14449   set_optab_libfunc (sub_optab, HFmode, NULL);
14450
14451   /* Comparisons.  */
14452   set_optab_libfunc (eq_optab, HFmode, NULL);
14453   set_optab_libfunc (ne_optab, HFmode, NULL);
14454   set_optab_libfunc (lt_optab, HFmode, NULL);
14455   set_optab_libfunc (le_optab, HFmode, NULL);
14456   set_optab_libfunc (ge_optab, HFmode, NULL);
14457   set_optab_libfunc (gt_optab, HFmode, NULL);
14458   set_optab_libfunc (unord_optab, HFmode, NULL);
14459 }
14460
14461 /* Target hook for c_mode_for_suffix.  */
14462 static machine_mode
14463 aarch64_c_mode_for_suffix (char suffix)
14464 {
14465   if (suffix == 'q')
14466     return TFmode;
14467
14468   return VOIDmode;
14469 }
14470
14471 /* We can only represent floating point constants which will fit in
14472    "quarter-precision" values.  These values are characterised by
14473    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
14474    by:
14475
14476    (-1)^s * (n/16) * 2^r
14477
14478    Where:
14479      's' is the sign bit.
14480      'n' is an integer in the range 16 <= n <= 31.
14481      'r' is an integer in the range -3 <= r <= 4.  */
14482
14483 /* Return true iff X can be represented by a quarter-precision
14484    floating point immediate operand X.  Note, we cannot represent 0.0.  */
14485 bool
14486 aarch64_float_const_representable_p (rtx x)
14487 {
14488   /* This represents our current view of how many bits
14489      make up the mantissa.  */
14490   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14491   int exponent;
14492   unsigned HOST_WIDE_INT mantissa, mask;
14493   REAL_VALUE_TYPE r, m;
14494   bool fail;
14495
14496   if (!CONST_DOUBLE_P (x))
14497     return false;
14498
14499   /* We don't support HFmode constants yet.  */
14500   if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14501     return false;
14502
14503   r = *CONST_DOUBLE_REAL_VALUE (x);
14504
14505   /* We cannot represent infinities, NaNs or +/-zero.  We won't
14506      know if we have +zero until we analyse the mantissa, but we
14507      can reject the other invalid values.  */
14508   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14509       || REAL_VALUE_MINUS_ZERO (r))
14510     return false;
14511
14512   /* Extract exponent.  */
14513   r = real_value_abs (&r);
14514   exponent = REAL_EXP (&r);
14515
14516   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14517      highest (sign) bit, with a fixed binary point at bit point_pos.
14518      m1 holds the low part of the mantissa, m2 the high part.
14519      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14520      bits for the mantissa, this can fail (low bits will be lost).  */
14521   real_ldexp (&m, &r, point_pos - exponent);
14522   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14523
14524   /* If the low part of the mantissa has bits set we cannot represent
14525      the value.  */
14526   if (w.ulow () != 0)
14527     return false;
14528   /* We have rejected the lower HOST_WIDE_INT, so update our
14529      understanding of how many bits lie in the mantissa and
14530      look only at the high HOST_WIDE_INT.  */
14531   mantissa = w.elt (1);
14532   point_pos -= HOST_BITS_PER_WIDE_INT;
14533
14534   /* We can only represent values with a mantissa of the form 1.xxxx.  */
14535   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14536   if ((mantissa & mask) != 0)
14537     return false;
14538
14539   /* Having filtered unrepresentable values, we may now remove all
14540      but the highest 5 bits.  */
14541   mantissa >>= point_pos - 5;
14542
14543   /* We cannot represent the value 0.0, so reject it.  This is handled
14544      elsewhere.  */
14545   if (mantissa == 0)
14546     return false;
14547
14548   /* Then, as bit 4 is always set, we can mask it off, leaving
14549      the mantissa in the range [0, 15].  */
14550   mantissa &= ~(1 << 4);
14551   gcc_assert (mantissa <= 15);
14552
14553   /* GCC internally does not use IEEE754-like encoding (where normalized
14554      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.c).
14555      Our mantissa values are shifted 4 places to the left relative to
14556      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14557      by 5 places to correct for GCC's representation.  */
14558   exponent = 5 - exponent;
14559
14560   return (exponent >= 0 && exponent <= 7);
14561 }
14562
14563 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14564    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
14565    output MOVI/MVNI, ORR or BIC immediate.  */
14566 char*
14567 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14568                                    enum simd_immediate_check which)
14569 {
14570   bool is_valid;
14571   static char templ[40];
14572   const char *mnemonic;
14573   const char *shift_op;
14574   unsigned int lane_count = 0;
14575   char element_char;
14576
14577   struct simd_immediate_info info;
14578
14579   /* This will return true to show const_vector is legal for use as either
14580      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14581      It will also update INFO to show how the immediate should be generated.
14582      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
14583   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14584   gcc_assert (is_valid);
14585
14586   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14587   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14588
14589   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14590     {
14591       gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14592       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14593          move immediate path.  */
14594       if (aarch64_float_const_zero_rtx_p (info.value))
14595         info.value = GEN_INT (0);
14596       else
14597         {
14598           const unsigned int buf_size = 20;
14599           char float_buf[buf_size] = {'\0'};
14600           real_to_decimal_for_mode (float_buf,
14601                                     CONST_DOUBLE_REAL_VALUE (info.value),
14602                                     buf_size, buf_size, 1, info.elt_mode);
14603
14604           if (lane_count == 1)
14605             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14606           else
14607             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14608                       lane_count, element_char, float_buf);
14609           return templ;
14610         }
14611     }
14612
14613   gcc_assert (CONST_INT_P (info.value));
14614
14615   if (which == AARCH64_CHECK_MOV)
14616     {
14617       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14618       shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14619       if (lane_count == 1)
14620         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14621                   mnemonic, UINTVAL (info.value));
14622       else if (info.shift)
14623         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14624                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14625                   element_char, UINTVAL (info.value), shift_op, info.shift);
14626       else
14627         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14628                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14629                   element_char, UINTVAL (info.value));
14630     }
14631   else
14632     {
14633       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
14634       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14635       if (info.shift)
14636         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14637                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14638                   element_char, UINTVAL (info.value), "lsl", info.shift);
14639       else
14640         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14641                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14642                   element_char, UINTVAL (info.value));
14643     }
14644   return templ;
14645 }
14646
14647 char*
14648 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14649 {
14650
14651   /* If a floating point number was passed and we desire to use it in an
14652      integer mode do the conversion to integer.  */
14653   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14654     {
14655       unsigned HOST_WIDE_INT ival;
14656       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
14657           gcc_unreachable ();
14658       immediate = gen_int_mode (ival, mode);
14659     }
14660
14661   machine_mode vmode;
14662   /* use a 64 bit mode for everything except for DI/DF mode, where we use
14663      a 128 bit vector mode.  */
14664   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
14665
14666   vmode = aarch64_simd_container_mode (mode, width);
14667   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
14668   return aarch64_output_simd_mov_immediate (v_op, width);
14669 }
14670
14671 /* Return the output string to use for moving immediate CONST_VECTOR
14672    into an SVE register.  */
14673
14674 char *
14675 aarch64_output_sve_mov_immediate (rtx const_vector)
14676 {
14677   static char templ[40];
14678   struct simd_immediate_info info;
14679   char element_char;
14680
14681   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
14682   gcc_assert (is_valid);
14683
14684   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14685
14686   if (info.step)
14687     {
14688       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
14689                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
14690                 element_char, INTVAL (info.value), INTVAL (info.step));
14691       return templ;
14692     }
14693
14694   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14695     {
14696       if (aarch64_float_const_zero_rtx_p (info.value))
14697         info.value = GEN_INT (0);
14698       else
14699         {
14700           const int buf_size = 20;
14701           char float_buf[buf_size] = {};
14702           real_to_decimal_for_mode (float_buf,
14703                                     CONST_DOUBLE_REAL_VALUE (info.value),
14704                                     buf_size, buf_size, 1, info.elt_mode);
14705
14706           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
14707                     element_char, float_buf);
14708           return templ;
14709         }
14710     }
14711
14712   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
14713             element_char, INTVAL (info.value));
14714   return templ;
14715 }
14716
14717 /* Return the asm format for a PTRUE instruction whose destination has
14718    mode MODE.  SUFFIX is the element size suffix.  */
14719
14720 char *
14721 aarch64_output_ptrue (machine_mode mode, char suffix)
14722 {
14723   unsigned int nunits;
14724   static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
14725   if (GET_MODE_NUNITS (mode).is_constant (&nunits))
14726     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
14727   else
14728     snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
14729   return buf;
14730 }
14731
14732 /* Split operands into moves from op[1] + op[2] into op[0].  */
14733
14734 void
14735 aarch64_split_combinev16qi (rtx operands[3])
14736 {
14737   unsigned int dest = REGNO (operands[0]);
14738   unsigned int src1 = REGNO (operands[1]);
14739   unsigned int src2 = REGNO (operands[2]);
14740   machine_mode halfmode = GET_MODE (operands[1]);
14741   unsigned int halfregs = REG_NREGS (operands[1]);
14742   rtx destlo, desthi;
14743
14744   gcc_assert (halfmode == V16QImode);
14745
14746   if (src1 == dest && src2 == dest + halfregs)
14747     {
14748       /* No-op move.  Can't split to nothing; emit something.  */
14749       emit_note (NOTE_INSN_DELETED);
14750       return;
14751     }
14752
14753   /* Preserve register attributes for variable tracking.  */
14754   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
14755   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
14756                                GET_MODE_SIZE (halfmode));
14757
14758   /* Special case of reversed high/low parts.  */
14759   if (reg_overlap_mentioned_p (operands[2], destlo)
14760       && reg_overlap_mentioned_p (operands[1], desthi))
14761     {
14762       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
14763       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
14764       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
14765     }
14766   else if (!reg_overlap_mentioned_p (operands[2], destlo))
14767     {
14768       /* Try to avoid unnecessary moves if part of the result
14769          is in the right place already.  */
14770       if (src1 != dest)
14771         emit_move_insn (destlo, operands[1]);
14772       if (src2 != dest + halfregs)
14773         emit_move_insn (desthi, operands[2]);
14774     }
14775   else
14776     {
14777       if (src2 != dest + halfregs)
14778         emit_move_insn (desthi, operands[2]);
14779       if (src1 != dest)
14780         emit_move_insn (destlo, operands[1]);
14781     }
14782 }
14783
14784 /* vec_perm support.  */
14785
14786 struct expand_vec_perm_d
14787 {
14788   rtx target, op0, op1;
14789   vec_perm_indices perm;
14790   machine_mode vmode;
14791   unsigned int vec_flags;
14792   bool one_vector_p;
14793   bool testing_p;
14794 };
14795
14796 /* Generate a variable permutation.  */
14797
14798 static void
14799 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
14800 {
14801   machine_mode vmode = GET_MODE (target);
14802   bool one_vector_p = rtx_equal_p (op0, op1);
14803
14804   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
14805   gcc_checking_assert (GET_MODE (op0) == vmode);
14806   gcc_checking_assert (GET_MODE (op1) == vmode);
14807   gcc_checking_assert (GET_MODE (sel) == vmode);
14808   gcc_checking_assert (TARGET_SIMD);
14809
14810   if (one_vector_p)
14811     {
14812       if (vmode == V8QImode)
14813         {
14814           /* Expand the argument to a V16QI mode by duplicating it.  */
14815           rtx pair = gen_reg_rtx (V16QImode);
14816           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
14817           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
14818         }
14819       else
14820         {
14821           emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
14822         }
14823     }
14824   else
14825     {
14826       rtx pair;
14827
14828       if (vmode == V8QImode)
14829         {
14830           pair = gen_reg_rtx (V16QImode);
14831           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
14832           emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
14833         }
14834       else
14835         {
14836           pair = gen_reg_rtx (OImode);
14837           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
14838           emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
14839         }
14840     }
14841 }
14842
14843 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
14844    NELT is the number of elements in the vector.  */
14845
14846 void
14847 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
14848                          unsigned int nelt)
14849 {
14850   machine_mode vmode = GET_MODE (target);
14851   bool one_vector_p = rtx_equal_p (op0, op1);
14852   rtx mask;
14853
14854   /* The TBL instruction does not use a modulo index, so we must take care
14855      of that ourselves.  */
14856   mask = aarch64_simd_gen_const_vector_dup (vmode,
14857       one_vector_p ? nelt - 1 : 2 * nelt - 1);
14858   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
14859
14860   /* For big-endian, we also need to reverse the index within the vector
14861      (but not which vector).  */
14862   if (BYTES_BIG_ENDIAN)
14863     {
14864       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
14865       if (!one_vector_p)
14866         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
14867       sel = expand_simple_binop (vmode, XOR, sel, mask,
14868                                  NULL, 0, OPTAB_LIB_WIDEN);
14869     }
14870   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
14871 }
14872
14873 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
14874
14875 static void
14876 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
14877 {
14878   emit_insn (gen_rtx_SET (target,
14879                           gen_rtx_UNSPEC (GET_MODE (target),
14880                                           gen_rtvec (2, op0, op1), code)));
14881 }
14882
14883 /* Expand an SVE vec_perm with the given operands.  */
14884
14885 void
14886 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
14887 {
14888   machine_mode data_mode = GET_MODE (target);
14889   machine_mode sel_mode = GET_MODE (sel);
14890   /* Enforced by the pattern condition.  */
14891   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
14892
14893   /* Note: vec_perm indices are supposed to wrap when they go beyond the
14894      size of the two value vectors, i.e. the upper bits of the indices
14895      are effectively ignored.  SVE TBL instead produces 0 for any
14896      out-of-range indices, so we need to modulo all the vec_perm indices
14897      to ensure they are all in range.  */
14898   rtx sel_reg = force_reg (sel_mode, sel);
14899
14900   /* Check if the sel only references the first values vector.  */
14901   if (GET_CODE (sel) == CONST_VECTOR
14902       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
14903     {
14904       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
14905       return;
14906     }
14907
14908   /* Check if the two values vectors are the same.  */
14909   if (rtx_equal_p (op0, op1))
14910     {
14911       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
14912       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
14913                                          NULL, 0, OPTAB_DIRECT);
14914       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
14915       return;
14916     }
14917
14918   /* Run TBL on for each value vector and combine the results.  */
14919
14920   rtx res0 = gen_reg_rtx (data_mode);
14921   rtx res1 = gen_reg_rtx (data_mode);
14922   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
14923   if (GET_CODE (sel) != CONST_VECTOR
14924       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
14925     {
14926       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
14927                                                        2 * nunits - 1);
14928       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
14929                                      NULL, 0, OPTAB_DIRECT);
14930     }
14931   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
14932   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
14933                                      NULL, 0, OPTAB_DIRECT);
14934   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
14935   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
14936     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
14937   else
14938     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
14939 }
14940
14941 /* Recognize patterns suitable for the TRN instructions.  */
14942 static bool
14943 aarch64_evpc_trn (struct expand_vec_perm_d *d)
14944 {
14945   HOST_WIDE_INT odd;
14946   poly_uint64 nelt = d->perm.length ();
14947   rtx out, in0, in1, x;
14948   machine_mode vmode = d->vmode;
14949
14950   if (GET_MODE_UNIT_SIZE (vmode) > 8)
14951     return false;
14952
14953   /* Note that these are little-endian tests.
14954      We correct for big-endian later.  */
14955   if (!d->perm[0].is_constant (&odd)
14956       || (odd != 0 && odd != 1)
14957       || !d->perm.series_p (0, 2, odd, 2)
14958       || !d->perm.series_p (1, 2, nelt + odd, 2))
14959     return false;
14960
14961   /* Success!  */
14962   if (d->testing_p)
14963     return true;
14964
14965   in0 = d->op0;
14966   in1 = d->op1;
14967   /* We don't need a big-endian lane correction for SVE; see the comment
14968      at the head of aarch64-sve.md for details.  */
14969   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
14970     {
14971       x = in0, in0 = in1, in1 = x;
14972       odd = !odd;
14973     }
14974   out = d->target;
14975
14976   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
14977                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
14978   return true;
14979 }
14980
14981 /* Recognize patterns suitable for the UZP instructions.  */
14982 static bool
14983 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
14984 {
14985   HOST_WIDE_INT odd;
14986   rtx out, in0, in1, x;
14987   machine_mode vmode = d->vmode;
14988
14989   if (GET_MODE_UNIT_SIZE (vmode) > 8)
14990     return false;
14991
14992   /* Note that these are little-endian tests.
14993      We correct for big-endian later.  */
14994   if (!d->perm[0].is_constant (&odd)
14995       || (odd != 0 && odd != 1)
14996       || !d->perm.series_p (0, 1, odd, 2))
14997     return false;
14998
14999   /* Success!  */
15000   if (d->testing_p)
15001     return true;
15002
15003   in0 = d->op0;
15004   in1 = d->op1;
15005   /* We don't need a big-endian lane correction for SVE; see the comment
15006      at the head of aarch64-sve.md for details.  */
15007   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15008     {
15009       x = in0, in0 = in1, in1 = x;
15010       odd = !odd;
15011     }
15012   out = d->target;
15013
15014   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15015                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15016   return true;
15017 }
15018
15019 /* Recognize patterns suitable for the ZIP instructions.  */
15020 static bool
15021 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15022 {
15023   unsigned int high;
15024   poly_uint64 nelt = d->perm.length ();
15025   rtx out, in0, in1, x;
15026   machine_mode vmode = d->vmode;
15027
15028   if (GET_MODE_UNIT_SIZE (vmode) > 8)
15029     return false;
15030
15031   /* Note that these are little-endian tests.
15032      We correct for big-endian later.  */
15033   poly_uint64 first = d->perm[0];
15034   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15035       || !d->perm.series_p (0, 2, first, 1)
15036       || !d->perm.series_p (1, 2, first + nelt, 1))
15037     return false;
15038   high = maybe_ne (first, 0U);
15039
15040   /* Success!  */
15041   if (d->testing_p)
15042     return true;
15043
15044   in0 = d->op0;
15045   in1 = d->op1;
15046   /* We don't need a big-endian lane correction for SVE; see the comment
15047      at the head of aarch64-sve.md for details.  */
15048   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15049     {
15050       x = in0, in0 = in1, in1 = x;
15051       high = !high;
15052     }
15053   out = d->target;
15054
15055   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15056                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15057   return true;
15058 }
15059
15060 /* Recognize patterns for the EXT insn.  */
15061
15062 static bool
15063 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15064 {
15065   HOST_WIDE_INT location;
15066   rtx offset;
15067
15068   /* The first element always refers to the first vector.
15069      Check if the extracted indices are increasing by one.  */
15070   if (d->vec_flags == VEC_SVE_PRED
15071       || !d->perm[0].is_constant (&location)
15072       || !d->perm.series_p (0, 1, location, 1))
15073     return false;
15074
15075   /* Success! */
15076   if (d->testing_p)
15077     return true;
15078
15079   /* The case where (location == 0) is a no-op for both big- and little-endian,
15080      and is removed by the mid-end at optimization levels -O1 and higher.
15081
15082      We don't need a big-endian lane correction for SVE; see the comment
15083      at the head of aarch64-sve.md for details.  */
15084   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15085     {
15086       /* After setup, we want the high elements of the first vector (stored
15087          at the LSB end of the register), and the low elements of the second
15088          vector (stored at the MSB end of the register). So swap.  */
15089       std::swap (d->op0, d->op1);
15090       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15091          to_constant () is safe since this is restricted to Advanced SIMD
15092          vectors.  */
15093       location = d->perm.length ().to_constant () - location;
15094     }
15095
15096   offset = GEN_INT (location);
15097   emit_set_insn (d->target,
15098                  gen_rtx_UNSPEC (d->vmode,
15099                                  gen_rtvec (3, d->op0, d->op1, offset),
15100                                  UNSPEC_EXT));
15101   return true;
15102 }
15103
15104 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15105    within each 64-bit, 32-bit or 16-bit granule.  */
15106
15107 static bool
15108 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15109 {
15110   HOST_WIDE_INT diff;
15111   unsigned int i, size, unspec;
15112   machine_mode pred_mode;
15113
15114   if (d->vec_flags == VEC_SVE_PRED
15115       || !d->one_vector_p
15116       || !d->perm[0].is_constant (&diff))
15117     return false;
15118
15119   size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15120   if (size == 8)
15121     {
15122       unspec = UNSPEC_REV64;
15123       pred_mode = VNx2BImode;
15124     }
15125   else if (size == 4)
15126     {
15127       unspec = UNSPEC_REV32;
15128       pred_mode = VNx4BImode;
15129     }
15130   else if (size == 2)
15131     {
15132       unspec = UNSPEC_REV16;
15133       pred_mode = VNx8BImode;
15134     }
15135   else
15136     return false;
15137
15138   unsigned int step = diff + 1;
15139   for (i = 0; i < step; ++i)
15140     if (!d->perm.series_p (i, step, diff - i, step))
15141       return false;
15142
15143   /* Success! */
15144   if (d->testing_p)
15145     return true;
15146
15147   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15148   if (d->vec_flags == VEC_SVE_DATA)
15149     {
15150       rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15151       src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15152                             UNSPEC_MERGE_PTRUE);
15153     }
15154   emit_set_insn (d->target, src);
15155   return true;
15156 }
15157
15158 /* Recognize patterns for the REV insn, which reverses elements within
15159    a full vector.  */
15160
15161 static bool
15162 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15163 {
15164   poly_uint64 nelt = d->perm.length ();
15165
15166   if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15167     return false;
15168
15169   if (!d->perm.series_p (0, 1, nelt - 1, -1))
15170     return false;
15171
15172   /* Success! */
15173   if (d->testing_p)
15174     return true;
15175
15176   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15177   emit_set_insn (d->target, src);
15178   return true;
15179 }
15180
15181 static bool
15182 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15183 {
15184   rtx out = d->target;
15185   rtx in0;
15186   HOST_WIDE_INT elt;
15187   machine_mode vmode = d->vmode;
15188   rtx lane;
15189
15190   if (d->vec_flags == VEC_SVE_PRED
15191       || d->perm.encoding ().encoded_nelts () != 1
15192       || !d->perm[0].is_constant (&elt))
15193     return false;
15194
15195   if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15196     return false;
15197
15198   /* Success! */
15199   if (d->testing_p)
15200     return true;
15201
15202   /* The generic preparation in aarch64_expand_vec_perm_const_1
15203      swaps the operand order and the permute indices if it finds
15204      d->perm[0] to be in the second operand.  Thus, we can always
15205      use d->op0 and need not do any extra arithmetic to get the
15206      correct lane number.  */
15207   in0 = d->op0;
15208   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
15209
15210   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15211   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15212   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15213   return true;
15214 }
15215
15216 static bool
15217 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15218 {
15219   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15220   machine_mode vmode = d->vmode;
15221
15222   /* Make sure that the indices are constant.  */
15223   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15224   for (unsigned int i = 0; i < encoded_nelts; ++i)
15225     if (!d->perm[i].is_constant ())
15226       return false;
15227
15228   if (d->testing_p)
15229     return true;
15230
15231   /* Generic code will try constant permutation twice.  Once with the
15232      original mode and again with the elements lowered to QImode.
15233      So wait and don't do the selector expansion ourselves.  */
15234   if (vmode != V8QImode && vmode != V16QImode)
15235     return false;
15236
15237   /* to_constant is safe since this routine is specific to Advanced SIMD
15238      vectors.  */
15239   unsigned int nelt = d->perm.length ().to_constant ();
15240   for (unsigned int i = 0; i < nelt; ++i)
15241     /* If big-endian and two vectors we end up with a weird mixed-endian
15242        mode on NEON.  Reverse the index within each word but not the word
15243        itself.  to_constant is safe because we checked is_constant above.  */
15244     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15245                         ? d->perm[i].to_constant () ^ (nelt - 1)
15246                         : d->perm[i].to_constant ());
15247
15248   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15249   sel = force_reg (vmode, sel);
15250
15251   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15252   return true;
15253 }
15254
15255 /* Try to implement D using an SVE TBL instruction.  */
15256
15257 static bool
15258 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15259 {
15260   unsigned HOST_WIDE_INT nelt;
15261
15262   /* Permuting two variable-length vectors could overflow the
15263      index range.  */
15264   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15265     return false;
15266
15267   if (d->testing_p)
15268     return true;
15269
15270   machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15271   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15272   aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15273   return true;
15274 }
15275
15276 static bool
15277 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15278 {
15279   /* The pattern matching functions above are written to look for a small
15280      number to begin the sequence (0, 1, N/2).  If we begin with an index
15281      from the second operand, we can swap the operands.  */
15282   poly_int64 nelt = d->perm.length ();
15283   if (known_ge (d->perm[0], nelt))
15284     {
15285       d->perm.rotate_inputs (1);
15286       std::swap (d->op0, d->op1);
15287     }
15288
15289   if ((d->vec_flags == VEC_ADVSIMD
15290        || d->vec_flags == VEC_SVE_DATA
15291        || d->vec_flags == VEC_SVE_PRED)
15292       && known_gt (nelt, 1))
15293     {
15294       if (aarch64_evpc_rev_local (d))
15295         return true;
15296       else if (aarch64_evpc_rev_global (d))
15297         return true;
15298       else if (aarch64_evpc_ext (d))
15299         return true;
15300       else if (aarch64_evpc_dup (d))
15301         return true;
15302       else if (aarch64_evpc_zip (d))
15303         return true;
15304       else if (aarch64_evpc_uzp (d))
15305         return true;
15306       else if (aarch64_evpc_trn (d))
15307         return true;
15308       if (d->vec_flags == VEC_SVE_DATA)
15309         return aarch64_evpc_sve_tbl (d);
15310       else if (d->vec_flags == VEC_SVE_DATA)
15311         return aarch64_evpc_tbl (d);
15312     }
15313   return false;
15314 }
15315
15316 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
15317
15318 static bool
15319 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15320                                   rtx op1, const vec_perm_indices &sel)
15321 {
15322   struct expand_vec_perm_d d;
15323
15324   /* Check whether the mask can be applied to a single vector.  */
15325   if (op0 && rtx_equal_p (op0, op1))
15326     d.one_vector_p = true;
15327   else if (sel.all_from_input_p (0))
15328     {
15329       d.one_vector_p = true;
15330       op1 = op0;
15331     }
15332   else if (sel.all_from_input_p (1))
15333     {
15334       d.one_vector_p = true;
15335       op0 = op1;
15336     }
15337   else
15338     d.one_vector_p = false;
15339
15340   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15341                      sel.nelts_per_input ());
15342   d.vmode = vmode;
15343   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15344   d.target = target;
15345   d.op0 = op0;
15346   d.op1 = op1;
15347   d.testing_p = !target;
15348
15349   if (!d.testing_p)
15350     return aarch64_expand_vec_perm_const_1 (&d);
15351
15352   rtx_insn *last = get_last_insn ();
15353   bool ret = aarch64_expand_vec_perm_const_1 (&d);
15354   gcc_assert (last == get_last_insn ());
15355
15356   return ret;
15357 }
15358
15359 /* Generate a byte permute mask for a register of mode MODE,
15360    which has NUNITS units.  */
15361
15362 rtx
15363 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15364 {
15365   /* We have to reverse each vector because we dont have
15366      a permuted load that can reverse-load according to ABI rules.  */
15367   rtx mask;
15368   rtvec v = rtvec_alloc (16);
15369   unsigned int i, j;
15370   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15371
15372   gcc_assert (BYTES_BIG_ENDIAN);
15373   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15374
15375   for (i = 0; i < nunits; i++)
15376     for (j = 0; j < usize; j++)
15377       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15378   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15379   return force_reg (V16QImode, mask);
15380 }
15381
15382 /* Return true if X is a valid second operand for the SVE instruction
15383    that implements integer comparison OP_CODE.  */
15384
15385 static bool
15386 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15387 {
15388   if (register_operand (x, VOIDmode))
15389     return true;
15390
15391   switch (op_code)
15392     {
15393     case LTU:
15394     case LEU:
15395     case GEU:
15396     case GTU:
15397       return aarch64_sve_cmp_immediate_p (x, false);
15398     case LT:
15399     case LE:
15400     case GE:
15401     case GT:
15402     case NE:
15403     case EQ:
15404       return aarch64_sve_cmp_immediate_p (x, true);
15405     default:
15406       gcc_unreachable ();
15407     }
15408 }
15409
15410 /* Return the UNSPEC_COND_* code for comparison CODE.  */
15411
15412 static unsigned int
15413 aarch64_unspec_cond_code (rtx_code code)
15414 {
15415   switch (code)
15416     {
15417     case NE:
15418       return UNSPEC_COND_NE;
15419     case EQ:
15420       return UNSPEC_COND_EQ;
15421     case LT:
15422       return UNSPEC_COND_LT;
15423     case GT:
15424       return UNSPEC_COND_GT;
15425     case LE:
15426       return UNSPEC_COND_LE;
15427     case GE:
15428       return UNSPEC_COND_GE;
15429     case LTU:
15430       return UNSPEC_COND_LO;
15431     case GTU:
15432       return UNSPEC_COND_HI;
15433     case LEU:
15434       return UNSPEC_COND_LS;
15435     case GEU:
15436       return UNSPEC_COND_HS;
15437     case UNORDERED:
15438       return UNSPEC_COND_UO;
15439     default:
15440       gcc_unreachable ();
15441     }
15442 }
15443
15444 /* Return an (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>) expression,
15445    where <X> is the operation associated with comparison CODE.  */
15446
15447 static rtx
15448 aarch64_gen_unspec_cond (rtx_code code, machine_mode pred_mode,
15449                          rtx pred, rtx op0, rtx op1)
15450 {
15451   rtvec vec = gen_rtvec (3, pred, op0, op1);
15452   return gen_rtx_UNSPEC (pred_mode, vec, aarch64_unspec_cond_code (code));
15453 }
15454
15455 /* Expand an SVE integer comparison:
15456
15457      TARGET = CODE (OP0, OP1).  */
15458
15459 void
15460 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15461 {
15462   machine_mode pred_mode = GET_MODE (target);
15463   machine_mode data_mode = GET_MODE (op0);
15464
15465   if (!aarch64_sve_cmp_operand_p (code, op1))
15466     op1 = force_reg (data_mode, op1);
15467
15468   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15469   rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, ptrue, op0, op1);
15470   emit_insn (gen_set_clobber_cc (target, unspec));
15471 }
15472
15473 /* Emit an instruction:
15474
15475       (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15476
15477    where <X> is the operation associated with comparison CODE.  */
15478
15479 static void
15480 aarch64_emit_unspec_cond (rtx target, rtx_code code, machine_mode pred_mode,
15481                           rtx pred, rtx op0, rtx op1)
15482 {
15483   rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, pred, op0, op1);
15484   emit_set_insn (target, unspec);
15485 }
15486
15487 /* Emit:
15488
15489       (set TMP1 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X1>))
15490       (set TMP2 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X2>))
15491       (set TARGET (and:PRED_MODE (ior:PRED_MODE TMP1 TMP2) PTRUE))
15492
15493    where <Xi> is the operation associated with comparison CODEi.  */
15494
15495 static void
15496 aarch64_emit_unspec_cond_or (rtx target, rtx_code code1, rtx_code code2,
15497                              machine_mode pred_mode, rtx ptrue,
15498                              rtx op0, rtx op1)
15499 {
15500   rtx tmp1 = gen_reg_rtx (pred_mode);
15501   aarch64_emit_unspec_cond (tmp1, code1, pred_mode, ptrue, op0, op1);
15502   rtx tmp2 = gen_reg_rtx (pred_mode);
15503   aarch64_emit_unspec_cond (tmp2, code2, pred_mode, ptrue, op0, op1);
15504   emit_set_insn (target, gen_rtx_AND (pred_mode,
15505                                       gen_rtx_IOR (pred_mode, tmp1, tmp2),
15506                                       ptrue));
15507 }
15508
15509 /* If CAN_INVERT_P, emit an instruction:
15510
15511       (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15512
15513    where <X> is the operation associated with comparison CODE.  Otherwise
15514    emit:
15515
15516       (set TMP (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15517       (set TARGET (and:PRED_MODE (not:PRED_MODE TMP) PTRUE))
15518
15519    where the second instructions sets TARGET to the inverse of TMP.  */
15520
15521 static void
15522 aarch64_emit_inverted_unspec_cond (rtx target, rtx_code code,
15523                                    machine_mode pred_mode, rtx ptrue, rtx pred,
15524                                    rtx op0, rtx op1, bool can_invert_p)
15525 {
15526   if (can_invert_p)
15527     aarch64_emit_unspec_cond (target, code, pred_mode, pred, op0, op1);
15528   else
15529     {
15530       rtx tmp = gen_reg_rtx (pred_mode);
15531       aarch64_emit_unspec_cond (tmp, code, pred_mode, pred, op0, op1);
15532       emit_set_insn (target, gen_rtx_AND (pred_mode,
15533                                           gen_rtx_NOT (pred_mode, tmp),
15534                                           ptrue));
15535     }
15536 }
15537
15538 /* Expand an SVE floating-point comparison:
15539
15540      TARGET = CODE (OP0, OP1)
15541
15542    If CAN_INVERT_P is true, the caller can also handle inverted results;
15543    return true if the result is in fact inverted.  */
15544
15545 bool
15546 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15547                                   rtx op0, rtx op1, bool can_invert_p)
15548 {
15549   machine_mode pred_mode = GET_MODE (target);
15550   machine_mode data_mode = GET_MODE (op0);
15551
15552   rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15553   switch (code)
15554     {
15555     case UNORDERED:
15556       /* UNORDERED has no immediate form.  */
15557       op1 = force_reg (data_mode, op1);
15558       aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15559       return false;
15560
15561     case LT:
15562     case LE:
15563     case GT:
15564     case GE:
15565     case EQ:
15566     case NE:
15567       /* There is native support for the comparison.  */
15568       aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15569       return false;
15570
15571     case ORDERED:
15572       /* There is native support for the inverse comparison.  */
15573       op1 = force_reg (data_mode, op1);
15574       aarch64_emit_inverted_unspec_cond (target, UNORDERED,
15575                                          pred_mode, ptrue, ptrue, op0, op1,
15576                                          can_invert_p);
15577       return can_invert_p;
15578
15579     case LTGT:
15580       /* This is a trapping operation (LT or GT).  */
15581       aarch64_emit_unspec_cond_or (target, LT, GT, pred_mode, ptrue, op0, op1);
15582       return false;
15583
15584     case UNEQ:
15585       if (!flag_trapping_math)
15586         {
15587           /* This would trap for signaling NaNs.  */
15588           op1 = force_reg (data_mode, op1);
15589           aarch64_emit_unspec_cond_or (target, UNORDERED, EQ,
15590                                        pred_mode, ptrue, op0, op1);
15591           return false;
15592         }
15593       /* fall through */
15594
15595     case UNLT:
15596     case UNLE:
15597     case UNGT:
15598     case UNGE:
15599       {
15600         rtx ordered = ptrue;
15601         if (flag_trapping_math)
15602           {
15603             /* Only compare the elements that are known to be ordered.  */
15604             ordered = gen_reg_rtx (pred_mode);
15605             op1 = force_reg (data_mode, op1);
15606             aarch64_emit_inverted_unspec_cond (ordered, UNORDERED, pred_mode,
15607                                                ptrue, ptrue, op0, op1, false);
15608           }
15609         if (code == UNEQ)
15610           code = NE;
15611         else
15612           code = reverse_condition_maybe_unordered (code);
15613         aarch64_emit_inverted_unspec_cond (target, code, pred_mode, ptrue,
15614                                            ordered, op0, op1, can_invert_p);
15615         return can_invert_p;
15616       }
15617
15618     default:
15619       gcc_unreachable ();
15620     }
15621 }
15622
15623 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
15624    of the data being selected and CMP_MODE is the mode of the values being
15625    compared.  */
15626
15627 void
15628 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15629                           rtx *ops)
15630 {
15631   machine_mode pred_mode
15632     = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15633                              GET_MODE_SIZE (cmp_mode)).require ();
15634   rtx pred = gen_reg_rtx (pred_mode);
15635   if (FLOAT_MODE_P (cmp_mode))
15636     {
15637       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15638                                             ops[4], ops[5], true))
15639         std::swap (ops[1], ops[2]);
15640     }
15641   else
15642     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
15643
15644   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
15645   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
15646 }
15647
15648 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always return
15649    true.  However due to issues with register allocation it is preferable
15650    to avoid tieing integer scalar and FP scalar modes.  Executing integer
15651    operations in general registers is better than treating them as scalar
15652    vector operations.  This reduces latency and avoids redundant int<->FP
15653    moves.  So tie modes if they are either the same class, or vector modes
15654    with other vector modes, vector structs or any scalar mode.  */
15655
15656 static bool
15657 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
15658 {
15659   if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
15660     return true;
15661
15662   /* We specifically want to allow elements of "structure" modes to
15663      be tieable to the structure.  This more general condition allows
15664      other rarer situations too.  The reason we don't extend this to
15665      predicate modes is that there are no predicate structure modes
15666      nor any specific instructions for extracting part of a predicate
15667      register.  */
15668   if (aarch64_vector_data_mode_p (mode1)
15669       && aarch64_vector_data_mode_p (mode2))
15670     return true;
15671
15672   /* Also allow any scalar modes with vectors.  */
15673   if (aarch64_vector_mode_supported_p (mode1)
15674       || aarch64_vector_mode_supported_p (mode2))
15675     return true;
15676
15677   return false;
15678 }
15679
15680 /* Return a new RTX holding the result of moving POINTER forward by
15681    AMOUNT bytes.  */
15682
15683 static rtx
15684 aarch64_move_pointer (rtx pointer, poly_int64 amount)
15685 {
15686   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
15687
15688   return adjust_automodify_address (pointer, GET_MODE (pointer),
15689                                     next, amount);
15690 }
15691
15692 /* Return a new RTX holding the result of moving POINTER forward by the
15693    size of the mode it points to.  */
15694
15695 static rtx
15696 aarch64_progress_pointer (rtx pointer)
15697 {
15698   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
15699 }
15700
15701 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
15702    MODE bytes.  */
15703
15704 static void
15705 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
15706                                               machine_mode mode)
15707 {
15708   rtx reg = gen_reg_rtx (mode);
15709
15710   /* "Cast" the pointers to the correct mode.  */
15711   *src = adjust_address (*src, mode, 0);
15712   *dst = adjust_address (*dst, mode, 0);
15713   /* Emit the memcpy.  */
15714   emit_move_insn (reg, *src);
15715   emit_move_insn (*dst, reg);
15716   /* Move the pointers forward.  */
15717   *src = aarch64_progress_pointer (*src);
15718   *dst = aarch64_progress_pointer (*dst);
15719 }
15720
15721 /* Expand movmem, as if from a __builtin_memcpy.  Return true if
15722    we succeed, otherwise return false.  */
15723
15724 bool
15725 aarch64_expand_movmem (rtx *operands)
15726 {
15727   unsigned int n;
15728   rtx dst = operands[0];
15729   rtx src = operands[1];
15730   rtx base;
15731   bool speed_p = !optimize_function_for_size_p (cfun);
15732
15733   /* When optimizing for size, give a better estimate of the length of a
15734      memcpy call, but use the default otherwise.  */
15735   unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
15736
15737   /* We can't do anything smart if the amount to copy is not constant.  */
15738   if (!CONST_INT_P (operands[2]))
15739     return false;
15740
15741   n = UINTVAL (operands[2]);
15742
15743   /* Try to keep the number of instructions low.  For cases below 16 bytes we
15744      need to make at most two moves.  For cases above 16 bytes it will be one
15745      move for each 16 byte chunk, then at most two additional moves.  */
15746   if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
15747     return false;
15748
15749   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
15750   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
15751
15752   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
15753   src = adjust_automodify_address (src, VOIDmode, base, 0);
15754
15755   /* Simple cases.  Copy 0-3 bytes, as (if applicable) a 2-byte, then a
15756      1-byte chunk.  */
15757   if (n < 4)
15758     {
15759       if (n >= 2)
15760         {
15761           aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
15762           n -= 2;
15763         }
15764
15765       if (n == 1)
15766         aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
15767
15768       return true;
15769     }
15770
15771   /* Copy 4-8 bytes.  First a 4-byte chunk, then (if applicable) a second
15772      4-byte chunk, partially overlapping with the previously copied chunk.  */
15773   if (n < 8)
15774     {
15775       aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
15776       n -= 4;
15777       if (n > 0)
15778         {
15779           int move = n - 4;
15780
15781           src = aarch64_move_pointer (src, move);
15782           dst = aarch64_move_pointer (dst, move);
15783           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
15784         }
15785       return true;
15786     }
15787
15788   /* Copy more than 8 bytes.  Copy chunks of 16 bytes until we run out of
15789      them, then (if applicable) an 8-byte chunk.  */
15790   while (n >= 8)
15791     {
15792       if (n / 16)
15793         {
15794           aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
15795           n -= 16;
15796         }
15797       else
15798         {
15799           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
15800           n -= 8;
15801         }
15802     }
15803
15804   /* Finish the final bytes of the copy.  We can always do this in one
15805      instruction.  We either copy the exact amount we need, or partially
15806      overlap with the previous chunk we copied and copy 8-bytes.  */
15807   if (n == 0)
15808     return true;
15809   else if (n == 1)
15810     aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
15811   else if (n == 2)
15812     aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
15813   else if (n == 4)
15814     aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
15815   else
15816     {
15817       if (n == 3)
15818         {
15819           src = aarch64_move_pointer (src, -1);
15820           dst = aarch64_move_pointer (dst, -1);
15821           aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
15822         }
15823       else
15824         {
15825           int move = n - 8;
15826
15827           src = aarch64_move_pointer (src, move);
15828           dst = aarch64_move_pointer (dst, move);
15829           aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
15830         }
15831     }
15832
15833   return true;
15834 }
15835
15836 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
15837    SImode stores.  Handle the case when the constant has identical
15838    bottom and top halves.  This is beneficial when the two stores can be
15839    merged into an STP and we avoid synthesising potentially expensive
15840    immediates twice.  Return true if such a split is possible.  */
15841
15842 bool
15843 aarch64_split_dimode_const_store (rtx dst, rtx src)
15844 {
15845   rtx lo = gen_lowpart (SImode, src);
15846   rtx hi = gen_highpart_mode (SImode, DImode, src);
15847
15848   bool size_p = optimize_function_for_size_p (cfun);
15849
15850   if (!rtx_equal_p (lo, hi))
15851     return false;
15852
15853   unsigned int orig_cost
15854     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
15855   unsigned int lo_cost
15856     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
15857
15858   /* We want to transform:
15859      MOV        x1, 49370
15860      MOVK       x1, 0x140, lsl 16
15861      MOVK       x1, 0xc0da, lsl 32
15862      MOVK       x1, 0x140, lsl 48
15863      STR        x1, [x0]
15864    into:
15865      MOV        w1, 49370
15866      MOVK       w1, 0x140, lsl 16
15867      STP        w1, w1, [x0]
15868    So we want to perform this only when we save two instructions
15869    or more.  When optimizing for size, however, accept any code size
15870    savings we can.  */
15871   if (size_p && orig_cost <= lo_cost)
15872     return false;
15873
15874   if (!size_p
15875       && (orig_cost <= lo_cost + 1))
15876     return false;
15877
15878   rtx mem_lo = adjust_address (dst, SImode, 0);
15879   if (!aarch64_mem_pair_operand (mem_lo, SImode))
15880     return false;
15881
15882   rtx tmp_reg = gen_reg_rtx (SImode);
15883   aarch64_expand_mov_immediate (tmp_reg, lo);
15884   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
15885   /* Don't emit an explicit store pair as this may not be always profitable.
15886      Let the sched-fusion logic decide whether to merge them.  */
15887   emit_move_insn (mem_lo, tmp_reg);
15888   emit_move_insn (mem_hi, tmp_reg);
15889
15890   return true;
15891 }
15892
15893 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
15894
15895 static unsigned HOST_WIDE_INT
15896 aarch64_asan_shadow_offset (void)
15897 {
15898   return (HOST_WIDE_INT_1 << 36);
15899 }
15900
15901 static rtx
15902 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
15903                         int code, tree treeop0, tree treeop1)
15904 {
15905   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
15906   rtx op0, op1;
15907   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
15908   insn_code icode;
15909   struct expand_operand ops[4];
15910
15911   start_sequence ();
15912   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
15913
15914   op_mode = GET_MODE (op0);
15915   if (op_mode == VOIDmode)
15916     op_mode = GET_MODE (op1);
15917
15918   switch (op_mode)
15919     {
15920     case E_QImode:
15921     case E_HImode:
15922     case E_SImode:
15923       cmp_mode = SImode;
15924       icode = CODE_FOR_cmpsi;
15925       break;
15926
15927     case E_DImode:
15928       cmp_mode = DImode;
15929       icode = CODE_FOR_cmpdi;
15930       break;
15931
15932     case E_SFmode:
15933       cmp_mode = SFmode;
15934       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
15935       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
15936       break;
15937
15938     case E_DFmode:
15939       cmp_mode = DFmode;
15940       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
15941       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
15942       break;
15943
15944     default:
15945       end_sequence ();
15946       return NULL_RTX;
15947     }
15948
15949   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
15950   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
15951   if (!op0 || !op1)
15952     {
15953       end_sequence ();
15954       return NULL_RTX;
15955     }
15956   *prep_seq = get_insns ();
15957   end_sequence ();
15958
15959   create_fixed_operand (&ops[0], op0);
15960   create_fixed_operand (&ops[1], op1);
15961
15962   start_sequence ();
15963   if (!maybe_expand_insn (icode, 2, ops))
15964     {
15965       end_sequence ();
15966       return NULL_RTX;
15967     }
15968   *gen_seq = get_insns ();
15969   end_sequence ();
15970
15971   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
15972                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
15973 }
15974
15975 static rtx
15976 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
15977                        int cmp_code, tree treeop0, tree treeop1, int bit_code)
15978 {
15979   rtx op0, op1, target;
15980   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
15981   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
15982   insn_code icode;
15983   struct expand_operand ops[6];
15984   int aarch64_cond;
15985
15986   push_to_sequence (*prep_seq);
15987   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
15988
15989   op_mode = GET_MODE (op0);
15990   if (op_mode == VOIDmode)
15991     op_mode = GET_MODE (op1);
15992
15993   switch (op_mode)
15994     {
15995     case E_QImode:
15996     case E_HImode:
15997     case E_SImode:
15998       cmp_mode = SImode;
15999       icode = CODE_FOR_ccmpsi;
16000       break;
16001
16002     case E_DImode:
16003       cmp_mode = DImode;
16004       icode = CODE_FOR_ccmpdi;
16005       break;
16006
16007     case E_SFmode:
16008       cmp_mode = SFmode;
16009       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16010       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16011       break;
16012
16013     case E_DFmode:
16014       cmp_mode = DFmode;
16015       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16016       icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16017       break;
16018
16019     default:
16020       end_sequence ();
16021       return NULL_RTX;
16022     }
16023
16024   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16025   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16026   if (!op0 || !op1)
16027     {
16028       end_sequence ();
16029       return NULL_RTX;
16030     }
16031   *prep_seq = get_insns ();
16032   end_sequence ();
16033
16034   target = gen_rtx_REG (cc_mode, CC_REGNUM);
16035   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16036
16037   if (bit_code != AND)
16038     {
16039       prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16040                                                 GET_MODE (XEXP (prev, 0))),
16041                              VOIDmode, XEXP (prev, 0), const0_rtx);
16042       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16043     }
16044
16045   create_fixed_operand (&ops[0], XEXP (prev, 0));
16046   create_fixed_operand (&ops[1], target);
16047   create_fixed_operand (&ops[2], op0);
16048   create_fixed_operand (&ops[3], op1);
16049   create_fixed_operand (&ops[4], prev);
16050   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16051
16052   push_to_sequence (*gen_seq);
16053   if (!maybe_expand_insn (icode, 6, ops))
16054     {
16055       end_sequence ();
16056       return NULL_RTX;
16057     }
16058
16059   *gen_seq = get_insns ();
16060   end_sequence ();
16061
16062   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16063 }
16064
16065 #undef TARGET_GEN_CCMP_FIRST
16066 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16067
16068 #undef TARGET_GEN_CCMP_NEXT
16069 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16070
16071 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
16072    instruction fusion of some sort.  */
16073
16074 static bool
16075 aarch64_macro_fusion_p (void)
16076 {
16077   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16078 }
16079
16080
16081 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
16082    should be kept together during scheduling.  */
16083
16084 static bool
16085 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16086 {
16087   rtx set_dest;
16088   rtx prev_set = single_set (prev);
16089   rtx curr_set = single_set (curr);
16090   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
16091   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16092
16093   if (!aarch64_macro_fusion_p ())
16094     return false;
16095
16096   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16097     {
16098       /* We are trying to match:
16099          prev (mov)  == (set (reg r0) (const_int imm16))
16100          curr (movk) == (set (zero_extract (reg r0)
16101                                            (const_int 16)
16102                                            (const_int 16))
16103                              (const_int imm16_1))  */
16104
16105       set_dest = SET_DEST (curr_set);
16106
16107       if (GET_CODE (set_dest) == ZERO_EXTRACT
16108           && CONST_INT_P (SET_SRC (curr_set))
16109           && CONST_INT_P (SET_SRC (prev_set))
16110           && CONST_INT_P (XEXP (set_dest, 2))
16111           && INTVAL (XEXP (set_dest, 2)) == 16
16112           && REG_P (XEXP (set_dest, 0))
16113           && REG_P (SET_DEST (prev_set))
16114           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16115         {
16116           return true;
16117         }
16118     }
16119
16120   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16121     {
16122
16123       /*  We're trying to match:
16124           prev (adrp) == (set (reg r1)
16125                               (high (symbol_ref ("SYM"))))
16126           curr (add) == (set (reg r0)
16127                              (lo_sum (reg r1)
16128                                      (symbol_ref ("SYM"))))
16129           Note that r0 need not necessarily be the same as r1, especially
16130           during pre-regalloc scheduling.  */
16131
16132       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16133           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16134         {
16135           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16136               && REG_P (XEXP (SET_SRC (curr_set), 0))
16137               && REGNO (XEXP (SET_SRC (curr_set), 0))
16138                  == REGNO (SET_DEST (prev_set))
16139               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16140                               XEXP (SET_SRC (curr_set), 1)))
16141             return true;
16142         }
16143     }
16144
16145   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16146     {
16147
16148       /* We're trying to match:
16149          prev (movk) == (set (zero_extract (reg r0)
16150                                            (const_int 16)
16151                                            (const_int 32))
16152                              (const_int imm16_1))
16153          curr (movk) == (set (zero_extract (reg r0)
16154                                            (const_int 16)
16155                                            (const_int 48))
16156                              (const_int imm16_2))  */
16157
16158       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16159           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16160           && REG_P (XEXP (SET_DEST (prev_set), 0))
16161           && REG_P (XEXP (SET_DEST (curr_set), 0))
16162           && REGNO (XEXP (SET_DEST (prev_set), 0))
16163              == REGNO (XEXP (SET_DEST (curr_set), 0))
16164           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16165           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16166           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16167           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16168           && CONST_INT_P (SET_SRC (prev_set))
16169           && CONST_INT_P (SET_SRC (curr_set)))
16170         return true;
16171
16172     }
16173   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16174     {
16175       /* We're trying to match:
16176           prev (adrp) == (set (reg r0)
16177                               (high (symbol_ref ("SYM"))))
16178           curr (ldr) == (set (reg r1)
16179                              (mem (lo_sum (reg r0)
16180                                              (symbol_ref ("SYM")))))
16181                  or
16182           curr (ldr) == (set (reg r1)
16183                              (zero_extend (mem
16184                                            (lo_sum (reg r0)
16185                                                    (symbol_ref ("SYM"))))))  */
16186       if (satisfies_constraint_Ush (SET_SRC (prev_set))
16187           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16188         {
16189           rtx curr_src = SET_SRC (curr_set);
16190
16191           if (GET_CODE (curr_src) == ZERO_EXTEND)
16192             curr_src = XEXP (curr_src, 0);
16193
16194           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16195               && REG_P (XEXP (XEXP (curr_src, 0), 0))
16196               && REGNO (XEXP (XEXP (curr_src, 0), 0))
16197                  == REGNO (SET_DEST (prev_set))
16198               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16199                               XEXP (SET_SRC (prev_set), 0)))
16200               return true;
16201         }
16202     }
16203
16204   if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16205        && aarch_crypto_can_dual_issue (prev, curr))
16206     return true;
16207
16208   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16209       && any_condjump_p (curr))
16210     {
16211       enum attr_type prev_type = get_attr_type (prev);
16212
16213       unsigned int condreg1, condreg2;
16214       rtx cc_reg_1;
16215       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16216       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16217
16218       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16219           && prev
16220           && modified_in_p (cc_reg_1, prev))
16221         {
16222           /* FIXME: this misses some which is considered simple arthematic
16223              instructions for ThunderX.  Simple shifts are missed here.  */
16224           if (prev_type == TYPE_ALUS_SREG
16225               || prev_type == TYPE_ALUS_IMM
16226               || prev_type == TYPE_LOGICS_REG
16227               || prev_type == TYPE_LOGICS_IMM)
16228             return true;
16229         }
16230     }
16231
16232   if (prev_set
16233       && curr_set
16234       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16235       && any_condjump_p (curr))
16236     {
16237       /* We're trying to match:
16238           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16239           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
16240                                                          (const_int 0))
16241                                                  (label_ref ("SYM"))
16242                                                  (pc))  */
16243       if (SET_DEST (curr_set) == (pc_rtx)
16244           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16245           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16246           && REG_P (SET_DEST (prev_set))
16247           && REGNO (SET_DEST (prev_set))
16248              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16249         {
16250           /* Fuse ALU operations followed by conditional branch instruction.  */
16251           switch (get_attr_type (prev))
16252             {
16253             case TYPE_ALU_IMM:
16254             case TYPE_ALU_SREG:
16255             case TYPE_ADC_REG:
16256             case TYPE_ADC_IMM:
16257             case TYPE_ADCS_REG:
16258             case TYPE_ADCS_IMM:
16259             case TYPE_LOGIC_REG:
16260             case TYPE_LOGIC_IMM:
16261             case TYPE_CSEL:
16262             case TYPE_ADR:
16263             case TYPE_MOV_IMM:
16264             case TYPE_SHIFT_REG:
16265             case TYPE_SHIFT_IMM:
16266             case TYPE_BFM:
16267             case TYPE_RBIT:
16268             case TYPE_REV:
16269             case TYPE_EXTEND:
16270               return true;
16271
16272             default:;
16273             }
16274         }
16275     }
16276
16277   return false;
16278 }
16279
16280 /* Return true iff the instruction fusion described by OP is enabled.  */
16281
16282 bool
16283 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16284 {
16285   return (aarch64_tune_params.fusible_ops & op) != 0;
16286 }
16287
16288 /* If MEM is in the form of [base+offset], extract the two parts
16289    of address and set to BASE and OFFSET, otherwise return false
16290    after clearing BASE and OFFSET.  */
16291
16292 bool
16293 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16294 {
16295   rtx addr;
16296
16297   gcc_assert (MEM_P (mem));
16298
16299   addr = XEXP (mem, 0);
16300
16301   if (REG_P (addr))
16302     {
16303       *base = addr;
16304       *offset = const0_rtx;
16305       return true;
16306     }
16307
16308   if (GET_CODE (addr) == PLUS
16309       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16310     {
16311       *base = XEXP (addr, 0);
16312       *offset = XEXP (addr, 1);
16313       return true;
16314     }
16315
16316   *base = NULL_RTX;
16317   *offset = NULL_RTX;
16318
16319   return false;
16320 }
16321
16322 /* Types for scheduling fusion.  */
16323 enum sched_fusion_type
16324 {
16325   SCHED_FUSION_NONE = 0,
16326   SCHED_FUSION_LD_SIGN_EXTEND,
16327   SCHED_FUSION_LD_ZERO_EXTEND,
16328   SCHED_FUSION_LD,
16329   SCHED_FUSION_ST,
16330   SCHED_FUSION_NUM
16331 };
16332
16333 /* If INSN is a load or store of address in the form of [base+offset],
16334    extract the two parts and set to BASE and OFFSET.  Return scheduling
16335    fusion type this INSN is.  */
16336
16337 static enum sched_fusion_type
16338 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16339 {
16340   rtx x, dest, src;
16341   enum sched_fusion_type fusion = SCHED_FUSION_LD;
16342
16343   gcc_assert (INSN_P (insn));
16344   x = PATTERN (insn);
16345   if (GET_CODE (x) != SET)
16346     return SCHED_FUSION_NONE;
16347
16348   src = SET_SRC (x);
16349   dest = SET_DEST (x);
16350
16351   machine_mode dest_mode = GET_MODE (dest);
16352
16353   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16354     return SCHED_FUSION_NONE;
16355
16356   if (GET_CODE (src) == SIGN_EXTEND)
16357     {
16358       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16359       src = XEXP (src, 0);
16360       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16361         return SCHED_FUSION_NONE;
16362     }
16363   else if (GET_CODE (src) == ZERO_EXTEND)
16364     {
16365       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16366       src = XEXP (src, 0);
16367       if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16368         return SCHED_FUSION_NONE;
16369     }
16370
16371   if (GET_CODE (src) == MEM && REG_P (dest))
16372     extract_base_offset_in_addr (src, base, offset);
16373   else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16374     {
16375       fusion = SCHED_FUSION_ST;
16376       extract_base_offset_in_addr (dest, base, offset);
16377     }
16378   else
16379     return SCHED_FUSION_NONE;
16380
16381   if (*base == NULL_RTX || *offset == NULL_RTX)
16382     fusion = SCHED_FUSION_NONE;
16383
16384   return fusion;
16385 }
16386
16387 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16388
16389    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16390    and PRI are only calculated for these instructions.  For other instruction,
16391    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
16392    type instruction fusion can be added by returning different priorities.
16393
16394    It's important that irrelevant instructions get the largest FUSION_PRI.  */
16395
16396 static void
16397 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16398                                int *fusion_pri, int *pri)
16399 {
16400   int tmp, off_val;
16401   rtx base, offset;
16402   enum sched_fusion_type fusion;
16403
16404   gcc_assert (INSN_P (insn));
16405
16406   tmp = max_pri - 1;
16407   fusion = fusion_load_store (insn, &base, &offset);
16408   if (fusion == SCHED_FUSION_NONE)
16409     {
16410       *pri = tmp;
16411       *fusion_pri = tmp;
16412       return;
16413     }
16414
16415   /* Set FUSION_PRI according to fusion type and base register.  */
16416   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16417
16418   /* Calculate PRI.  */
16419   tmp /= 2;
16420
16421   /* INSN with smaller offset goes first.  */
16422   off_val = (int)(INTVAL (offset));
16423   if (off_val >= 0)
16424     tmp -= (off_val & 0xfffff);
16425   else
16426     tmp += ((- off_val) & 0xfffff);
16427
16428   *pri = tmp;
16429   return;
16430 }
16431
16432 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16433    Adjust priority of sha1h instructions so they are scheduled before
16434    other SHA1 instructions.  */
16435
16436 static int
16437 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16438 {
16439   rtx x = PATTERN (insn);
16440
16441   if (GET_CODE (x) == SET)
16442     {
16443       x = SET_SRC (x);
16444
16445       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16446         return priority + 10;
16447     }
16448
16449   return priority;
16450 }
16451
16452 /* Given OPERANDS of consecutive load/store, check if we can merge
16453    them into ldp/stp.  LOAD is true if they are load instructions.
16454    MODE is the mode of memory operands.  */
16455
16456 bool
16457 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16458                                 machine_mode mode)
16459 {
16460   HOST_WIDE_INT offval_1, offval_2, msize;
16461   enum reg_class rclass_1, rclass_2;
16462   rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16463
16464   if (load)
16465     {
16466       mem_1 = operands[1];
16467       mem_2 = operands[3];
16468       reg_1 = operands[0];
16469       reg_2 = operands[2];
16470       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16471       if (REGNO (reg_1) == REGNO (reg_2))
16472         return false;
16473     }
16474   else
16475     {
16476       mem_1 = operands[0];
16477       mem_2 = operands[2];
16478       reg_1 = operands[1];
16479       reg_2 = operands[3];
16480     }
16481
16482   /* The mems cannot be volatile.  */
16483   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16484     return false;
16485
16486   /* If we have SImode and slow unaligned ldp,
16487      check the alignment to be at least 8 byte. */
16488   if (mode == SImode
16489       && (aarch64_tune_params.extra_tuning_flags
16490           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16491       && !optimize_size
16492       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16493     return false;
16494
16495   /* Check if the addresses are in the form of [base+offset].  */
16496   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16497   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16498     return false;
16499   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16500   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16501     return false;
16502
16503   /* Check if the bases are same.  */
16504   if (!rtx_equal_p (base_1, base_2))
16505     return false;
16506
16507   offval_1 = INTVAL (offset_1);
16508   offval_2 = INTVAL (offset_2);
16509   /* We should only be trying this for fixed-sized modes.  There is no
16510      SVE LDP/STP instruction.  */
16511   msize = GET_MODE_SIZE (mode).to_constant ();
16512   /* Check if the offsets are consecutive.  */
16513   if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16514     return false;
16515
16516   /* Check if the addresses are clobbered by load.  */
16517   if (load)
16518     {
16519       if (reg_mentioned_p (reg_1, mem_1))
16520         return false;
16521
16522       /* In increasing order, the last load can clobber the address.  */
16523       if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16524       return false;
16525     }
16526
16527   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16528     rclass_1 = FP_REGS;
16529   else
16530     rclass_1 = GENERAL_REGS;
16531
16532   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16533     rclass_2 = FP_REGS;
16534   else
16535     rclass_2 = GENERAL_REGS;
16536
16537   /* Check if the registers are of same class.  */
16538   if (rclass_1 != rclass_2)
16539     return false;
16540
16541   return true;
16542 }
16543
16544 /* Given OPERANDS of consecutive load/store, check if we can merge
16545    them into ldp/stp by adjusting the offset.  LOAD is true if they
16546    are load instructions.  MODE is the mode of memory operands.
16547
16548    Given below consecutive stores:
16549
16550      str  w1, [xb, 0x100]
16551      str  w1, [xb, 0x104]
16552      str  w1, [xb, 0x108]
16553      str  w1, [xb, 0x10c]
16554
16555    Though the offsets are out of the range supported by stp, we can
16556    still pair them after adjusting the offset, like:
16557
16558      add  scratch, xb, 0x100
16559      stp  w1, w1, [scratch]
16560      stp  w1, w1, [scratch, 0x8]
16561
16562    The peephole patterns detecting this opportunity should guarantee
16563    the scratch register is avaliable.  */
16564
16565 bool
16566 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16567                                        scalar_mode mode)
16568 {
16569   enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
16570   HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
16571   rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
16572   rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
16573
16574   if (load)
16575     {
16576       reg_1 = operands[0];
16577       mem_1 = operands[1];
16578       reg_2 = operands[2];
16579       mem_2 = operands[3];
16580       reg_3 = operands[4];
16581       mem_3 = operands[5];
16582       reg_4 = operands[6];
16583       mem_4 = operands[7];
16584       gcc_assert (REG_P (reg_1) && REG_P (reg_2)
16585                   && REG_P (reg_3) && REG_P (reg_4));
16586       if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
16587         return false;
16588     }
16589   else
16590     {
16591       mem_1 = operands[0];
16592       reg_1 = operands[1];
16593       mem_2 = operands[2];
16594       reg_2 = operands[3];
16595       mem_3 = operands[4];
16596       reg_3 = operands[5];
16597       mem_4 = operands[6];
16598       reg_4 = operands[7];
16599     }
16600   /* Skip if memory operand is by itslef valid for ldp/stp.  */
16601   if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
16602     return false;
16603
16604   /* The mems cannot be volatile.  */
16605   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
16606       || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
16607     return false;
16608
16609   /* Check if the addresses are in the form of [base+offset].  */
16610   extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16611   if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16612     return false;
16613   extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16614   if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16615     return false;
16616   extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
16617   if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
16618     return false;
16619   extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
16620   if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
16621     return false;
16622
16623   /* Check if the bases are same.  */
16624   if (!rtx_equal_p (base_1, base_2)
16625       || !rtx_equal_p (base_2, base_3)
16626       || !rtx_equal_p (base_3, base_4))
16627     return false;
16628
16629   offval_1 = INTVAL (offset_1);
16630   offval_2 = INTVAL (offset_2);
16631   offval_3 = INTVAL (offset_3);
16632   offval_4 = INTVAL (offset_4);
16633   msize = GET_MODE_SIZE (mode);
16634   /* Check if the offsets are consecutive.  */
16635   if ((offval_1 != (offval_2 + msize)
16636        || offval_1 != (offval_3 + msize * 2)
16637        || offval_1 != (offval_4 + msize * 3))
16638       && (offval_4 != (offval_3 + msize)
16639           || offval_4 != (offval_2 + msize * 2)
16640           || offval_4 != (offval_1 + msize * 3)))
16641     return false;
16642
16643   /* Check if the addresses are clobbered by load.  */
16644   if (load)
16645     {
16646       if (reg_mentioned_p (reg_1, mem_1)
16647           || reg_mentioned_p (reg_2, mem_2)
16648           || reg_mentioned_p (reg_3, mem_3))
16649         return false;
16650
16651       /* In increasing order, the last load can clobber the address.  */
16652       if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
16653         return false;
16654     }
16655
16656   /* If we have SImode and slow unaligned ldp,
16657      check the alignment to be at least 8 byte. */
16658   if (mode == SImode
16659       && (aarch64_tune_params.extra_tuning_flags
16660           & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16661       && !optimize_size
16662       && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16663     return false;
16664
16665   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16666     rclass_1 = FP_REGS;
16667   else
16668     rclass_1 = GENERAL_REGS;
16669
16670   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16671     rclass_2 = FP_REGS;
16672   else
16673     rclass_2 = GENERAL_REGS;
16674
16675   if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
16676     rclass_3 = FP_REGS;
16677   else
16678     rclass_3 = GENERAL_REGS;
16679
16680   if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
16681     rclass_4 = FP_REGS;
16682   else
16683     rclass_4 = GENERAL_REGS;
16684
16685   /* Check if the registers are of same class.  */
16686   if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
16687     return false;
16688
16689   return true;
16690 }
16691
16692 /* Given OPERANDS of consecutive load/store, this function pairs them
16693    into ldp/stp after adjusting the offset.  It depends on the fact
16694    that addresses of load/store instructions are in increasing order.
16695    MODE is the mode of memory operands.  CODE is the rtl operator
16696    which should be applied to all memory operands, it's SIGN_EXTEND,
16697    ZERO_EXTEND or UNKNOWN.  */
16698
16699 bool
16700 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
16701                              scalar_mode mode, RTX_CODE code)
16702 {
16703   rtx base, offset, t1, t2;
16704   rtx mem_1, mem_2, mem_3, mem_4;
16705   HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
16706
16707   if (load)
16708     {
16709       mem_1 = operands[1];
16710       mem_2 = operands[3];
16711       mem_3 = operands[5];
16712       mem_4 = operands[7];
16713     }
16714   else
16715     {
16716       mem_1 = operands[0];
16717       mem_2 = operands[2];
16718       mem_3 = operands[4];
16719       mem_4 = operands[6];
16720       gcc_assert (code == UNKNOWN);
16721     }
16722
16723   extract_base_offset_in_addr (mem_1, &base, &offset);
16724   gcc_assert (base != NULL_RTX && offset != NULL_RTX);
16725
16726   /* Adjust offset thus it can fit in ldp/stp instruction.  */
16727   msize = GET_MODE_SIZE (mode);
16728   stp_off_limit = msize * 0x40;
16729   off_val = INTVAL (offset);
16730   abs_off = (off_val < 0) ? -off_val : off_val;
16731   new_off = abs_off % stp_off_limit;
16732   adj_off = abs_off - new_off;
16733
16734   /* Further adjust to make sure all offsets are OK.  */
16735   if ((new_off + msize * 2) >= stp_off_limit)
16736     {
16737       adj_off += stp_off_limit;
16738       new_off -= stp_off_limit;
16739     }
16740
16741   /* Make sure the adjustment can be done with ADD/SUB instructions.  */
16742   if (adj_off >= 0x1000)
16743     return false;
16744
16745   if (off_val < 0)
16746     {
16747       adj_off = -adj_off;
16748       new_off = -new_off;
16749     }
16750
16751   /* Create new memory references.  */
16752   mem_1 = change_address (mem_1, VOIDmode,
16753                           plus_constant (DImode, operands[8], new_off));
16754
16755   /* Check if the adjusted address is OK for ldp/stp.  */
16756   if (!aarch64_mem_pair_operand (mem_1, mode))
16757     return false;
16758
16759   msize = GET_MODE_SIZE (mode);
16760   mem_2 = change_address (mem_2, VOIDmode,
16761                           plus_constant (DImode,
16762                                          operands[8],
16763                                          new_off + msize));
16764   mem_3 = change_address (mem_3, VOIDmode,
16765                           plus_constant (DImode,
16766                                          operands[8],
16767                                          new_off + msize * 2));
16768   mem_4 = change_address (mem_4, VOIDmode,
16769                           plus_constant (DImode,
16770                                          operands[8],
16771                                          new_off + msize * 3));
16772
16773   if (code == ZERO_EXTEND)
16774     {
16775       mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
16776       mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
16777       mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
16778       mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
16779     }
16780   else if (code == SIGN_EXTEND)
16781     {
16782       mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
16783       mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
16784       mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
16785       mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
16786     }
16787
16788   if (load)
16789     {
16790       operands[1] = mem_1;
16791       operands[3] = mem_2;
16792       operands[5] = mem_3;
16793       operands[7] = mem_4;
16794     }
16795   else
16796     {
16797       operands[0] = mem_1;
16798       operands[2] = mem_2;
16799       operands[4] = mem_3;
16800       operands[6] = mem_4;
16801     }
16802
16803   /* Emit adjusting instruction.  */
16804   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
16805   /* Emit ldp/stp instructions.  */
16806   t1 = gen_rtx_SET (operands[0], operands[1]);
16807   t2 = gen_rtx_SET (operands[2], operands[3]);
16808   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
16809   t1 = gen_rtx_SET (operands[4], operands[5]);
16810   t2 = gen_rtx_SET (operands[6], operands[7]);
16811   emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
16812   return true;
16813 }
16814
16815 /* Return 1 if pseudo register should be created and used to hold
16816    GOT address for PIC code.  */
16817
16818 bool
16819 aarch64_use_pseudo_pic_reg (void)
16820 {
16821   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
16822 }
16823
16824 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
16825
16826 static int
16827 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
16828 {
16829   switch (XINT (x, 1))
16830     {
16831     case UNSPEC_GOTSMALLPIC:
16832     case UNSPEC_GOTSMALLPIC28K:
16833     case UNSPEC_GOTTINYPIC:
16834       return 0;
16835     default:
16836       break;
16837     }
16838
16839   return default_unspec_may_trap_p (x, flags);
16840 }
16841
16842
16843 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
16844    return the log2 of that value.  Otherwise return -1.  */
16845
16846 int
16847 aarch64_fpconst_pow_of_2 (rtx x)
16848 {
16849   const REAL_VALUE_TYPE *r;
16850
16851   if (!CONST_DOUBLE_P (x))
16852     return -1;
16853
16854   r = CONST_DOUBLE_REAL_VALUE (x);
16855
16856   if (REAL_VALUE_NEGATIVE (*r)
16857       || REAL_VALUE_ISNAN (*r)
16858       || REAL_VALUE_ISINF (*r)
16859       || !real_isinteger (r, DFmode))
16860     return -1;
16861
16862   return exact_log2 (real_to_integer (r));
16863 }
16864
16865 /* If X is a vector of equal CONST_DOUBLE values and that value is
16866    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
16867
16868 int
16869 aarch64_vec_fpconst_pow_of_2 (rtx x)
16870 {
16871   int nelts;
16872   if (GET_CODE (x) != CONST_VECTOR
16873       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
16874     return -1;
16875
16876   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
16877     return -1;
16878
16879   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
16880   if (firstval <= 0)
16881     return -1;
16882
16883   for (int i = 1; i < nelts; i++)
16884     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
16885       return -1;
16886
16887   return firstval;
16888 }
16889
16890 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
16891    to float.
16892
16893    __fp16 always promotes through this hook.
16894    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
16895    through the generic excess precision logic rather than here.  */
16896
16897 static tree
16898 aarch64_promoted_type (const_tree t)
16899 {
16900   if (SCALAR_FLOAT_TYPE_P (t)
16901       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
16902     return float_type_node;
16903
16904   return NULL_TREE;
16905 }
16906
16907 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
16908
16909 static bool
16910 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
16911                            optimization_type opt_type)
16912 {
16913   switch (op)
16914     {
16915     case rsqrt_optab:
16916       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
16917
16918     default:
16919       return true;
16920     }
16921 }
16922
16923 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
16924
16925 static unsigned int
16926 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
16927                                         int *offset)
16928 {
16929   /* Polynomial invariant 1 == (VG / 2) - 1.  */
16930   gcc_assert (i == 1);
16931   *factor = 2;
16932   *offset = 1;
16933   return AARCH64_DWARF_VG;
16934 }
16935
16936 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
16937    if MODE is HFmode, and punt to the generic implementation otherwise.  */
16938
16939 static bool
16940 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
16941 {
16942   return (mode == HFmode
16943           ? true
16944           : default_libgcc_floating_mode_supported_p (mode));
16945 }
16946
16947 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
16948    if MODE is HFmode, and punt to the generic implementation otherwise.  */
16949
16950 static bool
16951 aarch64_scalar_mode_supported_p (scalar_mode mode)
16952 {
16953   return (mode == HFmode
16954           ? true
16955           : default_scalar_mode_supported_p (mode));
16956 }
16957
16958 /* Set the value of FLT_EVAL_METHOD.
16959    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
16960
16961     0: evaluate all operations and constants, whose semantic type has at
16962        most the range and precision of type float, to the range and
16963        precision of float; evaluate all other operations and constants to
16964        the range and precision of the semantic type;
16965
16966     N, where _FloatN is a supported interchange floating type
16967        evaluate all operations and constants, whose semantic type has at
16968        most the range and precision of _FloatN type, to the range and
16969        precision of the _FloatN type; evaluate all other operations and
16970        constants to the range and precision of the semantic type;
16971
16972    If we have the ARMv8.2-A extensions then we support _Float16 in native
16973    precision, so we should set this to 16.  Otherwise, we support the type,
16974    but want to evaluate expressions in float precision, so set this to
16975    0.  */
16976
16977 static enum flt_eval_method
16978 aarch64_excess_precision (enum excess_precision_type type)
16979 {
16980   switch (type)
16981     {
16982       case EXCESS_PRECISION_TYPE_FAST:
16983       case EXCESS_PRECISION_TYPE_STANDARD:
16984         /* We can calculate either in 16-bit range and precision or
16985            32-bit range and precision.  Make that decision based on whether
16986            we have native support for the ARMv8.2-A 16-bit floating-point
16987            instructions or not.  */
16988         return (TARGET_FP_F16INST
16989                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
16990                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
16991       case EXCESS_PRECISION_TYPE_IMPLICIT:
16992         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
16993       default:
16994         gcc_unreachable ();
16995     }
16996   return FLT_EVAL_METHOD_UNPREDICTABLE;
16997 }
16998
16999 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
17000    scheduled for speculative execution.  Reject the long-running division
17001    and square-root instructions.  */
17002
17003 static bool
17004 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17005 {
17006   switch (get_attr_type (insn))
17007     {
17008       case TYPE_SDIV:
17009       case TYPE_UDIV:
17010       case TYPE_FDIVS:
17011       case TYPE_FDIVD:
17012       case TYPE_FSQRTS:
17013       case TYPE_FSQRTD:
17014       case TYPE_NEON_FP_SQRT_S:
17015       case TYPE_NEON_FP_SQRT_D:
17016       case TYPE_NEON_FP_SQRT_S_Q:
17017       case TYPE_NEON_FP_SQRT_D_Q:
17018       case TYPE_NEON_FP_DIV_S:
17019       case TYPE_NEON_FP_DIV_D:
17020       case TYPE_NEON_FP_DIV_S_Q:
17021       case TYPE_NEON_FP_DIV_D_Q:
17022         return false;
17023       default:
17024         return true;
17025     }
17026 }
17027
17028 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
17029
17030 static int
17031 aarch64_compute_pressure_classes (reg_class *classes)
17032 {
17033   int i = 0;
17034   classes[i++] = GENERAL_REGS;
17035   classes[i++] = FP_REGS;
17036   /* PR_REGS isn't a useful pressure class because many predicate pseudo
17037      registers need to go in PR_LO_REGS at some point during their
17038      lifetime.  Splitting it into two halves has the effect of making
17039      all predicates count against PR_LO_REGS, so that we try whenever
17040      possible to restrict the number of live predicates to 8.  This
17041      greatly reduces the amount of spilling in certain loops.  */
17042   classes[i++] = PR_LO_REGS;
17043   classes[i++] = PR_HI_REGS;
17044   return i;
17045 }
17046
17047 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
17048
17049 static bool
17050 aarch64_can_change_mode_class (machine_mode from,
17051                                machine_mode to, reg_class_t)
17052 {
17053   /* See the comment at the head of aarch64-sve.md for details.  */
17054   if (BYTES_BIG_ENDIAN
17055       && (aarch64_sve_data_mode_p (from) != aarch64_sve_data_mode_p (to)))
17056     return false;
17057   return true;
17058 }
17059
17060 /* Target-specific selftests.  */
17061
17062 #if CHECKING_P
17063
17064 namespace selftest {
17065
17066 /* Selftest for the RTL loader.
17067    Verify that the RTL loader copes with a dump from
17068    print_rtx_function.  This is essentially just a test that class
17069    function_reader can handle a real dump, but it also verifies
17070    that lookup_reg_by_dump_name correctly handles hard regs.
17071    The presence of hard reg names in the dump means that the test is
17072    target-specific, hence it is in this file.  */
17073
17074 static void
17075 aarch64_test_loading_full_dump ()
17076 {
17077   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17078
17079   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17080
17081   rtx_insn *insn_1 = get_insn_by_uid (1);
17082   ASSERT_EQ (NOTE, GET_CODE (insn_1));
17083
17084   rtx_insn *insn_15 = get_insn_by_uid (15);
17085   ASSERT_EQ (INSN, GET_CODE (insn_15));
17086   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17087
17088   /* Verify crtl->return_rtx.  */
17089   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17090   ASSERT_EQ (0, REGNO (crtl->return_rtx));
17091   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17092 }
17093
17094 /* Run all target-specific selftests.  */
17095
17096 static void
17097 aarch64_run_selftests (void)
17098 {
17099   aarch64_test_loading_full_dump ();
17100 }
17101
17102 } // namespace selftest
17103
17104 #endif /* #if CHECKING_P */
17105
17106 #undef TARGET_ADDRESS_COST
17107 #define TARGET_ADDRESS_COST aarch64_address_cost
17108
17109 /* This hook will determines whether unnamed bitfields affect the alignment
17110    of the containing structure.  The hook returns true if the structure
17111    should inherit the alignment requirements of an unnamed bitfield's
17112    type.  */
17113 #undef TARGET_ALIGN_ANON_BITFIELD
17114 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17115
17116 #undef TARGET_ASM_ALIGNED_DI_OP
17117 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17118
17119 #undef TARGET_ASM_ALIGNED_HI_OP
17120 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17121
17122 #undef TARGET_ASM_ALIGNED_SI_OP
17123 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17124
17125 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17126 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17127   hook_bool_const_tree_hwi_hwi_const_tree_true
17128
17129 #undef TARGET_ASM_FILE_START
17130 #define TARGET_ASM_FILE_START aarch64_start_file
17131
17132 #undef TARGET_ASM_OUTPUT_MI_THUNK
17133 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17134
17135 #undef TARGET_ASM_SELECT_RTX_SECTION
17136 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17137
17138 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17139 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17140
17141 #undef TARGET_BUILD_BUILTIN_VA_LIST
17142 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17143
17144 #undef TARGET_CALLEE_COPIES
17145 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17146
17147 #undef TARGET_CAN_ELIMINATE
17148 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17149
17150 #undef TARGET_CAN_INLINE_P
17151 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17152
17153 #undef TARGET_CANNOT_FORCE_CONST_MEM
17154 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17155
17156 #undef TARGET_CASE_VALUES_THRESHOLD
17157 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17158
17159 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17160 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17161
17162 /* Only the least significant bit is used for initialization guard
17163    variables.  */
17164 #undef TARGET_CXX_GUARD_MASK_BIT
17165 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17166
17167 #undef TARGET_C_MODE_FOR_SUFFIX
17168 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17169
17170 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17171 #undef  TARGET_DEFAULT_TARGET_FLAGS
17172 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17173 #endif
17174
17175 #undef TARGET_CLASS_MAX_NREGS
17176 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17177
17178 #undef TARGET_BUILTIN_DECL
17179 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17180
17181 #undef TARGET_BUILTIN_RECIPROCAL
17182 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17183
17184 #undef TARGET_C_EXCESS_PRECISION
17185 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17186
17187 #undef  TARGET_EXPAND_BUILTIN
17188 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17189
17190 #undef TARGET_EXPAND_BUILTIN_VA_START
17191 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17192
17193 #undef TARGET_FOLD_BUILTIN
17194 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17195
17196 #undef TARGET_FUNCTION_ARG
17197 #define TARGET_FUNCTION_ARG aarch64_function_arg
17198
17199 #undef TARGET_FUNCTION_ARG_ADVANCE
17200 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17201
17202 #undef TARGET_FUNCTION_ARG_BOUNDARY
17203 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17204
17205 #undef TARGET_FUNCTION_ARG_PADDING
17206 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17207
17208 #undef TARGET_GET_RAW_RESULT_MODE
17209 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17210 #undef TARGET_GET_RAW_ARG_MODE
17211 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17212
17213 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17214 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17215
17216 #undef TARGET_FUNCTION_VALUE
17217 #define TARGET_FUNCTION_VALUE aarch64_function_value
17218
17219 #undef TARGET_FUNCTION_VALUE_REGNO_P
17220 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17221
17222 #undef TARGET_GIMPLE_FOLD_BUILTIN
17223 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17224
17225 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17226 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17227
17228 #undef  TARGET_INIT_BUILTINS
17229 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
17230
17231 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17232 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17233   aarch64_ira_change_pseudo_allocno_class
17234
17235 #undef TARGET_LEGITIMATE_ADDRESS_P
17236 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17237
17238 #undef TARGET_LEGITIMATE_CONSTANT_P
17239 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17240
17241 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17242 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17243   aarch64_legitimize_address_displacement
17244
17245 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17246 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17247
17248 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17249 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17250 aarch64_libgcc_floating_mode_supported_p
17251
17252 #undef TARGET_MANGLE_TYPE
17253 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17254
17255 #undef TARGET_MEMORY_MOVE_COST
17256 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17257
17258 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17259 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17260
17261 #undef TARGET_MUST_PASS_IN_STACK
17262 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17263
17264 /* This target hook should return true if accesses to volatile bitfields
17265    should use the narrowest mode possible.  It should return false if these
17266    accesses should use the bitfield container type.  */
17267 #undef TARGET_NARROW_VOLATILE_BITFIELD
17268 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17269
17270 #undef  TARGET_OPTION_OVERRIDE
17271 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17272
17273 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17274 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17275   aarch64_override_options_after_change
17276
17277 #undef TARGET_OPTION_SAVE
17278 #define TARGET_OPTION_SAVE aarch64_option_save
17279
17280 #undef TARGET_OPTION_RESTORE
17281 #define TARGET_OPTION_RESTORE aarch64_option_restore
17282
17283 #undef TARGET_OPTION_PRINT
17284 #define TARGET_OPTION_PRINT aarch64_option_print
17285
17286 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17287 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17288
17289 #undef TARGET_SET_CURRENT_FUNCTION
17290 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17291
17292 #undef TARGET_PASS_BY_REFERENCE
17293 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17294
17295 #undef TARGET_PREFERRED_RELOAD_CLASS
17296 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17297
17298 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17299 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17300
17301 #undef TARGET_PROMOTED_TYPE
17302 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17303
17304 #undef TARGET_SECONDARY_RELOAD
17305 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17306
17307 #undef TARGET_SHIFT_TRUNCATION_MASK
17308 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17309
17310 #undef TARGET_SETUP_INCOMING_VARARGS
17311 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17312
17313 #undef TARGET_STRUCT_VALUE_RTX
17314 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
17315
17316 #undef TARGET_REGISTER_MOVE_COST
17317 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17318
17319 #undef TARGET_RETURN_IN_MEMORY
17320 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17321
17322 #undef TARGET_RETURN_IN_MSB
17323 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17324
17325 #undef TARGET_RTX_COSTS
17326 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17327
17328 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17329 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17330
17331 #undef TARGET_SCHED_ISSUE_RATE
17332 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17333
17334 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17335 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17336   aarch64_sched_first_cycle_multipass_dfa_lookahead
17337
17338 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17339 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17340   aarch64_first_cycle_multipass_dfa_lookahead_guard
17341
17342 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17343 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17344   aarch64_get_separate_components
17345
17346 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17347 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17348   aarch64_components_for_bb
17349
17350 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17351 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17352   aarch64_disqualify_components
17353
17354 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17355 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17356   aarch64_emit_prologue_components
17357
17358 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17359 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17360   aarch64_emit_epilogue_components
17361
17362 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17363 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17364   aarch64_set_handled_components
17365
17366 #undef TARGET_TRAMPOLINE_INIT
17367 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17368
17369 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17370 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17371
17372 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17373 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17374
17375 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17376 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17377   aarch64_builtin_support_vector_misalignment
17378
17379 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17380 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17381
17382 #undef TARGET_VECTORIZE_ADD_STMT_COST
17383 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17384
17385 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17386 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17387   aarch64_builtin_vectorization_cost
17388
17389 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17390 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17391
17392 #undef TARGET_VECTORIZE_BUILTINS
17393 #define TARGET_VECTORIZE_BUILTINS
17394
17395 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17396 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17397   aarch64_builtin_vectorized_function
17398
17399 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17400 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17401   aarch64_autovectorize_vector_sizes
17402
17403 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17404 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17405   aarch64_atomic_assign_expand_fenv
17406
17407 /* Section anchor support.  */
17408
17409 #undef TARGET_MIN_ANCHOR_OFFSET
17410 #define TARGET_MIN_ANCHOR_OFFSET -256
17411
17412 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17413    byte offset; we can do much more for larger data types, but have no way
17414    to determine the size of the access.  We assume accesses are aligned.  */
17415 #undef TARGET_MAX_ANCHOR_OFFSET
17416 #define TARGET_MAX_ANCHOR_OFFSET 4095
17417
17418 #undef TARGET_VECTOR_ALIGNMENT
17419 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17420
17421 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17422 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17423   aarch64_vectorize_preferred_vector_alignment
17424 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17425 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17426   aarch64_simd_vector_alignment_reachable
17427
17428 /* vec_perm support.  */
17429
17430 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17431 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17432   aarch64_vectorize_vec_perm_const
17433
17434 #undef TARGET_VECTORIZE_GET_MASK_MODE
17435 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17436
17437 #undef TARGET_INIT_LIBFUNCS
17438 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17439
17440 #undef TARGET_FIXED_CONDITION_CODE_REGS
17441 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17442
17443 #undef TARGET_FLAGS_REGNUM
17444 #define TARGET_FLAGS_REGNUM CC_REGNUM
17445
17446 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17447 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17448
17449 #undef TARGET_ASAN_SHADOW_OFFSET
17450 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17451
17452 #undef TARGET_LEGITIMIZE_ADDRESS
17453 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17454
17455 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17456 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17457
17458 #undef TARGET_CAN_USE_DOLOOP_P
17459 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17460
17461 #undef TARGET_SCHED_ADJUST_PRIORITY
17462 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17463
17464 #undef TARGET_SCHED_MACRO_FUSION_P
17465 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17466
17467 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17468 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17469
17470 #undef TARGET_SCHED_FUSION_PRIORITY
17471 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17472
17473 #undef TARGET_UNSPEC_MAY_TRAP_P
17474 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17475
17476 #undef TARGET_USE_PSEUDO_PIC_REG
17477 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17478
17479 #undef TARGET_PRINT_OPERAND
17480 #define TARGET_PRINT_OPERAND aarch64_print_operand
17481
17482 #undef TARGET_PRINT_OPERAND_ADDRESS
17483 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17484
17485 #undef TARGET_OPTAB_SUPPORTED_P
17486 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17487
17488 #undef TARGET_OMIT_STRUCT_RETURN_REG
17489 #define TARGET_OMIT_STRUCT_RETURN_REG true
17490
17491 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17492 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17493   aarch64_dwarf_poly_indeterminate_value
17494
17495 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
17496 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17497 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17498
17499 #undef TARGET_HARD_REGNO_NREGS
17500 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17501 #undef TARGET_HARD_REGNO_MODE_OK
17502 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17503
17504 #undef TARGET_MODES_TIEABLE_P
17505 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17506
17507 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17508 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17509   aarch64_hard_regno_call_part_clobbered
17510
17511 #undef TARGET_CONSTANT_ALIGNMENT
17512 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17513
17514 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17515 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17516
17517 #undef TARGET_CAN_CHANGE_MODE_CLASS
17518 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17519
17520 #if CHECKING_P
17521 #undef TARGET_RUN_TARGET_SELFTESTS
17522 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17523 #endif /* #if CHECKING_P */
17524
17525 struct gcc_target targetm = TARGET_INITIALIZER;
17526
17527 #include "gt-aarch64.h"