gcc/config/aarch64/aarch64.cc

   1 /* Machine description for AArch64 architecture.
   2    Copyright (C) 2009-2024 Free Software Foundation, Inc.
   3    Contributed by ARM Ltd.
   4
   5    This file is part of GCC.
   6
   7    GCC is free software; you can redistribute it and/or modify it
   8    under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3, or (at your option)
  10    any later version.
  11
  12    GCC is distributed in the hope that it will be useful, but
  13    WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15    General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with GCC; see the file COPYING3.  If not see
  19    <http://www.gnu.org/licenses/>.  */
  20
  21 #define IN_TARGET_CODE 1
  22
  23 #define INCLUDE_STRING
  24 #define INCLUDE_ALGORITHM
  25 #define INCLUDE_VECTOR
  26 #include "config.h"
  27 #include "system.h"
  28 #include "coretypes.h"
  29 #include "backend.h"
  30 #include "target.h"
  31 #include "rtl.h"
  32 #include "tree.h"
  33 #include "memmodel.h"
  34 #include "gimple.h"
  35 #include "cfghooks.h"
  36 #include "cfgloop.h"
  37 #include "df.h"
  38 #include "tm_p.h"
  39 #include "stringpool.h"
  40 #include "attribs.h"
  41 #include "optabs.h"
  42 #include "regs.h"
  43 #include "emit-rtl.h"
  44 #include "recog.h"
  45 #include "cgraph.h"
  46 #include "diagnostic.h"
  47 #include "insn-attr.h"
  48 #include "alias.h"
  49 #include "fold-const.h"
  50 #include "stor-layout.h"
  51 #include "calls.h"
  52 #include "varasm.h"
  53 #include "output.h"
  54 #include "flags.h"
  55 #include "explow.h"
  56 #include "expr.h"
  57 #include "reload.h"
  58 #include "langhooks.h"
  59 #include "opts.h"
  60 #include "gimplify.h"
  61 #include "dwarf2.h"
  62 #include "gimple-iterator.h"
  63 #include "tree-vectorizer.h"
  64 #include "aarch64-cost-tables.h"
  65 #include "dumpfile.h"
  66 #include "builtins.h"
  67 #include "rtl-iter.h"
  68 #include "tm-constrs.h"
  69 #include "sched-int.h"
  70 #include "target-globals.h"
  71 #include "common/common-target.h"
  72 #include "cfgrtl.h"
  73 #include "selftest.h"
  74 #include "selftest-rtl.h"
  75 #include "rtx-vector-builder.h"
  76 #include "intl.h"
  77 #include "expmed.h"
  78 #include "function-abi.h"
  79 #include "gimple-pretty-print.h"
  80 #include "tree-ssa-loop-niter.h"
  81 #include "fractional-cost.h"
  82 #include "rtlanal.h"
  83 #include "tree-dfa.h"
  84 #include "asan.h"
  85 #include "aarch64-feature-deps.h"
  86 #include "config/arm/aarch-common.h"
  87 #include "config/arm/aarch-common-protos.h"
  88 #include "common/config/aarch64/cpuinfo.h"
  89 #include "ssa.h"
  90 #include "except.h"
  91 #include "tree-pass.h"
  92 #include "cfgbuild.h"
  93 #include "symbol-summary.h"
  94 #include "ipa-prop.h"
  95 #include "ipa-fnsummary.h"
  96 #include "hash-map.h"
  97
  98 /* This file should be included last.  */
  99 #include "target-def.h"
 100
 101 /* Defined for convenience.  */
 102 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
 103
 104 /* Flags that describe how a function shares certain architectural state
 105    with its callers.
 106
 107    - AARCH64_STATE_SHARED indicates that the function does share the state
 108      with callers.
 109
 110    - AARCH64_STATE_IN indicates that the function reads (or might read) the
 111      incoming state.  The converse is that the function ignores the incoming
 112      state.
 113
 114    - AARCH64_STATE_OUT indicates that the function returns new state.
 115      The converse is that the state on return is the same as it was on entry.
 116
 117    A function that partially modifies the state treats it as both IN
 118    and OUT (because the value on return depends to some extent on the
 119    value on input).  */
 120 constexpr auto AARCH64_STATE_SHARED = 1U << 0;
 121 constexpr auto AARCH64_STATE_IN = 1U << 1;
 122 constexpr auto AARCH64_STATE_OUT = 1U << 2;
 123
 124 /* Information about a legitimate vector immediate operand.  */
 125 struct simd_immediate_info
 126 {
 127   enum insn_type { MOV, MVN, INDEX, PTRUE };
 128   enum modifier_type { LSL, MSL };
 129
 130   simd_immediate_info () {}
 131   simd_immediate_info (scalar_float_mode, rtx);
 132   simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
 133                        insn_type = MOV, modifier_type = LSL,
 134                        unsigned int = 0);
 135   simd_immediate_info (scalar_mode, rtx, rtx);
 136   simd_immediate_info (scalar_int_mode, aarch64_svpattern);
 137
 138   /* The mode of the elements.  */
 139   scalar_mode elt_mode;
 140
 141   /* The instruction to use to move the immediate into a vector.  */
 142   insn_type insn;
 143
 144   union
 145   {
 146     /* For MOV and MVN.  */
 147     struct
 148     {
 149       /* The value of each element.  */
 150       rtx value;
 151
 152       /* The kind of shift modifier to use, and the number of bits to shift.
 153          This is (LSL, 0) if no shift is needed.  */
 154       modifier_type modifier;
 155       unsigned int shift;
 156     } mov;
 157
 158     /* For INDEX.  */
 159     struct
 160     {
 161       /* The value of the first element and the step to be added for each
 162          subsequent element.  */
 163       rtx base, step;
 164     } index;
 165
 166     /* For PTRUE.  */
 167     aarch64_svpattern pattern;
 168   } u;
 169 };
 170
 171 /* Construct a floating-point immediate in which each element has mode
 172    ELT_MODE_IN and value VALUE_IN.  */
 173 inline simd_immediate_info
 174 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
 175   : elt_mode (elt_mode_in), insn (MOV)
 176 {
 177   u.mov.value = value_in;
 178   u.mov.modifier = LSL;
 179   u.mov.shift = 0;
 180 }
 181
 182 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 183    and value VALUE_IN.  The other parameters are as for the structure
 184    fields.  */
 185 inline simd_immediate_info
 186 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 187                        unsigned HOST_WIDE_INT value_in,
 188                        insn_type insn_in, modifier_type modifier_in,
 189                        unsigned int shift_in)
 190   : elt_mode (elt_mode_in), insn (insn_in)
 191 {
 192   u.mov.value = gen_int_mode (value_in, elt_mode_in);
 193   u.mov.modifier = modifier_in;
 194   u.mov.shift = shift_in;
 195 }
 196
 197 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
 198    and where element I is equal to BASE_IN + I * STEP_IN.  */
 199 inline simd_immediate_info
 200 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
 201   : elt_mode (elt_mode_in), insn (INDEX)
 202 {
 203   u.index.base = base_in;
 204   u.index.step = step_in;
 205 }
 206
 207 /* Construct a predicate that controls elements of mode ELT_MODE_IN
 208    and has PTRUE pattern PATTERN_IN.  */
 209 inline simd_immediate_info
 210 ::simd_immediate_info (scalar_int_mode elt_mode_in,
 211                        aarch64_svpattern pattern_in)
 212   : elt_mode (elt_mode_in), insn (PTRUE)
 213 {
 214   u.pattern = pattern_in;
 215 }
 216
 217 namespace {
 218
 219 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64.  */
 220 class pure_scalable_type_info
 221 {
 222 public:
 223   /* Represents the result of analyzing a type.  All values are nonzero,
 224      in the possibly forlorn hope that accidental conversions to bool
 225      trigger a warning.  */
 226   enum analysis_result
 227   {
 228     /* The type does not have an ABI identity; i.e. it doesn't contain
 229        at least one object whose type is a Fundamental Data Type.  */
 230     NO_ABI_IDENTITY = 1,
 231
 232     /* The type is definitely a Pure Scalable Type.  */
 233     IS_PST,
 234
 235     /* The type is definitely not a Pure Scalable Type.  */
 236     ISNT_PST,
 237
 238     /* It doesn't matter for PCS purposes whether the type is a Pure
 239        Scalable Type or not, since the type will be handled the same
 240        way regardless.
 241
 242        Specifically, this means that if the type is a Pure Scalable Type,
 243        there aren't enough argument registers to hold it, and so it will
 244        need to be passed or returned in memory.  If the type isn't a
 245        Pure Scalable Type, it's too big to be passed or returned in core
 246        or SIMD&FP registers, and so again will need to go in memory.  */
 247     DOESNT_MATTER
 248   };
 249
 250   /* Aggregates of 17 bytes or more are normally passed and returned
 251      in memory, so aggregates of that size can safely be analyzed as
 252      DOESNT_MATTER.  We need to be able to collect enough pieces to
 253      represent a PST that is smaller than that.  Since predicates are
 254      2 bytes in size for -msve-vector-bits=128, that means we need to be
 255      able to store at least 8 pieces.
 256
 257      We also need to be able to store enough pieces to represent
 258      a single vector in each vector argument register and a single
 259      predicate in each predicate argument register.  This means that
 260      we need at least 12 pieces.  */
 261   static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
 262   static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
 263
 264   /* Describes one piece of a PST.  Each piece is one of:
 265
 266      - a single Scalable Vector Type (SVT)
 267      - a single Scalable Predicate Type (SPT)
 268      - a PST containing 2, 3 or 4 SVTs, with no padding
 269
 270      It either represents a single built-in type or a PST formed from
 271      multiple homogeneous built-in types.  */
 272   struct piece
 273   {
 274     rtx get_rtx (unsigned int, unsigned int) const;
 275
 276     /* The number of vector and predicate registers that the piece
 277        occupies.  One of the two is always zero.  */
 278     unsigned int num_zr;
 279     unsigned int num_pr;
 280
 281     /* The mode of the registers described above.  */
 282     machine_mode mode;
 283
 284     /* If this piece is formed from multiple homogeneous built-in types,
 285        this is the mode of the built-in types, otherwise it is MODE.  */
 286     machine_mode orig_mode;
 287
 288     /* The offset in bytes of the piece from the start of the type.  */
 289     poly_uint64 offset;
 290   };
 291
 292   /* Divides types analyzed as IS_PST into individual pieces.  The pieces
 293      are in memory order.  */
 294   auto_vec<piece, MAX_PIECES> pieces;
 295
 296   unsigned int num_zr () const;
 297   unsigned int num_pr () const;
 298
 299   rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
 300
 301   analysis_result analyze (const_tree);
 302   bool analyze_registers (const_tree);
 303
 304 private:
 305   analysis_result analyze_array (const_tree);
 306   analysis_result analyze_record (const_tree);
 307   void add_piece (const piece &);
 308 };
 309 }
 310
 311 /* The current code model.  */
 312 enum aarch64_code_model aarch64_cmodel;
 313
 314 enum aarch64_tp_reg aarch64_tpidr_register;
 315
 316 /* The number of 64-bit elements in an SVE vector.  */
 317 poly_uint16 aarch64_sve_vg;
 318
 319 #ifdef HAVE_AS_TLS
 320 #undef TARGET_HAVE_TLS
 321 #define TARGET_HAVE_TLS 1
 322 #endif
 323
 324 static bool aarch64_composite_type_p (const_tree, machine_mode);
 325 static bool aarch64_return_in_memory_1 (const_tree);
 326 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
 327                                                      const_tree,
 328                                                      machine_mode *, int *,
 329                                                      bool *, bool);
 330 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
 331 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
 332 static void aarch64_override_options_after_change (void);
 333 static bool aarch64_vector_mode_supported_p (machine_mode);
 334 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
 335 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
 336                                                          const_tree type,
 337                                                          int misalignment,
 338                                                          bool is_packed);
 339 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
 340 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
 341                                             aarch64_addr_query_type);
 342
 343 /* The processor for which instructions should be scheduled.  */
 344 enum aarch64_processor aarch64_tune = cortexa53;
 345
 346 /* Mask to specify which instruction scheduling options should be used.  */
 347 uint64_t aarch64_tune_flags = 0;
 348
 349 /* Global flag for PC relative loads.  */
 350 bool aarch64_pcrelative_literal_loads;
 351
 352 /* Global flag for whether frame pointer is enabled.  */
 353 bool aarch64_use_frame_pointer;
 354
 355 /* Support for command line parsing of boolean flags in the tuning
 356    structures.  */
 357 struct aarch64_flag_desc
 358 {
 359   const char* name;
 360   unsigned int flag;
 361 };
 362
 363 #define AARCH64_FUSION_PAIR(name, internal_name) \
 364   { name, AARCH64_FUSE_##internal_name },
 365 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
 366 {
 367   { "none", AARCH64_FUSE_NOTHING },
 368 #include "aarch64-fusion-pairs.def"
 369   { "all", AARCH64_FUSE_ALL },
 370   { NULL, AARCH64_FUSE_NOTHING }
 371 };
 372
 373 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
 374   { name, AARCH64_EXTRA_TUNE_##internal_name },
 375 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
 376 {
 377   { "none", AARCH64_EXTRA_TUNE_NONE },
 378 #include "aarch64-tuning-flags.def"
 379   { "all", AARCH64_EXTRA_TUNE_ALL },
 380   { NULL, AARCH64_EXTRA_TUNE_NONE }
 381 };
 382
 383 /* Tuning parameters.  */
 384 #include "tuning_models/generic.h"
 385 #include "tuning_models/generic_armv8_a.h"
 386 #include "tuning_models/generic_armv9_a.h"
 387 #include "tuning_models/cortexa35.h"
 388 #include "tuning_models/cortexa53.h"
 389 #include "tuning_models/cortexa57.h"
 390 #include "tuning_models/cortexa72.h"
 391 #include "tuning_models/cortexa73.h"
 392 #include "tuning_models/exynosm1.h"
 393 #include "tuning_models/thunderxt88.h"
 394 #include "tuning_models/thunderx.h"
 395 #include "tuning_models/tsv110.h"
 396 #include "tuning_models/xgene1.h"
 397 #include "tuning_models/emag.h"
 398 #include "tuning_models/qdf24xx.h"
 399 #include "tuning_models/saphira.h"
 400 #include "tuning_models/thunderx2t99.h"
 401 #include "tuning_models/thunderx3t110.h"
 402 #include "tuning_models/neoversen1.h"
 403 #include "tuning_models/ampere1.h"
 404 #include "tuning_models/ampere1a.h"
 405 #include "tuning_models/ampere1b.h"
 406 #include "tuning_models/neoversev1.h"
 407 #include "tuning_models/neoverse512tvb.h"
 408 #include "tuning_models/neoversen2.h"
 409 #include "tuning_models/neoversev2.h"
 410 #include "tuning_models/a64fx.h"
 411
 412 /* Support for fine-grained override of the tuning structures.  */
 413 struct aarch64_tuning_override_function
 414 {
 415   const char* name;
 416   void (*parse_override)(const char*, struct tune_params*);
 417 };
 418
 419 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
 420 static void aarch64_parse_tune_string (const char*, struct tune_params*);
 421 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
 422
 423 static const struct aarch64_tuning_override_function
 424 aarch64_tuning_override_functions[] =
 425 {
 426   { "fuse", aarch64_parse_fuse_string },
 427   { "tune", aarch64_parse_tune_string },
 428   { "sve_width", aarch64_parse_sve_width_string },
 429   { NULL, NULL }
 430 };
 431
 432 /* A processor implementing AArch64.  */
 433 struct processor
 434 {
 435   const char *name;
 436   aarch64_processor ident;
 437   aarch64_processor sched_core;
 438   aarch64_arch arch;
 439   aarch64_feature_flags flags;
 440   const tune_params *tune;
 441 };
 442
 443 /* Architectures implementing AArch64.  */
 444 static CONSTEXPR const processor all_architectures[] =
 445 {
 446 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \
 447   {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, \
 448    feature_deps::ARCH_IDENT ().enable, NULL},
 449 #include "aarch64-arches.def"
 450   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
 451 };
 452
 453 /* Processor cores implementing AArch64.  */
 454 static const struct processor all_cores[] =
 455 {
 456 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \
 457   {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
 458    feature_deps::cpu_##IDENT, &COSTS##_tunings},
 459 #include "aarch64-cores.def"
 460   {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
 461 };
 462 /* Internal representation of system registers.  */
 463 typedef struct {
 464   const char *name;
 465   /* Stringified sysreg encoding values, represented as
 466      s<sn>_<op1>_c<cn>_c<cm>_<op2>.  */
 467   const char *encoding;
 468   /* Flags affecting sysreg usage, such as read/write-only.  */
 469   unsigned properties;
 470   /* Architectural features implied by sysreg.  */
 471   aarch64_feature_flags arch_reqs;
 472 } sysreg_t;
 473
 474 /* An aarch64_feature_set initializer for a single feature,
 475    AARCH64_FEATURE_<FEAT>.  */
 476 #define AARCH64_FEATURE(FEAT) AARCH64_FL_##FEAT
 477
 478 /* Used by AARCH64_FEATURES.  */
 479 #define AARCH64_OR_FEATURES_1(X, F1) \
 480   AARCH64_FEATURE (F1)
 481 #define AARCH64_OR_FEATURES_2(X, F1, F2) \
 482   (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_1 (X, F2))
 483 #define AARCH64_OR_FEATURES_3(X, F1, ...) \
 484   (AARCH64_FEATURE (F1) | AARCH64_OR_FEATURES_2 (X, __VA_ARGS__))
 485
 486 /* An aarch64_feature_set initializer for the N features listed in "...".  */
 487 #define AARCH64_FEATURES(N, ...) \
 488   AARCH64_OR_FEATURES_##N (0, __VA_ARGS__)
 489
 490 #define AARCH64_NO_FEATURES        0
 491
 492 /* Flags associated with the properties of system registers.  It mainly serves
 493    to mark particular registers as read or write only.  */
 494 #define F_DEPRECATED               (1 << 1)
 495 #define F_REG_READ                 (1 << 2)
 496 #define F_REG_WRITE                (1 << 3)
 497 #define F_ARCHEXT                  (1 << 4)
 498 /* Flag indicating register name is alias for another system register.  */
 499 #define F_REG_ALIAS                (1 << 5)
 500 /* Flag indicatinig registers which may be implemented with 128-bits.  */
 501 #define F_REG_128                  (1 << 6)
 502
 503 /* Database of system registers, their encodings and architectural
 504    requirements.  */
 505 const sysreg_t aarch64_sysregs[] =
 506 {
 507 #define CPENC(SN, OP1, CN, CM, OP2) "s"#SN"_"#OP1"_c"#CN"_c"#CM"_"#OP2
 508 #define SYSREG(NAME, ENC, FLAGS, ARCH) \
 509   { NAME, ENC, FLAGS, ARCH },
 510 #include "aarch64-sys-regs.def"
 511 #undef CPENC
 512 };
 513
 514 #undef AARCH64_NO_FEATURES
 515
 516 using sysreg_map_t = hash_map<nofree_string_hash, const sysreg_t *>;
 517 static sysreg_map_t *sysreg_map = nullptr;
 518
 519 /* Map system register names to their hardware metadata: encoding,
 520    feature flags and architectural feature requirements, all of which
 521    are encoded in a sysreg_t struct.  */
 522 void
 523 aarch64_register_sysreg (const char *name, const sysreg_t *metadata)
 524 {
 525   bool dup = sysreg_map->put (name, metadata);
 526   gcc_checking_assert (!dup);
 527 }
 528
 529 /* Lazily initialize hash table for system register validation,
 530    checking the validity of supplied register name and returning
 531    register's associated metadata.  */
 532 static void
 533 aarch64_init_sysregs (void)
 534 {
 535   gcc_assert (!sysreg_map);
 536   sysreg_map = new sysreg_map_t;
 537
 538
 539   for (unsigned i = 0; i < ARRAY_SIZE (aarch64_sysregs); i++)
 540     {
 541       const sysreg_t *reg = aarch64_sysregs + i;
 542       aarch64_register_sysreg (reg->name, reg);
 543     }
 544 }
 545
 546 /* No direct access to the sysreg hash-map should be made.  Doing so
 547    risks trying to acess an unitialized hash-map and dereferencing the
 548    returned double pointer without due care risks dereferencing a
 549    null-pointer.  */
 550 const sysreg_t *
 551 aarch64_lookup_sysreg_map (const char *regname)
 552 {
 553   if (!sysreg_map)
 554     aarch64_init_sysregs ();
 555
 556   const sysreg_t **sysreg_entry = sysreg_map->get (regname);
 557   if (sysreg_entry != NULL)
 558     return *sysreg_entry;
 559   return NULL;
 560 }
 561
 562 /* The current tuning set.  */
 563 struct tune_params aarch64_tune_params = generic_tunings;
 564
 565 /* If NAME is the name of an arm:: attribute that describes shared state,
 566    return its associated AARCH64_STATE_* flags, otherwise return 0.  */
 567 static unsigned int
 568 aarch64_attribute_shared_state_flags (const char *name)
 569 {
 570   if (strcmp (name, "in") == 0)
 571     return AARCH64_STATE_SHARED | AARCH64_STATE_IN;
 572   if (strcmp (name, "inout") == 0)
 573     return AARCH64_STATE_SHARED | AARCH64_STATE_IN | AARCH64_STATE_OUT;
 574   if (strcmp (name, "out") == 0)
 575     return AARCH64_STATE_SHARED | AARCH64_STATE_OUT;
 576   if (strcmp (name, "preserves") == 0)
 577     return AARCH64_STATE_SHARED;
 578   return 0;
 579 }
 580
 581 /* See whether attribute list ATTRS has any sharing information
 582    for state STATE_NAME.  Return the associated state flags if so,
 583    otherwise return 0.  */
 584 static unsigned int
 585 aarch64_lookup_shared_state_flags (tree attrs, const char *state_name)
 586 {
 587   for (tree attr = attrs; attr; attr = TREE_CHAIN (attr))
 588     {
 589       if (!cxx11_attribute_p (attr))
 590         continue;
 591
 592       auto ns = IDENTIFIER_POINTER (TREE_PURPOSE (TREE_PURPOSE (attr)));
 593       if (strcmp (ns, "arm") != 0)
 594         continue;
 595
 596       auto attr_name = IDENTIFIER_POINTER (TREE_VALUE (TREE_PURPOSE (attr)));
 597       auto flags = aarch64_attribute_shared_state_flags (attr_name);
 598       if (!flags)
 599         continue;
 600
 601       for (tree arg = TREE_VALUE (attr); arg; arg = TREE_CHAIN (arg))
 602         {
 603           tree value = TREE_VALUE (arg);
 604           if (TREE_CODE (value) == STRING_CST
 605               && strcmp (TREE_STRING_POINTER (value), state_name) == 0)
 606             return flags;
 607         }
 608     }
 609   return 0;
 610 }
 611
 612 /* Return true if DECL creates a new scope for state STATE_STRING.  */
 613 static bool
 614 aarch64_fndecl_has_new_state (const_tree decl, const char *state_name)
 615 {
 616   if (tree attr = lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl)))
 617     for (tree arg = TREE_VALUE (attr); arg; arg = TREE_CHAIN (arg))
 618       {
 619         tree value = TREE_VALUE (arg);
 620         if (TREE_CODE (value) == STRING_CST
 621             && strcmp (TREE_STRING_POINTER (value), state_name) == 0)
 622           return true;
 623       }
 624   return false;
 625 }
 626
 627 /* Return true if attribute argument VALUE is a recognized state string,
 628    otherwise report an error.  NAME is the name of the attribute to which
 629    VALUE is being passed.  */
 630 static bool
 631 aarch64_check_state_string (tree name, tree value)
 632 {
 633   if (TREE_CODE (value) != STRING_CST)
 634     {
 635       error ("the arguments to %qE must be constant strings", name);
 636       return false;
 637     }
 638
 639   const char *state_name = TREE_STRING_POINTER (value);
 640   if (strcmp (state_name, "za") != 0
 641       && strcmp (state_name, "zt0") != 0)
 642     {
 643       error ("unrecognized state string %qs", state_name);
 644       return false;
 645     }
 646
 647   return true;
 648 }
 649
 650 /* qsort callback to compare two STRING_CSTs.  */
 651 static int
 652 cmp_string_csts (const void *a, const void *b)
 653 {
 654   return strcmp (TREE_STRING_POINTER (*(const_tree const *) a),
 655                  TREE_STRING_POINTER (*(const_tree const *) b));
 656 }
 657
 658 /* Canonicalize a list of state strings.  ARGS contains the arguments to
 659    a new attribute while OLD_ATTR, if nonnull, contains a previous attribute
 660    of the same type.  If CAN_MERGE_IN_PLACE, it is safe to adjust OLD_ATTR's
 661    arguments and drop the new attribute.  Otherwise, the new attribute must
 662    be kept and ARGS must include the information in OLD_ATTR.
 663
 664    In both cases, the new arguments must be a sorted list of state strings
 665    with duplicates removed.
 666
 667    Return true if new attribute should be kept, false if it should be
 668    dropped.  */
 669 static bool
 670 aarch64_merge_string_arguments (tree args, tree old_attr,
 671                                 bool can_merge_in_place)
 672 {
 673   /* Get a sorted list of all state strings (including duplicates).  */
 674   auto add_args = [](vec<tree> &strings, const_tree args)
 675     {
 676       for (const_tree arg = args; arg; arg = TREE_CHAIN (arg))
 677         if (TREE_CODE (TREE_VALUE (arg)) == STRING_CST)
 678           strings.safe_push (TREE_VALUE (arg));
 679     };
 680   auto_vec<tree, 16> strings;
 681   add_args (strings, args);
 682   if (old_attr)
 683     add_args (strings, TREE_VALUE (old_attr));
 684   strings.qsort (cmp_string_csts);
 685
 686   /* The list can be empty if there was no previous attribute and if all
 687      the new arguments are erroneous.  Drop the attribute in that case.  */
 688   if (strings.is_empty ())
 689     return false;
 690
 691   /* Destructively modify one of the argument lists, removing duplicates
 692      on the fly.  */
 693   bool use_old_attr = old_attr && can_merge_in_place;
 694   tree *end = use_old_attr ? &TREE_VALUE (old_attr) : &args;
 695   tree prev = NULL_TREE;
 696   for (tree arg : strings)
 697     {
 698       if (prev && simple_cst_equal (arg, prev))
 699         continue;
 700       prev = arg;
 701       if (!*end)
 702         *end = tree_cons (NULL_TREE, arg, NULL_TREE);
 703       else
 704         TREE_VALUE (*end) = arg;
 705       end = &TREE_CHAIN (*end);
 706     }
 707   *end = NULL_TREE;
 708   return !use_old_attr;
 709 }
 710
 711 /* Check whether an 'aarch64_vector_pcs' attribute is valid.  */
 712
 713 static tree
 714 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
 715                                      int, bool *no_add_attrs)
 716 {
 717   /* Since we set fn_type_req to true, the caller should have checked
 718      this for us.  */
 719   gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
 720   switch ((arm_pcs) fntype_abi (*node).id ())
 721     {
 722     case ARM_PCS_AAPCS64:
 723     case ARM_PCS_SIMD:
 724       return NULL_TREE;
 725
 726     case ARM_PCS_SVE:
 727       error ("the %qE attribute cannot be applied to an SVE function type",
 728              name);
 729       *no_add_attrs = true;
 730       return NULL_TREE;
 731
 732     case ARM_PCS_TLSDESC:
 733     case ARM_PCS_UNKNOWN:
 734       break;
 735     }
 736   gcc_unreachable ();
 737 }
 738
 739 /* Return true if arm::new(ARGS) is compatible with the type of decl DECL,
 740    otherwise report an error.  */
 741 static bool
 742 aarch64_check_arm_new_against_type (tree args, tree decl)
 743 {
 744   tree type_attrs = TYPE_ATTRIBUTES (TREE_TYPE (decl));
 745   for (tree arg = args; arg; arg = TREE_CHAIN (arg))
 746     {
 747       tree value = TREE_VALUE (arg);
 748       if (TREE_CODE (value) == STRING_CST)
 749         {
 750           const char *state_name = TREE_STRING_POINTER (value);
 751           if (aarch64_lookup_shared_state_flags (type_attrs, state_name))
 752             {
 753               error_at (DECL_SOURCE_LOCATION (decl),
 754                         "cannot create a new %qs scope since %qs is shared"
 755                         " with callers", state_name, state_name);
 756               return false;
 757             }
 758         }
 759     }
 760   return true;
 761 }
 762
 763 /* Callback for arm::new attributes.  */
 764 static tree
 765 handle_arm_new (tree *node, tree name, tree args, int, bool *no_add_attrs)
 766 {
 767   tree decl = *node;
 768   if (TREE_CODE (decl) != FUNCTION_DECL)
 769     {
 770       error ("%qE attribute applies only to function definitions", name);
 771       *no_add_attrs = true;
 772       return NULL_TREE;
 773     }
 774   if (TREE_TYPE (decl) == error_mark_node)
 775     {
 776       *no_add_attrs = true;
 777       return NULL_TREE;
 778     }
 779
 780   for (tree arg = args; arg; arg = TREE_CHAIN (arg))
 781     aarch64_check_state_string (name, TREE_VALUE (arg));
 782
 783   if (!aarch64_check_arm_new_against_type (args, decl))
 784     {
 785       *no_add_attrs = true;
 786       return NULL_TREE;
 787     }
 788
 789   /* If there is an old attribute, we should try to update it in-place,
 790      so that there is only one (definitive) arm::new attribute on the decl.  */
 791   tree old_attr = lookup_attribute ("arm", "new", DECL_ATTRIBUTES (decl));
 792   if (!aarch64_merge_string_arguments (args, old_attr, true))
 793     *no_add_attrs = true;
 794
 795   return NULL_TREE;
 796 }
 797
 798 /* Callback for arm::{in,out,inout,preserves} attributes.  */
 799 static tree
 800 handle_arm_shared (tree *node, tree name, tree args,
 801                    int, bool *no_add_attrs)
 802 {
 803   tree type = *node;
 804   tree old_attrs = TYPE_ATTRIBUTES (type);
 805   auto flags = aarch64_attribute_shared_state_flags (IDENTIFIER_POINTER (name));
 806   for (tree arg = args; arg; arg = TREE_CHAIN (arg))
 807     {
 808       tree value = TREE_VALUE (arg);
 809       if (aarch64_check_state_string (name, value))
 810         {
 811           const char *state_name = TREE_STRING_POINTER (value);
 812           auto old_flags = aarch64_lookup_shared_state_flags (old_attrs,
 813                                                               state_name);
 814           if (old_flags && old_flags != flags)
 815             {
 816               error ("inconsistent attributes for state %qs", state_name);
 817               *no_add_attrs = true;
 818               return NULL_TREE;
 819             }
 820         }
 821     }
 822
 823   /* We can't update an old attribute in-place, since types are shared.
 824      Instead make sure that this new attribute contains all the
 825      information, so that the old attribute becomes redundant.  */
 826   tree old_attr = lookup_attribute ("arm", IDENTIFIER_POINTER (name),
 827                                     old_attrs);
 828   if (!aarch64_merge_string_arguments (args, old_attr, false))
 829     *no_add_attrs = true;
 830
 831   return NULL_TREE;
 832 }
 833
 834 /* Mutually-exclusive function type attributes for controlling PSTATE.SM.  */
 835 static const struct attribute_spec::exclusions attr_streaming_exclusions[] =
 836 {
 837   /* Attribute name     exclusion applies to:
 838                         function, type, variable */
 839   { "streaming", false, true, false },
 840   { "streaming_compatible", false, true, false },
 841   { NULL, false, false, false }
 842 };
 843
 844 /* Table of machine attributes.  */
 845 static const attribute_spec aarch64_gnu_attributes[] =
 846 {
 847   /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
 848        affects_type_identity, handler, exclude } */
 849   { "aarch64_vector_pcs", 0, 0, false, true,  true,  true,
 850                           handle_aarch64_vector_pcs_attribute, NULL },
 851   { "arm_sve_vector_bits", 1, 1, false, true,  false, true,
 852                           aarch64_sve::handle_arm_sve_vector_bits_attribute,
 853                           NULL },
 854   { "Advanced SIMD type", 1, 1, false, true,  false, true,  NULL, NULL },
 855   { "SVE type",           3, 3, false, true,  false, true,  NULL, NULL },
 856   { "SVE sizeless type",  0, 0, false, true,  false, true,  NULL, NULL }
 857 };
 858
 859 static const scoped_attribute_specs aarch64_gnu_attribute_table =
 860 {
 861   "gnu", { aarch64_gnu_attributes }
 862 };
 863
 864 static const attribute_spec aarch64_arm_attributes[] =
 865 {
 866   { "streaming",          0, 0, false, true,  true,  true,
 867                           NULL, attr_streaming_exclusions },
 868   { "streaming_compatible", 0, 0, false, true,  true,  true,
 869                           NULL, attr_streaming_exclusions },
 870   { "locally_streaming",  0, 0, true, false, false, false, NULL, NULL },
 871   { "new",                1, -1, true, false, false, false,
 872                           handle_arm_new, NULL },
 873   { "preserves",          1, -1, false, true,  true,  true,
 874                           handle_arm_shared, NULL },
 875   { "in",                 1, -1, false, true,  true,  true,
 876                           handle_arm_shared, NULL },
 877   { "out",                1, -1, false, true,  true,  true,
 878                           handle_arm_shared, NULL },
 879   { "inout",              1, -1, false, true,  true,  true,
 880                           handle_arm_shared, NULL }
 881 };
 882
 883 static const scoped_attribute_specs aarch64_arm_attribute_table =
 884 {
 885   "arm", { aarch64_arm_attributes }
 886 };
 887
 888 static const scoped_attribute_specs *const aarch64_attribute_table[] =
 889 {
 890   &aarch64_gnu_attribute_table,
 891   &aarch64_arm_attribute_table
 892 };
 893
 894 typedef enum aarch64_cond_code
 895 {
 896   AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
 897   AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
 898   AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
 899 }
 900 aarch64_cc;
 901
 902 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
 903
 904
 905 /* The condition codes of the processor, and the inverse function.  */
 906 static const char * const aarch64_condition_codes[] =
 907 {
 908   "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
 909   "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
 910 };
 911
 912 /* The preferred condition codes for SVE conditions.  */
 913 static const char *const aarch64_sve_condition_codes[] =
 914 {
 915   "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
 916   "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
 917 };
 918
 919 /* Return the assembly token for svpattern value VALUE.  */
 920
 921 static const char *
 922 svpattern_token (enum aarch64_svpattern pattern)
 923 {
 924   switch (pattern)
 925     {
 926 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
 927     AARCH64_FOR_SVPATTERN (CASE)
 928 #undef CASE
 929     case AARCH64_NUM_SVPATTERNS:
 930       break;
 931     }
 932   gcc_unreachable ();
 933 }
 934
 935 /* Return the location of a piece that is known to be passed or returned
 936    in registers.  FIRST_ZR is the first unused vector argument register
 937    and FIRST_PR is the first unused predicate argument register.  */
 938
 939 rtx
 940 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
 941                                          unsigned int first_pr) const
 942 {
 943   gcc_assert (VECTOR_MODE_P (mode)
 944               && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
 945               && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
 946
 947   if (num_zr > 0 && num_pr == 0)
 948     return gen_rtx_REG (mode, first_zr);
 949
 950   if (num_zr == 0 && num_pr <= 2)
 951     return gen_rtx_REG (mode, first_pr);
 952
 953   gcc_unreachable ();
 954 }
 955
 956 /* Return the total number of vector registers required by the PST.  */
 957
 958 unsigned int
 959 pure_scalable_type_info::num_zr () const
 960 {
 961   unsigned int res = 0;
 962   for (unsigned int i = 0; i < pieces.length (); ++i)
 963     res += pieces[i].num_zr;
 964   return res;
 965 }
 966
 967 /* Return the total number of predicate registers required by the PST.  */
 968
 969 unsigned int
 970 pure_scalable_type_info::num_pr () const
 971 {
 972   unsigned int res = 0;
 973   for (unsigned int i = 0; i < pieces.length (); ++i)
 974     res += pieces[i].num_pr;
 975   return res;
 976 }
 977
 978 /* Return the location of a PST that is known to be passed or returned
 979    in registers.  FIRST_ZR is the first unused vector argument register
 980    and FIRST_PR is the first unused predicate argument register.  */
 981
 982 rtx
 983 pure_scalable_type_info::get_rtx (machine_mode mode,
 984                                   unsigned int first_zr,
 985                                   unsigned int first_pr) const
 986 {
 987   /* Try to return a single REG if possible.  This leads to better
 988      code generation; it isn't required for correctness.  */
 989   if (mode == pieces[0].mode)
 990     {
 991       gcc_assert (pieces.length () == 1);
 992       return pieces[0].get_rtx (first_zr, first_pr);
 993     }
 994
 995   /* Build up a PARALLEL that contains the individual pieces.  */
 996   rtvec rtxes = rtvec_alloc (pieces.length ());
 997   for (unsigned int i = 0; i < pieces.length (); ++i)
 998     {
 999       rtx reg = pieces[i].get_rtx (first_zr, first_pr);
1000       rtx offset = gen_int_mode (pieces[i].offset, Pmode);
1001       RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
1002       first_zr += pieces[i].num_zr;
1003       first_pr += pieces[i].num_pr;
1004     }
1005   return gen_rtx_PARALLEL (mode, rtxes);
1006 }
1007
1008 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
1009    in the AAPCS64.  */
1010
1011 pure_scalable_type_info::analysis_result
1012 pure_scalable_type_info::analyze (const_tree type)
1013 {
1014   /* Prevent accidental reuse.  */
1015   gcc_assert (pieces.is_empty ());
1016
1017   /* No code will be generated for erroneous types, so we won't establish
1018      an ABI mapping.  */
1019   if (type == error_mark_node)
1020     return NO_ABI_IDENTITY;
1021
1022   /* Zero-sized types disappear in the language->ABI mapping.  */
1023   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1024     return NO_ABI_IDENTITY;
1025
1026   /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs.  */
1027   piece p = {};
1028   if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
1029     {
1030       machine_mode mode = TYPE_MODE_RAW (type);
1031       gcc_assert (VECTOR_MODE_P (mode)
1032                   && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
1033
1034       p.mode = p.orig_mode = mode;
1035       add_piece (p);
1036       return IS_PST;
1037     }
1038
1039   /* Check for user-defined PSTs.  */
1040   if (TREE_CODE (type) == ARRAY_TYPE)
1041     return analyze_array (type);
1042   if (TREE_CODE (type) == RECORD_TYPE)
1043     return analyze_record (type);
1044
1045   return ISNT_PST;
1046 }
1047
1048 /* Analyze a type that is known not to be passed or returned in memory.
1049    Return true if it has an ABI identity and is a Pure Scalable Type.  */
1050
1051 bool
1052 pure_scalable_type_info::analyze_registers (const_tree type)
1053 {
1054   analysis_result result = analyze (type);
1055   gcc_assert (result != DOESNT_MATTER);
1056   return result == IS_PST;
1057 }
1058
1059 /* Subroutine of analyze for handling ARRAY_TYPEs.  */
1060
1061 pure_scalable_type_info::analysis_result
1062 pure_scalable_type_info::analyze_array (const_tree type)
1063 {
1064   /* Analyze the element type.  */
1065   pure_scalable_type_info element_info;
1066   analysis_result result = element_info.analyze (TREE_TYPE (type));
1067   if (result != IS_PST)
1068     return result;
1069
1070   /* An array of unknown, flexible or variable length will be passed and
1071      returned by reference whatever we do.  */
1072   tree nelts_minus_one = array_type_nelts (type);
1073   if (!tree_fits_uhwi_p (nelts_minus_one))
1074     return DOESNT_MATTER;
1075
1076   /* Likewise if the array is constant-sized but too big to be interesting.
1077      The double checks against MAX_PIECES are to protect against overflow.  */
1078   unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
1079   if (count > MAX_PIECES)
1080     return DOESNT_MATTER;
1081   count += 1;
1082   if (count * element_info.pieces.length () > MAX_PIECES)
1083     return DOESNT_MATTER;
1084
1085   /* The above checks should have weeded out elements of unknown size.  */
1086   poly_uint64 element_bytes;
1087   if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
1088     gcc_unreachable ();
1089
1090   /* Build up the list of individual vectors and predicates.  */
1091   gcc_assert (!element_info.pieces.is_empty ());
1092   for (unsigned int i = 0; i < count; ++i)
1093     for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
1094       {
1095         piece p = element_info.pieces[j];
1096         p.offset += i * element_bytes;
1097         add_piece (p);
1098       }
1099   return IS_PST;
1100 }
1101
1102 /* Subroutine of analyze for handling RECORD_TYPEs.  */
1103
1104 pure_scalable_type_info::analysis_result
1105 pure_scalable_type_info::analyze_record (const_tree type)
1106 {
1107   for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1108     {
1109       if (TREE_CODE (field) != FIELD_DECL)
1110         continue;
1111
1112       /* Zero-sized fields disappear in the language->ABI mapping.  */
1113       if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
1114         continue;
1115
1116       /* All fields with an ABI identity must be PSTs for the record as
1117          a whole to be a PST.  If any individual field is too big to be
1118          interesting then the record is too.  */
1119       pure_scalable_type_info field_info;
1120       analysis_result subresult = field_info.analyze (TREE_TYPE (field));
1121       if (subresult == NO_ABI_IDENTITY)
1122         continue;
1123       if (subresult != IS_PST)
1124         return subresult;
1125
1126       /* Since all previous fields are PSTs, we ought to be able to track
1127          the field offset using poly_ints.  */
1128       tree bitpos = bit_position (field);
1129       gcc_assert (poly_int_tree_p (bitpos));
1130
1131       /* For the same reason, it shouldn't be possible to create a PST field
1132          whose offset isn't byte-aligned.  */
1133       poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
1134                                                 BITS_PER_UNIT);
1135
1136       /* Punt if the record is too big to be interesting.  */
1137       poly_uint64 bytepos;
1138       if (!wide_bytepos.to_uhwi (&bytepos)
1139           || pieces.length () + field_info.pieces.length () > MAX_PIECES)
1140         return DOESNT_MATTER;
1141
1142       /* Add the individual vectors and predicates in the field to the
1143          record's list.  */
1144       gcc_assert (!field_info.pieces.is_empty ());
1145       for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
1146         {
1147           piece p = field_info.pieces[i];
1148           p.offset += bytepos;
1149           add_piece (p);
1150         }
1151     }
1152   /* Empty structures disappear in the language->ABI mapping.  */
1153   return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
1154 }
1155
1156 /* Add P to the list of pieces in the type.  */
1157
1158 void
1159 pure_scalable_type_info::add_piece (const piece &p)
1160 {
1161   /* Try to fold the new piece into the previous one to form a
1162      single-mode PST.  For example, if we see three consecutive vectors
1163      of the same mode, we can represent them using the corresponding
1164      3-tuple mode.
1165
1166      This is purely an optimization.  */
1167   if (!pieces.is_empty ())
1168     {
1169       piece &prev = pieces.last ();
1170       gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
1171       unsigned int nelems1, nelems2;
1172       if (prev.orig_mode == p.orig_mode
1173           && GET_MODE_CLASS (p.orig_mode) != MODE_VECTOR_BOOL
1174           && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
1175           && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
1176                                   GET_MODE_NUNITS (p.orig_mode), &nelems1)
1177           && constant_multiple_p (GET_MODE_NUNITS (p.mode),
1178                                   GET_MODE_NUNITS (p.orig_mode), &nelems2)
1179           && targetm.array_mode (p.orig_mode,
1180                                  nelems1 + nelems2).exists (&prev.mode))
1181         {
1182           prev.num_zr += p.num_zr;
1183           prev.num_pr += p.num_pr;
1184           return;
1185         }
1186     }
1187   pieces.quick_push (p);
1188 }
1189
1190 /* Return true if at least one possible value of type TYPE includes at
1191    least one object of Pure Scalable Type, in the sense of the AAPCS64.
1192
1193    This is a relatively expensive test for some types, so it should
1194    generally be made as late as possible.  */
1195
1196 static bool
1197 aarch64_some_values_include_pst_objects_p (const_tree type)
1198 {
1199   if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1200     return false;
1201
1202   if (aarch64_sve::builtin_type_p (type))
1203     return true;
1204
1205   if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
1206     return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
1207
1208   if (RECORD_OR_UNION_TYPE_P (type))
1209     for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1210       if (TREE_CODE (field) == FIELD_DECL
1211           && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
1212         return true;
1213
1214   return false;
1215 }
1216
1217 /* Return the descriptor of the SIMD ABI.  */
1218
1219 static const predefined_function_abi &
1220 aarch64_simd_abi (void)
1221 {
1222   predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1223   if (!simd_abi.initialized_p ())
1224     {
1225       HARD_REG_SET full_reg_clobbers
1226         = default_function_abi.full_reg_clobbers ();
1227       for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1228         if (FP_SIMD_SAVED_REGNUM_P (regno))
1229           CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1230       simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1231     }
1232   return simd_abi;
1233 }
1234
1235 /* Return the descriptor of the SVE PCS.  */
1236
1237 static const predefined_function_abi &
1238 aarch64_sve_abi (void)
1239 {
1240   predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1241   if (!sve_abi.initialized_p ())
1242     {
1243       HARD_REG_SET full_reg_clobbers
1244         = default_function_abi.full_reg_clobbers ();
1245       for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1246         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1247       for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
1248         CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1249       sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1250     }
1251   return sve_abi;
1252 }
1253
1254 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
1255    wraps, otherwise return X itself.  */
1256
1257 static rtx
1258 strip_salt (rtx x)
1259 {
1260   rtx search = x;
1261   if (GET_CODE (search) == CONST)
1262     search = XEXP (search, 0);
1263   if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
1264     x = XVECEXP (search, 0, 0);
1265   return x;
1266 }
1267
1268 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
1269    expression.  */
1270
1271 static rtx
1272 strip_offset_and_salt (rtx addr, poly_int64 *offset)
1273 {
1274   return strip_salt (strip_offset (addr, offset));
1275 }
1276
1277 /* Generate code to enable conditional branches in functions over 1 MiB.  */
1278 const char *
1279 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1280                         const char * branch_format)
1281 {
1282     rtx_code_label * tmp_label = gen_label_rtx ();
1283     char label_buf[256];
1284     char buffer[128];
1285     ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1286                                  CODE_LABEL_NUMBER (tmp_label));
1287     const char *label_ptr = targetm.strip_name_encoding (label_buf);
1288     rtx dest_label = operands[pos_label];
1289     operands[pos_label] = tmp_label;
1290
1291     snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1292     output_asm_insn (buffer, operands);
1293
1294     snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1295     operands[pos_label] = dest_label;
1296     output_asm_insn (buffer, operands);
1297     return "";
1298 }
1299
1300 void
1301 aarch64_err_no_fpadvsimd (machine_mode mode)
1302 {
1303   if (TARGET_GENERAL_REGS_ONLY)
1304     if (FLOAT_MODE_P (mode))
1305       error ("%qs is incompatible with the use of floating-point types",
1306              "-mgeneral-regs-only");
1307     else
1308       error ("%qs is incompatible with the use of vector types",
1309              "-mgeneral-regs-only");
1310   else
1311     if (FLOAT_MODE_P (mode))
1312       error ("%qs feature modifier is incompatible with the use of"
1313              " floating-point types", "+nofp");
1314     else
1315       error ("%qs feature modifier is incompatible with the use of"
1316              " vector types", "+nofp");
1317 }
1318
1319 /* Report when we try to do something that requires SVE when SVE is disabled.
1320    This is an error of last resort and isn't very high-quality.  It usually
1321    involves attempts to measure the vector length in some way.  */
1322 static void
1323 aarch64_report_sve_required (void)
1324 {
1325   static bool reported_p = false;
1326
1327   /* Avoid reporting a slew of messages for a single oversight.  */
1328   if (reported_p)
1329     return;
1330
1331   error ("this operation requires the SVE ISA extension");
1332   inform (input_location, "you can enable SVE using the command-line"
1333           " option %<-march%>, or by using the %<target%>"
1334           " attribute or pragma");
1335   reported_p = true;
1336 }
1337
1338 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1339    registers.  */
1340 inline bool
1341 pr_or_ffr_regnum_p (unsigned int regno)
1342 {
1343   return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1344 }
1345
1346 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1347    The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1348    GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1349    higher cost.  POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1350    and GENERAL_REGS is lower than the memory cost (in this case the best class
1351    is the lowest cost one).  Using POINTER_AND_FP_REGS irrespectively of its
1352    cost results in bad allocations with many redundant int<->FP moves which
1353    are expensive on various cores.
1354    To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1355    force a decision between FP_REGS and GENERAL_REGS.  We use the allocno class
1356    if it isn't POINTER_AND_FP_REGS.  Similarly, use the best class if it isn't
1357    POINTER_AND_FP_REGS.  Otherwise set the allocno class depending on the mode.
1358    The result of this is that it is no longer inefficient to have a higher
1359    memory move cost than the register move cost.
1360 */
1361
1362 static reg_class_t
1363 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1364                                          reg_class_t best_class)
1365 {
1366   machine_mode mode;
1367
1368   if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1369       || !reg_class_subset_p (FP_REGS, allocno_class))
1370     return allocno_class;
1371
1372   if (!reg_class_subset_p (GENERAL_REGS, best_class)
1373       || !reg_class_subset_p (FP_REGS, best_class))
1374     return best_class;
1375
1376   mode = PSEUDO_REGNO_MODE (regno);
1377   return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1378 }
1379
1380 static unsigned int
1381 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1382 {
1383   if (GET_MODE_UNIT_SIZE (mode) == 4)
1384     return aarch64_tune_params.min_div_recip_mul_sf;
1385   return aarch64_tune_params.min_div_recip_mul_df;
1386 }
1387
1388 /* Return the reassociation width of treeop OPC with mode MODE.  */
1389 static int
1390 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1391 {
1392   if (VECTOR_MODE_P (mode))
1393     return aarch64_tune_params.vec_reassoc_width;
1394   if (INTEGRAL_MODE_P (mode))
1395     return aarch64_tune_params.int_reassoc_width;
1396   /* Reassociation reduces the number of FMAs which may result in worse
1397      performance.  Use a per-CPU setting for FMA reassociation which allows
1398      narrow CPUs with few FP pipes to switch it off (value of 1), and wider
1399      CPUs with many FP pipes to enable reassociation.
1400      Since the reassociation pass doesn't understand FMA at all, assume
1401      that any FP addition might turn into FMA.  */
1402   if (FLOAT_MODE_P (mode))
1403     return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
1404                             : aarch64_tune_params.fp_reassoc_width;
1405   return 1;
1406 }
1407
1408 /* Provide a mapping from gcc register numbers to dwarf register numbers.  */
1409 unsigned
1410 aarch64_debugger_regno (unsigned regno)
1411 {
1412    if (GP_REGNUM_P (regno))
1413      return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1414    else if (regno == SP_REGNUM)
1415      return AARCH64_DWARF_SP;
1416    else if (FP_REGNUM_P (regno))
1417      return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1418    else if (PR_REGNUM_P (regno))
1419      return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1420    else if (regno == VG_REGNUM)
1421      return AARCH64_DWARF_VG;
1422
1423    /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1424       equivalent DWARF register.  */
1425    return DWARF_FRAME_REGISTERS;
1426 }
1427
1428 /* Implement TARGET_DWARF_FRAME_REG_MODE.  */
1429 static machine_mode
1430 aarch64_dwarf_frame_reg_mode (int regno)
1431 {
1432   /* Predicate registers are call-clobbered in the EH ABI (which is
1433      ARM_PCS_AAPCS64), so they should not be described by CFI.
1434      Their size changes as VL changes, so any values computed by
1435      __builtin_init_dwarf_reg_size_table might not be valid for
1436      all frames.  */
1437   if (PR_REGNUM_P (regno))
1438     return VOIDmode;
1439   return default_dwarf_frame_reg_mode (regno);
1440 }
1441
1442 /* If X is a CONST_DOUBLE, return its bit representation as a constant
1443    integer, otherwise return X unmodified.  */
1444 static rtx
1445 aarch64_bit_representation (rtx x)
1446 {
1447   if (CONST_DOUBLE_P (x))
1448     x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
1449   return x;
1450 }
1451
1452 /* Return an estimate for the number of quadwords in an SVE vector.  This is
1453    equivalent to the number of Advanced SIMD vectors in an SVE vector.  */
1454 static unsigned int
1455 aarch64_estimated_sve_vq ()
1456 {
1457   return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
1458 }
1459
1460 /* Return true if MODE is an SVE predicate mode.  */
1461 static bool
1462 aarch64_sve_pred_mode_p (machine_mode mode)
1463 {
1464   return (TARGET_SVE
1465           && (mode == VNx16BImode
1466               || mode == VNx8BImode
1467               || mode == VNx4BImode
1468               || mode == VNx2BImode));
1469 }
1470
1471 /* Three mutually-exclusive flags describing a vector or predicate type.  */
1472 const unsigned int VEC_ADVSIMD  = 1;
1473 const unsigned int VEC_SVE_DATA = 2;
1474 const unsigned int VEC_SVE_PRED = 4;
1475 /* Indicates a structure of 2, 3 or 4 vectors or predicates.  */
1476 const unsigned int VEC_STRUCT   = 8;
1477 /* Can be used in combination with VEC_SVE_DATA to indicate that the
1478    vector has fewer significant bytes than a full SVE vector.  */
1479 const unsigned int VEC_PARTIAL  = 16;
1480 /* Useful combinations of the above.  */
1481 const unsigned int VEC_ANY_SVE  = VEC_SVE_DATA | VEC_SVE_PRED;
1482 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1483
1484 /* Return a set of flags describing the vector properties of mode MODE.
1485    If ANY_TARGET_P is false (the default), ignore modes that are not supported
1486    by the current target.  Otherwise categorize the modes that can be used
1487    with the set of all targets supported by the port.  */
1488
1489 static unsigned int
1490 aarch64_classify_vector_mode (machine_mode mode, bool any_target_p = false)
1491 {
1492   if (aarch64_sve_pred_mode_p (mode))
1493     return VEC_SVE_PRED;
1494
1495   /* Make the decision based on the mode's enum value rather than its
1496      properties, so that we keep the correct classification regardless
1497      of -msve-vector-bits.  */
1498   switch (mode)
1499     {
1500     /* Partial SVE QI vectors.  */
1501     case E_VNx2QImode:
1502     case E_VNx4QImode:
1503     case E_VNx8QImode:
1504     /* Partial SVE HI vectors.  */
1505     case E_VNx2HImode:
1506     case E_VNx4HImode:
1507     /* Partial SVE SI vector.  */
1508     case E_VNx2SImode:
1509     /* Partial SVE HF vectors.  */
1510     case E_VNx2HFmode:
1511     case E_VNx4HFmode:
1512     /* Partial SVE BF vectors.  */
1513     case E_VNx2BFmode:
1514     case E_VNx4BFmode:
1515     /* Partial SVE SF vector.  */
1516     case E_VNx2SFmode:
1517       return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA | VEC_PARTIAL : 0;
1518
1519     case E_VNx16QImode:
1520     case E_VNx8HImode:
1521     case E_VNx4SImode:
1522     case E_VNx2DImode:
1523     case E_VNx8BFmode:
1524     case E_VNx8HFmode:
1525     case E_VNx4SFmode:
1526     case E_VNx2DFmode:
1527       return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA : 0;
1528
1529     /* x2 SVE vectors.  */
1530     case E_VNx32QImode:
1531     case E_VNx16HImode:
1532     case E_VNx8SImode:
1533     case E_VNx4DImode:
1534     case E_VNx16BFmode:
1535     case E_VNx16HFmode:
1536     case E_VNx8SFmode:
1537     case E_VNx4DFmode:
1538     /* x3 SVE vectors.  */
1539     case E_VNx48QImode:
1540     case E_VNx24HImode:
1541     case E_VNx12SImode:
1542     case E_VNx6DImode:
1543     case E_VNx24BFmode:
1544     case E_VNx24HFmode:
1545     case E_VNx12SFmode:
1546     case E_VNx6DFmode:
1547     /* x4 SVE vectors.  */
1548     case E_VNx64QImode:
1549     case E_VNx32HImode:
1550     case E_VNx16SImode:
1551     case E_VNx8DImode:
1552     case E_VNx32BFmode:
1553     case E_VNx32HFmode:
1554     case E_VNx16SFmode:
1555     case E_VNx8DFmode:
1556       return (TARGET_SVE || any_target_p) ? VEC_SVE_DATA | VEC_STRUCT : 0;
1557
1558     case E_OImode:
1559     case E_CImode:
1560     case E_XImode:
1561       return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD | VEC_STRUCT : 0;
1562
1563     /* Structures of 64-bit Advanced SIMD vectors.  */
1564     case E_V2x8QImode:
1565     case E_V2x4HImode:
1566     case E_V2x2SImode:
1567     case E_V2x1DImode:
1568     case E_V2x4BFmode:
1569     case E_V2x4HFmode:
1570     case E_V2x2SFmode:
1571     case E_V2x1DFmode:
1572     case E_V3x8QImode:
1573     case E_V3x4HImode:
1574     case E_V3x2SImode:
1575     case E_V3x1DImode:
1576     case E_V3x4BFmode:
1577     case E_V3x4HFmode:
1578     case E_V3x2SFmode:
1579     case E_V3x1DFmode:
1580     case E_V4x8QImode:
1581     case E_V4x4HImode:
1582     case E_V4x2SImode:
1583     case E_V4x1DImode:
1584     case E_V4x4BFmode:
1585     case E_V4x4HFmode:
1586     case E_V4x2SFmode:
1587     case E_V4x1DFmode:
1588       return (TARGET_FLOAT || any_target_p)
1589               ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
1590
1591     /* Structures of 128-bit Advanced SIMD vectors.  */
1592     case E_V2x16QImode:
1593     case E_V2x8HImode:
1594     case E_V2x4SImode:
1595     case E_V2x2DImode:
1596     case E_V2x8BFmode:
1597     case E_V2x8HFmode:
1598     case E_V2x4SFmode:
1599     case E_V2x2DFmode:
1600     case E_V3x16QImode:
1601     case E_V3x8HImode:
1602     case E_V3x4SImode:
1603     case E_V3x2DImode:
1604     case E_V3x8BFmode:
1605     case E_V3x8HFmode:
1606     case E_V3x4SFmode:
1607     case E_V3x2DFmode:
1608     case E_V4x16QImode:
1609     case E_V4x8HImode:
1610     case E_V4x4SImode:
1611     case E_V4x2DImode:
1612     case E_V4x8BFmode:
1613     case E_V4x8HFmode:
1614     case E_V4x4SFmode:
1615     case E_V4x2DFmode:
1616       return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD | VEC_STRUCT : 0;
1617
1618     /* 64-bit Advanced SIMD vectors.  */
1619     case E_V8QImode:
1620     case E_V4HImode:
1621     case E_V2SImode:
1622     case E_V1DImode:
1623     case E_V4HFmode:
1624     case E_V4BFmode:
1625     case E_V2SFmode:
1626     case E_V1DFmode:
1627     /* 128-bit Advanced SIMD vectors.  */
1628     case E_V16QImode:
1629     case E_V8HImode:
1630     case E_V4SImode:
1631     case E_V2DImode:
1632     case E_V8HFmode:
1633     case E_V8BFmode:
1634     case E_V4SFmode:
1635     case E_V2DFmode:
1636       return (TARGET_FLOAT || any_target_p) ? VEC_ADVSIMD : 0;
1637
1638     case E_VNx32BImode:
1639       return TARGET_SVE ? VEC_SVE_PRED | VEC_STRUCT : 0;
1640
1641     default:
1642       return 0;
1643     }
1644 }
1645
1646 /* Return true if MODE is any of the Advanced SIMD structure modes.  */
1647 bool
1648 aarch64_advsimd_struct_mode_p (machine_mode mode)
1649 {
1650   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1651   return (vec_flags & VEC_ADVSIMD) && (vec_flags & VEC_STRUCT);
1652 }
1653
1654 /* Return true if MODE is an Advanced SIMD D-register structure mode.  */
1655 static bool
1656 aarch64_advsimd_partial_struct_mode_p (machine_mode mode)
1657 {
1658   return (aarch64_classify_vector_mode (mode)
1659           == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL));
1660 }
1661
1662 /* Return true if MODE is an Advanced SIMD Q-register structure mode.  */
1663 static bool
1664 aarch64_advsimd_full_struct_mode_p (machine_mode mode)
1665 {
1666   return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT));
1667 }
1668
1669 /* Return true if MODE is any of the data vector modes, including
1670    structure modes.  */
1671 static bool
1672 aarch64_vector_data_mode_p (machine_mode mode)
1673 {
1674   return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1675 }
1676
1677 /* Return true if MODE is any form of SVE mode, including predicates,
1678    vectors and structures.  */
1679 bool
1680 aarch64_sve_mode_p (machine_mode mode)
1681 {
1682   return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
1683 }
1684
1685 /* Return true if MODE is an SVE data vector mode; either a single vector
1686    or a structure of vectors.  */
1687 static bool
1688 aarch64_sve_data_mode_p (machine_mode mode)
1689 {
1690   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1691 }
1692
1693 /* Return the number of defined bytes in one constituent vector of
1694    SVE mode MODE, which has vector flags VEC_FLAGS.  */
1695 static poly_int64
1696 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
1697 {
1698   if (vec_flags & VEC_PARTIAL)
1699     /* A single partial vector.  */
1700     return GET_MODE_SIZE (mode);
1701
1702   if (vec_flags & VEC_SVE_DATA)
1703     /* A single vector or a tuple.  */
1704     return BYTES_PER_SVE_VECTOR;
1705
1706   /* A single predicate.  */
1707   gcc_assert (vec_flags & VEC_SVE_PRED);
1708   return BYTES_PER_SVE_PRED;
1709 }
1710
1711 /* If MODE holds an array of vectors, return the number of vectors
1712    in the array, otherwise return 1.  */
1713
1714 static unsigned int
1715 aarch64_ldn_stn_vectors (machine_mode mode)
1716 {
1717   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1718   if (vec_flags == (VEC_ADVSIMD | VEC_PARTIAL | VEC_STRUCT))
1719     return exact_div (GET_MODE_SIZE (mode), 8).to_constant ();
1720   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
1721     return exact_div (GET_MODE_SIZE (mode), 16).to_constant ();
1722   if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
1723     return exact_div (GET_MODE_SIZE (mode),
1724                       BYTES_PER_SVE_VECTOR).to_constant ();
1725   return 1;
1726 }
1727
1728 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
1729    corresponding vector structure mode.  */
1730 static opt_machine_mode
1731 aarch64_advsimd_vector_array_mode (machine_mode mode,
1732                                    unsigned HOST_WIDE_INT nelems)
1733 {
1734   unsigned int flags = VEC_ADVSIMD | VEC_STRUCT;
1735   if (known_eq (GET_MODE_SIZE (mode), 8))
1736     flags |= VEC_PARTIAL;
1737
1738   machine_mode struct_mode;
1739   FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
1740     if (aarch64_classify_vector_mode (struct_mode) == flags
1741         && GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
1742         && known_eq (GET_MODE_NUNITS (struct_mode),
1743              GET_MODE_NUNITS (mode) * nelems))
1744       return struct_mode;
1745   return opt_machine_mode ();
1746 }
1747
1748 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE.  */
1749
1750 opt_machine_mode
1751 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
1752 {
1753   enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
1754                             ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
1755   machine_mode mode;
1756   FOR_EACH_MODE_IN_CLASS (mode, mclass)
1757     if (inner_mode == GET_MODE_INNER (mode)
1758         && known_eq (nunits, GET_MODE_NUNITS (mode))
1759         && aarch64_sve_data_mode_p (mode))
1760       return mode;
1761   return opt_machine_mode ();
1762 }
1763
1764 /* Implement target hook TARGET_ARRAY_MODE.  */
1765 static opt_machine_mode
1766 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1767 {
1768   if (TARGET_SVE && GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
1769     {
1770       /* Use VNx32BI for pairs of predicates, but explicitly reject giving
1771          a mode to other array sizes.  Using integer modes requires a round
1772          trip through memory and generates terrible code.  */
1773       if (nelems == 1)
1774         return mode;
1775       if (mode == VNx16BImode && nelems == 2)
1776         return VNx32BImode;
1777       return BLKmode;
1778     }
1779
1780   auto flags = aarch64_classify_vector_mode (mode);
1781   if (flags == VEC_SVE_DATA && IN_RANGE (nelems, 2, 4))
1782     return aarch64_sve_data_mode (GET_MODE_INNER (mode),
1783                                   GET_MODE_NUNITS (mode) * nelems);
1784
1785   if (flags == VEC_ADVSIMD && IN_RANGE (nelems, 2, 4))
1786     return aarch64_advsimd_vector_array_mode (mode, nelems);
1787
1788   return opt_machine_mode ();
1789 }
1790
1791 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
1792 static bool
1793 aarch64_array_mode_supported_p (machine_mode mode,
1794                                 unsigned HOST_WIDE_INT nelems)
1795 {
1796   if (TARGET_BASE_SIMD
1797       && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1798           || AARCH64_VALID_SIMD_DREG_MODE (mode))
1799       && (nelems >= 2 && nelems <= 4))
1800     return true;
1801
1802   return false;
1803 }
1804
1805 /* MODE is some form of SVE vector mode.  For data modes, return the number
1806    of vector register bits that each element of MODE occupies, such as 64
1807    for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
1808    in a 64-bit container).  For predicate modes, return the number of
1809    data bits controlled by each significant predicate bit.  */
1810
1811 static unsigned int
1812 aarch64_sve_container_bits (machine_mode mode)
1813 {
1814   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1815   poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
1816                              ? BITS_PER_SVE_VECTOR
1817                              : GET_MODE_BITSIZE (mode));
1818   return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
1819 }
1820
1821 /* Return the SVE predicate mode to use for elements that have
1822    ELEM_NBYTES bytes, if such a mode exists.  */
1823
1824 opt_machine_mode
1825 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1826 {
1827   if (TARGET_SVE)
1828     {
1829       if (elem_nbytes == 1)
1830         return VNx16BImode;
1831       if (elem_nbytes == 2)
1832         return VNx8BImode;
1833       if (elem_nbytes == 4)
1834         return VNx4BImode;
1835       if (elem_nbytes == 8)
1836         return VNx2BImode;
1837     }
1838   return opt_machine_mode ();
1839 }
1840
1841 /* Return the SVE predicate mode that should be used to control
1842    SVE mode MODE.  */
1843
1844 machine_mode
1845 aarch64_sve_pred_mode (machine_mode mode)
1846 {
1847   unsigned int bits = aarch64_sve_container_bits (mode);
1848   return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
1849 }
1850
1851 /* Implement TARGET_VECTORIZE_GET_MASK_MODE.  */
1852
1853 static opt_machine_mode
1854 aarch64_get_mask_mode (machine_mode mode)
1855 {
1856   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1857   if (vec_flags & VEC_SVE_DATA)
1858     return aarch64_sve_pred_mode (mode);
1859
1860   return default_get_mask_mode (mode);
1861 }
1862
1863 /* Return the integer element mode associated with SVE mode MODE.  */
1864
1865 static scalar_int_mode
1866 aarch64_sve_element_int_mode (machine_mode mode)
1867 {
1868   poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
1869                              ? BITS_PER_SVE_VECTOR
1870                              : GET_MODE_BITSIZE (mode));
1871   unsigned int elt_bits = vector_element_size (vector_bits,
1872                                                GET_MODE_NUNITS (mode));
1873   return int_mode_for_size (elt_bits, 0).require ();
1874 }
1875
1876 /* Return an integer element mode that contains exactly
1877    aarch64_sve_container_bits (MODE) bits.  This is wider than
1878    aarch64_sve_element_int_mode if MODE is a partial vector,
1879    otherwise it's the same.  */
1880
1881 static scalar_int_mode
1882 aarch64_sve_container_int_mode (machine_mode mode)
1883 {
1884   return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
1885 }
1886
1887 /* Return the integer vector mode associated with SVE mode MODE.
1888    Unlike related_int_vector_mode, this can handle the case in which
1889    MODE is a predicate (and thus has a different total size).  */
1890
1891 machine_mode
1892 aarch64_sve_int_mode (machine_mode mode)
1893 {
1894   scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
1895   return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
1896 }
1897
1898 /* Implement TARGET_VECTORIZE_RELATED_MODE.  */
1899
1900 static opt_machine_mode
1901 aarch64_vectorize_related_mode (machine_mode vector_mode,
1902                                 scalar_mode element_mode,
1903                                 poly_uint64 nunits)
1904 {
1905   unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
1906
1907   /* If we're operating on SVE vectors, try to return an SVE mode.  */
1908   poly_uint64 sve_nunits;
1909   if ((vec_flags & VEC_SVE_DATA)
1910       && multiple_p (BYTES_PER_SVE_VECTOR,
1911                      GET_MODE_SIZE (element_mode), &sve_nunits))
1912     {
1913       machine_mode sve_mode;
1914       if (maybe_ne (nunits, 0U))
1915         {
1916           /* Try to find a full or partial SVE mode with exactly
1917              NUNITS units.  */
1918           if (multiple_p (sve_nunits, nunits)
1919               && aarch64_sve_data_mode (element_mode,
1920                                         nunits).exists (&sve_mode))
1921             return sve_mode;
1922         }
1923       else
1924         {
1925           /* Take the preferred number of units from the number of bytes
1926              that fit in VECTOR_MODE.  We always start by "autodetecting"
1927              a full vector mode with preferred_simd_mode, so vectors
1928              chosen here will also be full vector modes.  Then
1929              autovectorize_vector_modes tries smaller starting modes
1930              and thus smaller preferred numbers of units.  */
1931           sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
1932           if (aarch64_sve_data_mode (element_mode,
1933                                      sve_nunits).exists (&sve_mode))
1934             return sve_mode;
1935         }
1936     }
1937
1938   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
1939   if (TARGET_SIMD
1940       && (vec_flags & VEC_ADVSIMD)
1941       && known_eq (nunits, 0U)
1942       && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
1943       && maybe_ge (GET_MODE_BITSIZE (element_mode)
1944                    * GET_MODE_NUNITS (vector_mode), 128U))
1945     {
1946       machine_mode res = aarch64_simd_container_mode (element_mode, 128);
1947       if (VECTOR_MODE_P (res))
1948         return res;
1949     }
1950
1951   return default_vectorize_related_mode (vector_mode, element_mode, nunits);
1952 }
1953
1954 /* Implement TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT.  */
1955
1956 static bool
1957 aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type)
1958 {
1959   machine_mode mode = TYPE_MODE (type);
1960   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1961   bool sve_p = (vec_flags & VEC_ANY_SVE);
1962   bool simd_p = (vec_flags & VEC_ADVSIMD);
1963
1964   return (sve_p && TARGET_SVE2) || (simd_p && TARGET_SIMD);
1965 }
1966
1967 /* Implement TARGET_PREFERRED_ELSE_VALUE.  For binary operations,
1968    prefer to use the first arithmetic operand as the else value if
1969    the else value doesn't matter, since that exactly matches the SVE
1970    destructive merging form.  For ternary operations we could either
1971    pick the first operand and use FMAD-like instructions or the last
1972    operand and use FMLA-like instructions; the latter seems more
1973    natural.  */
1974
1975 static tree
1976 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1977 {
1978   return nops == 3 ? ops[2] : ops[0];
1979 }
1980
1981 /* Implement TARGET_HARD_REGNO_NREGS.  */
1982
1983 static unsigned int
1984 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1985 {
1986   /* ??? Logically we should only need to provide a value when
1987      HARD_REGNO_MODE_OK says that the combination is valid,
1988      but at the moment we need to handle all modes.  Just ignore
1989      any runtime parts for registers that can't store them.  */
1990   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1991   switch (aarch64_regno_regclass (regno))
1992     {
1993     case FP_REGS:
1994     case FP_LO_REGS:
1995     case FP_LO8_REGS:
1996       {
1997         unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1998         if (vec_flags & VEC_SVE_DATA)
1999           return exact_div (GET_MODE_SIZE (mode),
2000                             aarch64_vl_bytes (mode, vec_flags)).to_constant ();
2001         if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
2002           return GET_MODE_SIZE (mode).to_constant () / 8;
2003         return CEIL (lowest_size, UNITS_PER_VREG);
2004       }
2005
2006     case PR_REGS:
2007     case PR_LO_REGS:
2008     case PR_HI_REGS:
2009       return mode == VNx32BImode ? 2 : 1;
2010
2011     case FFR_REGS:
2012     case PR_AND_FFR_REGS:
2013     case FAKE_REGS:
2014       return 1;
2015
2016     default:
2017       return CEIL (lowest_size, UNITS_PER_WORD);
2018     }
2019   gcc_unreachable ();
2020 }
2021
2022 /* Implement TARGET_HARD_REGNO_MODE_OK.  */
2023
2024 static bool
2025 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
2026 {
2027   if (mode == V8DImode)
2028     return IN_RANGE (regno, R0_REGNUM, R23_REGNUM)
2029            && multiple_p (regno - R0_REGNUM, 2);
2030
2031   if (GET_MODE_CLASS (mode) == MODE_CC)
2032     return regno == CC_REGNUM;
2033
2034   if (regno == VG_REGNUM)
2035     /* This must have the same size as _Unwind_Word.  */
2036     return mode == DImode;
2037
2038   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2039   if (vec_flags == VEC_SVE_PRED)
2040     return pr_or_ffr_regnum_p (regno);
2041
2042   if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
2043     return PR_REGNUM_P (regno);
2044
2045   if (pr_or_ffr_regnum_p (regno))
2046     return false;
2047
2048   /* These registers are abstract; their modes don't matter.  */
2049   if (FAKE_REGNUM_P (regno))
2050     return true;
2051
2052   if (regno == SP_REGNUM)
2053     /* The purpose of comparing with ptr_mode is to support the
2054        global register variable associated with the stack pointer
2055        register via the syntax of asm ("wsp") in ILP32.  */
2056     return mode == Pmode || mode == ptr_mode;
2057
2058   if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2059     return mode == Pmode;
2060
2061   if (GP_REGNUM_P (regno))
2062     {
2063       if (vec_flags & (VEC_ANY_SVE | VEC_STRUCT))
2064         return false;
2065       if (known_le (GET_MODE_SIZE (mode), 8))
2066         return true;
2067       if (known_le (GET_MODE_SIZE (mode), 16))
2068         return (regno & 1) == 0;
2069     }
2070   else if (FP_REGNUM_P (regno))
2071     {
2072       if (vec_flags & VEC_STRUCT)
2073         return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2074       else
2075         return !VECTOR_MODE_P (mode) || vec_flags != 0;
2076     }
2077
2078   return false;
2079 }
2080
2081 /* Return true if a function with type FNTYPE returns its value in
2082    SVE vector or predicate registers.  */
2083
2084 static bool
2085 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2086 {
2087   tree return_type = TREE_TYPE (fntype);
2088
2089   pure_scalable_type_info pst_info;
2090   switch (pst_info.analyze (return_type))
2091     {
2092     case pure_scalable_type_info::IS_PST:
2093       return (pst_info.num_zr () <= NUM_FP_ARG_REGS
2094               && pst_info.num_pr () <= NUM_PR_ARG_REGS);
2095
2096     case pure_scalable_type_info::DOESNT_MATTER:
2097       gcc_assert (aarch64_return_in_memory_1 (return_type));
2098       return false;
2099
2100     case pure_scalable_type_info::NO_ABI_IDENTITY:
2101     case pure_scalable_type_info::ISNT_PST:
2102       return false;
2103     }
2104   gcc_unreachable ();
2105 }
2106
2107 /* Return true if a function with type FNTYPE takes arguments in
2108    SVE vector or predicate registers.  */
2109
2110 static bool
2111 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2112 {
2113   CUMULATIVE_ARGS args_so_far_v;
2114   aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2115                                 NULL_TREE, 0, true);
2116   cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2117
2118   for (tree chain = TYPE_ARG_TYPES (fntype);
2119        chain && chain != void_list_node;
2120        chain = TREE_CHAIN (chain))
2121     {
2122       tree arg_type = TREE_VALUE (chain);
2123       if (arg_type == error_mark_node)
2124         return false;
2125
2126       function_arg_info arg (arg_type, /*named=*/true);
2127       apply_pass_by_reference_rules (&args_so_far_v, arg);
2128       pure_scalable_type_info pst_info;
2129       if (pst_info.analyze_registers (arg.type))
2130         {
2131           unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
2132           unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
2133           gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
2134           return true;
2135         }
2136
2137       targetm.calls.function_arg_advance (args_so_far, arg);
2138     }
2139   return false;
2140 }
2141
2142 /* Implement TARGET_FNTYPE_ABI.  */
2143
2144 static const predefined_function_abi &
2145 aarch64_fntype_abi (const_tree fntype)
2146 {
2147   if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2148     return aarch64_simd_abi ();
2149
2150   if (aarch64_returns_value_in_sve_regs_p (fntype)
2151       || aarch64_takes_arguments_in_sve_regs_p (fntype))
2152     return aarch64_sve_abi ();
2153
2154   return default_function_abi;
2155 }
2156
2157 /* Return the state of PSTATE.SM on entry to functions of type FNTYPE.  */
2158
2159 static aarch64_feature_flags
2160 aarch64_fntype_pstate_sm (const_tree fntype)
2161 {
2162   if (lookup_attribute ("arm", "streaming", TYPE_ATTRIBUTES (fntype)))
2163     return AARCH64_FL_SM_ON;
2164
2165   if (lookup_attribute ("arm", "streaming_compatible",
2166                         TYPE_ATTRIBUTES (fntype)))
2167     return 0;
2168
2169   return AARCH64_FL_SM_OFF;
2170 }
2171
2172 /* Return state flags that describe whether and how functions of type
2173    FNTYPE share state STATE_NAME with their callers.  */
2174
2175 static unsigned int
2176 aarch64_fntype_shared_flags (const_tree fntype, const char *state_name)
2177 {
2178   return aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (fntype),
2179                                             state_name);
2180 }
2181
2182 /* Return the state of PSTATE.ZA on entry to functions of type FNTYPE.  */
2183
2184 static aarch64_feature_flags
2185 aarch64_fntype_pstate_za (const_tree fntype)
2186 {
2187   if (aarch64_fntype_shared_flags (fntype, "za")
2188       || aarch64_fntype_shared_flags (fntype, "zt0"))
2189     return AARCH64_FL_ZA_ON;
2190
2191   return 0;
2192 }
2193
2194 /* Return the ISA mode on entry to functions of type FNTYPE.  */
2195
2196 static aarch64_feature_flags
2197 aarch64_fntype_isa_mode (const_tree fntype)
2198 {
2199   return (aarch64_fntype_pstate_sm (fntype)
2200           | aarch64_fntype_pstate_za (fntype));
2201 }
2202
2203 /* Return true if FNDECL uses streaming mode internally, as an
2204    implementation choice.  */
2205
2206 static bool
2207 aarch64_fndecl_is_locally_streaming (const_tree fndecl)
2208 {
2209   return lookup_attribute ("arm", "locally_streaming",
2210                            DECL_ATTRIBUTES (fndecl));
2211 }
2212
2213 /* Return the state of PSTATE.SM when compiling the body of
2214    function FNDECL.  This might be different from the state of
2215    PSTATE.SM on entry.  */
2216
2217 static aarch64_feature_flags
2218 aarch64_fndecl_pstate_sm (const_tree fndecl)
2219 {
2220   if (aarch64_fndecl_is_locally_streaming (fndecl))
2221     return AARCH64_FL_SM_ON;
2222
2223   return aarch64_fntype_pstate_sm (TREE_TYPE (fndecl));
2224 }
2225
2226 /* Return true if function FNDECL has state STATE_NAME, either by creating
2227    new state itself or by sharing state with callers.  */
2228
2229 static bool
2230 aarch64_fndecl_has_state (tree fndecl, const char *state_name)
2231 {
2232   return (aarch64_fndecl_has_new_state (fndecl, state_name)
2233           || aarch64_fntype_shared_flags (TREE_TYPE (fndecl),
2234                                           state_name) != 0);
2235 }
2236
2237 /* Return the state of PSTATE.ZA when compiling the body of function FNDECL.
2238    This might be different from the state of PSTATE.ZA on entry.  */
2239
2240 static aarch64_feature_flags
2241 aarch64_fndecl_pstate_za (const_tree fndecl)
2242 {
2243   if (aarch64_fndecl_has_new_state (fndecl, "za")
2244       || aarch64_fndecl_has_new_state (fndecl, "zt0"))
2245     return AARCH64_FL_ZA_ON;
2246
2247   return aarch64_fntype_pstate_za (TREE_TYPE (fndecl));
2248 }
2249
2250 /* Return the ISA mode that should be used to compile the body of
2251    function FNDECL.  */
2252
2253 static aarch64_feature_flags
2254 aarch64_fndecl_isa_mode (const_tree fndecl)
2255 {
2256   return (aarch64_fndecl_pstate_sm (fndecl)
2257           | aarch64_fndecl_pstate_za (fndecl));
2258 }
2259
2260 /* Return the state of PSTATE.SM on entry to the current function.
2261    This might be different from the state of PSTATE.SM in the function
2262    body.  */
2263
2264 static aarch64_feature_flags
2265 aarch64_cfun_incoming_pstate_sm ()
2266 {
2267   return aarch64_fntype_pstate_sm (TREE_TYPE (cfun->decl));
2268 }
2269
2270 /* Return the state of PSTATE.ZA on entry to the current function.
2271    This might be different from the state of PSTATE.ZA in the function
2272    body.  */
2273
2274 static aarch64_feature_flags
2275 aarch64_cfun_incoming_pstate_za ()
2276 {
2277   return aarch64_fntype_pstate_za (TREE_TYPE (cfun->decl));
2278 }
2279
2280 /* Return state flags that describe whether and how the current function shares
2281    state STATE_NAME with callers.  */
2282
2283 static unsigned int
2284 aarch64_cfun_shared_flags (const char *state_name)
2285 {
2286   return aarch64_fntype_shared_flags (TREE_TYPE (cfun->decl), state_name);
2287 }
2288
2289 /* Return true if the current function creates new state of type STATE_NAME
2290    (as opposed to sharing the state with its callers or ignoring the state
2291    altogether).  */
2292
2293 static bool
2294 aarch64_cfun_has_new_state (const char *state_name)
2295 {
2296   return aarch64_fndecl_has_new_state (cfun->decl, state_name);
2297 }
2298
2299 /* Return true if PSTATE.SM is 1 in the body of the current function,
2300    but is not guaranteed to be 1 on entry.  */
2301
2302 static bool
2303 aarch64_cfun_enables_pstate_sm ()
2304 {
2305   return (aarch64_fndecl_is_locally_streaming (cfun->decl)
2306           && aarch64_cfun_incoming_pstate_sm () != AARCH64_FL_SM_ON);
2307 }
2308
2309 /* Return true if the current function has state STATE_NAME, either by
2310    creating new state itself or by sharing state with callers.  */
2311
2312 static bool
2313 aarch64_cfun_has_state (const char *state_name)
2314 {
2315   return aarch64_fndecl_has_state (cfun->decl, state_name);
2316 }
2317
2318 /* Return true if a call from the current function to a function with
2319    ISA mode CALLEE_MODE would involve a change to PSTATE.SM around
2320    the BL instruction.  */
2321
2322 static bool
2323 aarch64_call_switches_pstate_sm (aarch64_feature_flags callee_mode)
2324 {
2325   return (callee_mode & ~AARCH64_ISA_MODE & AARCH64_FL_SM_STATE) != 0;
2326 }
2327
2328 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P.  */
2329
2330 static bool
2331 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2332 {
2333   return (aarch64_sve::builtin_type_p (type1)
2334           == aarch64_sve::builtin_type_p (type2));
2335 }
2336
2337 /* Return true if we should emit CFI for register REGNO.  */
2338
2339 static bool
2340 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2341 {
2342   return (GP_REGNUM_P (regno)
2343           || !default_function_abi.clobbers_full_reg_p (regno));
2344 }
2345
2346 /* Return the mode we should use to save and restore register REGNO.  */
2347
2348 static machine_mode
2349 aarch64_reg_save_mode (unsigned int regno)
2350 {
2351   if (GP_REGNUM_P (regno) || regno == VG_REGNUM)
2352     return DImode;
2353
2354   if (FP_REGNUM_P (regno))
2355     switch (crtl->abi->id ())
2356       {
2357       case ARM_PCS_AAPCS64:
2358         /* Only the low 64 bits are saved by the base PCS.  */
2359         return DFmode;
2360
2361       case ARM_PCS_SIMD:
2362         /* The vector PCS saves the low 128 bits (which is the full
2363            register on non-SVE targets).  */
2364         return TFmode;
2365
2366       case ARM_PCS_SVE:
2367         /* Use vectors of DImode for registers that need frame
2368            information, so that the first 64 bytes of the save slot
2369            are always the equivalent of what storing D<n> would give.  */
2370         if (aarch64_emit_cfi_for_reg_p (regno))
2371           return VNx2DImode;
2372
2373         /* Use vectors of bytes otherwise, so that the layout is
2374            endian-agnostic, and so that we can use LDR and STR for
2375            big-endian targets.  */
2376         return VNx16QImode;
2377
2378       case ARM_PCS_TLSDESC:
2379       case ARM_PCS_UNKNOWN:
2380         break;
2381       }
2382
2383   if (PR_REGNUM_P (regno))
2384     /* Save the full predicate register.  */
2385     return VNx16BImode;
2386
2387   gcc_unreachable ();
2388 }
2389
2390 /* Given the ISA mode on entry to a callee and the ABI of the callee,
2391    return the CONST_INT that should be placed in an UNSPEC_CALLEE_ABI rtx.  */
2392
2393 rtx
2394 aarch64_gen_callee_cookie (aarch64_feature_flags isa_mode, arm_pcs pcs_variant)
2395 {
2396   return gen_int_mode ((unsigned int) isa_mode
2397                        | (unsigned int) pcs_variant << AARCH64_NUM_ISA_MODES,
2398                        DImode);
2399 }
2400
2401 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx.  Return the
2402    callee's ABI.  */
2403
2404 static const predefined_function_abi &
2405 aarch64_callee_abi (rtx cookie)
2406 {
2407   return function_abis[UINTVAL (cookie) >> AARCH64_NUM_ISA_MODES];
2408 }
2409
2410 /* COOKIE is a CONST_INT from an UNSPEC_CALLEE_ABI rtx.  Return the
2411    required ISA mode on entry to the callee, which is also the ISA
2412    mode on return from the callee.  */
2413
2414 static aarch64_feature_flags
2415 aarch64_callee_isa_mode (rtx cookie)
2416 {
2417   return UINTVAL (cookie) & AARCH64_FL_ISA_MODES;
2418 }
2419
2420 /* INSN is a call instruction.  Return the CONST_INT stored in its
2421    UNSPEC_CALLEE_ABI rtx.  */
2422
2423 static rtx
2424 aarch64_insn_callee_cookie (const rtx_insn *insn)
2425 {
2426   rtx pat = PATTERN (insn);
2427   gcc_assert (GET_CODE (pat) == PARALLEL);
2428   rtx unspec = XVECEXP (pat, 0, 1);
2429   gcc_assert (GET_CODE (unspec) == UNSPEC
2430               && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2431   return XVECEXP (unspec, 0, 0);
2432 }
2433
2434 /* Implement TARGET_INSN_CALLEE_ABI.  */
2435
2436 const predefined_function_abi &
2437 aarch64_insn_callee_abi (const rtx_insn *insn)
2438 {
2439   return aarch64_callee_abi (aarch64_insn_callee_cookie (insn));
2440 }
2441
2442 /* INSN is a call instruction.  Return the required ISA mode on entry to
2443    the callee, which is also the ISA mode on return from the callee.  */
2444
2445 static aarch64_feature_flags
2446 aarch64_insn_callee_isa_mode (const rtx_insn *insn)
2447 {
2448   return aarch64_callee_isa_mode (aarch64_insn_callee_cookie (insn));
2449 }
2450
2451 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED.  The callee only saves
2452    the lower 64 bits of a 128-bit register.  Tell the compiler the callee
2453    clobbers the top 64 bits when restoring the bottom 64 bits.  */
2454
2455 static bool
2456 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2457                                         unsigned int regno,
2458                                         machine_mode mode)
2459 {
2460   if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2461     {
2462       poly_int64 per_register_size = GET_MODE_SIZE (mode);
2463       unsigned int nregs = hard_regno_nregs (regno, mode);
2464       if (nregs > 1)
2465         per_register_size = exact_div (per_register_size, nregs);
2466       if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2467         return maybe_gt (per_register_size, 16);
2468       return maybe_gt (per_register_size, 8);
2469     }
2470   return false;
2471 }
2472
2473 /* Implement REGMODE_NATURAL_SIZE.  */
2474 poly_uint64
2475 aarch64_regmode_natural_size (machine_mode mode)
2476 {
2477   /* The natural size for SVE data modes is one SVE data vector,
2478      and similarly for predicates.  We can't independently modify
2479      anything smaller than that.  */
2480   /* ??? For now, only do this for variable-width SVE registers.
2481      Doing it for constant-sized registers breaks lower-subreg.cc.  */
2482   /* ??? And once that's fixed, we should probably have similar
2483      code for Advanced SIMD.  */
2484   if (!aarch64_sve_vg.is_constant ())
2485     {
2486       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2487       if (vec_flags & VEC_SVE_PRED)
2488         return BYTES_PER_SVE_PRED;
2489       if (vec_flags & VEC_SVE_DATA)
2490         return BYTES_PER_SVE_VECTOR;
2491     }
2492   return UNITS_PER_WORD;
2493 }
2494
2495 /* Implement HARD_REGNO_CALLER_SAVE_MODE.  */
2496 machine_mode
2497 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2498                                      machine_mode mode)
2499 {
2500   /* The predicate mode determines which bits are significant and
2501      which are "don't care".  Decreasing the number of lanes would
2502      lose data while increasing the number of lanes would make bits
2503      unnecessarily significant.  */
2504   if (PR_REGNUM_P (regno))
2505     return mode;
2506   if (known_ge (GET_MODE_SIZE (mode), 4))
2507     return mode;
2508   else
2509     return SImode;
2510 }
2511
2512 /* Return true if I's bits are consecutive ones from the MSB.  */
2513 bool
2514 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2515 {
2516   return exact_log2 (-i) != HOST_WIDE_INT_M1;
2517 }
2518
2519 /* Implement TARGET_CONSTANT_ALIGNMENT.  Make strings word-aligned so
2520    that strcpy from constants will be faster.  */
2521
2522 static HOST_WIDE_INT
2523 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2524 {
2525   if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2526     return MAX (align, BITS_PER_WORD);
2527   return align;
2528 }
2529
2530 /* Return true if calls to DECL should be treated as
2531    long-calls (ie called via a register).  */
2532 static bool
2533 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2534 {
2535   return false;
2536 }
2537
2538 /* Return true if calls to symbol-ref SYM should be treated as
2539    long-calls (ie called via a register).  */
2540 bool
2541 aarch64_is_long_call_p (rtx sym)
2542 {
2543   return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2544 }
2545
2546 /* Return true if calls to symbol-ref SYM should not go through
2547    plt stubs.  */
2548
2549 bool
2550 aarch64_is_noplt_call_p (rtx sym)
2551 {
2552   const_tree decl = SYMBOL_REF_DECL (sym);
2553
2554   if (flag_pic
2555       && decl
2556       && (!flag_plt
2557           || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2558       && !targetm.binds_local_p (decl))
2559     return true;
2560
2561   return false;
2562 }
2563
2564 /* Emit an insn that's a simple single-set.  Both the operands must be
2565    known to be valid.  */
2566 inline static rtx_insn *
2567 emit_set_insn (rtx x, rtx y)
2568 {
2569   return emit_insn (gen_rtx_SET (x, y));
2570 }
2571
2572 /* X and Y are two things to compare using CODE.  Emit the compare insn and
2573    return the rtx for register 0 in the proper mode.  */
2574 rtx
2575 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2576 {
2577   machine_mode cmp_mode = GET_MODE (x);
2578   machine_mode cc_mode;
2579   rtx cc_reg;
2580
2581   if (cmp_mode == TImode)
2582     {
2583       gcc_assert (code == NE);
2584
2585       cc_mode = CCmode;
2586       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2587
2588       rtx x_lo = operand_subword (x, 0, 0, TImode);
2589       rtx y_lo = operand_subword (y, 0, 0, TImode);
2590       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2591
2592       rtx x_hi = operand_subword (x, 1, 0, TImode);
2593       rtx y_hi = operand_subword (y, 1, 0, TImode);
2594       emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2595                                gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2596                                GEN_INT (AARCH64_EQ)));
2597     }
2598   else
2599     {
2600       cc_mode = SELECT_CC_MODE (code, x, y);
2601       cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2602       emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2603     }
2604   return cc_reg;
2605 }
2606
2607 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode.  */
2608
2609 static rtx
2610 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2611                                   machine_mode y_mode)
2612 {
2613   if (y_mode == E_QImode || y_mode == E_HImode)
2614     {
2615       if (CONST_INT_P (y))
2616         {
2617           y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2618           y_mode = SImode;
2619         }
2620       else
2621         {
2622           rtx t, cc_reg;
2623           machine_mode cc_mode;
2624
2625           t = gen_rtx_ZERO_EXTEND (SImode, y);
2626           t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2627           cc_mode = CC_SWPmode;
2628           cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2629           emit_set_insn (cc_reg, t);
2630           return cc_reg;
2631         }
2632     }
2633
2634   if (!aarch64_plus_operand (y, y_mode))
2635     y = force_reg (y_mode, y);
2636
2637   return aarch64_gen_compare_reg (code, x, y);
2638 }
2639
2640 /* Consider the operation:
2641
2642      OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
2643
2644    where:
2645
2646    - CODE is [SU]MAX or [SU]MIN
2647    - OPERANDS[2] and OPERANDS[3] are constant integers
2648    - OPERANDS[3] is a positive or negative shifted 12-bit immediate
2649    - all operands have mode MODE
2650
2651    Decide whether it is possible to implement the operation using:
2652
2653      SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
2654      or
2655      ADDS <tmp>, OPERANDS[1], OPERANDS[3]
2656
2657    followed by:
2658
2659      <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
2660
2661    where <insn> is one of CSEL, CSINV or CSINC.  Return true if so.
2662    If GENERATE_P is true, also update OPERANDS as follows:
2663
2664      OPERANDS[4] = -OPERANDS[3]
2665      OPERANDS[5] = the rtl condition representing <cond>
2666      OPERANDS[6] = <tmp>
2667      OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC.  */
2668 bool
2669 aarch64_maxmin_plus_const (rtx_code code, rtx *operands, bool generate_p)
2670 {
2671   signop sgn = (code == UMAX || code == UMIN ? UNSIGNED : SIGNED);
2672   rtx dst = operands[0];
2673   rtx maxmin_op = operands[2];
2674   rtx add_op = operands[3];
2675   machine_mode mode = GET_MODE (dst);
2676
2677   /* max (x, y) - z == (x >= y + 1 ? x : y) - z
2678                     == (x >= y ? x : y) - z
2679                     == (x > y ? x : y) - z
2680                     == (x > y - 1 ? x : y) - z
2681
2682      min (x, y) - z == (x <= y - 1 ? x : y) - z
2683                     == (x <= y ? x : y) - z
2684                     == (x < y ? x : y) - z
2685                     == (x < y + 1 ? x : y) - z
2686
2687      Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
2688      which x is compared with z.  Set DIFF to y - z.  Thus the supported
2689      combinations are as follows, with DIFF being the value after the ":":
2690
2691      max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1   [z == y + 1]
2692                     == x >= y ? x - y : 0              [z == y]
2693                     == x > y ? x - y : 0               [z == y]
2694                     == x > y - 1 ? x - (y - 1) : 1     [z == y - 1]
2695
2696      min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1    [z == y - 1]
2697                     == x <= y ? x - y : 0              [z == y]
2698                     == x < y ? x - y : 0               [z == y]
2699                     == x < y + 1 ? x - (y + 1) : -1    [z == y + 1].  */
2700   auto maxmin_val = rtx_mode_t (maxmin_op, mode);
2701   auto add_val = rtx_mode_t (add_op, mode);
2702   auto sub_val = wi::neg (add_val);
2703   auto diff = wi::sub (maxmin_val, sub_val);
2704   if (!(diff == 0
2705         || (diff == 1 && wi::gt_p (maxmin_val, sub_val, sgn))
2706         || (diff == -1 && wi::lt_p (maxmin_val, sub_val, sgn))))
2707     return false;
2708
2709   if (!generate_p)
2710     return true;
2711
2712   rtx_code cmp;
2713   switch (code)
2714     {
2715     case SMAX:
2716       cmp = diff == 1 ? GT : GE;
2717       break;
2718     case UMAX:
2719       cmp = diff == 1 ? GTU : GEU;
2720       break;
2721     case SMIN:
2722       cmp = diff == -1 ? LT : LE;
2723       break;
2724     case UMIN:
2725       cmp = diff == -1 ? LTU : LEU;
2726       break;
2727     default:
2728       gcc_unreachable ();
2729     }
2730   rtx cc = gen_rtx_REG (CCmode, CC_REGNUM);
2731
2732   operands[4] = immed_wide_int_const (sub_val, mode);
2733   operands[5] = gen_rtx_fmt_ee (cmp, VOIDmode, cc, const0_rtx);
2734   if (can_create_pseudo_p ())
2735     operands[6] = gen_reg_rtx (mode);
2736   else
2737     operands[6] = dst;
2738   operands[7] = immed_wide_int_const (diff, mode);
2739
2740   return true;
2741 }
2742
2743
2744 /* Build the SYMBOL_REF for __tls_get_addr.  */
2745
2746 static GTY(()) rtx tls_get_addr_libfunc;
2747
2748 rtx
2749 aarch64_tls_get_addr (void)
2750 {
2751   if (!tls_get_addr_libfunc)
2752     tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2753   return tls_get_addr_libfunc;
2754 }
2755
2756 /* Return the TLS model to use for ADDR.  */
2757
2758 static enum tls_model
2759 tls_symbolic_operand_type (rtx addr)
2760 {
2761   enum tls_model tls_kind = TLS_MODEL_NONE;
2762   poly_int64 offset;
2763   addr = strip_offset_and_salt (addr, &offset);
2764   if (SYMBOL_REF_P (addr))
2765     tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2766
2767   return tls_kind;
2768 }
2769
2770 /* We'll allow lo_sum's in addresses in our legitimate addresses
2771    so that combine would take care of combining addresses where
2772    necessary, but for generation purposes, we'll generate the address
2773    as :
2774    RTL                               Absolute
2775    tmp = hi (symbol_ref);            adrp  x1, foo
2776    dest = lo_sum (tmp, symbol_ref);  add dest, x1, :lo_12:foo
2777                                      nop
2778
2779    PIC                               TLS
2780    adrp x1, :got:foo                 adrp tmp, :tlsgd:foo
2781    ldr  x1, [:got_lo12:foo]          add  dest, tmp, :tlsgd_lo12:foo
2782                                      bl   __tls_get_addr
2783                                      nop
2784
2785    Load TLS symbol, depending on TLS mechanism and TLS access model.
2786
2787    Global Dynamic - Traditional TLS:
2788    adrp tmp, :tlsgd:imm
2789    add  dest, tmp, #:tlsgd_lo12:imm
2790    bl   __tls_get_addr
2791
2792    Global Dynamic - TLS Descriptors:
2793    adrp dest, :tlsdesc:imm
2794    ldr  tmp, [dest, #:tlsdesc_lo12:imm]
2795    add  dest, dest, #:tlsdesc_lo12:imm
2796    blr  tmp
2797    mrs  tp, tpidr_el0
2798    add  dest, dest, tp
2799
2800    Initial Exec:
2801    mrs  tp, tpidr_el0
2802    adrp tmp, :gottprel:imm
2803    ldr  dest, [tmp, #:gottprel_lo12:imm]
2804    add  dest, dest, tp
2805
2806    Local Exec:
2807    mrs  tp, tpidr_el0
2808    add  t0, tp, #:tprel_hi12:imm, lsl #12
2809    add  t0, t0, #:tprel_lo12_nc:imm
2810 */
2811
2812 static void
2813 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2814                                    enum aarch64_symbol_type type)
2815 {
2816   switch (type)
2817     {
2818     case SYMBOL_SMALL_ABSOLUTE:
2819       {
2820         /* In ILP32, the mode of dest can be either SImode or DImode.  */
2821         rtx tmp_reg = dest;
2822         machine_mode mode = GET_MODE (dest);
2823
2824         gcc_assert (mode == Pmode || mode == ptr_mode);
2825
2826         if (can_create_pseudo_p ())
2827           tmp_reg = gen_reg_rtx (mode);
2828
2829         emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm)));
2830         emit_insn (gen_add_losym (dest, tmp_reg, imm));
2831         return;
2832       }
2833
2834     case SYMBOL_TINY_ABSOLUTE:
2835       emit_insn (gen_rtx_SET (dest, imm));
2836       return;
2837
2838     case SYMBOL_SMALL_GOT_28K:
2839       {
2840         machine_mode mode = GET_MODE (dest);
2841         rtx gp_rtx = pic_offset_table_rtx;
2842         rtx insn;
2843         rtx mem;
2844
2845         /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2846            here before rtl expand.  Tree IVOPT will generate rtl pattern to
2847            decide rtx costs, in which case pic_offset_table_rtx is not
2848            initialized.  For that case no need to generate the first adrp
2849            instruction as the final cost for global variable access is
2850            one instruction.  */
2851         if (gp_rtx != NULL)
2852           {
2853             /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2854                using the page base as GOT base, the first page may be wasted,
2855                in the worst scenario, there is only 28K space for GOT).
2856
2857                The generate instruction sequence for accessing global variable
2858                is:
2859
2860                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
2861
2862                Only one instruction needed. But we must initialize
2863                pic_offset_table_rtx properly.  We generate initialize insn for
2864                every global access, and allow CSE to remove all redundant.
2865
2866                The final instruction sequences will look like the following
2867                for multiply global variables access.
2868
2869                  adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
2870
2871                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
2872                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
2873                  ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
2874                  ...  */
2875
2876             rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
2877             crtl->uses_pic_offset_table = 1;
2878             emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
2879
2880             if (mode != GET_MODE (gp_rtx))
2881              gp_rtx = gen_lowpart (mode, gp_rtx);
2882
2883           }
2884
2885         if (mode == ptr_mode)
2886           {
2887             if (mode == DImode)
2888               insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
2889             else
2890               insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
2891
2892             mem = XVECEXP (SET_SRC (insn), 0, 0);
2893           }
2894         else
2895           {
2896             gcc_assert (mode == Pmode);
2897
2898             insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
2899             mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
2900           }
2901
2902         /* The operand is expected to be MEM.  Whenever the related insn
2903            pattern changed, above code which calculate mem should be
2904            updated.  */
2905         gcc_assert (MEM_P (mem));
2906         MEM_READONLY_P (mem) = 1;
2907         MEM_NOTRAP_P (mem) = 1;
2908         emit_insn (insn);
2909         return;
2910       }
2911
2912     case SYMBOL_SMALL_GOT_4G:
2913       emit_insn (gen_rtx_SET (dest, imm));
2914       return;
2915
2916     case SYMBOL_SMALL_TLSGD:
2917       {
2918         rtx_insn *insns;
2919         /* The return type of __tls_get_addr is the C pointer type
2920            so use ptr_mode.  */
2921         rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
2922         rtx tmp_reg = dest;
2923
2924         if (GET_MODE (dest) != ptr_mode)
2925           tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
2926
2927         start_sequence ();
2928         if (ptr_mode == SImode)
2929           aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
2930         else
2931           aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
2932         insns = get_insns ();
2933         end_sequence ();
2934
2935         RTL_CONST_CALL_P (insns) = 1;
2936         emit_libcall_block (insns, tmp_reg, result, imm);
2937         /* Convert back to the mode of the dest adding a zero_extend
2938            from SImode (ptr_mode) to DImode (Pmode). */
2939         if (dest != tmp_reg)
2940           convert_move (dest, tmp_reg, true);
2941         return;
2942       }
2943
2944     case SYMBOL_SMALL_TLSDESC:
2945       {
2946         machine_mode mode = GET_MODE (dest);
2947         rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
2948         rtx tp;
2949
2950         gcc_assert (mode == Pmode || mode == ptr_mode);
2951
2952         /* In ILP32, the got entry is always of SImode size.  Unlike
2953            small GOT, the dest is fixed at reg 0.  */
2954         if (TARGET_ILP32)
2955           emit_insn (gen_tlsdesc_small_si (imm));
2956         else
2957           emit_insn (gen_tlsdesc_small_di (imm));
2958         tp = aarch64_load_tp (NULL);
2959
2960         if (mode != Pmode)
2961           tp = gen_lowpart (mode, tp);
2962
2963         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
2964         if (REG_P (dest))
2965           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
2966         return;
2967       }
2968
2969     case SYMBOL_SMALL_TLSIE:
2970       {
2971         /* In ILP32, the mode of dest can be either SImode or DImode,
2972            while the got entry is always of SImode size.  The mode of
2973            dest depends on how dest is used: if dest is assigned to a
2974            pointer (e.g. in the memory), it has SImode; it may have
2975            DImode if dest is dereferenced to access the memeory.
2976            This is why we have to handle three different tlsie_small
2977            patterns here (two patterns for ILP32).  */
2978         machine_mode mode = GET_MODE (dest);
2979         rtx tmp_reg = gen_reg_rtx (mode);
2980         rtx tp = aarch64_load_tp (NULL);
2981
2982         if (mode == ptr_mode)
2983           {
2984             if (mode == DImode)
2985               emit_insn (gen_tlsie_small_di (tmp_reg, imm));
2986             else
2987               {
2988                 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
2989                 tp = gen_lowpart (mode, tp);
2990               }
2991           }
2992         else
2993           {
2994             gcc_assert (mode == Pmode);
2995             emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
2996           }
2997
2998         emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
2999         if (REG_P (dest))
3000           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3001         return;
3002       }
3003
3004     case SYMBOL_TLSLE12:
3005     case SYMBOL_TLSLE24:
3006     case SYMBOL_TLSLE32:
3007     case SYMBOL_TLSLE48:
3008       {
3009         machine_mode mode = GET_MODE (dest);
3010         rtx tp = aarch64_load_tp (NULL);
3011
3012         if (mode != Pmode)
3013           tp = gen_lowpart (mode, tp);
3014
3015         switch (type)
3016           {
3017           case SYMBOL_TLSLE12:
3018             emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3019                         (dest, tp, imm));
3020             break;
3021           case SYMBOL_TLSLE24:
3022             emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3023                         (dest, tp, imm));
3024           break;
3025           case SYMBOL_TLSLE32:
3026             emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3027                         (dest, imm));
3028             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3029                         (dest, dest, tp));
3030           break;
3031           case SYMBOL_TLSLE48:
3032             emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3033                         (dest, imm));
3034             emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3035                         (dest, dest, tp));
3036             break;
3037           default:
3038             gcc_unreachable ();
3039           }
3040
3041         if (REG_P (dest))
3042           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3043         return;
3044       }
3045
3046     case SYMBOL_TINY_GOT:
3047       {
3048         rtx insn;
3049         machine_mode mode = GET_MODE (dest);
3050
3051         if (mode == ptr_mode)
3052           insn = gen_ldr_got_tiny (mode, dest, imm);
3053         else
3054           {
3055             gcc_assert (mode == Pmode);
3056             insn = gen_ldr_got_tiny_sidi (dest, imm);
3057           }
3058
3059         emit_insn (insn);
3060         return;
3061       }
3062
3063     case SYMBOL_TINY_TLSIE:
3064       {
3065         machine_mode mode = GET_MODE (dest);
3066         rtx tp = aarch64_load_tp (NULL);
3067
3068         if (mode == ptr_mode)
3069           {
3070             if (mode == DImode)
3071               emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3072             else
3073               {
3074                 tp = gen_lowpart (mode, tp);
3075                 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3076               }
3077           }
3078         else
3079           {
3080             gcc_assert (mode == Pmode);
3081             emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3082           }
3083
3084         if (REG_P (dest))
3085           set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3086         return;
3087       }
3088
3089     default:
3090       gcc_unreachable ();
3091     }
3092 }
3093
3094 /* Emit a move from SRC to DEST.  Assume that the move expanders can
3095    handle all moves if !can_create_pseudo_p ().  The distinction is
3096    important because, unlike emit_move_insn, the move expanders know
3097    how to force Pmode objects into the constant pool even when the
3098    constant pool address is not itself legitimate.  */
3099 static rtx
3100 aarch64_emit_move (rtx dest, rtx src)
3101 {
3102   return (can_create_pseudo_p ()
3103           ? emit_move_insn (dest, src)
3104           : emit_move_insn_1 (dest, src));
3105 }
3106
3107 /* Apply UNOPTAB to OP and store the result in DEST.  */
3108
3109 static void
3110 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3111 {
3112   rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3113   if (dest != tmp)
3114     emit_move_insn (dest, tmp);
3115 }
3116
3117 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST.  */
3118
3119 static void
3120 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
3121 {
3122   rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
3123                           OPTAB_DIRECT);
3124   if (dest != tmp)
3125     emit_move_insn (dest, tmp);
3126 }
3127
3128 /* Split a move from SRC to DST into two moves of mode SINGLE_MODE.  */
3129
3130 void
3131 aarch64_split_double_move (rtx dst, rtx src, machine_mode single_mode)
3132 {
3133   machine_mode mode = GET_MODE (dst);
3134
3135   rtx dst0 = simplify_gen_subreg (single_mode, dst, mode, 0);
3136   rtx dst1 = simplify_gen_subreg (single_mode, dst, mode,
3137                                   GET_MODE_SIZE (single_mode));
3138   rtx src0 = simplify_gen_subreg (single_mode, src, mode, 0);
3139   rtx src1 = simplify_gen_subreg (single_mode, src, mode,
3140                                   GET_MODE_SIZE (single_mode));
3141
3142   /* At most one pairing may overlap.  */
3143   if (reg_overlap_mentioned_p (dst0, src1))
3144     {
3145       aarch64_emit_move (dst1, src1);
3146       aarch64_emit_move (dst0, src0);
3147     }
3148   else
3149     {
3150       aarch64_emit_move (dst0, src0);
3151       aarch64_emit_move (dst1, src1);
3152     }
3153 }
3154
3155 /* Split a 128-bit move operation into two 64-bit move operations,
3156    taking care to handle partial overlap of register to register
3157    copies.  Special cases are needed when moving between GP regs and
3158    FP regs.  SRC can be a register, constant or memory; DST a register
3159    or memory.  If either operand is memory it must not have any side
3160    effects.  */
3161 void
3162 aarch64_split_128bit_move (rtx dst, rtx src)
3163 {
3164   machine_mode mode = GET_MODE (dst);
3165
3166   gcc_assert (mode == TImode || mode == TFmode || mode == TDmode);
3167   gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
3168   gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
3169
3170   if (REG_P (dst) && REG_P (src))
3171     {
3172       int src_regno = REGNO (src);
3173       int dst_regno = REGNO (dst);
3174
3175       /* Handle FP <-> GP regs.  */
3176       if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
3177         {
3178           rtx src_lo = gen_lowpart (word_mode, src);
3179           rtx src_hi = gen_highpart (word_mode, src);
3180
3181           emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
3182           emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
3183           return;
3184         }
3185       else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
3186         {
3187           rtx dst_lo = gen_lowpart (word_mode, dst);
3188           rtx dst_hi = gen_highpart (word_mode, dst);
3189
3190           emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
3191           emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
3192           return;
3193         }
3194     }
3195
3196   aarch64_split_double_move (dst, src, word_mode);
3197 }
3198
3199 /* Return true if we should split a move from 128-bit value SRC
3200    to 128-bit register DEST.  */
3201
3202 bool
3203 aarch64_split_128bit_move_p (rtx dst, rtx src)
3204 {
3205   if (FP_REGNUM_P (REGNO (dst)))
3206     return REG_P (src) && !FP_REGNUM_P (REGNO (src));
3207   /* All moves to GPRs need to be split.  */
3208   return true;
3209 }
3210
3211 /* Split a complex SIMD move.  */
3212
3213 void
3214 aarch64_split_simd_move (rtx dst, rtx src)
3215 {
3216   machine_mode src_mode = GET_MODE (src);
3217   machine_mode dst_mode = GET_MODE (dst);
3218
3219   gcc_assert (VECTOR_MODE_P (dst_mode));
3220
3221   if (REG_P (dst) && REG_P (src))
3222     {
3223       gcc_assert (VECTOR_MODE_P (src_mode));
3224       emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
3225     }
3226 }
3227
3228 /* Return a register that contains SVE value X reinterpreted as SVE mode MODE.
3229    The semantics of those of svreinterpret rather than those of subregs;
3230    see the comment at the head of aarch64-sve.md for details about the
3231    difference.  */
3232
3233 rtx
3234 aarch64_sve_reinterpret (machine_mode mode, rtx x)
3235 {
3236   if (GET_MODE (x) == mode)
3237     return x;
3238
3239   /* can_change_mode_class must only return true if subregs and svreinterprets
3240      have the same semantics.  */
3241   if (targetm.can_change_mode_class (GET_MODE (x), mode, FP_REGS))
3242     return lowpart_subreg (mode, x, GET_MODE (x));
3243
3244   rtx res = gen_reg_rtx (mode);
3245   x = force_reg (GET_MODE (x), x);
3246   emit_insn (gen_aarch64_sve_reinterpret (mode, res, x));
3247   return res;
3248 }
3249
3250 bool
3251 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
3252                               machine_mode ymode, rtx y)
3253 {
3254   rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
3255   gcc_assert (r != NULL);
3256   return rtx_equal_p (x, r);
3257 }
3258
3259 /* Return TARGET if it is nonnull and a register of mode MODE.
3260    Otherwise, return a fresh register of mode MODE if we can,
3261    or TARGET reinterpreted as MODE if we can't.  */
3262
3263 static rtx
3264 aarch64_target_reg (rtx target, machine_mode mode)
3265 {
3266   if (target && REG_P (target) && GET_MODE (target) == mode)
3267     return target;
3268   if (!can_create_pseudo_p ())
3269     {
3270       gcc_assert (target);
3271       return gen_lowpart (mode, target);
3272     }
3273   return gen_reg_rtx (mode);
3274 }
3275
3276 /* Return a register that contains the constant in BUILDER, given that
3277    the constant is a legitimate move operand.  Use TARGET as the register
3278    if it is nonnull and convenient.  */
3279
3280 static rtx
3281 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
3282 {
3283   rtx src = builder.build ();
3284   target = aarch64_target_reg (target, GET_MODE (src));
3285   emit_insn (gen_rtx_SET (target, src));
3286   return target;
3287 }
3288
3289 static rtx
3290 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
3291 {
3292   if (can_create_pseudo_p ())
3293     return force_reg (mode, value);
3294   else
3295     {
3296       gcc_assert (x);
3297       aarch64_emit_move (x, value);
3298       return x;
3299     }
3300 }
3301
3302 /* Return true if predicate value X is a constant in which every element
3303    is a CONST_INT.  When returning true, describe X in BUILDER as a VNx16BI
3304    value, i.e. as a predicate in which all bits are significant.  */
3305
3306 static bool
3307 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
3308 {
3309   if (!CONST_VECTOR_P (x))
3310     return false;
3311
3312   unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
3313                                              GET_MODE_NUNITS (GET_MODE (x)));
3314   unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
3315   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
3316   builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
3317
3318   unsigned int nelts = const_vector_encoded_nelts (x);
3319   for (unsigned int i = 0; i < nelts; ++i)
3320     {
3321       rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
3322       if (!CONST_INT_P (elt))
3323         return false;
3324
3325       builder.quick_push (elt);
3326       for (unsigned int j = 1; j < factor; ++j)
3327         builder.quick_push (const0_rtx);
3328     }
3329   builder.finalize ();
3330   return true;
3331 }
3332
3333 /* BUILDER contains a predicate constant of mode VNx16BI.  Return the
3334    widest predicate element size it can have (that is, the largest size
3335    for which each element would still be 0 or 1).  */
3336
3337 unsigned int
3338 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3339 {
3340   /* Start with the most optimistic assumption: that we only need
3341      one bit per pattern.  This is what we will use if only the first
3342      bit in each pattern is ever set.  */
3343   unsigned int mask = GET_MODE_SIZE (DImode);
3344   mask |= builder.npatterns ();
3345
3346   /* Look for set bits.  */
3347   unsigned int nelts = builder.encoded_nelts ();
3348   for (unsigned int i = 1; i < nelts; ++i)
3349     if (INTVAL (builder.elt (i)) != 0)
3350       {
3351         if (i & 1)
3352           return 1;
3353         mask |= i;
3354       }
3355   return mask & -mask;
3356 }
3357
3358 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3359    return that predicate mode, otherwise return opt_machine_mode ().  */
3360
3361 opt_machine_mode
3362 aarch64_ptrue_all_mode (rtx x)
3363 {
3364   gcc_assert (GET_MODE (x) == VNx16BImode);
3365   if (!CONST_VECTOR_P (x)
3366       || !CONST_VECTOR_DUPLICATE_P (x)
3367       || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3368       || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3369     return opt_machine_mode ();
3370
3371   unsigned int nelts = const_vector_encoded_nelts (x);
3372   for (unsigned int i = 1; i < nelts; ++i)
3373     if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3374       return opt_machine_mode ();
3375
3376   return aarch64_sve_pred_mode (nelts);
3377 }
3378
3379 /* BUILDER is a predicate constant of mode VNx16BI.  Consider the value
3380    that the constant would have with predicate element size ELT_SIZE
3381    (ignoring the upper bits in each element) and return:
3382
3383    * -1 if all bits are set
3384    * N if the predicate has N leading set bits followed by all clear bits
3385    * 0 if the predicate does not have any of these forms.  */
3386
3387 int
3388 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3389                               unsigned int elt_size)
3390 {
3391   /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3392      followed by set bits.  */
3393   if (builder.nelts_per_pattern () == 3)
3394     return 0;
3395
3396   /* Skip over leading set bits.  */
3397   unsigned int nelts = builder.encoded_nelts ();
3398   unsigned int i = 0;
3399   for (; i < nelts; i += elt_size)
3400     if (INTVAL (builder.elt (i)) == 0)
3401       break;
3402   unsigned int vl = i / elt_size;
3403
3404   /* Check for the all-true case.  */
3405   if (i == nelts)
3406     return -1;
3407
3408   /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3409      repeating pattern of set bits followed by clear bits.  */
3410   if (builder.nelts_per_pattern () != 2)
3411     return 0;
3412
3413   /* We have a "foreground" value and a duplicated "background" value.
3414      If the background might repeat and the last set bit belongs to it,
3415      we might have set bits followed by clear bits followed by set bits.  */
3416   if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3417     return 0;
3418
3419   /* Make sure that the rest are all clear.  */
3420   for (; i < nelts; i += elt_size)
3421     if (INTVAL (builder.elt (i)) != 0)
3422       return 0;
3423
3424   return vl;
3425 }
3426
3427 /* See if there is an svpattern that encodes an SVE predicate of mode
3428    PRED_MODE in which the first VL bits are set and the rest are clear.
3429    Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3430    A VL of -1 indicates an all-true vector.  */
3431
3432 aarch64_svpattern
3433 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3434 {
3435   if (vl < 0)
3436     return AARCH64_SV_ALL;
3437
3438   if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3439     return AARCH64_NUM_SVPATTERNS;
3440
3441   if (vl >= 1 && vl <= 8)
3442     return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3443
3444   if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3445     return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3446
3447   int max_vl;
3448   if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3449     {
3450       if (vl == (max_vl / 3) * 3)
3451         return AARCH64_SV_MUL3;
3452       /* These would only trigger for non-power-of-2 lengths.  */
3453       if (vl == (max_vl & -4))
3454         return AARCH64_SV_MUL4;
3455       if (vl == (1 << floor_log2 (max_vl)))
3456         return AARCH64_SV_POW2;
3457       if (vl == max_vl)
3458         return AARCH64_SV_ALL;
3459     }
3460   return AARCH64_NUM_SVPATTERNS;
3461 }
3462
3463 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3464    bits has the lowest bit set and the upper bits clear.  This is the
3465    VNx16BImode equivalent of a PTRUE for controlling elements of
3466    ELT_SIZE bytes.  However, because the constant is VNx16BImode,
3467    all bits are significant, even the upper zeros.  */
3468
3469 rtx
3470 aarch64_ptrue_all (unsigned int elt_size)
3471 {
3472   rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3473   builder.quick_push (const1_rtx);
3474   for (unsigned int i = 1; i < elt_size; ++i)
3475     builder.quick_push (const0_rtx);
3476   return builder.build ();
3477 }
3478
3479 /* Return an all-true predicate register of mode MODE.  */
3480
3481 rtx
3482 aarch64_ptrue_reg (machine_mode mode)
3483 {
3484   gcc_assert (aarch64_sve_pred_mode_p (mode));
3485   rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3486   return gen_lowpart (mode, reg);
3487 }
3488
3489 /* Return an all-false predicate register of mode MODE.  */
3490
3491 rtx
3492 aarch64_pfalse_reg (machine_mode mode)
3493 {
3494   gcc_assert (aarch64_sve_pred_mode_p (mode));
3495   rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3496   return gen_lowpart (mode, reg);
3497 }
3498
3499 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3500    for it.  PRED2[0] is the predicate for the instruction whose result
3501    is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3502    for it.  Return true if we can prove that the two predicates are
3503    equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3504    with PRED1[0] without changing behavior.  */
3505
3506 bool
3507 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3508 {
3509   machine_mode mode = GET_MODE (pred1[0]);
3510   gcc_assert (aarch64_sve_pred_mode_p (mode)
3511               && mode == GET_MODE (pred2[0])
3512               && aarch64_sve_ptrue_flag (pred1[1], SImode)
3513               && aarch64_sve_ptrue_flag (pred2[1], SImode));
3514
3515   bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3516                    || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3517   bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3518                    || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3519   return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3520 }
3521
3522 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3523    DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3524    Use TARGET as the target register if nonnull and convenient.  */
3525
3526 static rtx
3527 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3528                           machine_mode data_mode, rtx op1, rtx op2)
3529 {
3530   insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3531   expand_operand ops[5];
3532   create_output_operand (&ops[0], target, pred_mode);
3533   create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3534   create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3535   create_input_operand (&ops[3], op1, data_mode);
3536   create_input_operand (&ops[4], op2, data_mode);
3537   expand_insn (icode, 5, ops);
3538   return ops[0].value;
3539 }
3540
3541 /* Use a comparison to convert integer vector SRC into MODE, which is
3542    the corresponding SVE predicate mode.  Use TARGET for the result
3543    if it's nonnull and convenient.  */
3544
3545 rtx
3546 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3547 {
3548   machine_mode src_mode = GET_MODE (src);
3549   return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3550                                    src, CONST0_RTX (src_mode));
3551 }
3552
3553 /* Return the assembly token for svprfop value PRFOP.  */
3554
3555 static const char *
3556 svprfop_token (enum aarch64_svprfop prfop)
3557 {
3558   switch (prfop)
3559     {
3560 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3561     AARCH64_FOR_SVPRFOP (CASE)
3562 #undef CASE
3563     case AARCH64_NUM_SVPRFOPS:
3564       break;
3565     }
3566   gcc_unreachable ();
3567 }
3568
3569 /* Return the assembly string for an SVE prefetch operation with
3570    mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3571    and that SUFFIX is the format for the remaining operands.  */
3572
3573 char *
3574 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3575                              const char *suffix)
3576 {
3577   static char buffer[128];
3578   aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3579   unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3580                                    mnemonic, svprfop_token (prfop), suffix);
3581   gcc_assert (written < sizeof (buffer));
3582   return buffer;
3583 }
3584
3585 /* Check whether we can calculate the number of elements in PATTERN
3586    at compile time, given that there are NELTS_PER_VQ elements per
3587    128-bit block.  Return the value if so, otherwise return -1.  */
3588
3589 HOST_WIDE_INT
3590 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3591 {
3592   unsigned int vl, const_vg;
3593   if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3594     vl = 1 + (pattern - AARCH64_SV_VL1);
3595   else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3596     vl = 16 << (pattern - AARCH64_SV_VL16);
3597   else if (aarch64_sve_vg.is_constant (&const_vg))
3598     {
3599       /* There are two vector granules per quadword.  */
3600       unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3601       switch (pattern)
3602         {
3603         case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3604         case AARCH64_SV_MUL4: return nelts & -4;
3605         case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3606         case AARCH64_SV_ALL: return nelts;
3607         default: gcc_unreachable ();
3608         }
3609     }
3610   else
3611     return -1;
3612
3613   /* There are two vector granules per quadword.  */
3614   poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3615   if (known_le (vl, nelts_all))
3616     return vl;
3617
3618   /* Requesting more elements than are available results in a PFALSE.  */
3619   if (known_gt (vl, nelts_all))
3620     return 0;
3621
3622   return -1;
3623 }
3624
3625 /* Return true if a single CNT[BHWD] instruction can multiply FACTOR
3626    by the number of 128-bit quadwords in an SVE vector.  */
3627
3628 static bool
3629 aarch64_sve_cnt_factor_p (HOST_WIDE_INT factor)
3630 {
3631   /* The coefficient must be [1, 16] * {2, 4, 8, 16}.  */
3632   return (IN_RANGE (factor, 2, 16 * 16)
3633           && (factor & 1) == 0
3634           && factor <= 16 * (factor & -factor));
3635 }
3636
3637 /* Return true if we can move VALUE into a register using a single
3638    CNT[BHWD] instruction.  */
3639
3640 static bool
3641 aarch64_sve_cnt_immediate_p (poly_int64 value)
3642 {
3643   HOST_WIDE_INT factor = value.coeffs[0];
3644   return value.coeffs[1] == factor && aarch64_sve_cnt_factor_p (factor);
3645 }
3646
3647 /* Likewise for rtx X.  */
3648
3649 bool
3650 aarch64_sve_cnt_immediate_p (rtx x)
3651 {
3652   poly_int64 value;
3653   return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3654 }
3655
3656 /* Return the asm string for an instruction with a CNT-like vector size
3657    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3658    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3659    first part of the operands template (the part that comes before the
3660    vector size itself).  PATTERN is the pattern to use.  FACTOR is the
3661    number of quadwords.  NELTS_PER_VQ, if nonzero, is the number of elements
3662    in each quadword.  If it is zero, we can use any element size.  */
3663
3664 static char *
3665 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3666                                   aarch64_svpattern pattern,
3667                                   unsigned int factor,
3668                                   unsigned int nelts_per_vq)
3669 {
3670   static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3671
3672   if (nelts_per_vq == 0)
3673     /* There is some overlap in the ranges of the four CNT instructions.
3674        Here we always use the smallest possible element size, so that the
3675        multiplier is 1 whereever possible.  */
3676     nelts_per_vq = factor & -factor;
3677   int shift = std::min (exact_log2 (nelts_per_vq), 4);
3678   gcc_assert (IN_RANGE (shift, 1, 4));
3679   char suffix = "dwhb"[shift - 1];
3680
3681   factor >>= shift;
3682   unsigned int written;
3683   if (pattern == AARCH64_SV_ALL && factor == 1)
3684     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3685                         prefix, suffix, operands);
3686   else if (factor == 1)
3687     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3688                         prefix, suffix, operands, svpattern_token (pattern));
3689   else
3690     written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3691                         prefix, suffix, operands, svpattern_token (pattern),
3692                         factor);
3693   gcc_assert (written < sizeof (buffer));
3694   return buffer;
3695 }
3696
3697 /* Return the asm string for an instruction with a CNT-like vector size
3698    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3699    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3700    first part of the operands template (the part that comes before the
3701    vector size itself).  X is the value of the vector size operand,
3702    as a polynomial integer rtx; we need to convert this into an "all"
3703    pattern with a multiplier.  */
3704
3705 char *
3706 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3707                                   rtx x)
3708 {
3709   poly_int64 value = rtx_to_poly_int64 (x);
3710   gcc_assert (aarch64_sve_cnt_immediate_p (value));
3711   return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3712                                            value.coeffs[1], 0);
3713 }
3714
3715 /* Return the asm string for an instruction with a CNT-like vector size
3716    operand (a vector pattern followed by a multiplier in the range [1, 16]).
3717    PREFIX is the mnemonic without the size suffix and OPERANDS is the
3718    first part of the operands template (the part that comes before the
3719    vector size itself).  CNT_PAT[0..2] are the operands of the
3720    UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details.  */
3721
3722 char *
3723 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
3724                                       const char *operands, rtx *cnt_pat)
3725 {
3726   aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
3727   unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
3728   unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
3729   return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
3730                                            factor, nelts_per_vq);
3731 }
3732
3733 /* Return true if we can add X using a single SVE INC or DEC instruction.  */
3734
3735 bool
3736 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3737 {
3738   poly_int64 value;
3739   return (poly_int_rtx_p (x, &value)
3740           && (aarch64_sve_cnt_immediate_p (value)
3741               || aarch64_sve_cnt_immediate_p (-value)));
3742 }
3743
3744 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3745    operand 0.  */
3746
3747 char *
3748 aarch64_output_sve_scalar_inc_dec (rtx offset)
3749 {
3750   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3751   gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3752   if (offset_value.coeffs[1] > 0)
3753     return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3754                                              offset_value.coeffs[1], 0);
3755   else
3756     return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3757                                              -offset_value.coeffs[1], 0);
3758 }
3759
3760 /* Return true if a single RDVL instruction can multiply FACTOR by the
3761    number of 128-bit quadwords in an SVE vector.  This is also the
3762    range of ADDVL.  */
3763
3764 static bool
3765 aarch64_sve_rdvl_addvl_factor_p (HOST_WIDE_INT factor)
3766 {
3767   return (multiple_p (factor, 16)
3768           && IN_RANGE (factor, -32 * 16, 31 * 16));
3769 }
3770
3771 /* Return true if ADDPL can be used to add FACTOR multiplied by the number
3772    of quadwords in an SVE vector.  */
3773
3774 static bool
3775 aarch64_sve_addpl_factor_p (HOST_WIDE_INT factor)
3776 {
3777   return (multiple_p (factor, 2)
3778           && IN_RANGE (factor, -32 * 2, 31 * 2));
3779 }
3780
3781 /* Return true if we can move VALUE into a register using a single
3782    RDVL instruction.  */
3783
3784 static bool
3785 aarch64_sve_rdvl_immediate_p (poly_int64 value)
3786 {
3787   HOST_WIDE_INT factor = value.coeffs[0];
3788   return value.coeffs[1] == factor && aarch64_sve_rdvl_addvl_factor_p (factor);
3789 }
3790
3791 /* Likewise for rtx X.  */
3792
3793 bool
3794 aarch64_sve_rdvl_immediate_p (rtx x)
3795 {
3796   poly_int64 value;
3797   return poly_int_rtx_p (x, &value) && aarch64_sve_rdvl_immediate_p (value);
3798 }
3799
3800 /* Return the asm string for moving RDVL immediate OFFSET into register
3801    operand 0.  */
3802
3803 char *
3804 aarch64_output_sve_rdvl (rtx offset)
3805 {
3806   static char buffer[sizeof ("rdvl\t%x0, #-") + 3 * sizeof (int)];
3807   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3808   gcc_assert (aarch64_sve_rdvl_immediate_p (offset_value));
3809
3810   int factor = offset_value.coeffs[1];
3811   snprintf (buffer, sizeof (buffer), "rdvl\t%%x0, #%d", factor / 16);
3812   return buffer;
3813 }
3814
3815 /* Return true if we can add VALUE to a register using a single ADDVL
3816    or ADDPL instruction.  */
3817
3818 static bool
3819 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3820 {
3821   HOST_WIDE_INT factor = value.coeffs[0];
3822   if (factor == 0 || value.coeffs[1] != factor)
3823     return false;
3824   return (aarch64_sve_rdvl_addvl_factor_p (factor)
3825           || aarch64_sve_addpl_factor_p (factor));
3826 }
3827
3828 /* Likewise for rtx X.  */
3829
3830 bool
3831 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3832 {
3833   poly_int64 value;
3834   return (poly_int_rtx_p (x, &value)
3835           && aarch64_sve_addvl_addpl_immediate_p (value));
3836 }
3837
3838 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3839    to operand 1 and storing the result in operand 0.  */
3840
3841 char *
3842 aarch64_output_sve_addvl_addpl (rtx offset)
3843 {
3844   static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3845   poly_int64 offset_value = rtx_to_poly_int64 (offset);
3846   gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3847
3848   int factor = offset_value.coeffs[1];
3849   if ((factor & 15) == 0)
3850     snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3851   else
3852     snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3853   return buffer;
3854 }
3855
3856 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3857    instruction.  If it is, store the number of elements in each vector
3858    quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3859    factor in *FACTOR_OUT (if nonnull).  */
3860
3861 bool
3862 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3863                                         unsigned int *nelts_per_vq_out)
3864 {
3865   rtx elt;
3866   poly_int64 value;
3867
3868   if (!const_vec_duplicate_p (x, &elt)
3869       || !poly_int_rtx_p (elt, &value))
3870     return false;
3871
3872   unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3873   if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3874     /* There's no vector INCB.  */
3875     return false;
3876
3877   HOST_WIDE_INT factor = value.coeffs[0];
3878   if (value.coeffs[1] != factor)
3879     return false;
3880
3881   /* The coefficient must be [1, 16] * NELTS_PER_VQ.  */
3882   if ((factor % nelts_per_vq) != 0
3883       || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
3884     return false;
3885
3886   if (factor_out)
3887     *factor_out = factor;
3888   if (nelts_per_vq_out)
3889     *nelts_per_vq_out = nelts_per_vq;
3890   return true;
3891 }
3892
3893 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3894    instruction.  */
3895
3896 bool
3897 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
3898 {
3899   return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
3900 }
3901
3902 /* Return the asm template for an SVE vector INC or DEC instruction.
3903    OPERANDS gives the operands before the vector count and X is the
3904    value of the vector count operand itself.  */
3905
3906 char *
3907 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
3908 {
3909   int factor;
3910   unsigned int nelts_per_vq;
3911   if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
3912     gcc_unreachable ();
3913   if (factor < 0)
3914     return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
3915                                              -factor, nelts_per_vq);
3916   else
3917     return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
3918                                              factor, nelts_per_vq);
3919 }
3920
3921 /* Return a constant that represents FACTOR multiplied by the
3922    number of 128-bit quadwords in an SME vector.  ISA_MODE is the
3923    ISA mode in which the calculation is being performed.  */
3924
3925 rtx
3926 aarch64_sme_vq_immediate (machine_mode mode, HOST_WIDE_INT factor,
3927                           aarch64_feature_flags isa_mode)
3928 {
3929   gcc_assert (aarch64_sve_rdvl_addvl_factor_p (factor));
3930   if (isa_mode & AARCH64_FL_SM_ON)
3931     /* We're in streaming mode, so we can use normal poly-int values.  */
3932     return gen_int_mode ({ factor, factor }, mode);
3933
3934   rtvec vec = gen_rtvec (1, gen_int_mode (factor, SImode));
3935   rtx unspec = gen_rtx_UNSPEC (mode, vec, UNSPEC_SME_VQ);
3936   return gen_rtx_CONST (mode, unspec);
3937 }
3938
3939 /* Return true if X is a constant that represents some number X
3940    multiplied by the number of quadwords in an SME vector.  Store this X
3941    in *FACTOR if so.  */
3942
3943 static bool
3944 aarch64_sme_vq_unspec_p (const_rtx x, HOST_WIDE_INT *factor)
3945 {
3946   if (!TARGET_SME || GET_CODE (x) != CONST)
3947     return false;
3948
3949   x = XEXP (x, 0);
3950   if (GET_CODE (x) != UNSPEC
3951       || XINT (x, 1) != UNSPEC_SME_VQ
3952       || XVECLEN (x, 0) != 1)
3953     return false;
3954
3955   x = XVECEXP (x, 0, 0);
3956   if (!CONST_INT_P (x))
3957     return false;
3958
3959   *factor = INTVAL (x);
3960   return true;
3961 }
3962
3963 /* Return true if X is a constant that represents some number Y
3964    multiplied by the number of quadwords in an SME vector, and if
3965    that Y is in the range of RDSVL.  */
3966
3967 bool
3968 aarch64_rdsvl_immediate_p (const_rtx x)
3969 {
3970   HOST_WIDE_INT factor;
3971   return (aarch64_sme_vq_unspec_p (x, &factor)
3972           && aarch64_sve_rdvl_addvl_factor_p (factor));
3973 }
3974
3975 /* Return the asm string for an RDSVL instruction that calculates X,
3976    which is a constant that satisfies aarch64_rdsvl_immediate_p.  */
3977
3978 char *
3979 aarch64_output_rdsvl (const_rtx x)
3980 {
3981   gcc_assert (aarch64_rdsvl_immediate_p (x));
3982   static char buffer[sizeof ("rdsvl\t%x0, #-") + 3 * sizeof (int)];
3983   x = XVECEXP (XEXP (x, 0), 0, 0);
3984   snprintf (buffer, sizeof (buffer), "rdsvl\t%%x0, #%d",
3985             (int) INTVAL (x) / 16);
3986   return buffer;
3987 }
3988
3989 /* Return true if X is a constant that can be added using ADDSVL or ADDSPL.  */
3990
3991 bool
3992 aarch64_addsvl_addspl_immediate_p (const_rtx x)
3993 {
3994   HOST_WIDE_INT factor;
3995   return (aarch64_sme_vq_unspec_p (x, &factor)
3996           && (aarch64_sve_rdvl_addvl_factor_p (factor)
3997               || aarch64_sve_addpl_factor_p (factor)));
3998 }
3999
4000 /* X is a constant that satisfies aarch64_addsvl_addspl_immediate_p.
4001    Return the asm string for the associated instruction.  */
4002
4003 char *
4004 aarch64_output_addsvl_addspl (rtx x)
4005 {
4006   static char buffer[sizeof ("addspl\t%x0, %x1, #-") + 3 * sizeof (int)];
4007   HOST_WIDE_INT factor;
4008   if (!aarch64_sme_vq_unspec_p (x, &factor))
4009     gcc_unreachable ();
4010   if (aarch64_sve_rdvl_addvl_factor_p (factor))
4011     snprintf (buffer, sizeof (buffer), "addsvl\t%%x0, %%x1, #%d",
4012               (int) factor / 16);
4013   else if (aarch64_sve_addpl_factor_p (factor))
4014     snprintf (buffer, sizeof (buffer), "addspl\t%%x0, %%x1, #%d",
4015               (int) factor / 2);
4016   else
4017     gcc_unreachable ();
4018   return buffer;
4019 }
4020
4021 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2.  */
4022
4023 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
4024   {
4025     0x0000000100000001ull,
4026     0x0001000100010001ull,
4027     0x0101010101010101ull,
4028     0x1111111111111111ull,
4029     0x5555555555555555ull,
4030   };
4031
4032
4033
4034 /* Return true if 64-bit VAL is a valid bitmask immediate.  */
4035 static bool
4036 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
4037 {
4038   unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
4039   int bits;
4040
4041   /* Check for a single sequence of one bits and return quickly if so.
4042      The special cases of all ones and all zeroes returns false.  */
4043   tmp = val + (val & -val);
4044
4045   if (tmp == (tmp & -tmp))
4046     return (val + 1) > 1;
4047
4048   /* Invert if the immediate doesn't start with a zero bit - this means we
4049      only need to search for sequences of one bits.  */
4050   if (val & 1)
4051     val = ~val;
4052
4053   /* Find the first set bit and set tmp to val with the first sequence of one
4054      bits removed.  Return success if there is a single sequence of ones.  */
4055   first_one = val & -val;
4056   tmp = val & (val + first_one);
4057
4058   if (tmp == 0)
4059     return true;
4060
4061   /* Find the next set bit and compute the difference in bit position.  */
4062   next_one = tmp & -tmp;
4063   bits = clz_hwi (first_one) - clz_hwi (next_one);
4064   mask = val ^ tmp;
4065
4066   /* Check the bit position difference is a power of 2, and that the first
4067      sequence of one bits fits within 'bits' bits.  */
4068   if ((mask >> bits) != 0 || bits != (bits & -bits))
4069     return false;
4070
4071   /* Check the sequence of one bits is repeated 64/bits times.  */
4072   return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4073 }
4074
4075
4076 /* Return true if VAL is a valid bitmask immediate for MODE.  */
4077 bool
4078 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
4079 {
4080   if (mode == DImode)
4081     return aarch64_bitmask_imm (val);
4082
4083   if (mode == SImode)
4084     return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
4085
4086   /* Replicate small immediates to fit 64 bits.  */
4087   int size = GET_MODE_UNIT_PRECISION (mode);
4088   val &= (HOST_WIDE_INT_1U << size) - 1;
4089   val *= bitmask_imm_mul[__builtin_clz (size) - 26];
4090
4091   return aarch64_bitmask_imm (val);
4092 }
4093
4094
4095 /* Return true if the immediate VAL can be a bitfield immediate
4096    by changing the given MASK bits in VAL to zeroes, ones or bits
4097    from the other half of VAL.  Return the new immediate in VAL2.  */
4098 static inline bool
4099 aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
4100                        unsigned HOST_WIDE_INT &val2,
4101                        unsigned HOST_WIDE_INT mask)
4102 {
4103   val2 = val & ~mask;
4104   if (val2 != val && aarch64_bitmask_imm (val2))
4105     return true;
4106   val2 = val | mask;
4107   if (val2 != val && aarch64_bitmask_imm (val2))
4108     return true;
4109   val = val & ~mask;
4110   val2 = val | (((val >> 32) | (val << 32)) & mask);
4111   if (val2 != val && aarch64_bitmask_imm (val2))
4112     return true;
4113   val2 = val | (((val >> 16) | (val << 48)) & mask);
4114   if (val2 != val && aarch64_bitmask_imm (val2))
4115     return true;
4116   return false;
4117 }
4118
4119
4120 /* Return true if VAL is a valid MOVZ immediate.  */
4121 static inline bool
4122 aarch64_is_movz (unsigned HOST_WIDE_INT val)
4123 {
4124   return (val >> (ctz_hwi (val) & 48)) < 65536;
4125 }
4126
4127
4128 /* Return true if immediate VAL can be created by a 64-bit MOVI/MOVN/MOVZ.  */
4129 bool
4130 aarch64_is_mov_xn_imm (unsigned HOST_WIDE_INT val)
4131 {
4132   return aarch64_is_movz (val) || aarch64_is_movz (~val)
4133     || aarch64_bitmask_imm (val);
4134 }
4135
4136
4137 /* Return true if VAL is an immediate that can be created by a single
4138    MOV instruction.  */
4139 bool
4140 aarch64_move_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
4141 {
4142   gcc_assert (mode == SImode || mode == DImode);
4143
4144   if (val < 65536)
4145     return true;
4146
4147   unsigned HOST_WIDE_INT mask =
4148     (val >> 32) == 0 || mode == SImode ? 0xffffffff : HOST_WIDE_INT_M1U;
4149
4150   if (aarch64_is_movz (val & mask) || aarch64_is_movz (~val & mask))
4151     return true;
4152
4153   val = (val & mask) | ((val << 32) & ~mask);
4154   return aarch64_bitmask_imm (val);
4155 }
4156
4157
4158 static int
4159 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
4160                                 machine_mode mode)
4161 {
4162   int i;
4163   unsigned HOST_WIDE_INT val, val2, val3, mask;
4164   int one_match, zero_match;
4165   int num_insns;
4166
4167   gcc_assert (mode == SImode || mode == DImode);
4168
4169   val = INTVAL (imm);
4170
4171   if (aarch64_move_imm (val, mode))
4172     {
4173       if (generate)
4174         emit_insn (gen_rtx_SET (dest, imm));
4175       return 1;
4176     }
4177
4178   if ((val >> 32) == 0 || mode == SImode)
4179     {
4180       if (generate)
4181         {
4182           emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4183           if (mode == SImode)
4184             emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4185                                        GEN_INT ((val >> 16) & 0xffff)));
4186           else
4187             emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4188                                        GEN_INT ((val >> 16) & 0xffff)));
4189         }
4190       return 2;
4191     }
4192
4193   /* Remaining cases are all for DImode.  */
4194
4195   mask = 0xffff;
4196   zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4197     ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4198   one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4199     ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
4200
4201   /* Try a bitmask immediate and a movk to generate the immediate
4202      in 2 instructions.  */
4203
4204   if (zero_match < 2 && one_match < 2)
4205     {
4206       for (i = 0; i < 64; i += 16)
4207         {
4208           if (aarch64_check_bitmask (val, val2, mask << i))
4209             break;
4210
4211           val2 = val & ~(mask << i);
4212           if ((val2 >> 32) == 0 && aarch64_move_imm (val2, DImode))
4213             break;
4214         }
4215
4216       if (i != 64)
4217         {
4218           if (generate)
4219             {
4220               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4221               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4222                                          GEN_INT ((val >> i) & 0xffff)));
4223             }
4224           return 2;
4225         }
4226
4227       /* Try 2 bitmask immediates which are xor'd together. */
4228       for (i = 0; i < 64; i += 16)
4229         {
4230           val2 = (val >> i) & mask;
4231           val2 |= val2 << 16;
4232           val2 |= val2 << 32;
4233           if (aarch64_bitmask_imm (val2) && aarch64_bitmask_imm (val ^ val2))
4234             break;
4235         }
4236
4237       if (i != 64)
4238         {
4239           if (generate)
4240             {
4241               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4242               emit_insn (gen_xordi3 (dest, dest, GEN_INT (val ^ val2)));
4243             }
4244           return 2;
4245         }
4246     }
4247
4248   /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  */
4249   if (zero_match + one_match == 0)
4250     {
4251       for (i = 0; i < 48; i += 16)
4252         for (int j = i + 16; j < 64; j += 16)
4253           if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
4254             {
4255               if (generate)
4256                 {
4257                   emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4258                   emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4259                                              GEN_INT ((val >> i) & 0xffff)));
4260                   emit_insn (gen_insv_immdi (dest, GEN_INT (j),
4261                                                GEN_INT ((val >> j) & 0xffff)));
4262                 }
4263               return 3;
4264             }
4265
4266       /* Try shifting and inserting the bottom 32-bits into the top bits.  */
4267       val2 = val & 0xffffffff;
4268       val3 = 0xffffffff;
4269       val3 = val2 | (val3 << 32);
4270       for (i = 17; i < 48; i++)
4271         if ((val2 | (val2 << i)) == val)
4272           {
4273             if (generate)
4274               {
4275                 emit_insn (gen_rtx_SET (dest, GEN_INT (val2 & 0xffff)));
4276                 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4277                                            GEN_INT (val2 >> 16)));
4278                 emit_insn (gen_ior_ashldi3 (dest, dest, GEN_INT (i), dest));
4279               }
4280             return 3;
4281           }
4282         else if ((val3 & ~(val3 << i)) == val)
4283           {
4284             if (generate)
4285               {
4286                 emit_insn (gen_rtx_SET (dest, GEN_INT (val3 | 0xffff0000)));
4287                 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4288                                            GEN_INT (val2 >> 16)));
4289                 emit_insn (gen_and_one_cmpl_ashldi3 (dest, dest, GEN_INT (i),
4290                                                       dest));
4291               }
4292             return 3;
4293           }
4294     }
4295
4296   /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4297      are emitted by the initial mov.  If one_match > zero_match, skip set bits,
4298      otherwise skip zero bits.  */
4299
4300   num_insns = 1;
4301   mask = 0xffff;
4302   val2 = one_match > zero_match ? ~val : val;
4303   i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4304
4305   if (generate)
4306     emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4307                                            ? (val | ~(mask << i))
4308                                            : (val & (mask << i)))));
4309   for (i += 16; i < 64; i += 16)
4310     {
4311       if ((val2 & (mask << i)) == 0)
4312         continue;
4313       if (generate)
4314         emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4315                                    GEN_INT ((val >> i) & 0xffff)));
4316       num_insns ++;
4317     }
4318
4319   return num_insns;
4320 }
4321
4322 /* Return whether imm is a 128-bit immediate which is simple enough to
4323    expand inline.  */
4324 bool
4325 aarch64_mov128_immediate (rtx imm)
4326 {
4327   if (CONST_INT_P (imm))
4328     return true;
4329
4330   gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4331
4332   rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4333   rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4334
4335   return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4336          + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4337 }
4338
4339
4340 /* Return true if val can be encoded as a 12-bit unsigned immediate with
4341    a left shift of 0 or 12 bits.  */
4342 bool
4343 aarch64_uimm12_shift (unsigned HOST_WIDE_INT val)
4344 {
4345   return val < 4096 || (val & 0xfff000) == val;
4346 }
4347
4348 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
4349    that can be created with a left shift of 0 or 12.  */
4350 static HOST_WIDE_INT
4351 aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val)
4352 {
4353   /* Check to see if the value fits in 24 bits, as that is the maximum we can
4354      handle correctly.  */
4355   gcc_assert (val < 0x1000000);
4356
4357   if (val < 4096)
4358     return val;
4359
4360   return val & 0xfff000;
4361 }
4362
4363
4364 /* Test whether:
4365
4366      X = (X & AND_VAL) | IOR_VAL;
4367
4368    can be implemented using:
4369
4370      MOVK X, #(IOR_VAL >> shift), LSL #shift
4371
4372    Return the shift if so, otherwise return -1.  */
4373 int
4374 aarch64_movk_shift (const wide_int_ref &and_val,
4375                     const wide_int_ref &ior_val)
4376 {
4377   unsigned int precision = and_val.get_precision ();
4378   unsigned HOST_WIDE_INT mask = 0xffff;
4379   for (unsigned int shift = 0; shift < precision; shift += 16)
4380     {
4381       if (and_val == ~mask && (ior_val & mask) == ior_val)
4382         return shift;
4383       mask <<= 16;
4384     }
4385   return -1;
4386 }
4387
4388 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4389    Assumed precondition: VAL_IN Is not zero.  */
4390
4391 unsigned HOST_WIDE_INT
4392 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4393 {
4394   int lowest_bit_set = ctz_hwi (val_in);
4395   int highest_bit_set = floor_log2 (val_in);
4396   gcc_assert (val_in != 0);
4397
4398   return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4399           (HOST_WIDE_INT_1U << lowest_bit_set));
4400 }
4401
4402 /* Create constant where bits outside of lowest bit set to highest bit set
4403    are set to 1.  */
4404
4405 unsigned HOST_WIDE_INT
4406 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4407 {
4408   return val_in | ~aarch64_and_split_imm1 (val_in);
4409 }
4410
4411 /* Return true if VAL_IN is a valid 'and' bitmask immediate.  */
4412
4413 bool
4414 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4415 {
4416   scalar_int_mode int_mode;
4417   if (!is_a <scalar_int_mode> (mode, &int_mode))
4418     return false;
4419
4420   if (aarch64_bitmask_imm (val_in, int_mode))
4421     return false;
4422
4423   if (aarch64_move_imm (val_in, int_mode))
4424     return false;
4425
4426   unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4427
4428   return aarch64_bitmask_imm (imm2, int_mode);
4429 }
4430
4431 /* Return the number of temporary registers that aarch64_add_offset_1
4432    would need to add OFFSET to a register.  */
4433
4434 static unsigned int
4435 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4436 {
4437   return absu_hwi (offset) < 0x1000000 ? 0 : 1;
4438 }
4439
4440 /* A subroutine of aarch64_add_offset.  Set DEST to SRC + OFFSET for
4441    a non-polynomial OFFSET.  MODE is the mode of the addition.
4442    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4443    be set and CFA adjustments added to the generated instructions.
4444
4445    TEMP1, if nonnull, is a register of mode MODE that can be used as a
4446    temporary if register allocation is already complete.  This temporary
4447    register may overlap DEST but must not overlap SRC.  If TEMP1 is known
4448    to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4449    the immediate again.
4450
4451    Since this function may be used to adjust the stack pointer, we must
4452    ensure that it cannot cause transient stack deallocation (for example
4453    by first incrementing SP and then decrementing when adjusting by a
4454    large immediate).  */
4455
4456 static void
4457 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4458                       rtx src, HOST_WIDE_INT offset, rtx temp1,
4459                       bool frame_related_p, bool emit_move_imm)
4460 {
4461   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4462   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4463
4464   unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
4465   rtx_insn *insn;
4466
4467   if (!moffset)
4468     {
4469       if (!rtx_equal_p (dest, src))
4470         {
4471           insn = emit_insn (gen_rtx_SET (dest, src));
4472           RTX_FRAME_RELATED_P (insn) = frame_related_p;
4473         }
4474       return;
4475     }
4476
4477   /* Single instruction adjustment.  */
4478   if (aarch64_uimm12_shift (moffset))
4479     {
4480       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
4481       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4482       return;
4483     }
4484
4485   /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4486      and either:
4487
4488      a) the offset cannot be loaded by a 16-bit move or
4489      b) there is no spare register into which we can move it.  */
4490   if (moffset < 0x1000000
4491       && ((!temp1 && !can_create_pseudo_p ())
4492           || !aarch64_move_imm (moffset, mode)))
4493     {
4494       HOST_WIDE_INT low_off = moffset & 0xfff;
4495
4496       low_off = offset < 0 ? -low_off : low_off;
4497       insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
4498       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4499       insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
4500       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4501       return;
4502     }
4503
4504   /* Emit a move immediate if required and an addition/subtraction.  */
4505   if (emit_move_imm)
4506     {
4507       gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
4508       temp1 = aarch64_force_temporary (mode, temp1,
4509                                        gen_int_mode (moffset, mode));
4510     }
4511   insn = emit_insn (offset < 0
4512                     ? gen_sub3_insn (dest, src, temp1)
4513                     : gen_add3_insn (dest, src, temp1));
4514   if (frame_related_p)
4515     {
4516       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4517       rtx adj = plus_constant (mode, src, offset);
4518       add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
4519     }
4520 }
4521
4522 /* Return the number of temporary registers that aarch64_add_offset
4523    would need to move OFFSET into a register or add OFFSET to a register;
4524    ADD_P is true if we want the latter rather than the former.  */
4525
4526 static unsigned int
4527 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4528 {
4529   /* This follows the same structure as aarch64_add_offset.  */
4530   if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4531     return 0;
4532
4533   unsigned int count = 0;
4534   HOST_WIDE_INT factor = offset.coeffs[1];
4535   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4536   poly_int64 poly_offset (factor, factor);
4537   if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4538     /* Need one register for the ADDVL/ADDPL result.  */
4539     count += 1;
4540   else if (factor != 0)
4541     {
4542       factor /= (HOST_WIDE_INT) least_bit_hwi (factor);
4543       if (!IN_RANGE (factor, -32, 31))
4544         /* Need one register for the CNT or RDVL result and one for the
4545            multiplication factor.  If necessary, the second temporary
4546            can be reused for the constant part of the offset.  */
4547         return 2;
4548       /* Need one register for the CNT or RDVL result (which might then
4549          be shifted).  */
4550       count += 1;
4551     }
4552   return count + aarch64_add_offset_1_temporaries (constant);
4553 }
4554
4555 /* If X can be represented as a poly_int64, return the number
4556    of temporaries that are required to add it to a register.
4557    Return -1 otherwise.  */
4558
4559 int
4560 aarch64_add_offset_temporaries (rtx x)
4561 {
4562   poly_int64 offset;
4563   if (!poly_int_rtx_p (x, &offset))
4564     return -1;
4565   return aarch64_offset_temporaries (true, offset);
4566 }
4567
4568 /* Set DEST to SRC + OFFSET.  MODE is the mode of the addition.
4569    FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4570    be set and CFA adjustments added to the generated instructions.
4571
4572    TEMP1, if nonnull, is a register of mode MODE that can be used as a
4573    temporary if register allocation is already complete.  This temporary
4574    register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4575    If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4576    false to avoid emitting the immediate again.
4577
4578    TEMP2, if nonnull, is a second temporary register that doesn't
4579    overlap either DEST or REG.
4580
4581    FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of OFFSET
4582    is measured relative to the SME vector length instead of the current
4583    prevailing vector length.  It is 0 otherwise.
4584
4585    Since this function may be used to adjust the stack pointer, we must
4586    ensure that it cannot cause transient stack deallocation (for example
4587    by first incrementing SP and then decrementing when adjusting by a
4588    large immediate).  */
4589
4590 static void
4591 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4592                     poly_int64 offset, rtx temp1, rtx temp2,
4593                     aarch64_feature_flags force_isa_mode,
4594                     bool frame_related_p, bool emit_move_imm = true)
4595 {
4596   gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4597   gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4598   gcc_assert (temp1 == NULL_RTX
4599               || !frame_related_p
4600               || !reg_overlap_mentioned_p (temp1, dest));
4601   gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
4602
4603   /* Try using ADDVL or ADDPL to add the whole value.  */
4604   if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
4605     {
4606       gcc_assert (offset.coeffs[0] == offset.coeffs[1]);
4607       rtx offset_rtx;
4608       if (force_isa_mode == 0)
4609         offset_rtx = gen_int_mode (offset, mode);
4610       else
4611         offset_rtx = aarch64_sme_vq_immediate (mode, offset.coeffs[0], 0);
4612       rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4613       RTX_FRAME_RELATED_P (insn) = frame_related_p;
4614       if (frame_related_p && (force_isa_mode & AARCH64_FL_SM_ON))
4615         add_reg_note (insn, REG_CFA_ADJUST_CFA,
4616                       gen_rtx_SET (dest, plus_constant (Pmode, src,
4617                                                         offset)));
4618       return;
4619     }
4620
4621   /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4622      SVE vector register, over and above the minimum size of 128 bits.
4623      This is equivalent to half the value returned by CNTD with a
4624      vector shape of ALL.  */
4625   HOST_WIDE_INT factor = offset.coeffs[1];
4626   HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4627
4628   /* Try using ADDVL or ADDPL to add the VG-based part.  */
4629   poly_int64 poly_offset (factor, factor);
4630   if (src != const0_rtx
4631       && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4632     {
4633       rtx offset_rtx;
4634       if (force_isa_mode == 0)
4635         offset_rtx = gen_int_mode (poly_offset, mode);
4636       else
4637         offset_rtx = aarch64_sme_vq_immediate (mode, factor, 0);
4638       if (frame_related_p)
4639         {
4640           rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4641           RTX_FRAME_RELATED_P (insn) = true;
4642           if (force_isa_mode & AARCH64_FL_SM_ON)
4643             add_reg_note (insn, REG_CFA_ADJUST_CFA,
4644                           gen_rtx_SET (dest, plus_constant (Pmode, src,
4645                                                             poly_offset)));
4646           src = dest;
4647         }
4648       else
4649         {
4650           rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
4651           src = aarch64_force_temporary (mode, temp1, addr);
4652           temp1 = temp2;
4653           temp2 = NULL_RTX;
4654         }
4655     }
4656   /* Otherwise use a CNT-based sequence.  */
4657   else if (factor != 0)
4658     {
4659       /* Calculate CNTB * FACTOR / 16 as CNTB * REL_FACTOR * 2**SHIFT,
4660          with negative shifts indicating a shift right.  */
4661       HOST_WIDE_INT low_bit = least_bit_hwi (factor);
4662       HOST_WIDE_INT rel_factor = factor / low_bit;
4663       int shift = exact_log2 (low_bit) - 4;
4664       gcc_assert (shift >= -4 && (rel_factor & 1) != 0);
4665
4666       /* Set CODE, VAL and SHIFT so that [+-] VAL * 2**SHIFT is
4667          equal to CNTB * FACTOR / 16, with CODE being the [+-].
4668
4669          We can avoid a multiplication if REL_FACTOR is in the range
4670          of RDVL, although there are then various optimizations that
4671          we can try on top.  */
4672       rtx_code code = PLUS;
4673       rtx val;
4674       if (IN_RANGE (rel_factor, -32, 31))
4675         {
4676           if (force_isa_mode & AARCH64_FL_SM_ON)
4677             {
4678               /* Try to use an unshifted RDSVL, otherwise fall back on
4679                  a shifted RDSVL #1.  */
4680               if (aarch64_sve_rdvl_addvl_factor_p (factor))
4681                 shift = 0;
4682               else
4683                 factor = rel_factor * 16;
4684               val = aarch64_sme_vq_immediate (mode, factor, 0);
4685             }
4686           /* Try to use an unshifted CNT[BHWD] or RDVL.  */
4687           else if (aarch64_sve_cnt_factor_p (factor)
4688                    || aarch64_sve_rdvl_addvl_factor_p (factor))
4689             {
4690               val = gen_int_mode (poly_int64 (factor, factor), mode);
4691               shift = 0;
4692             }
4693           /* Try to subtract an unshifted CNT[BHWD].  */
4694           else if (aarch64_sve_cnt_factor_p (-factor))
4695             {
4696               code = MINUS;
4697               val = gen_int_mode (poly_int64 (-factor, -factor), mode);
4698               shift = 0;
4699             }
4700           /* If subtraction is free, prefer to load a positive constant.
4701              In the best case this will fit a shifted CNTB.  */
4702           else if (src != const0_rtx && rel_factor < 0)
4703             {
4704               code = MINUS;
4705               val = gen_int_mode (-rel_factor * BYTES_PER_SVE_VECTOR, mode);
4706             }
4707           /* Otherwise use a shifted RDVL or CNT[BHWD].  */
4708           else
4709             val = gen_int_mode (rel_factor * BYTES_PER_SVE_VECTOR, mode);
4710         }
4711       else
4712         {
4713           /* If we can calculate CNTB << SHIFT directly, prefer to do that,
4714              since it should increase the chances of being able to use
4715              a shift and add sequence for the multiplication.
4716              If CNTB << SHIFT is out of range, stick with the current
4717              shift factor.  */
4718           if (force_isa_mode == 0
4719               && IN_RANGE (low_bit, 2, 16 * 16))
4720             {
4721               val = gen_int_mode (poly_int64 (low_bit, low_bit), mode);
4722               shift = 0;
4723             }
4724           else if ((force_isa_mode & AARCH64_FL_SM_ON)
4725                    && aarch64_sve_rdvl_addvl_factor_p (low_bit))
4726             {
4727               val = aarch64_sme_vq_immediate (mode, low_bit, 0);
4728               shift = 0;
4729             }
4730           else
4731             val = gen_int_mode (BYTES_PER_SVE_VECTOR, mode);
4732
4733           val = aarch64_force_temporary (mode, temp1, val);
4734
4735           /* Prefer to multiply by a positive factor and subtract rather
4736              than multiply by a negative factor and add, since positive
4737              values are usually easier to move.  */
4738           if (rel_factor < 0 && src != const0_rtx)
4739             {
4740               rel_factor = -rel_factor;
4741               code = MINUS;
4742             }
4743
4744           if (can_create_pseudo_p ())
4745             {
4746               rtx coeff1 = gen_int_mode (rel_factor, mode);
4747               val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
4748             }
4749           else
4750             {
4751               rtx coeff1 = gen_int_mode (rel_factor, mode);
4752               coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
4753               val = gen_rtx_MULT (mode, val, coeff1);
4754             }
4755         }
4756
4757       /* Multiply by 2 ** SHIFT.  */
4758       if (shift > 0)
4759         {
4760           val = aarch64_force_temporary (mode, temp1, val);
4761           val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
4762         }
4763       else if (shift < 0)
4764         {
4765           val = aarch64_force_temporary (mode, temp1, val);
4766           val = gen_rtx_ASHIFTRT (mode, val, GEN_INT (-shift));
4767         }
4768
4769       /* Add the result to SRC or subtract the result from SRC.  */
4770       if (src != const0_rtx)
4771         {
4772           val = aarch64_force_temporary (mode, temp1, val);
4773           val = gen_rtx_fmt_ee (code, mode, src, val);
4774         }
4775       else if (code == MINUS)
4776         {
4777           val = aarch64_force_temporary (mode, temp1, val);
4778           val = gen_rtx_NEG (mode, val);
4779         }
4780
4781       if (constant == 0 || frame_related_p)
4782         {
4783           rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
4784           if (frame_related_p)
4785             {
4786               RTX_FRAME_RELATED_P (insn) = true;
4787               add_reg_note (insn, REG_CFA_ADJUST_CFA,
4788                             gen_rtx_SET (dest, plus_constant (Pmode, src,
4789                                                               poly_offset)));
4790             }
4791           src = dest;
4792           if (constant == 0)
4793             return;
4794         }
4795       else
4796         {
4797           src = aarch64_force_temporary (mode, temp1, val);
4798           temp1 = temp2;
4799           temp2 = NULL_RTX;
4800         }
4801
4802       emit_move_imm = true;
4803     }
4804
4805   aarch64_add_offset_1 (mode, dest, src, constant, temp1,
4806                         frame_related_p, emit_move_imm);
4807 }
4808
4809 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4810    than a poly_int64.  */
4811
4812 void
4813 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4814                           rtx offset_rtx, rtx temp1, rtx temp2)
4815 {
4816   aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4817                       temp1, temp2, 0, false);
4818 }
4819
4820 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4821    TEMP1 is available as a temporary if nonnull.  FORCE_ISA_MODE is as
4822    for aarch64_add_offset.  EMIT_MOVE_IMM is false if TEMP1 already
4823    contains abs (DELTA).  */
4824
4825 static inline void
4826 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta,
4827                 aarch64_feature_flags force_isa_mode, bool emit_move_imm)
4828 {
4829   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
4830                       temp1, temp2, force_isa_mode, true, emit_move_imm);
4831 }
4832
4833 /* Subtract DELTA from the stack pointer, marking the instructions
4834    frame-related if FRAME_RELATED_P.  FORCE_ISA_MODE is as for
4835    aarch64_add_offset.  TEMP1 is available as a temporary if nonnull.  */
4836
4837 static inline void
4838 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta,
4839                 aarch64_feature_flags force_isa_mode,
4840                 bool frame_related_p, bool emit_move_imm = true)
4841 {
4842   aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
4843                       temp1, temp2, force_isa_mode, frame_related_p,
4844                       emit_move_imm);
4845 }
4846
4847 /* A streaming-compatible function needs to switch temporarily to the known
4848    PSTATE.SM mode described by LOCAL_MODE.  The low bit of OLD_SVCR contains
4849    the runtime state of PSTATE.SM in the streaming-compatible code, before
4850    the start of the switch to LOCAL_MODE.
4851
4852    Emit instructions to branch around the mode switch if PSTATE.SM already
4853    matches LOCAL_MODE.  Return the label that the branch jumps to.  */
4854
4855 static rtx_insn *
4856 aarch64_guard_switch_pstate_sm (rtx old_svcr, aarch64_feature_flags local_mode)
4857 {
4858   local_mode &= AARCH64_FL_SM_STATE;
4859   gcc_assert (local_mode != 0);
4860   auto already_ok_cond = (local_mode & AARCH64_FL_SM_ON ? NE : EQ);
4861   auto *label = gen_label_rtx ();
4862   auto *jump = emit_jump_insn (gen_aarch64_tb (already_ok_cond, DImode, DImode,
4863                                                old_svcr, const0_rtx, label));
4864   JUMP_LABEL (jump) = label;
4865   return label;
4866 }
4867
4868 /* Emit code to switch from the PSTATE.SM state in OLD_MODE to the PSTATE.SM
4869    state in NEW_MODE.  This is known to involve either an SMSTART SM or
4870    an SMSTOP SM.  */
4871
4872 static void
4873 aarch64_switch_pstate_sm (aarch64_feature_flags old_mode,
4874                           aarch64_feature_flags new_mode)
4875 {
4876   old_mode &= AARCH64_FL_SM_STATE;
4877   new_mode &= AARCH64_FL_SM_STATE;
4878   gcc_assert (old_mode != new_mode);
4879
4880   if ((new_mode & AARCH64_FL_SM_ON)
4881       || (new_mode == 0 && (old_mode & AARCH64_FL_SM_OFF)))
4882     emit_insn (gen_aarch64_smstart_sm ());
4883   else
4884     emit_insn (gen_aarch64_smstop_sm ());
4885 }
4886
4887 /* As a side-effect, SMSTART SM and SMSTOP SM clobber the contents of all
4888    FP and predicate registers.  This class emits code to preserve any
4889    necessary registers around the mode switch.
4890
4891    The class uses four approaches to saving and restoring contents, enumerated
4892    by group_type:
4893
4894    - GPR: save and restore the contents of FP registers using GPRs.
4895      This is used if the FP register contains no more than 64 significant
4896      bits.  The registers used are FIRST_GPR onwards.
4897
4898    - MEM_128: save and restore 128-bit SIMD registers using memory.
4899
4900    - MEM_SVE_PRED: save and restore full SVE predicate registers using memory.
4901
4902    - MEM_SVE_DATA: save and restore full SVE vector registers using memory.
4903
4904    The save slots within each memory group are consecutive, with the
4905    MEM_SVE_PRED slots occupying a region below the MEM_SVE_DATA slots.
4906
4907    There will only be two mode switches for each use of SME, so they should
4908    not be particularly performance-sensitive.  It's also rare for SIMD, SVE
4909    or predicate registers to be live across mode switches.  We therefore
4910    don't preallocate the save slots but instead allocate them locally on
4911    demand.  This makes the code emitted by the class self-contained.  */
4912
4913 class aarch64_sme_mode_switch_regs
4914 {
4915 public:
4916   static const unsigned int FIRST_GPR = R10_REGNUM;
4917
4918   void add_reg (machine_mode, unsigned int);
4919   void add_call_args (rtx_call_insn *);
4920   void add_call_result (rtx_call_insn *);
4921   void add_call_preserved_reg (unsigned int);
4922   void add_call_preserved_regs (bitmap);
4923
4924   void emit_prologue ();
4925   void emit_epilogue ();
4926
4927   /* The number of GPRs needed to save FP registers, starting from
4928      FIRST_GPR.  */
4929   unsigned int num_gprs () { return m_group_count[GPR]; }
4930
4931 private:
4932   enum sequence { PROLOGUE, EPILOGUE };
4933   enum group_type { GPR, MEM_128, MEM_SVE_PRED, MEM_SVE_DATA, NUM_GROUPS };
4934
4935   /* Information about the save location for one FP, SIMD, SVE data, or
4936      SVE predicate register.  */
4937   struct save_location {
4938     /* The register to be saved.  */
4939     rtx reg;
4940
4941     /* Which group the save location belongs to.  */
4942     group_type group;
4943
4944     /* A zero-based index of the register within the group.  */
4945     unsigned int index;
4946   };
4947
4948   unsigned int sve_data_headroom ();
4949   rtx get_slot_mem (machine_mode, poly_int64);
4950   void emit_stack_adjust (sequence, poly_int64);
4951   void emit_mem_move (sequence, const save_location &, poly_int64);
4952
4953   void emit_gpr_moves (sequence);
4954   void emit_mem_128_moves (sequence);
4955   void emit_sve_sp_adjust (sequence);
4956   void emit_sve_pred_moves (sequence);
4957   void emit_sve_data_moves (sequence);
4958
4959   /* All save locations, in no particular order.  */
4960   auto_vec<save_location, 12> m_save_locations;
4961
4962   /* The number of registers in each group.  */
4963   unsigned int m_group_count[NUM_GROUPS] = {};
4964 };
4965
4966 /* Record that (reg:MODE REGNO) needs to be preserved around the mode
4967    switch.  */
4968
4969 void
4970 aarch64_sme_mode_switch_regs::add_reg (machine_mode mode, unsigned int regno)
4971 {
4972   if (!FP_REGNUM_P (regno) && !PR_REGNUM_P (regno))
4973     return;
4974
4975   unsigned int end_regno = end_hard_regno (mode, regno);
4976   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
4977   gcc_assert ((vec_flags & VEC_STRUCT) || end_regno == regno + 1);
4978   for (; regno < end_regno; regno++)
4979     {
4980       /* Force the mode of SVE saves and restores even for single registers.
4981          This is necessary because big-endian targets only allow LDR Z and
4982          STR Z to be used with byte modes.  */
4983       machine_mode submode = mode;
4984       if (vec_flags & VEC_SVE_PRED)
4985         submode = VNx16BImode;
4986       else if (vec_flags & VEC_SVE_DATA)
4987         submode = SVE_BYTE_MODE;
4988       else if (vec_flags & VEC_STRUCT)
4989         {
4990           if (vec_flags & VEC_PARTIAL)
4991             submode = V8QImode;
4992           else
4993             submode = V16QImode;
4994         }
4995       save_location loc;
4996       loc.reg = gen_rtx_REG (submode, regno);
4997       if (vec_flags & VEC_SVE_PRED)
4998         {
4999           gcc_assert (PR_REGNUM_P (regno));
5000           loc.group = MEM_SVE_PRED;
5001         }
5002       else
5003         {
5004           gcc_assert (FP_REGNUM_P (regno));
5005           if (known_le (GET_MODE_SIZE (submode), 8))
5006             loc.group = GPR;
5007           else if (known_eq (GET_MODE_SIZE (submode), 16))
5008             loc.group = MEM_128;
5009           else
5010             loc.group = MEM_SVE_DATA;
5011         }
5012       loc.index = m_group_count[loc.group]++;
5013       m_save_locations.quick_push (loc);
5014     }
5015 }
5016
5017 /* Record that the arguments to CALL_INSN need to be preserved around
5018    the mode switch.  */
5019
5020 void
5021 aarch64_sme_mode_switch_regs::add_call_args (rtx_call_insn *call_insn)
5022 {
5023   for (rtx node = CALL_INSN_FUNCTION_USAGE (call_insn);
5024        node; node = XEXP (node, 1))
5025     {
5026       rtx item = XEXP (node, 0);
5027       if (GET_CODE (item) != USE)
5028         continue;
5029       item = XEXP (item, 0);
5030       if (!REG_P (item))
5031         continue;
5032       add_reg (GET_MODE (item), REGNO (item));
5033     }
5034 }
5035
5036 /* Record that the return value from CALL_INSN (if any) needs to be
5037    preserved around the mode switch.  */
5038
5039 void
5040 aarch64_sme_mode_switch_regs::add_call_result (rtx_call_insn *call_insn)
5041 {
5042   rtx pat = PATTERN (call_insn);
5043   gcc_assert (GET_CODE (pat) == PARALLEL);
5044   pat = XVECEXP (pat, 0, 0);
5045   if (GET_CODE (pat) == CALL)
5046     return;
5047   rtx dest = SET_DEST (pat);
5048   if (GET_CODE (dest) == PARALLEL)
5049     for (int i = 0; i < XVECLEN (dest, 0); ++i)
5050       {
5051         rtx x = XVECEXP (dest, 0, i);
5052         gcc_assert (GET_CODE (x) == EXPR_LIST);
5053         rtx reg = XEXP (x, 0);
5054         add_reg (GET_MODE (reg), REGNO (reg));
5055       }
5056   else
5057     add_reg (GET_MODE (dest), REGNO (dest));
5058 }
5059
5060 /* REGNO is a register that is call-preserved under the current function's ABI.
5061    Record that it must be preserved around the mode switch.  */
5062
5063 void
5064 aarch64_sme_mode_switch_regs::add_call_preserved_reg (unsigned int regno)
5065 {
5066   if (FP_REGNUM_P (regno))
5067     switch (crtl->abi->id ())
5068       {
5069       case ARM_PCS_SVE:
5070         add_reg (VNx16QImode, regno);
5071         break;
5072       case ARM_PCS_SIMD:
5073         add_reg (V16QImode, regno);
5074         break;
5075       case ARM_PCS_AAPCS64:
5076         add_reg (DImode, regno);
5077         break;
5078       default:
5079         gcc_unreachable ();
5080       }
5081   else if (PR_REGNUM_P (regno))
5082     add_reg (VNx16BImode, regno);
5083 }
5084
5085 /* The hard registers in REGS are call-preserved under the current function's
5086    ABI.  Record that they must be preserved around the mode switch.  */
5087
5088 void
5089 aarch64_sme_mode_switch_regs::add_call_preserved_regs (bitmap regs)
5090 {
5091   bitmap_iterator bi;
5092   unsigned int regno;
5093   EXECUTE_IF_SET_IN_BITMAP (regs, 0, regno, bi)
5094     if (HARD_REGISTER_NUM_P (regno))
5095       add_call_preserved_reg (regno);
5096     else
5097       break;
5098 }
5099
5100 /* Emit code to save registers before the mode switch.  */
5101
5102 void
5103 aarch64_sme_mode_switch_regs::emit_prologue ()
5104 {
5105   emit_sve_sp_adjust (PROLOGUE);
5106   emit_sve_pred_moves (PROLOGUE);
5107   emit_sve_data_moves (PROLOGUE);
5108   emit_mem_128_moves (PROLOGUE);
5109   emit_gpr_moves (PROLOGUE);
5110 }
5111
5112 /* Emit code to restore registers after the mode switch.  */
5113
5114 void
5115 aarch64_sme_mode_switch_regs::emit_epilogue ()
5116 {
5117   emit_gpr_moves (EPILOGUE);
5118   emit_mem_128_moves (EPILOGUE);
5119   emit_sve_pred_moves (EPILOGUE);
5120   emit_sve_data_moves (EPILOGUE);
5121   emit_sve_sp_adjust (EPILOGUE);
5122 }
5123
5124 /* The SVE predicate registers are stored below the SVE data registers,
5125    with the predicate save area being padded to a data-register-sized
5126    boundary.  Return the size of this padded area as a whole number
5127    of data register slots.  */
5128
5129 unsigned int
5130 aarch64_sme_mode_switch_regs::sve_data_headroom ()
5131 {
5132   return CEIL (m_group_count[MEM_SVE_PRED], 8);
5133 }
5134
5135 /* Return a memory reference of mode MODE to OFFSET bytes from the
5136    stack pointer.  */
5137
5138 rtx
5139 aarch64_sme_mode_switch_regs::get_slot_mem (machine_mode mode,
5140                                             poly_int64 offset)
5141 {
5142   rtx addr = plus_constant (Pmode, stack_pointer_rtx, offset);
5143   return gen_rtx_MEM (mode, addr);
5144 }
5145
5146 /* Allocate or deallocate SIZE bytes of stack space: SEQ decides which.  */
5147
5148 void
5149 aarch64_sme_mode_switch_regs::emit_stack_adjust (sequence seq,
5150                                                  poly_int64 size)
5151 {
5152   if (seq == PROLOGUE)
5153     size = -size;
5154   emit_insn (gen_rtx_SET (stack_pointer_rtx,
5155                           plus_constant (Pmode, stack_pointer_rtx, size)));
5156 }
5157
5158 /* Save or restore the register in LOC, whose slot is OFFSET bytes from
5159    the stack pointer.  SEQ chooses between saving and restoring.  */
5160
5161 void
5162 aarch64_sme_mode_switch_regs::emit_mem_move (sequence seq,
5163                                              const save_location &loc,
5164                                              poly_int64 offset)
5165 {
5166   rtx mem = get_slot_mem (GET_MODE (loc.reg), offset);
5167   if (seq == PROLOGUE)
5168     emit_move_insn (mem, loc.reg);
5169   else
5170     emit_move_insn (loc.reg, mem);
5171 }
5172
5173 /* Emit instructions to save or restore the GPR group.  SEQ chooses between
5174    saving and restoring.  */
5175
5176 void
5177 aarch64_sme_mode_switch_regs::emit_gpr_moves (sequence seq)
5178 {
5179   for (auto &loc : m_save_locations)
5180     if (loc.group == GPR)
5181       {
5182         gcc_assert (loc.index < 8);
5183         rtx gpr = gen_rtx_REG (GET_MODE (loc.reg), FIRST_GPR + loc.index);
5184         if (seq == PROLOGUE)
5185           emit_move_insn (gpr, loc.reg);
5186         else
5187           emit_move_insn (loc.reg, gpr);
5188       }
5189 }
5190
5191 /* Emit instructions to save or restore the MEM_128 group.  SEQ chooses
5192    between saving and restoring.  */
5193
5194 void
5195 aarch64_sme_mode_switch_regs::emit_mem_128_moves (sequence seq)
5196 {
5197   HOST_WIDE_INT count = m_group_count[MEM_128];
5198   if (count == 0)
5199     return;
5200
5201   auto sp = stack_pointer_rtx;
5202   auto sp_adjust = (seq == PROLOGUE ? -count : count) * 16;
5203
5204   /* Pick a common mode that supports LDR & STR with pre/post-modification
5205      and LDP & STP with pre/post-modification.  */
5206   auto mode = TFmode;
5207
5208   /* An instruction pattern that should be emitted at the end.  */
5209   rtx last_pat = NULL_RTX;
5210
5211   /* A previous MEM_128 location that hasn't been handled yet.  */
5212   save_location *prev_loc = nullptr;
5213
5214   /* Look for LDP/STPs and record any leftover LDR/STR in PREV_LOC.  */
5215   for (auto &loc : m_save_locations)
5216     if (loc.group == MEM_128)
5217       {
5218         if (!prev_loc)
5219           {
5220             prev_loc = &loc;
5221             continue;
5222           }
5223         gcc_assert (loc.index == prev_loc->index + 1);
5224
5225         /* The offset of the base of the save area from the current
5226            stack pointer.  */
5227         HOST_WIDE_INT bias = 0;
5228         if (prev_loc->index == 0 && seq == PROLOGUE)
5229           bias = sp_adjust;
5230
5231         /* Get the two sets in the LDP/STP.  */
5232         rtx ops[] = {
5233           gen_rtx_REG (mode, REGNO (prev_loc->reg)),
5234           get_slot_mem (mode, prev_loc->index * 16 + bias),
5235           gen_rtx_REG (mode, REGNO (loc.reg)),
5236           get_slot_mem (mode, loc.index * 16 + bias)
5237         };
5238         unsigned int lhs = (seq == PROLOGUE);
5239         rtx set1 = gen_rtx_SET (ops[lhs], ops[1 - lhs]);
5240         rtx set2 = gen_rtx_SET (ops[lhs + 2], ops[3 - lhs]);
5241
5242         /* Combine the sets with any stack allocation/deallocation.  */
5243         rtx pat;
5244         if (prev_loc->index == 0)
5245           {
5246             rtx plus_sp = plus_constant (Pmode, sp, sp_adjust);
5247             rtvec vec = gen_rtvec (3, gen_rtx_SET (sp, plus_sp), set1, set2);
5248             pat = gen_rtx_PARALLEL (VOIDmode, vec);
5249           }
5250         else if (seq == PROLOGUE)
5251           pat = aarch64_gen_store_pair (ops[1], ops[0], ops[2]);
5252         else
5253           pat = aarch64_gen_load_pair (ops[0], ops[2], ops[1]);
5254
5255         /* Queue a deallocation to the end, otherwise emit the
5256            instruction now.  */
5257         if (seq == EPILOGUE && prev_loc->index == 0)
5258           last_pat = pat;
5259         else
5260           emit_insn (pat);
5261         prev_loc = nullptr;
5262       }
5263
5264   /* Handle any leftover LDR/STR.  */
5265   if (prev_loc)
5266     {
5267       rtx reg = gen_rtx_REG (mode, REGNO (prev_loc->reg));
5268       rtx addr;
5269       if (prev_loc->index != 0)
5270         addr = plus_constant (Pmode, sp, prev_loc->index * 16);
5271       else if (seq == PROLOGUE)
5272         {
5273           rtx allocate = plus_constant (Pmode, sp, -count * 16);
5274           addr = gen_rtx_PRE_MODIFY (Pmode, sp, allocate);
5275         }
5276       else
5277         {
5278           rtx deallocate = plus_constant (Pmode, sp, count * 16);
5279           addr = gen_rtx_POST_MODIFY (Pmode, sp, deallocate);
5280         }
5281       rtx mem = gen_rtx_MEM (mode, addr);
5282       if (seq == PROLOGUE)
5283         emit_move_insn (mem, reg);
5284       else
5285         emit_move_insn (reg, mem);
5286     }
5287
5288   if (last_pat)
5289     emit_insn (last_pat);
5290 }
5291
5292 /* Allocate or deallocate the stack space needed by the SVE groups.
5293    SEQ chooses between allocating and deallocating.  */
5294
5295 void
5296 aarch64_sme_mode_switch_regs::emit_sve_sp_adjust (sequence seq)
5297 {
5298   if (unsigned int count = m_group_count[MEM_SVE_DATA] + sve_data_headroom ())
5299     emit_stack_adjust (seq, count * BYTES_PER_SVE_VECTOR);
5300 }
5301
5302 /* Save or restore the MEM_SVE_DATA group.  SEQ chooses between saving
5303    and restoring.  */
5304
5305 void
5306 aarch64_sme_mode_switch_regs::emit_sve_data_moves (sequence seq)
5307 {
5308   for (auto &loc : m_save_locations)
5309     if (loc.group == MEM_SVE_DATA)
5310       {
5311         auto index = loc.index + sve_data_headroom ();
5312         emit_mem_move (seq, loc, index * BYTES_PER_SVE_VECTOR);
5313       }
5314 }
5315
5316 /* Save or restore the MEM_SVE_PRED group.  SEQ chooses between saving
5317    and restoring.  */
5318
5319 void
5320 aarch64_sme_mode_switch_regs::emit_sve_pred_moves (sequence seq)
5321 {
5322   for (auto &loc : m_save_locations)
5323     if (loc.group == MEM_SVE_PRED)
5324       emit_mem_move (seq, loc, loc.index * BYTES_PER_SVE_PRED);
5325 }
5326
5327 /* Set DEST to (vec_series BASE STEP).  */
5328
5329 static void
5330 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
5331 {
5332   machine_mode mode = GET_MODE (dest);
5333   scalar_mode inner = GET_MODE_INNER (mode);
5334
5335   /* Each operand can be a register or an immediate in the range [-16, 15].  */
5336   if (!aarch64_sve_index_immediate_p (base))
5337     base = force_reg (inner, base);
5338   if (!aarch64_sve_index_immediate_p (step))
5339     step = force_reg (inner, step);
5340
5341   emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
5342 }
5343
5344 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
5345    register of mode MODE.  Use TARGET for the result if it's nonnull
5346    and convenient.
5347
5348    The two vector modes must have the same element mode.  The behavior
5349    is to duplicate architectural lane N of SRC into architectural lanes
5350    N + I * STEP of the result.  On big-endian targets, architectural
5351    lane 0 of an Advanced SIMD vector is the last element of the vector
5352    in memory layout, so for big-endian targets this operation has the
5353    effect of reversing SRC before duplicating it.  Callers need to
5354    account for this.  */
5355
5356 rtx
5357 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
5358 {
5359   machine_mode src_mode = GET_MODE (src);
5360   gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
5361   insn_code icode = (BYTES_BIG_ENDIAN
5362                      ? code_for_aarch64_vec_duplicate_vq_be (mode)
5363                      : code_for_aarch64_vec_duplicate_vq_le (mode));
5364
5365   unsigned int i = 0;
5366   expand_operand ops[3];
5367   create_output_operand (&ops[i++], target, mode);
5368   create_output_operand (&ops[i++], src, src_mode);
5369   if (BYTES_BIG_ENDIAN)
5370     {
5371       /* Create a PARALLEL describing the reversal of SRC.  */
5372       unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
5373       rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
5374                                                   nelts_per_vq - 1, -1);
5375       create_fixed_operand (&ops[i++], sel);
5376     }
5377   expand_insn (icode, i, ops);
5378   return ops[0].value;
5379 }
5380
5381 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
5382    the memory image into DEST.  Return true on success.  */
5383
5384 static bool
5385 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
5386 {
5387   src = force_const_mem (GET_MODE (src), src);
5388   if (!src)
5389     return false;
5390
5391   /* Make sure that the address is legitimate.  */
5392   if (!aarch64_sve_ld1rq_operand_p (src))
5393     {
5394       rtx addr = force_reg (Pmode, XEXP (src, 0));
5395       src = replace_equiv_address (src, addr);
5396     }
5397
5398   machine_mode mode = GET_MODE (dest);
5399   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5400   rtx ptrue = aarch64_ptrue_reg (pred_mode);
5401   emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
5402   return true;
5403 }
5404
5405 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
5406    by N "background" values.  Try to move it into TARGET using:
5407
5408       PTRUE PRED.<T>, VL<N>
5409       MOV TRUE.<T>, #<foreground>
5410       MOV FALSE.<T>, #<background>
5411       SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
5412
5413    The PTRUE is always a single instruction but the MOVs might need a
5414    longer sequence.  If the background value is zero (as it often is),
5415    the sequence can sometimes collapse to a PTRUE followed by a
5416    zero-predicated move.
5417
5418    Return the target on success, otherwise return null.  */
5419
5420 static rtx
5421 aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
5422 {
5423   gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
5424
5425   /* Make sure that the PTRUE is valid.  */
5426   machine_mode mode = GET_MODE (src);
5427   machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5428   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5429   if (aarch64_svpattern_for_vl (pred_mode, npatterns)
5430       == AARCH64_NUM_SVPATTERNS)
5431     return NULL_RTX;
5432
5433   rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
5434   rtx_vector_builder true_builder (mode, npatterns, 1);
5435   rtx_vector_builder false_builder (mode, npatterns, 1);
5436   for (unsigned int i = 0; i < npatterns; ++i)
5437     {
5438       true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5439       pred_builder.quick_push (CONST1_RTX (BImode));
5440     }
5441   for (unsigned int i = 0; i < npatterns; ++i)
5442     {
5443       false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
5444       pred_builder.quick_push (CONST0_RTX (BImode));
5445     }
5446   expand_operand ops[4];
5447   create_output_operand (&ops[0], target, mode);
5448   create_input_operand (&ops[1], true_builder.build (), mode);
5449   create_input_operand (&ops[2], false_builder.build (), mode);
5450   create_input_operand (&ops[3], pred_builder.build (), pred_mode);
5451   expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
5452   return target;
5453 }
5454
5455 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
5456    SVE data mode and isn't a legitimate constant.  Use TARGET for the
5457    result if convenient.
5458
5459    The returned register can have whatever mode seems most natural
5460    given the contents of SRC.  */
5461
5462 static rtx
5463 aarch64_expand_sve_const_vector (rtx target, rtx src)
5464 {
5465   machine_mode mode = GET_MODE (src);
5466   unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5467   unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
5468   scalar_mode elt_mode = GET_MODE_INNER (mode);
5469   unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
5470   unsigned int container_bits = aarch64_sve_container_bits (mode);
5471   unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
5472
5473   if (nelts_per_pattern == 1
5474       && encoded_bits <= 128
5475       && container_bits != elt_bits)
5476     {
5477       /* We have a partial vector mode and a constant whose full-vector
5478          equivalent would occupy a repeating 128-bit sequence.  Build that
5479          full-vector equivalent instead, so that we have the option of
5480          using LD1RQ and Advanced SIMD operations.  */
5481       unsigned int repeat = container_bits / elt_bits;
5482       machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
5483       rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
5484       for (unsigned int i = 0; i < npatterns; ++i)
5485         for (unsigned int j = 0; j < repeat; ++j)
5486           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5487       target = aarch64_target_reg (target, full_mode);
5488       return aarch64_expand_sve_const_vector (target, builder.build ());
5489     }
5490
5491   if (nelts_per_pattern == 1 && encoded_bits == 128)
5492     {
5493       /* The constant is a duplicated quadword but can't be narrowed
5494          beyond a quadword.  Get the memory image of the first quadword
5495          as a 128-bit vector and try using LD1RQ to load it from memory.
5496
5497          The effect for both endiannesses is to load memory lane N into
5498          architectural lanes N + I * STEP of the result.  On big-endian
5499          targets, the layout of the 128-bit vector in an Advanced SIMD
5500          register would be different from its layout in an SVE register,
5501          but this 128-bit vector is a memory value only.  */
5502       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5503       rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
5504       if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
5505         return target;
5506     }
5507
5508   if (nelts_per_pattern == 1 && encoded_bits < 128)
5509     {
5510       /* The vector is a repeating sequence of 64 bits or fewer.
5511          See if we can load them using an Advanced SIMD move and then
5512          duplicate it to fill a vector.  This is better than using a GPR
5513          move because it keeps everything in the same register file.  */
5514       machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5515       rtx_vector_builder builder (vq_mode, npatterns, 1);
5516       for (unsigned int i = 0; i < npatterns; ++i)
5517         {
5518           /* We want memory lane N to go into architectural lane N,
5519              so reverse for big-endian targets.  The DUP .Q pattern
5520              has a compensating reverse built-in.  */
5521           unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
5522           builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
5523         }
5524       rtx vq_src = builder.build ();
5525       if (aarch64_simd_valid_immediate (vq_src, NULL))
5526         {
5527           vq_src = force_reg (vq_mode, vq_src);
5528           return aarch64_expand_sve_dupq (target, mode, vq_src);
5529         }
5530
5531       /* Get an integer representation of the repeating part of Advanced
5532          SIMD vector VQ_SRC.  This preserves the endianness of VQ_SRC,
5533          which for big-endian targets is lane-swapped wrt a normal
5534          Advanced SIMD vector.  This means that for both endiannesses,
5535          memory lane N of SVE vector SRC corresponds to architectural
5536          lane N of a register holding VQ_SRC.  This in turn means that
5537          memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
5538          as a single 128-bit value) and thus that memory lane 0 of SRC is
5539          in the lsb of the integer.  Duplicating the integer therefore
5540          ensures that memory lane N of SRC goes into architectural lane
5541          N + I * INDEX of the SVE register.  */
5542       scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
5543       rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
5544       if (elt_value)
5545         {
5546           /* Pretend that we had a vector of INT_MODE to start with.  */
5547           elt_mode = int_mode;
5548           mode = aarch64_full_sve_mode (int_mode).require ();
5549
5550           /* If the integer can be moved into a general register by a
5551              single instruction, do that and duplicate the result.  */
5552           if (CONST_INT_P (elt_value)
5553               && aarch64_move_imm (INTVAL (elt_value),
5554                                    encoded_bits <= 32 ? SImode : DImode))
5555             {
5556               elt_value = force_reg (elt_mode, elt_value);
5557               return expand_vector_broadcast (mode, elt_value);
5558             }
5559         }
5560       else if (npatterns == 1)
5561         /* We're duplicating a single value, but can't do better than
5562            force it to memory and load from there.  This handles things
5563            like symbolic constants.  */
5564         elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
5565
5566       if (elt_value)
5567         {
5568           /* Load the element from memory if we can, otherwise move it into
5569              a register and use a DUP.  */
5570           rtx op = force_const_mem (elt_mode, elt_value);
5571           if (!op)
5572             op = force_reg (elt_mode, elt_value);
5573           return expand_vector_broadcast (mode, op);
5574         }
5575     }
5576
5577   /* Try using INDEX.  */
5578   rtx base, step;
5579   if (const_vec_series_p (src, &base, &step))
5580     {
5581       aarch64_expand_vec_series (target, base, step);
5582       return target;
5583     }
5584
5585   /* From here on, it's better to force the whole constant to memory
5586      if we can.  */
5587   if (GET_MODE_NUNITS (mode).is_constant ())
5588     return NULL_RTX;
5589
5590   if (nelts_per_pattern == 2)
5591     if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
5592       return res;
5593
5594   /* Expand each pattern individually.  */
5595   gcc_assert (npatterns > 1);
5596   rtx_vector_builder builder;
5597   auto_vec<rtx, 16> vectors (npatterns);
5598   for (unsigned int i = 0; i < npatterns; ++i)
5599     {
5600       builder.new_vector (mode, 1, nelts_per_pattern);
5601       for (unsigned int j = 0; j < nelts_per_pattern; ++j)
5602         builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
5603       vectors.quick_push (force_reg (mode, builder.build ()));
5604     }
5605
5606   /* Use permutes to interleave the separate vectors.  */
5607   while (npatterns > 1)
5608     {
5609       npatterns /= 2;
5610       for (unsigned int i = 0; i < npatterns; ++i)
5611         {
5612           rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
5613           rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
5614           emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
5615           vectors[i] = tmp;
5616         }
5617     }
5618   gcc_assert (vectors[0] == target);
5619   return target;
5620 }
5621
5622 /* Use WHILE to set a predicate register of mode MODE in which the first
5623    VL bits are set and the rest are clear.  Use TARGET for the register
5624    if it's nonnull and convenient.  */
5625
5626 static rtx
5627 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
5628                                  unsigned int vl)
5629 {
5630   rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
5631   target = aarch64_target_reg (target, mode);
5632   emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
5633                         target, const0_rtx, limit));
5634   return target;
5635 }
5636
5637 static rtx
5638 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
5639
5640 /* BUILDER is a constant predicate in which the index of every set bit
5641    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
5642    by inverting every element at a multiple of ELT_SIZE and EORing the
5643    result with an ELT_SIZE PTRUE.
5644
5645    Return a register that contains the constant on success, otherwise
5646    return null.  Use TARGET as the register if it is nonnull and
5647    convenient.  */
5648
5649 static rtx
5650 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
5651                                    unsigned int elt_size)
5652 {
5653   /* Invert every element at a multiple of ELT_SIZE, keeping the
5654      other bits zero.  */
5655   rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
5656                                   builder.nelts_per_pattern ());
5657   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5658     if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
5659       inv_builder.quick_push (const1_rtx);
5660     else
5661       inv_builder.quick_push (const0_rtx);
5662   inv_builder.finalize ();
5663
5664   /* See if we can load the constant cheaply.  */
5665   rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
5666   if (!inv)
5667     return NULL_RTX;
5668
5669   /* EOR the result with an ELT_SIZE PTRUE.  */
5670   rtx mask = aarch64_ptrue_all (elt_size);
5671   mask = force_reg (VNx16BImode, mask);
5672   inv = gen_lowpart (VNx16BImode, inv);
5673   target = aarch64_target_reg (target, VNx16BImode);
5674   emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
5675   return target;
5676 }
5677
5678 /* BUILDER is a constant predicate in which the index of every set bit
5679    is a multiple of ELT_SIZE (which is <= 8).  Try to load the constant
5680    using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE.  Return the
5681    register on success, otherwise return null.  Use TARGET as the register
5682    if nonnull and convenient.  */
5683
5684 static rtx
5685 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
5686                                    unsigned int elt_size,
5687                                    unsigned int permute_size)
5688 {
5689   /* We're going to split the constant into two new constants A and B,
5690      with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
5691      and into B otherwise.  E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
5692
5693      A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
5694      B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
5695
5696      where _ indicates elements that will be discarded by the permute.
5697
5698      First calculate the ELT_SIZEs for A and B.  */
5699   unsigned int a_elt_size = GET_MODE_SIZE (DImode);
5700   unsigned int b_elt_size = GET_MODE_SIZE (DImode);
5701   for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
5702     if (INTVAL (builder.elt (i)) != 0)
5703       {
5704         if (i & permute_size)
5705           b_elt_size |= i - permute_size;
5706         else
5707           a_elt_size |= i;
5708       }
5709   a_elt_size &= -a_elt_size;
5710   b_elt_size &= -b_elt_size;
5711
5712   /* Now construct the vectors themselves.  */
5713   rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
5714                                 builder.nelts_per_pattern ());
5715   rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
5716                                 builder.nelts_per_pattern ());
5717   unsigned int nelts = builder.encoded_nelts ();
5718   for (unsigned int i = 0; i < nelts; ++i)
5719     if (i & (elt_size - 1))
5720       {
5721         a_builder.quick_push (const0_rtx);
5722         b_builder.quick_push (const0_rtx);
5723       }
5724     else if ((i & permute_size) == 0)
5725       {
5726         /* The A and B elements are significant.  */
5727         a_builder.quick_push (builder.elt (i));
5728         b_builder.quick_push (builder.elt (i + permute_size));
5729       }
5730     else
5731       {
5732         /* The A and B elements are going to be discarded, so pick whatever
5733            is likely to give a nice constant.  We are targeting element
5734            sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
5735            with the aim of each being a sequence of ones followed by
5736            a sequence of zeros.  So:
5737
5738            * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
5739              duplicate the last X_ELT_SIZE element, to extend the
5740              current sequence of ones or zeros.
5741
5742            * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
5743              zero, so that the constant really does have X_ELT_SIZE and
5744              not a smaller size.  */
5745         if (a_elt_size > permute_size)
5746           a_builder.quick_push (const0_rtx);
5747         else
5748           a_builder.quick_push (a_builder.elt (i - a_elt_size));
5749         if (b_elt_size > permute_size)
5750           b_builder.quick_push (const0_rtx);
5751         else
5752           b_builder.quick_push (b_builder.elt (i - b_elt_size));
5753       }
5754   a_builder.finalize ();
5755   b_builder.finalize ();
5756
5757   /* Try loading A into a register.  */
5758   rtx_insn *last = get_last_insn ();
5759   rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
5760   if (!a)
5761     return NULL_RTX;
5762
5763   /* Try loading B into a register.  */
5764   rtx b = a;
5765   if (a_builder != b_builder)
5766     {
5767       b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
5768       if (!b)
5769         {
5770           delete_insns_since (last);
5771           return NULL_RTX;
5772         }
5773     }
5774
5775   /* Emit the TRN1 itself.  We emit a TRN that operates on VNx16BI
5776      operands but permutes them as though they had mode MODE.  */
5777   machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
5778   target = aarch64_target_reg (target, GET_MODE (a));
5779   rtx type_reg = CONST0_RTX (mode);
5780   emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
5781   return target;
5782 }
5783
5784 /* Subroutine of aarch64_expand_sve_const_pred.  Try to load the VNx16BI
5785    constant in BUILDER into an SVE predicate register.  Return the register
5786    on success, otherwise return null.  Use TARGET for the register if
5787    nonnull and convenient.
5788
5789    ALLOW_RECURSE_P is true if we can use methods that would call this
5790    function recursively.  */
5791
5792 static rtx
5793 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
5794                                  bool allow_recurse_p)
5795 {
5796   if (builder.encoded_nelts () == 1)
5797     /* A PFALSE or a PTRUE .B ALL.  */
5798     return aarch64_emit_set_immediate (target, builder);
5799
5800   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
5801   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
5802     {
5803       /* If we can load the constant using PTRUE, use it as-is.  */
5804       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
5805       if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
5806         return aarch64_emit_set_immediate (target, builder);
5807
5808       /* Otherwise use WHILE to set the first VL bits.  */
5809       return aarch64_sve_move_pred_via_while (target, mode, vl);
5810     }
5811
5812   if (!allow_recurse_p)
5813     return NULL_RTX;
5814
5815   /* Try inverting the vector in element size ELT_SIZE and then EORing
5816      the result with an ELT_SIZE PTRUE.  */
5817   if (INTVAL (builder.elt (0)) == 0)
5818     if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
5819                                                      elt_size))
5820       return res;
5821
5822   /* Try using TRN1 to permute two simpler constants.  */
5823   for (unsigned int i = elt_size; i <= 8; i *= 2)
5824     if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
5825                                                      elt_size, i))
5826       return res;
5827
5828   return NULL_RTX;
5829 }
5830
5831 /* Return an SVE predicate register that contains the VNx16BImode
5832    constant in BUILDER, without going through the move expanders.
5833
5834    The returned register can have whatever mode seems most natural
5835    given the contents of BUILDER.  Use TARGET for the result if
5836    convenient.  */
5837
5838 static rtx
5839 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
5840 {
5841   /* Try loading the constant using pure predicate operations.  */
5842   if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
5843     return res;
5844
5845   /* Try forcing the constant to memory.  */
5846   if (builder.full_nelts ().is_constant ())
5847     if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
5848       {
5849         target = aarch64_target_reg (target, VNx16BImode);
5850         emit_move_insn (target, mem);
5851         return target;
5852       }
5853
5854   /* The last resort is to load the constant as an integer and then
5855      compare it against zero.  Use -1 for set bits in order to increase
5856      the changes of using SVE DUPM or an Advanced SIMD byte mask.  */
5857   rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
5858                                   builder.nelts_per_pattern ());
5859   for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5860     int_builder.quick_push (INTVAL (builder.elt (i))
5861                             ? constm1_rtx : const0_rtx);
5862   return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
5863                                            int_builder.build ());
5864 }
5865
5866 /* Set DEST to immediate IMM.  */
5867
5868 void
5869 aarch64_expand_mov_immediate (rtx dest, rtx imm)
5870 {
5871   machine_mode mode = GET_MODE (dest);
5872
5873   /* Check on what type of symbol it is.  */
5874   scalar_int_mode int_mode;
5875   if ((SYMBOL_REF_P (imm)
5876        || LABEL_REF_P (imm)
5877        || GET_CODE (imm) == CONST
5878        || GET_CODE (imm) == CONST_POLY_INT)
5879       && is_a <scalar_int_mode> (mode, &int_mode))
5880     {
5881       rtx mem;
5882       poly_int64 offset;
5883       HOST_WIDE_INT const_offset;
5884       enum aarch64_symbol_type sty;
5885
5886       /* If we have (const (plus symbol offset)), separate out the offset
5887          before we start classifying the symbol.  */
5888       rtx base = strip_offset (imm, &offset);
5889
5890       /* We must always add an offset involving VL separately, rather than
5891          folding it into the relocation.  */
5892       if (!offset.is_constant (&const_offset))
5893         {
5894           if (!TARGET_SVE)
5895             {
5896               aarch64_report_sve_required ();
5897               return;
5898             }
5899           if (base == const0_rtx
5900               && (aarch64_sve_cnt_immediate_p (offset)
5901                   || aarch64_sve_rdvl_immediate_p (offset)))
5902             emit_insn (gen_rtx_SET (dest, imm));
5903           else
5904             {
5905               /* Do arithmetic on 32-bit values if the result is smaller
5906                  than that.  */
5907               if (partial_subreg_p (int_mode, SImode))
5908                 {
5909                   /* It is invalid to do symbol calculations in modes
5910                      narrower than SImode.  */
5911                   gcc_assert (base == const0_rtx);
5912                   dest = gen_lowpart (SImode, dest);
5913                   int_mode = SImode;
5914                 }
5915               if (base != const0_rtx)
5916                 {
5917                   base = aarch64_force_temporary (int_mode, dest, base);
5918                   aarch64_add_offset (int_mode, dest, base, offset,
5919                                       NULL_RTX, NULL_RTX, 0, false);
5920                 }
5921               else
5922                 aarch64_add_offset (int_mode, dest, base, offset,
5923                                     dest, NULL_RTX, 0, false);
5924             }
5925           return;
5926         }
5927
5928       if (aarch64_rdsvl_immediate_p (base))
5929         {
5930           /* We could handle non-constant offsets if they are ever
5931              generated.  */
5932           gcc_assert (const_offset == 0);
5933           emit_insn (gen_rtx_SET (dest, imm));
5934           return;
5935         }
5936
5937       sty = aarch64_classify_symbol (base, const_offset);
5938       switch (sty)
5939         {
5940         case SYMBOL_FORCE_TO_MEM:
5941           if (int_mode != ptr_mode)
5942             imm = convert_memory_address (ptr_mode, imm);
5943
5944           if (const_offset != 0
5945               && targetm.cannot_force_const_mem (ptr_mode, imm))
5946             {
5947               gcc_assert (can_create_pseudo_p ());
5948               base = aarch64_force_temporary (int_mode, dest, base);
5949               aarch64_add_offset (int_mode, dest, base, const_offset,
5950                                   NULL_RTX, NULL_RTX, 0, false);
5951               return;
5952             }
5953
5954           mem = force_const_mem (ptr_mode, imm);
5955           gcc_assert (mem);
5956
5957           /* If we aren't generating PC relative literals, then
5958              we need to expand the literal pool access carefully.
5959              This is something that needs to be done in a number
5960              of places, so could well live as a separate function.  */
5961           if (!aarch64_pcrelative_literal_loads)
5962             {
5963               gcc_assert (can_create_pseudo_p ());
5964               base = gen_reg_rtx (ptr_mode);
5965               aarch64_expand_mov_immediate (base, XEXP (mem, 0));
5966               if (ptr_mode != Pmode)
5967                 base = convert_memory_address (Pmode, base);
5968               mem = gen_rtx_MEM (ptr_mode, base);
5969             }
5970
5971           if (int_mode != ptr_mode)
5972             mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
5973
5974           emit_insn (gen_rtx_SET (dest, mem));
5975
5976           return;
5977
5978         case SYMBOL_SMALL_TLSGD:
5979         case SYMBOL_SMALL_TLSDESC:
5980         case SYMBOL_SMALL_TLSIE:
5981         case SYMBOL_SMALL_GOT_28K:
5982         case SYMBOL_SMALL_GOT_4G:
5983         case SYMBOL_TINY_GOT:
5984         case SYMBOL_TINY_TLSIE:
5985           if (const_offset != 0)
5986             {
5987               gcc_assert(can_create_pseudo_p ());
5988               base = aarch64_force_temporary (int_mode, dest, base);
5989               aarch64_add_offset (int_mode, dest, base, const_offset,
5990                                   NULL_RTX, NULL_RTX, 0, false);
5991               return;
5992             }
5993           /* FALLTHRU */
5994
5995         case SYMBOL_SMALL_ABSOLUTE:
5996         case SYMBOL_TINY_ABSOLUTE:
5997         case SYMBOL_TLSLE12:
5998         case SYMBOL_TLSLE24:
5999         case SYMBOL_TLSLE32:
6000         case SYMBOL_TLSLE48:
6001           aarch64_load_symref_appropriately (dest, imm, sty);
6002           return;
6003
6004         default:
6005           gcc_unreachable ();
6006         }
6007     }
6008
6009   if (!CONST_INT_P (imm))
6010     {
6011       if (aarch64_sve_pred_mode_p (mode))
6012         {
6013           /* Only the low bit of each .H, .S and .D element is defined,
6014              so we can set the upper bits to whatever we like.  If the
6015              predicate is all-true in MODE, prefer to set all the undefined
6016              bits as well, so that we can share a single .B predicate for
6017              all modes.  */
6018           if (imm == CONSTM1_RTX (mode))
6019             imm = CONSTM1_RTX (VNx16BImode);
6020
6021           /* All methods for constructing predicate modes wider than VNx16BI
6022              will set the upper bits of each element to zero.  Expose this
6023              by moving such constants as a VNx16BI, so that all bits are
6024              significant and so that constants for different modes can be
6025              shared.  The wider constant will still be available as a
6026              REG_EQUAL note.  */
6027           rtx_vector_builder builder;
6028           if (aarch64_get_sve_pred_bits (builder, imm))
6029             {
6030               rtx res = aarch64_expand_sve_const_pred (dest, builder);
6031               if (dest != res)
6032                 emit_move_insn (dest, gen_lowpart (mode, res));
6033               return;
6034             }
6035         }
6036
6037       if (GET_CODE (imm) == HIGH
6038           || aarch64_simd_valid_immediate (imm, NULL))
6039         {
6040           emit_insn (gen_rtx_SET (dest, imm));
6041           return;
6042         }
6043
6044       if (CONST_VECTOR_P (imm) && aarch64_sve_data_mode_p (mode))
6045         if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
6046           {
6047             if (dest != res)
6048               emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
6049             return;
6050           }
6051
6052       rtx mem = force_const_mem (mode, imm);
6053       gcc_assert (mem);
6054       emit_move_insn (dest, mem);
6055       return;
6056     }
6057
6058   aarch64_internal_mov_immediate (dest, imm, true, mode);
6059 }
6060
6061 /* Return the MEM rtx that provides the canary value that should be used
6062    for stack-smashing protection.  MODE is the mode of the memory.
6063    For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
6064    (__stack_chk_guard), otherwise it has no useful value.  SALT_TYPE
6065    indicates whether the caller is performing a SET or a TEST operation.  */
6066
6067 rtx
6068 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
6069                                   aarch64_salt_type salt_type)
6070 {
6071   rtx addr;
6072   if (aarch64_stack_protector_guard == SSP_GLOBAL)
6073     {
6074       gcc_assert (MEM_P (decl_rtl));
6075       addr = XEXP (decl_rtl, 0);
6076       poly_int64 offset;
6077       rtx base = strip_offset_and_salt (addr, &offset);
6078       if (!SYMBOL_REF_P (base))
6079         return decl_rtl;
6080
6081       rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
6082       addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
6083       addr = gen_rtx_CONST (Pmode, addr);
6084       addr = plus_constant (Pmode, addr, offset);
6085     }
6086   else
6087     {
6088       /* Calculate the address from the system register.  */
6089       rtx salt = GEN_INT (salt_type);
6090       addr = gen_reg_rtx (mode);
6091       if (mode == DImode)
6092         emit_insn (gen_reg_stack_protect_address_di (addr, salt));
6093       else
6094         {
6095           emit_insn (gen_reg_stack_protect_address_si (addr, salt));
6096           addr = convert_memory_address (Pmode, addr);
6097         }
6098       addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
6099     }
6100   return gen_rtx_MEM (mode, force_reg (Pmode, addr));
6101 }
6102
6103 /* Emit an SVE predicated move from SRC to DEST.  PRED is a predicate
6104    that is known to contain PTRUE.  */
6105
6106 void
6107 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
6108 {
6109   expand_operand ops[3];
6110   machine_mode mode = GET_MODE (dest);
6111   create_output_operand (&ops[0], dest, mode);
6112   create_input_operand (&ops[1], pred, GET_MODE(pred));
6113   create_input_operand (&ops[2], src, mode);
6114   temporary_volatile_ok v (true);
6115   expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
6116 }
6117
6118 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
6119    operand is in memory.  In this case we need to use the predicated LD1
6120    and ST1 instead of LDR and STR, both for correctness on big-endian
6121    targets and because LD1 and ST1 support a wider range of addressing modes.
6122    PRED_MODE is the mode of the predicate.
6123
6124    See the comment at the head of aarch64-sve.md for details about the
6125    big-endian handling.  */
6126
6127 void
6128 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
6129 {
6130   machine_mode mode = GET_MODE (dest);
6131   rtx ptrue = aarch64_ptrue_reg (pred_mode);
6132   if (!register_operand (src, mode)
6133       && !register_operand (dest, mode))
6134     {
6135       rtx tmp = gen_reg_rtx (mode);
6136       if (MEM_P (src))
6137         aarch64_emit_sve_pred_move (tmp, ptrue, src);
6138       else
6139         emit_move_insn (tmp, src);
6140       src = tmp;
6141     }
6142   aarch64_emit_sve_pred_move (dest, ptrue, src);
6143 }
6144
6145 /* Called only on big-endian targets.  See whether an SVE vector move
6146    from SRC to DEST is effectively a REV[BHW] instruction, because at
6147    least one operand is a subreg of an SVE vector that has wider or
6148    narrower elements.  Return true and emit the instruction if so.
6149
6150    For example:
6151
6152      (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
6153
6154    represents a VIEW_CONVERT between the following vectors, viewed
6155    in memory order:
6156
6157      R2: { [0].high, [0].low,  [1].high, [1].low, ... }
6158      R1: { [0],      [1],      [2],      [3],     ... }
6159
6160    The high part of lane X in R2 should therefore correspond to lane X*2
6161    of R1, but the register representations are:
6162
6163          msb                                      lsb
6164      R2: ...... [1].high  [1].low   [0].high  [0].low
6165      R1: ...... [3]       [2]       [1]       [0]
6166
6167    where the low part of lane X in R2 corresponds to lane X*2 in R1.
6168    We therefore need a reverse operation to swap the high and low values
6169    around.
6170
6171    This is purely an optimization.  Without it we would spill the
6172    subreg operand to the stack in one mode and reload it in the
6173    other mode, which has the same effect as the REV.  */
6174
6175 bool
6176 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
6177 {
6178   gcc_assert (BYTES_BIG_ENDIAN);
6179
6180   /* Do not try to optimize subregs that LRA has created for matched
6181      reloads.  These subregs only exist as a temporary measure to make
6182      the RTL well-formed, but they are exempt from the usual
6183      TARGET_CAN_CHANGE_MODE_CLASS rules.
6184
6185      For example, if we have:
6186
6187        (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
6188
6189      and the constraints require R1 and R2 to be in the same register,
6190      LRA may need to create RTL such as:
6191
6192        (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
6193        (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
6194        (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
6195
6196      which forces both the input and output of the original instruction
6197      to use the same hard register.  But for this to work, the normal
6198      rules have to be suppressed on the subreg input, otherwise LRA
6199      would need to reload that input too, meaning that the process
6200      would never terminate.  To compensate for this, the normal rules
6201      are also suppressed for the subreg output of the first move.
6202      Ignoring the special case and handling the first move normally
6203      would therefore generate wrong code: we would reverse the elements
6204      for the first subreg but not reverse them back for the second subreg.  */
6205   if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
6206     dest = SUBREG_REG (dest);
6207   if (SUBREG_P (src) && !LRA_SUBREG_P (src))
6208     src = SUBREG_REG (src);
6209
6210   /* The optimization handles two single SVE REGs with different element
6211      sizes.  */
6212   if (!REG_P (dest)
6213       || !REG_P (src)
6214       || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
6215       || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
6216       || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
6217           == GET_MODE_UNIT_SIZE (GET_MODE (src))))
6218     return false;
6219
6220   /* Generate *aarch64_sve_mov<mode>_subreg_be.  */
6221   rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
6222   rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
6223                                UNSPEC_REV_SUBREG);
6224   emit_insn (gen_rtx_SET (dest, unspec));
6225   return true;
6226 }
6227
6228 /* Return a copy of X with mode MODE, without changing its other
6229    attributes.  Unlike gen_lowpart, this doesn't care whether the
6230    mode change is valid.  */
6231
6232 rtx
6233 aarch64_replace_reg_mode (rtx x, machine_mode mode)
6234 {
6235   if (GET_MODE (x) == mode)
6236     return x;
6237
6238   x = shallow_copy_rtx (x);
6239   set_mode_and_regno (x, mode, REGNO (x));
6240   return x;
6241 }
6242
6243 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
6244    stored in wider integer containers.  */
6245
6246 static unsigned int
6247 aarch64_sve_rev_unspec (machine_mode mode)
6248 {
6249   switch (GET_MODE_UNIT_SIZE (mode))
6250     {
6251     case 1: return UNSPEC_REVB;
6252     case 2: return UNSPEC_REVH;
6253     case 4: return UNSPEC_REVW;
6254     }
6255   gcc_unreachable ();
6256 }
6257
6258 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
6259    operands.  */
6260
6261 void
6262 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
6263 {
6264   /* Decide which REV operation we need.  The mode with wider elements
6265      determines the mode of the operands and the mode with the narrower
6266      elements determines the reverse width.  */
6267   machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
6268   machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
6269   if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
6270       < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
6271     std::swap (mode_with_wider_elts, mode_with_narrower_elts);
6272
6273   unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
6274   machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
6275
6276   /* Get the operands in the appropriate modes and emit the instruction.  */
6277   ptrue = gen_lowpart (pred_mode, ptrue);
6278   dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
6279   src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
6280   emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
6281                                dest, ptrue, src));
6282 }
6283
6284 static bool
6285 aarch64_function_ok_for_sibcall (tree, tree exp)
6286 {
6287   if (crtl->abi->id () != expr_callee_abi (exp).id ())
6288     return false;
6289
6290   tree fntype = TREE_TYPE (TREE_TYPE (CALL_EXPR_FN (exp)));
6291   if (aarch64_fntype_pstate_sm (fntype) & ~aarch64_cfun_incoming_pstate_sm ())
6292     return false;
6293   if (aarch64_fntype_pstate_za (fntype) != aarch64_cfun_incoming_pstate_za ())
6294     return false;
6295   return true;
6296 }
6297
6298 /* Subroutine of aarch64_pass_by_reference for arguments that are not
6299    passed in SVE registers.  */
6300
6301 static bool
6302 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
6303                              const function_arg_info &arg)
6304 {
6305   HOST_WIDE_INT size;
6306   machine_mode dummymode;
6307   int nregs;
6308
6309   /* GET_MODE_SIZE (BLKmode) is useless since it is 0.  */
6310   if (arg.mode == BLKmode && arg.type)
6311     size = int_size_in_bytes (arg.type);
6312   else
6313     /* No frontends can create types with variable-sized modes, so we
6314        shouldn't be asked to pass or return them.  */
6315     size = GET_MODE_SIZE (arg.mode).to_constant ();
6316
6317   /* Aggregates are passed by reference based on their size.  */
6318   if (arg.aggregate_type_p ())
6319     size = int_size_in_bytes (arg.type);
6320
6321   /* Variable sized arguments are always returned by reference.  */
6322   if (size < 0)
6323     return true;
6324
6325   /* Can this be a candidate to be passed in fp/simd register(s)?  */
6326   if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
6327                                                &dummymode, &nregs, NULL,
6328                                                !pcum || pcum->silent_p))
6329     return false;
6330
6331   /* Arguments which are variable sized or larger than 2 registers are
6332      passed by reference unless they are a homogenous floating point
6333      aggregate.  */
6334   return size > 2 * UNITS_PER_WORD;
6335 }
6336
6337 /* Implement TARGET_PASS_BY_REFERENCE.  */
6338
6339 static bool
6340 aarch64_pass_by_reference (cumulative_args_t pcum_v,
6341                            const function_arg_info &arg)
6342 {
6343   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6344
6345   if (!arg.type)
6346     return aarch64_pass_by_reference_1 (pcum, arg);
6347
6348   pure_scalable_type_info pst_info;
6349   switch (pst_info.analyze (arg.type))
6350     {
6351     case pure_scalable_type_info::IS_PST:
6352       if (pcum && !pcum->silent_p && !TARGET_SVE)
6353         /* We can't gracefully recover at this point, so make this a
6354            fatal error.  */
6355         fatal_error (input_location, "arguments of type %qT require"
6356                      " the SVE ISA extension", arg.type);
6357
6358       /* Variadic SVE types are passed by reference.  Normal non-variadic
6359          arguments are too if we've run out of registers.  */
6360       return (!arg.named
6361               || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
6362               || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
6363
6364     case pure_scalable_type_info::DOESNT_MATTER:
6365       gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
6366       return true;
6367
6368     case pure_scalable_type_info::NO_ABI_IDENTITY:
6369     case pure_scalable_type_info::ISNT_PST:
6370       return aarch64_pass_by_reference_1 (pcum, arg);
6371     }
6372   gcc_unreachable ();
6373 }
6374
6375 /* Return TRUE if VALTYPE is padded to its least significant bits.  */
6376 static bool
6377 aarch64_return_in_msb (const_tree valtype)
6378 {
6379   machine_mode dummy_mode;
6380   int dummy_int;
6381
6382   /* Never happens in little-endian mode.  */
6383   if (!BYTES_BIG_ENDIAN)
6384     return false;
6385
6386   /* Only composite types smaller than or equal to 16 bytes can
6387      be potentially returned in registers.  */
6388   if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
6389       || int_size_in_bytes (valtype) <= 0
6390       || int_size_in_bytes (valtype) > 16)
6391     return false;
6392
6393   /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
6394      or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
6395      is always passed/returned in the least significant bits of fp/simd
6396      register(s).  */
6397   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
6398                                                &dummy_mode, &dummy_int, NULL,
6399                                                false))
6400     return false;
6401
6402   /* Likewise pure scalable types for SVE vector and predicate registers.  */
6403   pure_scalable_type_info pst_info;
6404   if (pst_info.analyze_registers (valtype))
6405     return false;
6406
6407   return true;
6408 }
6409
6410 /* Implement TARGET_FUNCTION_VALUE.
6411    Define how to find the value returned by a function.  */
6412
6413 static rtx
6414 aarch64_function_value (const_tree type, const_tree func,
6415                         bool outgoing ATTRIBUTE_UNUSED)
6416 {
6417   machine_mode mode;
6418   int unsignedp;
6419
6420   mode = TYPE_MODE (type);
6421   if (INTEGRAL_TYPE_P (type))
6422     mode = promote_function_mode (type, mode, &unsignedp, func, 1);
6423
6424   pure_scalable_type_info pst_info;
6425   if (type && pst_info.analyze_registers (type))
6426     return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
6427
6428   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6429      are returned in memory, not by value.  */
6430   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6431   bool sve_p = (vec_flags & VEC_ANY_SVE);
6432
6433   if (aarch64_return_in_msb (type))
6434     {
6435       HOST_WIDE_INT size = int_size_in_bytes (type);
6436
6437       if (size % UNITS_PER_WORD != 0)
6438         {
6439           size += UNITS_PER_WORD - size % UNITS_PER_WORD;
6440           mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
6441         }
6442     }
6443
6444   int count;
6445   machine_mode ag_mode;
6446   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
6447                                                NULL, false))
6448     {
6449       gcc_assert (!sve_p);
6450       if (!aarch64_composite_type_p (type, mode))
6451         {
6452           gcc_assert (count == 1 && mode == ag_mode);
6453           return gen_rtx_REG (mode, V0_REGNUM);
6454         }
6455       else if (aarch64_advsimd_full_struct_mode_p (mode)
6456                && known_eq (GET_MODE_SIZE (ag_mode), 16))
6457         return gen_rtx_REG (mode, V0_REGNUM);
6458       else if (aarch64_advsimd_partial_struct_mode_p (mode)
6459                && known_eq (GET_MODE_SIZE (ag_mode), 8))
6460         return gen_rtx_REG (mode, V0_REGNUM);
6461       else
6462         {
6463           int i;
6464           rtx par;
6465
6466           par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
6467           for (i = 0; i < count; i++)
6468             {
6469               rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
6470               rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
6471               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6472               XVECEXP (par, 0, i) = tmp;
6473             }
6474           return par;
6475         }
6476     }
6477   else
6478     {
6479       if (sve_p)
6480         {
6481           /* Vector types can acquire a partial SVE mode using things like
6482              __attribute__((vector_size(N))), and this is potentially useful.
6483              However, the choice of mode doesn't affect the type's ABI
6484              identity, so we should treat the types as though they had
6485              the associated integer mode, just like they did before SVE
6486              was introduced.
6487
6488              We know that the vector must be 128 bits or smaller,
6489              otherwise we'd have returned it in memory instead.  */
6490           gcc_assert (type
6491                       && (aarch64_some_values_include_pst_objects_p (type)
6492                           || (vec_flags & VEC_PARTIAL)));
6493
6494           scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
6495           rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
6496           rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
6497           return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
6498         }
6499       return gen_rtx_REG (mode, R0_REGNUM);
6500     }
6501 }
6502
6503 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
6504    Return true if REGNO is the number of a hard register in which the values
6505    of called function may come back.  */
6506
6507 static bool
6508 aarch64_function_value_regno_p (const unsigned int regno)
6509 {
6510   /* Maximum of 16 bytes can be returned in the general registers.  Examples
6511      of 16-byte return values are: 128-bit integers and 16-byte small
6512      structures (excluding homogeneous floating-point aggregates).  */
6513   if (regno == R0_REGNUM || regno == R1_REGNUM)
6514     return true;
6515
6516   /* Up to four fp/simd registers can return a function value, e.g. a
6517      homogeneous floating-point aggregate having four members.  */
6518   if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
6519     return TARGET_FLOAT;
6520
6521   if (regno >= P0_REGNUM && regno < P0_REGNUM + HA_MAX_NUM_FLDS)
6522     return TARGET_SVE;
6523
6524   return false;
6525 }
6526
6527 /* Subroutine for aarch64_return_in_memory for types that are not returned
6528    in SVE registers.  */
6529
6530 static bool
6531 aarch64_return_in_memory_1 (const_tree type)
6532 {
6533   HOST_WIDE_INT size;
6534   machine_mode ag_mode;
6535   int count;
6536
6537   if (!AGGREGATE_TYPE_P (type)
6538       && TREE_CODE (type) != COMPLEX_TYPE
6539       && TREE_CODE (type) != VECTOR_TYPE)
6540     /* Simple scalar types always returned in registers.  */
6541     return false;
6542
6543   if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
6544                                                &ag_mode, &count, NULL, false))
6545     return false;
6546
6547   /* Types larger than 2 registers returned in memory.  */
6548   size = int_size_in_bytes (type);
6549   return (size < 0 || size > 2 * UNITS_PER_WORD);
6550 }
6551
6552 /* Implement TARGET_RETURN_IN_MEMORY.
6553
6554    If the type T of the result of a function is such that
6555      void func (T arg)
6556    would require that arg be passed as a value in a register (or set of
6557    registers) according to the parameter passing rules, then the result
6558    is returned in the same registers as would be used for such an
6559    argument.  */
6560
6561 static bool
6562 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
6563 {
6564   pure_scalable_type_info pst_info;
6565   switch (pst_info.analyze (type))
6566     {
6567     case pure_scalable_type_info::IS_PST:
6568       return (pst_info.num_zr () > NUM_FP_ARG_REGS
6569               || pst_info.num_pr () > NUM_PR_ARG_REGS);
6570
6571     case pure_scalable_type_info::DOESNT_MATTER:
6572       gcc_assert (aarch64_return_in_memory_1 (type));
6573       return true;
6574
6575     case pure_scalable_type_info::NO_ABI_IDENTITY:
6576     case pure_scalable_type_info::ISNT_PST:
6577       return aarch64_return_in_memory_1 (type);
6578     }
6579   gcc_unreachable ();
6580 }
6581
6582 static bool
6583 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
6584                                const_tree type, int *nregs)
6585 {
6586   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6587   return aarch64_vfp_is_call_or_return_candidate (mode, type,
6588                                                   &pcum->aapcs_vfp_rmode,
6589                                                   nregs, NULL, pcum->silent_p);
6590 }
6591
6592 /* Given MODE and TYPE of a function argument, return the alignment in
6593    bits.  The idea is to suppress any stronger alignment requested by
6594    the user and opt for the natural alignment (specified in AAPCS64 \S
6595    4.1).  ABI_BREAK_GCC_9 is set to the old alignment if the alignment
6596    was incorrectly calculated in versions of GCC prior to GCC 9.
6597    ABI_BREAK_GCC_13 is set to the old alignment if it was incorrectly
6598    calculated in versions between GCC 9 and GCC 13.  If the alignment
6599    might have changed between GCC 13 and GCC 14, ABI_BREAK_GCC_14
6600    is the old GCC 13 alignment, otherwise it is zero.
6601
6602    This is a helper function for local use only.  */
6603
6604 static unsigned int
6605 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
6606                                 unsigned int *abi_break_gcc_9,
6607                                 unsigned int *abi_break_gcc_13,
6608                                 unsigned int *abi_break_gcc_14)
6609 {
6610   *abi_break_gcc_9 = 0;
6611   *abi_break_gcc_13 = 0;
6612   *abi_break_gcc_14 = 0;
6613   if (!type)
6614     return GET_MODE_ALIGNMENT (mode);
6615
6616   if (integer_zerop (TYPE_SIZE (type)))
6617     return 0;
6618
6619   gcc_assert (TYPE_MODE (type) == mode);
6620
6621   if (!AGGREGATE_TYPE_P (type))
6622     {
6623       /* The ABI alignment is the natural alignment of the type, without
6624          any attributes applied.  Normally this is the alignment of the
6625          TYPE_MAIN_VARIANT, but not always; see PR108910 for a counterexample.
6626          For now we just handle the known exceptions explicitly.  */
6627       type = TYPE_MAIN_VARIANT (type);
6628       if (POINTER_TYPE_P (type))
6629         {
6630           gcc_assert (known_eq (POINTER_SIZE, GET_MODE_BITSIZE (mode)));
6631           return POINTER_SIZE;
6632         }
6633       if (TREE_CODE (type) == ENUMERAL_TYPE && TREE_TYPE (type))
6634         {
6635           *abi_break_gcc_14 = TYPE_ALIGN (type);
6636           type = TYPE_MAIN_VARIANT (TREE_TYPE (type));
6637         }
6638       gcc_assert (!TYPE_USER_ALIGN (type));
6639       return TYPE_ALIGN (type);
6640     }
6641
6642   if (TREE_CODE (type) == ARRAY_TYPE)
6643     return TYPE_ALIGN (TREE_TYPE (type));
6644
6645   unsigned int alignment = 0;
6646   unsigned int bitfield_alignment_with_packed = 0;
6647   unsigned int bitfield_alignment = 0;
6648   for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6649     if (TREE_CODE (field) == FIELD_DECL)
6650       {
6651         /* Note that we explicitly consider zero-sized fields here,
6652            even though they don't map to AAPCS64 machine types.
6653            For example, in:
6654
6655                struct __attribute__((aligned(8))) empty {};
6656
6657                struct s {
6658                  [[no_unique_address]] empty e;
6659                  int x;
6660                };
6661
6662            "s" contains only one Fundamental Data Type (the int field)
6663            but gains 8-byte alignment and size thanks to "e".  */
6664         alignment = std::max (alignment, DECL_ALIGN (field));
6665         if (DECL_BIT_FIELD_TYPE (field))
6666           {
6667             /* Take the bit-field type's alignment into account only
6668                if the user didn't reduce this field's alignment with
6669                the packed attribute.  */
6670             if (!DECL_PACKED (field))
6671               bitfield_alignment
6672                 = std::max (bitfield_alignment,
6673                             TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
6674
6675             /* Compute the alignment even if the bit-field is
6676                packed, so that we can emit a warning in case the
6677                alignment changed between GCC versions.  */
6678             bitfield_alignment_with_packed
6679               = std::max (bitfield_alignment_with_packed,
6680                           TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
6681           }
6682       }
6683
6684   /* Emit a warning if the alignment is different when taking the
6685      'packed' attribute into account.  */
6686   if (bitfield_alignment != bitfield_alignment_with_packed
6687       && bitfield_alignment_with_packed > alignment)
6688     *abi_break_gcc_13 = bitfield_alignment_with_packed;
6689
6690   if (bitfield_alignment > alignment)
6691     {
6692       *abi_break_gcc_9 = alignment;
6693       return bitfield_alignment;
6694     }
6695
6696   return alignment;
6697 }
6698
6699 /* Layout a function argument according to the AAPCS64 rules.  The rule
6700    numbers refer to the rule numbers in the AAPCS64.  ORIG_MODE is the
6701    mode that was originally given to us by the target hook, whereas the
6702    mode in ARG might be the result of replacing partial SVE modes with
6703    the equivalent integer mode.  */
6704
6705 static void
6706 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
6707 {
6708   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6709   tree type = arg.type;
6710   machine_mode mode = arg.mode;
6711   int ncrn, nvrn, nregs;
6712   bool allocate_ncrn, allocate_nvrn;
6713   HOST_WIDE_INT size;
6714   unsigned int abi_break_gcc_9;
6715   unsigned int abi_break_gcc_13;
6716   unsigned int abi_break_gcc_14;
6717
6718   /* We need to do this once per argument.  */
6719   if (pcum->aapcs_arg_processed)
6720     return;
6721
6722   bool warn_pcs_change
6723     = (warn_psabi
6724        && !pcum->silent_p
6725        && (currently_expanding_function_start
6726            || currently_expanding_gimple_stmt));
6727
6728   /* HFAs and HVAs can have an alignment greater than 16 bytes.  For example:
6729
6730        typedef struct foo {
6731          __Int8x16_t foo[2] __attribute__((aligned(32)));
6732        } foo;
6733
6734      is still a HVA despite its larger-than-normal alignment.
6735      However, such over-aligned HFAs and HVAs are guaranteed to have
6736      no padding.
6737
6738      If we exclude HFAs and HVAs from the discussion below, then there
6739      are several things to note:
6740
6741      - Both the C and AAPCS64 interpretations of a type's alignment should
6742        give a value that is no greater than the type's size.
6743
6744      - Types bigger than 16 bytes are passed indirectly.
6745
6746      - If an argument of type T is passed indirectly, TYPE and MODE describe
6747        a pointer to T rather than T iself.
6748
6749      It follows that the AAPCS64 alignment of TYPE must be no greater
6750      than 16 bytes.
6751
6752      Versions prior to GCC 9.1 ignored a bitfield's underlying type
6753      and so could calculate an alignment that was too small.  If this
6754      happened for TYPE then ABI_BREAK_GCC_9 is this older, too-small alignment.
6755
6756      Although GCC 9.1 fixed that bug, it introduced a different one:
6757      it would consider the alignment of a bitfield's underlying type even
6758      if the field was packed (which should have the effect of overriding
6759      the alignment of the underlying type).  This was fixed in GCC 13.1.
6760
6761      As a result of this bug, GCC 9 to GCC 12 could calculate an alignment
6762      that was too big.  If this happened for TYPE, ABI_BREAK_GCC_13 is
6763      this older, too-big alignment.
6764
6765      Also, the fact that GCC 9 to GCC 12 considered irrelevant
6766      alignments meant they could calculate type alignments that were
6767      bigger than the type's size, contrary to the assumption above.
6768      The handling of register arguments was nevertheless (and justifiably)
6769      written to follow the assumption that the alignment can never be
6770      greater than the size.  The same was not true for stack arguments;
6771      their alignment was instead handled by MIN bounds in
6772      aarch64_function_arg_boundary.
6773
6774      The net effect is that, if GCC 9 to GCC 12 incorrectly calculated
6775      an alignment of more than 16 bytes for TYPE then:
6776
6777      - If the argument was passed in registers, these GCC versions
6778        would treat the alignment as though it was *less than* 16 bytes.
6779
6780      - If the argument was passed on the stack, these GCC versions
6781        would treat the alignment as though it was *equal to* 16 bytes.
6782
6783      Both behaviors were wrong, but in different cases.  */
6784
6785   pcum->aapcs_arg_processed = true;
6786
6787   pure_scalable_type_info pst_info;
6788   if (type && pst_info.analyze_registers (type))
6789     {
6790       /* aarch64_function_arg_alignment has never had an effect on
6791          this case.  */
6792
6793       /* The PCS says that it is invalid to pass an SVE value to an
6794          unprototyped function.  There is no ABI-defined location we
6795          can return in this case, so we have no real choice but to raise
6796          an error immediately, even though this is only a query function.  */
6797       if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
6798         {
6799           gcc_assert (!pcum->silent_p);
6800           error ("SVE type %qT cannot be passed to an unprototyped function",
6801                  arg.type);
6802           /* Avoid repeating the message, and avoid tripping the assert
6803              below.  */
6804           pcum->pcs_variant = ARM_PCS_SVE;
6805         }
6806
6807       /* We would have converted the argument into pass-by-reference
6808          form if it didn't fit in registers.  */
6809       pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
6810       pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
6811       gcc_assert (arg.named
6812                   && pcum->pcs_variant == ARM_PCS_SVE
6813                   && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
6814                   && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
6815       pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
6816                                           P0_REGNUM + pcum->aapcs_nprn);
6817       return;
6818     }
6819
6820   /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6821      are passed by reference, not by value.  */
6822   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6823   bool sve_p = (vec_flags & VEC_ANY_SVE);
6824   if (sve_p)
6825     /* Vector types can acquire a partial SVE mode using things like
6826        __attribute__((vector_size(N))), and this is potentially useful.
6827        However, the choice of mode doesn't affect the type's ABI
6828        identity, so we should treat the types as though they had
6829        the associated integer mode, just like they did before SVE
6830        was introduced.
6831
6832        We know that the vector must be 128 bits or smaller,
6833        otherwise we'd have passed it in memory instead.  */
6834     gcc_assert (type
6835                 && (aarch64_some_values_include_pst_objects_p (type)
6836                     || (vec_flags & VEC_PARTIAL)));
6837
6838   /* Size in bytes, rounded to the nearest multiple of 8 bytes.  */
6839   if (type)
6840     size = int_size_in_bytes (type);
6841   else
6842     /* No frontends can create types with variable-sized modes, so we
6843        shouldn't be asked to pass or return them.  */
6844     size = GET_MODE_SIZE (mode).to_constant ();
6845   size = ROUND_UP (size, UNITS_PER_WORD);
6846
6847   allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
6848   allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
6849                                                  mode,
6850                                                  type,
6851                                                  &nregs);
6852   gcc_assert (!sve_p || !allocate_nvrn);
6853
6854   unsigned int alignment
6855     = aarch64_function_arg_alignment (mode, type, &abi_break_gcc_9,
6856                                       &abi_break_gcc_13, &abi_break_gcc_14);
6857
6858   gcc_assert ((allocate_nvrn || alignment <= 16 * BITS_PER_UNIT)
6859               && (!alignment || abi_break_gcc_9 < alignment)
6860               && (!abi_break_gcc_13 || alignment < abi_break_gcc_13));
6861
6862   /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
6863      The following code thus handles passing by SIMD/FP registers first.  */
6864
6865   nvrn = pcum->aapcs_nvrn;
6866
6867   /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
6868      and homogenous short-vector aggregates (HVA).  */
6869   if (allocate_nvrn)
6870     {
6871       /* aarch64_function_arg_alignment has never had an effect on
6872          this case.  */
6873       if (!pcum->silent_p && !TARGET_FLOAT)
6874         aarch64_err_no_fpadvsimd (mode);
6875
6876       if (nvrn + nregs <= NUM_FP_ARG_REGS)
6877         {
6878           pcum->aapcs_nextnvrn = nvrn + nregs;
6879           if (!aarch64_composite_type_p (type, mode))
6880             {
6881               gcc_assert (nregs == 1);
6882               pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6883             }
6884           else if (aarch64_advsimd_full_struct_mode_p (mode)
6885                    && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 16))
6886             pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6887           else if (aarch64_advsimd_partial_struct_mode_p (mode)
6888                    && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 8))
6889             pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6890           else
6891             {
6892               rtx par;
6893               int i;
6894               par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6895               for (i = 0; i < nregs; i++)
6896                 {
6897                   rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
6898                                          V0_REGNUM + nvrn + i);
6899                   rtx offset = gen_int_mode
6900                     (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
6901                   tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6902                   XVECEXP (par, 0, i) = tmp;
6903                 }
6904               pcum->aapcs_reg = par;
6905             }
6906           return;
6907         }
6908       else
6909         {
6910           /* C.3 NSRN is set to 8.  */
6911           pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
6912           goto on_stack;
6913         }
6914     }
6915
6916   ncrn = pcum->aapcs_ncrn;
6917   nregs = size / UNITS_PER_WORD;
6918
6919   /* C6 - C9.  though the sign and zero extension semantics are
6920      handled elsewhere.  This is the case where the argument fits
6921      entirely general registers.  */
6922   if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
6923     {
6924       gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
6925
6926       /* C.8 if the argument has an alignment of 16 then the NGRN is
6927          rounded up to the next even number.  */
6928       if (nregs == 2
6929           && ncrn % 2)
6930         {
6931           /* Emit a warning if the alignment changed when taking the
6932              'packed' attribute into account.  */
6933           if (warn_pcs_change
6934               && abi_break_gcc_13
6935               && ((abi_break_gcc_13 == 16 * BITS_PER_UNIT)
6936                   != (alignment == 16 * BITS_PER_UNIT)))
6937             inform (input_location, "parameter passing for argument of type "
6938                     "%qT changed in GCC 13.1", type);
6939
6940           if (warn_pcs_change
6941               && abi_break_gcc_14
6942               && ((abi_break_gcc_14 == 16 * BITS_PER_UNIT)
6943                   != (alignment == 16 * BITS_PER_UNIT)))
6944             inform (input_location, "parameter passing for argument of type "
6945                     "%qT changed in GCC 14.1", type);
6946
6947           /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
6948              comparison is there because for > 16 * BITS_PER_UNIT
6949              alignment nregs should be > 2 and therefore it should be
6950              passed by reference rather than value.  */
6951           if (alignment == 16 * BITS_PER_UNIT)
6952             {
6953               if (warn_pcs_change && abi_break_gcc_9)
6954                 inform (input_location, "parameter passing for argument of type "
6955                         "%qT changed in GCC 9.1", type);
6956               ++ncrn;
6957               gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
6958             }
6959         }
6960
6961       /* If an argument with an SVE mode needs to be shifted up to the
6962          high part of the register, treat it as though it had an integer mode.
6963          Using the normal (parallel [...]) would suppress the shifting.  */
6964       if (sve_p
6965           && BYTES_BIG_ENDIAN
6966           && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
6967           && aarch64_pad_reg_upward (mode, type, false))
6968         {
6969           mode = int_mode_for_mode (mode).require ();
6970           sve_p = false;
6971         }
6972
6973       /* NREGS can be 0 when e.g. an empty structure is to be passed.
6974          A reg is still generated for it, but the caller should be smart
6975          enough not to use it.  */
6976       if (nregs == 0
6977           || (nregs == 1 && !sve_p)
6978           || GET_MODE_CLASS (mode) == MODE_INT)
6979         pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
6980       else
6981         {
6982           rtx par;
6983           int i;
6984
6985           par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6986           for (i = 0; i < nregs; i++)
6987             {
6988               scalar_int_mode reg_mode = word_mode;
6989               if (nregs == 1)
6990                 reg_mode = int_mode_for_mode (mode).require ();
6991               rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
6992               tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
6993                                        GEN_INT (i * UNITS_PER_WORD));
6994               XVECEXP (par, 0, i) = tmp;
6995             }
6996           pcum->aapcs_reg = par;
6997         }
6998
6999       pcum->aapcs_nextncrn = ncrn + nregs;
7000       return;
7001     }
7002
7003   /* C.11  */
7004   pcum->aapcs_nextncrn = NUM_ARG_REGS;
7005
7006   /* The argument is passed on stack; record the needed number of words for
7007      this argument and align the total size if necessary.  */
7008 on_stack:
7009   pcum->aapcs_stack_words = size / UNITS_PER_WORD;
7010
7011   if (warn_pcs_change
7012       && abi_break_gcc_13
7013       && ((abi_break_gcc_13 >= 16 * BITS_PER_UNIT)
7014           != (alignment >= 16 * BITS_PER_UNIT)))
7015     inform (input_location, "parameter passing for argument of type "
7016             "%qT changed in GCC 13.1", type);
7017
7018   if (warn_pcs_change
7019       && abi_break_gcc_14
7020       && ((abi_break_gcc_14 >= 16 * BITS_PER_UNIT)
7021           != (alignment >= 16 * BITS_PER_UNIT)))
7022     inform (input_location, "parameter passing for argument of type "
7023             "%qT changed in GCC 14.1", type);
7024
7025   if (alignment == 16 * BITS_PER_UNIT)
7026     {
7027       int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
7028       if (pcum->aapcs_stack_size != new_size)
7029         {
7030           if (warn_pcs_change && abi_break_gcc_9)
7031             inform (input_location, "parameter passing for argument of type "
7032                     "%qT changed in GCC 9.1", type);
7033           pcum->aapcs_stack_size = new_size;
7034         }
7035     }
7036   return;
7037 }
7038
7039 /* Add the current argument register to the set of those that need
7040    to be saved and restored around a change to PSTATE.SM.  */
7041
7042 static void
7043 aarch64_record_sme_mode_switch_args (CUMULATIVE_ARGS *pcum)
7044 {
7045   subrtx_var_iterator::array_type array;
7046   FOR_EACH_SUBRTX_VAR (iter, array, pcum->aapcs_reg, NONCONST)
7047     {
7048       rtx x = *iter;
7049       if (REG_P (x) && (FP_REGNUM_P (REGNO (x)) || PR_REGNUM_P (REGNO (x))))
7050         {
7051           unsigned int i = pcum->num_sme_mode_switch_args++;
7052           gcc_assert (i < ARRAY_SIZE (pcum->sme_mode_switch_args));
7053           pcum->sme_mode_switch_args[i] = x;
7054         }
7055     }
7056 }
7057
7058 /* Return a parallel that contains all the registers that need to be
7059    saved around a change to PSTATE.SM.  Return const0_rtx if there is
7060    no such mode switch, or if no registers need to be saved.  */
7061
7062 static rtx
7063 aarch64_finish_sme_mode_switch_args (CUMULATIVE_ARGS *pcum)
7064 {
7065   if (!pcum->num_sme_mode_switch_args)
7066     return const0_rtx;
7067
7068   auto argvec = gen_rtvec_v (pcum->num_sme_mode_switch_args,
7069                              pcum->sme_mode_switch_args);
7070   return gen_rtx_PARALLEL (VOIDmode, argvec);
7071 }
7072
7073 /* Implement TARGET_FUNCTION_ARG.  */
7074
7075 static rtx
7076 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7077 {
7078   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7079   gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
7080               || pcum->pcs_variant == ARM_PCS_SIMD
7081               || pcum->pcs_variant == ARM_PCS_SVE);
7082
7083   if (arg.end_marker_p ())
7084     {
7085       rtx abi_cookie = aarch64_gen_callee_cookie (pcum->isa_mode,
7086                                                   pcum->pcs_variant);
7087       rtx sme_mode_switch_args = aarch64_finish_sme_mode_switch_args (pcum);
7088       rtx shared_za_flags = gen_int_mode (pcum->shared_za_flags, SImode);
7089       rtx shared_zt0_flags = gen_int_mode (pcum->shared_zt0_flags, SImode);
7090       return gen_rtx_PARALLEL (VOIDmode, gen_rtvec (4, abi_cookie,
7091                                                     sme_mode_switch_args,
7092                                                     shared_za_flags,
7093                                                     shared_zt0_flags));
7094     }
7095
7096   aarch64_layout_arg (pcum_v, arg);
7097   return pcum->aapcs_reg;
7098 }
7099
7100 void
7101 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
7102                               const_tree fntype,
7103                               rtx libname ATTRIBUTE_UNUSED,
7104                               const_tree fndecl,
7105                               unsigned n_named ATTRIBUTE_UNUSED,
7106                               bool silent_p)
7107 {
7108   pcum->aapcs_ncrn = 0;
7109   pcum->aapcs_nvrn = 0;
7110   pcum->aapcs_nprn = 0;
7111   pcum->aapcs_nextncrn = 0;
7112   pcum->aapcs_nextnvrn = 0;
7113   pcum->aapcs_nextnprn = 0;
7114   if (fntype)
7115     {
7116       pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
7117       pcum->isa_mode = aarch64_fntype_isa_mode (fntype);
7118     }
7119   else
7120     {
7121       pcum->pcs_variant = ARM_PCS_AAPCS64;
7122       pcum->isa_mode = AARCH64_FL_DEFAULT_ISA_MODE;
7123     }
7124   pcum->aapcs_reg = NULL_RTX;
7125   pcum->aapcs_arg_processed = false;
7126   pcum->aapcs_stack_words = 0;
7127   pcum->aapcs_stack_size = 0;
7128   pcum->silent_p = silent_p;
7129   pcum->shared_za_flags
7130     = (fntype ? aarch64_fntype_shared_flags (fntype, "za") : 0U);
7131   pcum->shared_zt0_flags
7132     = (fntype ? aarch64_fntype_shared_flags (fntype, "zt0") : 0U);
7133   pcum->num_sme_mode_switch_args = 0;
7134
7135   if (!silent_p
7136       && !TARGET_FLOAT
7137       && fntype && fntype != error_mark_node)
7138     {
7139       const_tree type = TREE_TYPE (fntype);
7140       machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument.  */
7141       int nregs ATTRIBUTE_UNUSED; /* Likewise.  */
7142       if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7143                                                    &mode, &nregs, NULL, false))
7144         aarch64_err_no_fpadvsimd (TYPE_MODE (type));
7145     }
7146
7147   if (!silent_p
7148       && !TARGET_SVE
7149       && pcum->pcs_variant == ARM_PCS_SVE)
7150     {
7151       /* We can't gracefully recover at this point, so make this a
7152          fatal error.  */
7153       if (fndecl)
7154         fatal_error (input_location, "%qE requires the SVE ISA extension",
7155                      fndecl);
7156       else
7157         fatal_error (input_location, "calls to functions of type %qT require"
7158                      " the SVE ISA extension", fntype);
7159     }
7160 }
7161
7162 static void
7163 aarch64_function_arg_advance (cumulative_args_t pcum_v,
7164                               const function_arg_info &arg)
7165 {
7166   CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7167   if (pcum->pcs_variant == ARM_PCS_AAPCS64
7168       || pcum->pcs_variant == ARM_PCS_SIMD
7169       || pcum->pcs_variant == ARM_PCS_SVE)
7170     {
7171       aarch64_layout_arg (pcum_v, arg);
7172       gcc_assert ((pcum->aapcs_reg != NULL_RTX)
7173                   != (pcum->aapcs_stack_words != 0));
7174       if (pcum->aapcs_reg
7175           && aarch64_call_switches_pstate_sm (pcum->isa_mode))
7176         aarch64_record_sme_mode_switch_args (pcum);
7177
7178       pcum->aapcs_arg_processed = false;
7179       pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
7180       pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
7181       pcum->aapcs_nprn = pcum->aapcs_nextnprn;
7182       pcum->aapcs_stack_size += pcum->aapcs_stack_words;
7183       pcum->aapcs_stack_words = 0;
7184       pcum->aapcs_reg = NULL_RTX;
7185     }
7186 }
7187
7188 bool
7189 aarch64_function_arg_regno_p (unsigned regno)
7190 {
7191   return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
7192           || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS)
7193           || (PR_REGNUM_P (regno) && regno < P0_REGNUM + NUM_PR_ARG_REGS));
7194 }
7195
7196 /* Implement FUNCTION_ARG_BOUNDARY.  Every parameter gets at least
7197    PARM_BOUNDARY bits of alignment, but will be given anything up
7198    to STACK_BOUNDARY bits if the type requires it.  This makes sure
7199    that both before and after the layout of each argument, the Next
7200    Stacked Argument Address (NSAA) will have a minimum alignment of
7201    8 bytes.  */
7202
7203 static unsigned int
7204 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
7205 {
7206   unsigned int abi_break_gcc_9;
7207   unsigned int abi_break_gcc_13;
7208   unsigned int abi_break_gcc_14;
7209   unsigned int alignment = aarch64_function_arg_alignment (mode, type,
7210                                                            &abi_break_gcc_9,
7211                                                            &abi_break_gcc_13,
7212                                                            &abi_break_gcc_14);
7213   /* We rely on aarch64_layout_arg and aarch64_gimplify_va_arg_expr
7214      to emit warnings about ABI incompatibility.  */
7215   alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
7216   return alignment;
7217 }
7218
7219 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE.  */
7220
7221 static fixed_size_mode
7222 aarch64_get_reg_raw_mode (int regno)
7223 {
7224   if (TARGET_SVE && FP_REGNUM_P (regno))
7225     /* Don't use the SVE part of the register for __builtin_apply and
7226        __builtin_return.  The SVE registers aren't used by the normal PCS,
7227        so using them there would be a waste of time.  The PCS extensions
7228        for SVE types are fundamentally incompatible with the
7229        __builtin_return/__builtin_apply interface.  */
7230     return as_a <fixed_size_mode> (V16QImode);
7231   if (PR_REGNUM_P (regno))
7232     /* For SVE PR regs, indicate that they should be ignored for
7233        __builtin_apply/__builtin_return.  */
7234     return as_a <fixed_size_mode> (VOIDmode);
7235   return default_get_reg_raw_mode (regno);
7236 }
7237
7238 /* Implement TARGET_FUNCTION_ARG_PADDING.
7239
7240    Small aggregate types are placed in the lowest memory address.
7241
7242    The related parameter passing rules are B.4, C.3, C.5 and C.14.  */
7243
7244 static pad_direction
7245 aarch64_function_arg_padding (machine_mode mode, const_tree type)
7246 {
7247   /* On little-endian targets, the least significant byte of every stack
7248      argument is passed at the lowest byte address of the stack slot.  */
7249   if (!BYTES_BIG_ENDIAN)
7250     return PAD_UPWARD;
7251
7252   /* Otherwise, integral, floating-point and pointer types are padded downward:
7253      the least significant byte of a stack argument is passed at the highest
7254      byte address of the stack slot.  */
7255   if (type
7256       ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
7257          || POINTER_TYPE_P (type))
7258       : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
7259     return PAD_DOWNWARD;
7260
7261   /* Everything else padded upward, i.e. data in first byte of stack slot.  */
7262   return PAD_UPWARD;
7263 }
7264
7265 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
7266
7267    It specifies padding for the last (may also be the only)
7268    element of a block move between registers and memory.  If
7269    assuming the block is in the memory, padding upward means that
7270    the last element is padded after its highest significant byte,
7271    while in downward padding, the last element is padded at the
7272    its least significant byte side.
7273
7274    Small aggregates and small complex types are always padded
7275    upwards.
7276
7277    We don't need to worry about homogeneous floating-point or
7278    short-vector aggregates; their move is not affected by the
7279    padding direction determined here.  Regardless of endianness,
7280    each element of such an aggregate is put in the least
7281    significant bits of a fp/simd register.
7282
7283    Return !BYTES_BIG_ENDIAN if the least significant byte of the
7284    register has useful data, and return the opposite if the most
7285    significant byte does.  */
7286
7287 bool
7288 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
7289                      bool first ATTRIBUTE_UNUSED)
7290 {
7291
7292   /* Aside from pure scalable types, small composite types are always
7293      padded upward.  */
7294   if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
7295     {
7296       HOST_WIDE_INT size;
7297       if (type)
7298         size = int_size_in_bytes (type);
7299       else
7300         /* No frontends can create types with variable-sized modes, so we
7301            shouldn't be asked to pass or return them.  */
7302         size = GET_MODE_SIZE (mode).to_constant ();
7303       if (size < 2 * UNITS_PER_WORD)
7304         {
7305           pure_scalable_type_info pst_info;
7306           if (pst_info.analyze_registers (type))
7307             return false;
7308           return true;
7309         }
7310     }
7311
7312   /* Otherwise, use the default padding.  */
7313   return !BYTES_BIG_ENDIAN;
7314 }
7315
7316 static scalar_int_mode
7317 aarch64_libgcc_cmp_return_mode (void)
7318 {
7319   return SImode;
7320 }
7321
7322 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
7323
7324 /* We use the 12-bit shifted immediate arithmetic instructions so values
7325    must be multiple of (1 << 12), i.e. 4096.  */
7326 #define ARITH_FACTOR 4096
7327
7328 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
7329 #error Cannot use simple address calculation for stack probing
7330 #endif
7331
7332 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
7333    inclusive.  These are offsets from the current stack pointer.  */
7334
7335 static void
7336 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
7337 {
7338   HOST_WIDE_INT size;
7339   if (!poly_size.is_constant (&size))
7340     {
7341       sorry ("stack probes for SVE frames");
7342       return;
7343     }
7344
7345   rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
7346
7347   /* See the same assertion on PROBE_INTERVAL above.  */
7348   gcc_assert ((first % ARITH_FACTOR) == 0);
7349
7350   /* See if we have a constant small number of probes to generate.  If so,
7351      that's the easy case.  */
7352   if (size <= PROBE_INTERVAL)
7353     {
7354       const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
7355
7356       emit_set_insn (reg1,
7357                      plus_constant (Pmode,
7358                                     stack_pointer_rtx, -(first + base)));
7359       emit_stack_probe (plus_constant (Pmode, reg1, base - size));
7360     }
7361
7362   /* The run-time loop is made up of 8 insns in the generic case while the
7363      compile-time loop is made up of 4+2*(n-2) insns for n # of intervals.  */
7364   else if (size <= 4 * PROBE_INTERVAL)
7365     {
7366       HOST_WIDE_INT i, rem;
7367
7368       emit_set_insn (reg1,
7369                      plus_constant (Pmode,
7370                                     stack_pointer_rtx,
7371                                     -(first + PROBE_INTERVAL)));
7372       emit_stack_probe (reg1);
7373
7374       /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
7375          it exceeds SIZE.  If only two probes are needed, this will not
7376          generate any code.  Then probe at FIRST + SIZE.  */
7377       for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
7378         {
7379           emit_set_insn (reg1,
7380                          plus_constant (Pmode, reg1, -PROBE_INTERVAL));
7381           emit_stack_probe (reg1);
7382         }
7383
7384       rem = size - (i - PROBE_INTERVAL);
7385       if (rem > 256)
7386         {
7387           const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7388
7389           emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
7390           emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
7391         }
7392       else
7393         emit_stack_probe (plus_constant (Pmode, reg1, -rem));
7394     }
7395
7396   /* Otherwise, do the same as above, but in a loop.  Note that we must be
7397      extra careful with variables wrapping around because we might be at
7398      the very top (or the very bottom) of the address space and we have
7399      to be able to handle this case properly; in particular, we use an
7400      equality test for the loop condition.  */
7401   else
7402     {
7403       rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
7404
7405       /* Step 1: round SIZE to the previous multiple of the interval.  */
7406
7407       HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
7408
7409
7410       /* Step 2: compute initial and final value of the loop counter.  */
7411
7412       /* TEST_ADDR = SP + FIRST.  */
7413       emit_set_insn (reg1,
7414                      plus_constant (Pmode, stack_pointer_rtx, -first));
7415
7416       /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE.  */
7417       HOST_WIDE_INT adjustment = - (first + rounded_size);
7418       if (! aarch64_uimm12_shift (adjustment))
7419         {
7420           aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
7421                                           true, Pmode);
7422           emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
7423         }
7424       else
7425         emit_set_insn (reg2,
7426                        plus_constant (Pmode, stack_pointer_rtx, adjustment));
7427
7428       /* Step 3: the loop
7429
7430          do
7431            {
7432              TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
7433              probe at TEST_ADDR
7434            }
7435          while (TEST_ADDR != LAST_ADDR)
7436
7437          probes at FIRST + N * PROBE_INTERVAL for values of N from 1
7438          until it is equal to ROUNDED_SIZE.  */
7439
7440       emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
7441
7442
7443       /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
7444          that SIZE is equal to ROUNDED_SIZE.  */
7445
7446       if (size != rounded_size)
7447         {
7448           HOST_WIDE_INT rem = size - rounded_size;
7449
7450           if (rem > 256)
7451             {
7452               const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
7453
7454               emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
7455               emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
7456             }
7457           else
7458             emit_stack_probe (plus_constant (Pmode, reg2, -rem));
7459         }
7460     }
7461
7462   /* Make sure nothing is scheduled before we are done.  */
7463   emit_insn (gen_blockage ());
7464 }
7465
7466 /* Probe a range of stack addresses from REG1 to REG2 inclusive.  These are
7467    absolute addresses.  */
7468
7469 const char *
7470 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
7471 {
7472   static int labelno = 0;
7473   char loop_lab[32];
7474   rtx xops[2];
7475
7476   ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
7477
7478   /* Loop.  */
7479   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
7480
7481   HOST_WIDE_INT stack_clash_probe_interval
7482     = 1 << param_stack_clash_protection_guard_size;
7483
7484   /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL.  */
7485   xops[0] = reg1;
7486   HOST_WIDE_INT interval;
7487   if (flag_stack_clash_protection)
7488     interval = stack_clash_probe_interval;
7489   else
7490     interval = PROBE_INTERVAL;
7491
7492   gcc_assert (aarch64_uimm12_shift (interval));
7493   xops[1] = GEN_INT (interval);
7494
7495   output_asm_insn ("sub\t%0, %0, %1", xops);
7496
7497   /* If doing stack clash protection then we probe up by the ABI specified
7498      amount.  We do this because we're dropping full pages at a time in the
7499      loop.  But if we're doing non-stack clash probing, probe at SP 0.  */
7500   if (flag_stack_clash_protection)
7501     xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
7502   else
7503     xops[1] = CONST0_RTX (GET_MODE (xops[1]));
7504
7505   /* Probe at TEST_ADDR.  If we're inside the loop it is always safe to probe
7506      by this amount for each iteration.  */
7507   output_asm_insn ("str\txzr, [%0, %1]", xops);
7508
7509   /* Test if TEST_ADDR == LAST_ADDR.  */
7510   xops[1] = reg2;
7511   output_asm_insn ("cmp\t%0, %1", xops);
7512
7513   /* Branch.  */
7514   fputs ("\tb.ne\t", asm_out_file);
7515   assemble_name_raw (asm_out_file, loop_lab);
7516   fputc ('\n', asm_out_file);
7517
7518   return "";
7519 }
7520
7521 /* Emit the probe loop for doing stack clash probes and stack adjustments for
7522    SVE.  This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
7523    of GUARD_SIZE.  When a probe is emitted it is done at most
7524    MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
7525    at most MIN_PROBE_THRESHOLD.  By the end of this function
7526    BASE = BASE - ADJUSTMENT.  */
7527
7528 const char *
7529 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
7530                                       rtx min_probe_threshold, rtx guard_size)
7531 {
7532   /* This function is not allowed to use any instruction generation function
7533      like gen_ and friends.  If you do you'll likely ICE during CFG validation,
7534      so instead emit the code you want using output_asm_insn.  */
7535   gcc_assert (flag_stack_clash_protection);
7536   gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
7537   gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
7538
7539   /* The minimum required allocation before the residual requires probing.  */
7540   HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
7541
7542   /* Clamp the value down to the nearest value that can be used with a cmp.  */
7543   residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
7544   rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
7545
7546   gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
7547   gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
7548
7549   static int labelno = 0;
7550   char loop_start_lab[32];
7551   char loop_end_lab[32];
7552   rtx xops[2];
7553
7554   ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
7555   ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
7556
7557   /* Emit loop start label.  */
7558   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
7559
7560   /* ADJUSTMENT < RESIDUAL_PROBE_GUARD.  */
7561   xops[0] = adjustment;
7562   xops[1] = probe_offset_value_rtx;
7563   output_asm_insn ("cmp\t%0, %1", xops);
7564
7565   /* Branch to end if not enough adjustment to probe.  */
7566   fputs ("\tb.lt\t", asm_out_file);
7567   assemble_name_raw (asm_out_file, loop_end_lab);
7568   fputc ('\n', asm_out_file);
7569
7570   /* BASE = BASE - RESIDUAL_PROBE_GUARD.  */
7571   xops[0] = base;
7572   xops[1] = probe_offset_value_rtx;
7573   output_asm_insn ("sub\t%0, %0, %1", xops);
7574
7575   /* Probe at BASE.  */
7576   xops[1] = const0_rtx;
7577   output_asm_insn ("str\txzr, [%0, %1]", xops);
7578
7579   /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD.  */
7580   xops[0] = adjustment;
7581   xops[1] = probe_offset_value_rtx;
7582   output_asm_insn ("sub\t%0, %0, %1", xops);
7583
7584   /* Branch to start if still more bytes to allocate.  */
7585   fputs ("\tb\t", asm_out_file);
7586   assemble_name_raw (asm_out_file, loop_start_lab);
7587   fputc ('\n', asm_out_file);
7588
7589   /* No probe leave.  */
7590   ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
7591
7592   /* BASE = BASE - ADJUSTMENT.  */
7593   xops[0] = base;
7594   xops[1] = adjustment;
7595   output_asm_insn ("sub\t%0, %0, %1", xops);
7596   return "";
7597 }
7598
7599 /* Determine whether a frame chain needs to be generated.  */
7600 static bool
7601 aarch64_needs_frame_chain (void)
7602 {
7603   if (frame_pointer_needed)
7604     return true;
7605
7606   /* A leaf function cannot have calls or write LR.  */
7607   bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
7608
7609   /* Don't use a frame chain in leaf functions if leaf frame pointers
7610      are disabled.  */
7611   if (flag_omit_leaf_frame_pointer && is_leaf)
7612     return false;
7613
7614   return aarch64_use_frame_pointer;
7615 }
7616
7617 /* Return true if the current function should save registers above
7618    the locals area, rather than below it.  */
7619
7620 static bool
7621 aarch64_save_regs_above_locals_p ()
7622 {
7623   /* When using stack smash protection, make sure that the canary slot
7624      comes between the locals and the saved registers.  Otherwise,
7625      it would be possible for a carefully sized smash attack to change
7626      the saved registers (particularly LR and FP) without reaching the
7627      canary.  */
7628   return crtl->stack_protect_guard;
7629 }
7630
7631 /* Return true if the current function needs to record the incoming
7632    value of PSTATE.SM.  */
7633 static bool
7634 aarch64_need_old_pstate_sm ()
7635 {
7636   /* Exit early if the incoming value of PSTATE.SM is known at
7637      compile time.  */
7638   if (aarch64_cfun_incoming_pstate_sm () != 0)
7639     return false;
7640
7641   if (aarch64_cfun_enables_pstate_sm ())
7642     return true;
7643
7644   /* Non-local goto receivers are entered with PSTATE.SM equal to 0,
7645      but the function needs to return with PSTATE.SM unchanged.  */
7646   if (nonlocal_goto_handler_labels)
7647     return true;
7648
7649   /* Likewise for exception handlers.  */
7650   eh_landing_pad lp;
7651   for (unsigned int i = 1; vec_safe_iterate (cfun->eh->lp_array, i, &lp); ++i)
7652     if (lp && lp->post_landing_pad)
7653       return true;
7654
7655   /* Non-local gotos need to set PSTATE.SM to zero.  It's possible to call
7656      streaming-compatible functions without SME being available, so PSTATE.SM
7657      should only be changed if it is currently set to one.  */
7658   if (crtl->has_nonlocal_goto)
7659     return true;
7660
7661   if (cfun->machine->call_switches_pstate_sm)
7662     for (auto insn = get_insns (); insn; insn = NEXT_INSN (insn))
7663       if (auto *call = dyn_cast<rtx_call_insn *> (insn))
7664         if (!SIBLING_CALL_P (call))
7665           {
7666             /* Return true if there is a call to a non-streaming-compatible
7667                function.  */
7668             auto callee_isa_mode = aarch64_insn_callee_isa_mode (call);
7669             if (aarch64_call_switches_pstate_sm (callee_isa_mode))
7670               return true;
7671           }
7672   return false;
7673 }
7674
7675 /* Mark the registers that need to be saved by the callee and calculate
7676    the size of the callee-saved registers area and frame record (both FP
7677    and LR may be omitted).  */
7678 static void
7679 aarch64_layout_frame (void)
7680 {
7681   unsigned regno, last_fp_reg = INVALID_REGNUM;
7682   machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
7683   poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
7684   bool frame_related_fp_reg_p = false;
7685   aarch64_frame &frame = cfun->machine->frame;
7686   poly_int64 top_of_locals = -1;
7687   bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
7688
7689   vec_safe_truncate (frame.saved_gprs, 0);
7690   vec_safe_truncate (frame.saved_fprs, 0);
7691   vec_safe_truncate (frame.saved_prs, 0);
7692
7693   frame.emit_frame_chain = aarch64_needs_frame_chain ();
7694
7695   /* Adjust the outgoing arguments size if required.  Keep it in sync with what
7696      the mid-end is doing.  */
7697   crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
7698
7699 #define SLOT_NOT_REQUIRED (-2)
7700 #define SLOT_REQUIRED     (-1)
7701
7702   frame.wb_push_candidate1 = INVALID_REGNUM;
7703   frame.wb_push_candidate2 = INVALID_REGNUM;
7704   frame.spare_pred_reg = INVALID_REGNUM;
7705
7706   /* First mark all the registers that really need to be saved...  */
7707   for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7708     frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
7709   frame.old_svcr_offset = SLOT_NOT_REQUIRED;
7710
7711   /* ... that includes the eh data registers (if needed)...  */
7712   if (crtl->calls_eh_return)
7713     for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
7714       frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
7715
7716   /* ... and any callee saved register that dataflow says is live.  */
7717   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
7718     if (df_regs_ever_live_p (regno)
7719         && !fixed_regs[regno]
7720         && (regno == R30_REGNUM
7721             || !crtl->abi->clobbers_full_reg_p (regno)))
7722       frame.reg_offset[regno] = SLOT_REQUIRED;
7723
7724   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7725     if ((enables_pstate_sm || df_regs_ever_live_p (regno))
7726         && !fixed_regs[regno]
7727         && !crtl->abi->clobbers_full_reg_p (regno))
7728       {
7729         frame.reg_offset[regno] = SLOT_REQUIRED;
7730         last_fp_reg = regno;
7731         if (aarch64_emit_cfi_for_reg_p (regno))
7732           frame_related_fp_reg_p = true;
7733       }
7734
7735   /* Big-endian SVE frames need a spare predicate register in order
7736      to save Z8-Z15.  Decide which register they should use.  Prefer
7737      an unused argument register if possible, so that we don't force P4
7738      to be saved unnecessarily.  */
7739   if (frame_related_fp_reg_p
7740       && crtl->abi->id () == ARM_PCS_SVE
7741       && BYTES_BIG_ENDIAN)
7742     {
7743       bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7744       bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
7745       for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
7746         if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
7747           break;
7748       gcc_assert (regno <= P7_REGNUM);
7749       frame.spare_pred_reg = regno;
7750       df_set_regs_ever_live (regno, true);
7751     }
7752
7753   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7754     if ((enables_pstate_sm || df_regs_ever_live_p (regno))
7755         && !fixed_regs[regno]
7756         && !crtl->abi->clobbers_full_reg_p (regno))
7757       frame.reg_offset[regno] = SLOT_REQUIRED;
7758
7759   bool regs_at_top_p = aarch64_save_regs_above_locals_p ();
7760
7761   poly_int64 offset = crtl->outgoing_args_size;
7762   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
7763   if (regs_at_top_p)
7764     {
7765       offset += get_frame_size ();
7766       offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7767       top_of_locals = offset;
7768     }
7769   frame.bytes_below_saved_regs = offset;
7770   frame.sve_save_and_probe = INVALID_REGNUM;
7771
7772   /* Now assign stack slots for the registers.  Start with the predicate
7773      registers, since predicate LDR and STR have a relatively small
7774      offset range.  These saves happen below the hard frame pointer.  */
7775   for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7776     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7777       {
7778         vec_safe_push (frame.saved_prs, regno);
7779         if (frame.sve_save_and_probe == INVALID_REGNUM)
7780           frame.sve_save_and_probe = regno;
7781         frame.reg_offset[regno] = offset;
7782         offset += BYTES_PER_SVE_PRED;
7783       }
7784
7785   poly_int64 saved_prs_size = offset - frame.bytes_below_saved_regs;
7786   if (maybe_ne (saved_prs_size, 0))
7787     {
7788       /* If we have any vector registers to save above the predicate registers,
7789          the offset of the vector register save slots need to be a multiple
7790          of the vector size.  This lets us use the immediate forms of LDR/STR
7791          (or LD1/ST1 for big-endian).
7792
7793          A vector register is 8 times the size of a predicate register,
7794          and we need to save a maximum of 12 predicate registers, so the
7795          first vector register will be at either #1, MUL VL or #2, MUL VL.
7796
7797          If we don't have any vector registers to save, and we know how
7798          big the predicate save area is, we can just round it up to the
7799          next 16-byte boundary.  */
7800       if (last_fp_reg == INVALID_REGNUM && offset.is_constant ())
7801         offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7802       else
7803         {
7804           if (known_le (saved_prs_size, vector_save_size))
7805             offset = frame.bytes_below_saved_regs + vector_save_size;
7806           else if (known_le (saved_prs_size, vector_save_size * 2))
7807             offset = frame.bytes_below_saved_regs + vector_save_size * 2;
7808           else
7809             gcc_unreachable ();
7810         }
7811     }
7812
7813   /* If we need to save any SVE vector registers, add them next.  */
7814   if (last_fp_reg != INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
7815     for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7816       if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7817         {
7818           vec_safe_push (frame.saved_fprs, regno);
7819           if (frame.sve_save_and_probe == INVALID_REGNUM)
7820             frame.sve_save_and_probe = regno;
7821           frame.reg_offset[regno] = offset;
7822           offset += vector_save_size;
7823         }
7824
7825   /* OFFSET is now the offset of the hard frame pointer from the bottom
7826      of the callee save area.  */
7827   auto below_hard_fp_saved_regs_size = offset - frame.bytes_below_saved_regs;
7828   bool saves_below_hard_fp_p = maybe_ne (below_hard_fp_saved_regs_size, 0);
7829   gcc_assert (!saves_below_hard_fp_p
7830               || (frame.sve_save_and_probe != INVALID_REGNUM
7831                   && known_eq (frame.reg_offset[frame.sve_save_and_probe],
7832                                frame.bytes_below_saved_regs)));
7833
7834   frame.bytes_below_hard_fp = offset;
7835   frame.hard_fp_save_and_probe = INVALID_REGNUM;
7836
7837   auto allocate_gpr_slot = [&](unsigned int regno)
7838     {
7839       vec_safe_push (frame.saved_gprs, regno);
7840       frame.reg_offset[regno] = offset;
7841       offset += UNITS_PER_WORD;
7842     };
7843
7844   if (frame.emit_frame_chain)
7845     {
7846       /* FP and LR are placed in the linkage record.  */
7847       allocate_gpr_slot (R29_REGNUM);
7848       allocate_gpr_slot (R30_REGNUM);
7849     }
7850   else if ((flag_stack_clash_protection || !frame.is_scs_enabled)
7851            && known_eq (frame.reg_offset[R30_REGNUM], SLOT_REQUIRED))
7852     /* Put the LR save slot first, since it makes a good choice of probe
7853        for stack clash purposes.  The idea is that the link register usually
7854        has to be saved before a call anyway, and so we lose little by
7855        stopping it from being individually shrink-wrapped.  */
7856     allocate_gpr_slot (R30_REGNUM);
7857
7858   for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
7859     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7860       allocate_gpr_slot (regno);
7861
7862   if (aarch64_need_old_pstate_sm ())
7863     {
7864       frame.old_svcr_offset = offset;
7865       offset += UNITS_PER_WORD;
7866     }
7867
7868   /* If the current function changes the SVE vector length, ensure that the
7869      old value of the DWARF VG register is saved and available in the CFI,
7870      so that outer frames with VL-sized offsets can be processed correctly.  */
7871   if (cfun->machine->call_switches_pstate_sm
7872       || aarch64_cfun_enables_pstate_sm ())
7873     {
7874       frame.reg_offset[VG_REGNUM] = offset;
7875       offset += UNITS_PER_WORD;
7876     }
7877
7878   poly_int64 max_int_offset = offset;
7879   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7880   bool has_align_gap = maybe_ne (offset, max_int_offset);
7881
7882   for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7883     if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7884       {
7885         vec_safe_push (frame.saved_fprs, regno);
7886         /* If there is an alignment gap between integer and fp callee-saves,
7887            allocate the last fp register to it if possible.  */
7888         if (regno == last_fp_reg
7889             && has_align_gap
7890             && known_eq (vector_save_size, 8)
7891             && multiple_p (offset, 16))
7892           {
7893             frame.reg_offset[regno] = max_int_offset;
7894             break;
7895           }
7896
7897         frame.reg_offset[regno] = offset;
7898         offset += vector_save_size;
7899       }
7900
7901   offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7902   auto saved_regs_size = offset - frame.bytes_below_saved_regs;
7903
7904   array_slice<unsigned int> push_regs = (!vec_safe_is_empty (frame.saved_gprs)
7905                                          ? frame.saved_gprs
7906                                          : frame.saved_fprs);
7907   if (!push_regs.empty ()
7908       && known_eq (frame.reg_offset[push_regs[0]], frame.bytes_below_hard_fp))
7909     {
7910       frame.hard_fp_save_and_probe = push_regs[0];
7911       frame.wb_push_candidate1 = push_regs[0];
7912       if (push_regs.size () > 1)
7913         frame.wb_push_candidate2 = push_regs[1];
7914     }
7915
7916   /* With stack-clash, a register must be saved in non-leaf functions.
7917      The saving of the bottommost register counts as an implicit probe,
7918      which allows us to maintain the invariant described in the comment
7919      at expand_prologue.  */
7920   gcc_assert (crtl->is_leaf || maybe_ne (saved_regs_size, 0));
7921
7922   if (!regs_at_top_p)
7923     {
7924       offset += get_frame_size ();
7925       offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7926       top_of_locals = offset;
7927     }
7928   offset += frame.saved_varargs_size;
7929   gcc_assert (multiple_p (offset, STACK_BOUNDARY / BITS_PER_UNIT));
7930   frame.frame_size = offset;
7931
7932   frame.bytes_above_hard_fp = frame.frame_size - frame.bytes_below_hard_fp;
7933   gcc_assert (known_ge (top_of_locals, 0));
7934   frame.bytes_above_locals = frame.frame_size - top_of_locals;
7935
7936   frame.initial_adjust = 0;
7937   frame.final_adjust = 0;
7938   frame.callee_adjust = 0;
7939   frame.sve_callee_adjust = 0;
7940
7941   frame.wb_pop_candidate1 = frame.wb_push_candidate1;
7942   frame.wb_pop_candidate2 = frame.wb_push_candidate2;
7943
7944   /* Shadow call stack only deals with functions where the LR is pushed
7945      onto the stack and without specifying the "no_sanitize" attribute
7946      with the argument "shadow-call-stack".  */
7947   frame.is_scs_enabled
7948     = (!crtl->calls_eh_return
7949        && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
7950        && known_ge (frame.reg_offset[LR_REGNUM], 0));
7951
7952   /* When shadow call stack is enabled, the scs_pop in the epilogue will
7953      restore x30, and we don't need to pop x30 again in the traditional
7954      way.  Pop candidates record the registers that need to be popped
7955      eventually.  */
7956   if (frame.is_scs_enabled)
7957     {
7958       if (frame.wb_pop_candidate2 == R30_REGNUM)
7959         frame.wb_pop_candidate2 = INVALID_REGNUM;
7960       else if (frame.wb_pop_candidate1 == R30_REGNUM)
7961         frame.wb_pop_candidate1 = INVALID_REGNUM;
7962     }
7963
7964   /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
7965      256 to ensure that the offset meets the requirements of emit_move_insn.
7966      Similarly, if candidate1 is INVALID_REGNUM, we need to set
7967      max_push_offset to 0, because no registers are popped at this time,
7968      so callee_adjust cannot be adjusted.  */
7969   HOST_WIDE_INT max_push_offset = 0;
7970   if (frame.wb_pop_candidate1 != INVALID_REGNUM)
7971     {
7972       if (frame.wb_pop_candidate2 != INVALID_REGNUM)
7973         max_push_offset = 512;
7974       else
7975         max_push_offset = 256;
7976     }
7977
7978   HOST_WIDE_INT const_size, const_below_saved_regs, const_above_fp;
7979   HOST_WIDE_INT const_saved_regs_size;
7980   if (known_eq (saved_regs_size, 0))
7981     frame.initial_adjust = frame.frame_size;
7982   else if (frame.frame_size.is_constant (&const_size)
7983            && const_size < max_push_offset
7984            && known_eq (frame.bytes_above_hard_fp, const_size))
7985     {
7986       /* Simple, small frame with no data below the saved registers.
7987
7988          stp reg1, reg2, [sp, -frame_size]!
7989          stp reg3, reg4, [sp, 16]  */
7990       frame.callee_adjust = const_size;
7991     }
7992   else if (frame.bytes_below_saved_regs.is_constant (&const_below_saved_regs)
7993            && saved_regs_size.is_constant (&const_saved_regs_size)
7994            && const_below_saved_regs + const_saved_regs_size < 512
7995            /* We could handle this case even with data below the saved
7996               registers, provided that that data left us with valid offsets
7997               for all predicate and vector save slots.  It's such a rare
7998               case that it hardly seems worth the effort though.  */
7999            && (!saves_below_hard_fp_p || const_below_saved_regs == 0)
8000            && !(cfun->calls_alloca
8001                 && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
8002                 && const_above_fp < max_push_offset))
8003     {
8004       /* Frame with small area below the saved registers:
8005
8006          sub sp, sp, frame_size
8007          stp reg1, reg2, [sp, bytes_below_saved_regs]
8008          stp reg3, reg4, [sp, bytes_below_saved_regs + 16]  */
8009       frame.initial_adjust = frame.frame_size;
8010     }
8011   else if (saves_below_hard_fp_p
8012            && known_eq (saved_regs_size, below_hard_fp_saved_regs_size))
8013     {
8014       /* Frame in which all saves are SVE saves:
8015
8016          sub sp, sp, frame_size - bytes_below_saved_regs
8017          save SVE registers relative to SP
8018          sub sp, sp, bytes_below_saved_regs  */
8019       frame.initial_adjust = frame.frame_size - frame.bytes_below_saved_regs;
8020       frame.final_adjust = frame.bytes_below_saved_regs;
8021     }
8022   else if (frame.wb_push_candidate1 != INVALID_REGNUM
8023            && frame.bytes_above_hard_fp.is_constant (&const_above_fp)
8024            && const_above_fp < max_push_offset)
8025     {
8026       /* Frame with large area below the saved registers, or with SVE saves,
8027          but with a small area above:
8028
8029          stp reg1, reg2, [sp, -hard_fp_offset]!
8030          stp reg3, reg4, [sp, 16]
8031          [sub sp, sp, below_hard_fp_saved_regs_size]
8032          [save SVE registers relative to SP]
8033          sub sp, sp, bytes_below_saved_regs  */
8034       frame.callee_adjust = const_above_fp;
8035       frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
8036       frame.final_adjust = frame.bytes_below_saved_regs;
8037     }
8038   else
8039     {
8040       /* General case:
8041
8042          sub sp, sp, hard_fp_offset
8043          stp x29, x30, [sp, 0]
8044          add x29, sp, 0
8045          stp reg3, reg4, [sp, 16]
8046          [sub sp, sp, below_hard_fp_saved_regs_size]
8047          [save SVE registers relative to SP]
8048          sub sp, sp, bytes_below_saved_regs  */
8049       frame.initial_adjust = frame.bytes_above_hard_fp;
8050       frame.sve_callee_adjust = below_hard_fp_saved_regs_size;
8051       frame.final_adjust = frame.bytes_below_saved_regs;
8052     }
8053
8054   /* The frame is allocated in pieces, with each non-final piece
8055      including a register save at offset 0 that acts as a probe for
8056      the following piece.  In addition, the save of the bottommost register
8057      acts as a probe for callees and allocas.  Roll back any probes that
8058      aren't needed.
8059
8060      A probe isn't needed if it is associated with the final allocation
8061      (including callees and allocas) that happens before the epilogue is
8062      executed.  */
8063   if (crtl->is_leaf
8064       && !cfun->calls_alloca
8065       && known_eq (frame.final_adjust, 0))
8066     {
8067       if (maybe_ne (frame.sve_callee_adjust, 0))
8068         frame.sve_save_and_probe = INVALID_REGNUM;
8069       else
8070         frame.hard_fp_save_and_probe = INVALID_REGNUM;
8071     }
8072
8073   /* Make sure the individual adjustments add up to the full frame size.  */
8074   gcc_assert (known_eq (frame.initial_adjust
8075                         + frame.callee_adjust
8076                         + frame.sve_callee_adjust
8077                         + frame.final_adjust, frame.frame_size));
8078
8079   if (frame.callee_adjust == 0)
8080     {
8081       /* We've decided not to do a "real" push and pop.  However,
8082          setting up the frame chain is treated as being essentially
8083          a multi-instruction push.  */
8084       frame.wb_pop_candidate1 = frame.wb_pop_candidate2 = INVALID_REGNUM;
8085       if (!frame.emit_frame_chain)
8086         frame.wb_push_candidate1 = frame.wb_push_candidate2 = INVALID_REGNUM;
8087     }
8088
8089   frame.laid_out = true;
8090 }
8091
8092 /* Return true if the register REGNO is saved on entry to
8093    the current function.  */
8094
8095 static bool
8096 aarch64_register_saved_on_entry (int regno)
8097 {
8098   return known_ge (cfun->machine->frame.reg_offset[regno], 0);
8099 }
8100
8101 /* Push the register number REGNO of mode MODE to the stack with write-back
8102    adjusting the stack by ADJUSTMENT.  */
8103
8104 static void
8105 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
8106                            HOST_WIDE_INT adjustment)
8107  {
8108   rtx base_rtx = stack_pointer_rtx;
8109   rtx insn, reg, mem;
8110
8111   reg = gen_rtx_REG (mode, regno);
8112   mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
8113                             plus_constant (Pmode, base_rtx, -adjustment));
8114   mem = gen_frame_mem (mode, mem);
8115
8116   insn = emit_move_insn (mem, reg);
8117   RTX_FRAME_RELATED_P (insn) = 1;
8118 }
8119
8120 /* Generate and return an instruction to store the pair of registers
8121    REG and REG2 of mode MODE to location BASE with write-back adjusting
8122    the stack location BASE by ADJUSTMENT.  */
8123
8124 static rtx
8125 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8126                           HOST_WIDE_INT adjustment)
8127 {
8128   rtx new_base = plus_constant (Pmode, base, -adjustment);
8129   rtx mem = gen_frame_mem (mode, new_base);
8130   rtx mem2 = adjust_address_nv (mem, mode, GET_MODE_SIZE (mode));
8131
8132   return gen_rtx_PARALLEL (VOIDmode,
8133                            gen_rtvec (3,
8134                                       gen_rtx_SET (base, new_base),
8135                                       gen_rtx_SET (mem, reg),
8136                                       gen_rtx_SET (mem2, reg2)));
8137 }
8138
8139 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8140    stack pointer by ADJUSTMENT.  */
8141
8142 static void
8143 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
8144 {
8145   rtx_insn *insn;
8146   machine_mode mode = aarch64_reg_save_mode (regno1);
8147
8148   if (regno2 == INVALID_REGNUM)
8149     return aarch64_pushwb_single_reg (mode, regno1, adjustment);
8150
8151   rtx reg1 = gen_rtx_REG (mode, regno1);
8152   rtx reg2 = gen_rtx_REG (mode, regno2);
8153
8154   insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
8155                                               reg2, adjustment));
8156   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
8157   RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8158   RTX_FRAME_RELATED_P (insn) = 1;
8159 }
8160
8161 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8162    adjusting it by ADJUSTMENT afterwards.  */
8163
8164 static rtx
8165 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8166                          HOST_WIDE_INT adjustment)
8167 {
8168   rtx mem = gen_frame_mem (mode, base);
8169   rtx mem2 = adjust_address_nv (mem, mode, GET_MODE_SIZE (mode));
8170   rtx new_base = plus_constant (Pmode, base, adjustment);
8171
8172   return gen_rtx_PARALLEL (VOIDmode,
8173                            gen_rtvec (3,
8174                                       gen_rtx_SET (base, new_base),
8175                                       gen_rtx_SET (reg, mem),
8176                                       gen_rtx_SET (reg2, mem2)));
8177 }
8178
8179 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8180    afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8181    into CFI_OPS.  */
8182
8183 static void
8184 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
8185                   rtx *cfi_ops)
8186 {
8187   machine_mode mode = aarch64_reg_save_mode (regno1);
8188   rtx reg1 = gen_rtx_REG (mode, regno1);
8189
8190   *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
8191
8192   if (regno2 == INVALID_REGNUM)
8193     {
8194       rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
8195       mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
8196       emit_move_insn (reg1, gen_frame_mem (mode, mem));
8197     }
8198   else
8199     {
8200       rtx reg2 = gen_rtx_REG (mode, regno2);
8201       *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8202       emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
8203                                           reg2, adjustment));
8204     }
8205 }
8206
8207 /* Given an ldp/stp register operand mode MODE, return a suitable mode to use
8208    for a mem rtx representing the entire pair.  */
8209
8210 static machine_mode
8211 aarch64_pair_mode_for_mode (machine_mode mode)
8212 {
8213   if (known_eq (GET_MODE_SIZE (mode), 4))
8214     return V2x4QImode;
8215   else if (known_eq (GET_MODE_SIZE (mode), 8))
8216     return V2x8QImode;
8217   else if (known_eq (GET_MODE_SIZE (mode), 16))
8218     return V2x16QImode;
8219   else
8220     gcc_unreachable ();
8221 }
8222
8223 /* Given a base mem MEM with mode and address suitable for a single ldp/stp
8224    operand, return an rtx like MEM which instead represents the entire pair.  */
8225
8226 static rtx
8227 aarch64_pair_mem_from_base (rtx mem)
8228 {
8229   auto pair_mode = aarch64_pair_mode_for_mode (GET_MODE (mem));
8230   mem = adjust_bitfield_address_nv (mem, pair_mode, 0);
8231   gcc_assert (aarch64_mem_pair_lanes_operand (mem, pair_mode));
8232   return mem;
8233 }
8234
8235 /* Generate and return a store pair instruction to store REG1 and REG2
8236    into memory starting at BASE_MEM.  All three rtxes should have modes of the
8237    same size.  */
8238
8239 rtx
8240 aarch64_gen_store_pair (rtx base_mem, rtx reg1, rtx reg2)
8241 {
8242   rtx pair_mem = aarch64_pair_mem_from_base (base_mem);
8243
8244   return gen_rtx_SET (pair_mem,
8245                       gen_rtx_UNSPEC (GET_MODE (pair_mem),
8246                                       gen_rtvec (2, reg1, reg2),
8247                                       UNSPEC_STP));
8248 }
8249
8250 /* Generate and return a load pair instruction to load a pair of
8251    registers starting at BASE_MEM into REG1 and REG2.  If CODE is
8252    UNKNOWN, all three rtxes should have modes of the same size.
8253    Otherwise, CODE is {SIGN,ZERO}_EXTEND, base_mem should be in SImode,
8254    and REG{1,2} should be in DImode.  */
8255
8256 rtx
8257 aarch64_gen_load_pair (rtx reg1, rtx reg2, rtx base_mem, enum rtx_code code)
8258 {
8259   rtx pair_mem = aarch64_pair_mem_from_base (base_mem);
8260
8261   const bool any_extend_p = (code == ZERO_EXTEND || code == SIGN_EXTEND);
8262   if (any_extend_p)
8263     gcc_checking_assert (GET_MODE (base_mem) == SImode
8264                          && GET_MODE (reg1) == DImode
8265                          && GET_MODE (reg2) == DImode);
8266   else
8267     gcc_assert (code == UNKNOWN);
8268
8269   rtx unspecs[2] = {
8270     gen_rtx_UNSPEC (any_extend_p ? SImode : GET_MODE (reg1),
8271                     gen_rtvec (1, pair_mem),
8272                     UNSPEC_LDP_FST),
8273     gen_rtx_UNSPEC (any_extend_p ? SImode : GET_MODE (reg2),
8274                     gen_rtvec (1, copy_rtx (pair_mem)),
8275                     UNSPEC_LDP_SND)
8276   };
8277
8278   if (any_extend_p)
8279     for (int i = 0; i < 2; i++)
8280       unspecs[i] = gen_rtx_fmt_e (code, DImode, unspecs[i]);
8281
8282   return gen_rtx_PARALLEL (VOIDmode,
8283                            gen_rtvec (2,
8284                                       gen_rtx_SET (reg1, unspecs[0]),
8285                                       gen_rtx_SET (reg2, unspecs[1])));
8286 }
8287
8288 /* Return TRUE if return address signing should be enabled for the current
8289    function, otherwise return FALSE.  */
8290
8291 bool
8292 aarch64_return_address_signing_enabled (void)
8293 {
8294   /* This function should only be called after frame laid out.   */
8295   gcc_assert (cfun->machine->frame.laid_out);
8296
8297   /* If signing scope is AARCH_FUNCTION_NON_LEAF, we only sign a leaf function
8298      if its LR is pushed onto stack.  */
8299   return (aarch_ra_sign_scope == AARCH_FUNCTION_ALL
8300           || (aarch_ra_sign_scope == AARCH_FUNCTION_NON_LEAF
8301               && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
8302 }
8303
8304 /* Only used by the arm backend.  */
8305 void aarch_bti_arch_check (void)
8306 {}
8307
8308 /* Return TRUE if Branch Target Identification Mechanism is enabled.  */
8309 bool
8310 aarch_bti_enabled (void)
8311 {
8312   return (aarch_enable_bti == 1);
8313 }
8314
8315 /* Check if INSN is a BTI J insn.  */
8316 bool
8317 aarch_bti_j_insn_p (rtx_insn *insn)
8318 {
8319   if (!insn || !INSN_P (insn))
8320     return false;
8321
8322   rtx pat = PATTERN (insn);
8323   return GET_CODE (pat) == UNSPEC_VOLATILE && XINT (pat, 1) == UNSPECV_BTI_J;
8324 }
8325
8326 /* Check if X (or any sub-rtx of X) is a PACIASP/PACIBSP instruction.  */
8327 bool
8328 aarch_pac_insn_p (rtx x)
8329 {
8330   if (!INSN_P (x))
8331     return false;
8332
8333   subrtx_var_iterator::array_type array;
8334   FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (x), ALL)
8335     {
8336       rtx sub = *iter;
8337       if (sub && GET_CODE (sub) == UNSPEC)
8338         {
8339           int unspec_val = XINT (sub, 1);
8340           switch (unspec_val)
8341             {
8342             case UNSPEC_PACIASP:
8343             case UNSPEC_PACIBSP:
8344               return true;
8345
8346             default:
8347               return false;
8348             }
8349           iter.skip_subrtxes ();
8350         }
8351     }
8352   return false;
8353 }
8354
8355 rtx aarch_gen_bti_c (void)
8356 {
8357   return gen_bti_c ();
8358 }
8359
8360 rtx aarch_gen_bti_j (void)
8361 {
8362   return gen_bti_j ();
8363 }
8364
8365 /* The caller is going to use ST1D or LD1D to save or restore an SVE
8366    register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
8367    the range [1, 16] * GET_MODE_SIZE (MODE).  Prepare for this by:
8368
8369      (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
8370          or LD1D address
8371
8372      (2) setting PRED to a valid predicate register for the ST1D or LD1D,
8373          if the variable isn't already nonnull
8374
8375    (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
8376    Handle this case using a temporary base register that is suitable for
8377    all offsets in that range.  Use ANCHOR_REG as this base register if it
8378    is nonnull, otherwise create a new register and store it in ANCHOR_REG.  */
8379
8380 static inline void
8381 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
8382                                      rtx &anchor_reg, poly_int64 &offset,
8383                                      rtx &ptrue)
8384 {
8385   if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
8386     {
8387       /* This is the maximum valid offset of the anchor from the base.
8388          Lower values would be valid too.  */
8389       poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
8390       if (!anchor_reg)
8391         {
8392           anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8393           emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8394                                     gen_int_mode (anchor_offset, Pmode)));
8395         }
8396       base_rtx = anchor_reg;
8397       offset -= anchor_offset;
8398     }
8399   if (!ptrue)
8400     {
8401       int pred_reg = cfun->machine->frame.spare_pred_reg;
8402       emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
8403                       CONSTM1_RTX (VNx16BImode));
8404       ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
8405     }
8406 }
8407
8408 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
8409    is saved at BASE + OFFSET.  */
8410
8411 static void
8412 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
8413                             rtx base, poly_int64 offset)
8414 {
8415   rtx mem = gen_frame_mem (GET_MODE (reg),
8416                            plus_constant (Pmode, base, offset));
8417   add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
8418 }
8419
8420 /* Emit code to save the callee-saved registers in REGS.  Skip any
8421    write-back candidates if SKIP_WB is true, otherwise consider only
8422    write-back candidates.
8423
8424    The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8425    of the static frame.  HARD_FP_VALID_P is true if the hard frame pointer
8426    has been set up.  */
8427
8428 static void
8429 aarch64_save_callee_saves (poly_int64 bytes_below_sp,
8430                            array_slice<unsigned int> regs, bool skip_wb,
8431                            bool hard_fp_valid_p)
8432 {
8433   aarch64_frame &frame = cfun->machine->frame;
8434   rtx_insn *insn;
8435   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8436
8437   auto skip_save_p = [&](unsigned int regno)
8438     {
8439       if (cfun->machine->reg_is_wrapped_separately[regno])
8440         return true;
8441
8442       if (skip_wb == (regno == frame.wb_push_candidate1
8443                       || regno == frame.wb_push_candidate2))
8444         return true;
8445
8446       return false;
8447     };
8448
8449   for (unsigned int i = 0; i < regs.size (); ++i)
8450     {
8451       unsigned int regno = regs[i];
8452       poly_int64 offset;
8453       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8454
8455       if (skip_save_p (regno))
8456         continue;
8457
8458       machine_mode mode = aarch64_reg_save_mode (regno);
8459       rtx reg = gen_rtx_REG (mode, regno);
8460       rtx move_src = reg;
8461       offset = frame.reg_offset[regno] - bytes_below_sp;
8462       if (regno == VG_REGNUM)
8463         {
8464           move_src = gen_rtx_REG (DImode, IP0_REGNUM);
8465           emit_move_insn (move_src, gen_int_mode (aarch64_sve_vg, DImode));
8466         }
8467       rtx base_rtx = stack_pointer_rtx;
8468       poly_int64 sp_offset = offset;
8469
8470       HOST_WIDE_INT const_offset;
8471       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8472         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8473                                              offset, ptrue);
8474       else if (GP_REGNUM_P (REGNO (reg))
8475                && (!offset.is_constant (&const_offset) || const_offset >= 512))
8476         {
8477           poly_int64 fp_offset = frame.bytes_below_hard_fp - bytes_below_sp;
8478           if (hard_fp_valid_p)
8479             base_rtx = hard_frame_pointer_rtx;
8480           else
8481             {
8482               if (!anchor_reg)
8483                 {
8484                   anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8485                   emit_insn (gen_add3_insn (anchor_reg, base_rtx,
8486                                             gen_int_mode (fp_offset, Pmode)));
8487                 }
8488               base_rtx = anchor_reg;
8489             }
8490           offset -= fp_offset;
8491         }
8492       rtx mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8493       rtx cfi_mem = gen_frame_mem (mode, plus_constant (Pmode,
8494                                                         stack_pointer_rtx,
8495                                                         sp_offset));
8496       rtx cfi_set = gen_rtx_SET (cfi_mem, reg);
8497       bool need_cfi_note_p = (base_rtx != stack_pointer_rtx);
8498
8499       unsigned int regno2;
8500       if (!aarch64_sve_mode_p (mode)
8501           && reg == move_src
8502           && i + 1 < regs.size ()
8503           && (regno2 = regs[i + 1], !skip_save_p (regno2))
8504           && known_eq (GET_MODE_SIZE (mode),
8505                        frame.reg_offset[regno2] - frame.reg_offset[regno]))
8506         {
8507           rtx reg2 = gen_rtx_REG (mode, regno2);
8508
8509           offset += GET_MODE_SIZE (mode);
8510           insn = emit_insn (aarch64_gen_store_pair (mem, reg, reg2));
8511
8512           rtx cfi_mem2
8513             = gen_frame_mem (mode,
8514                              plus_constant (Pmode,
8515                                             stack_pointer_rtx,
8516                                             sp_offset + GET_MODE_SIZE (mode)));
8517           rtx cfi_set2 = gen_rtx_SET (cfi_mem2, reg2);
8518
8519           /* The first part of a frame-related parallel insn is always
8520              assumed to be relevant to the frame calculations;
8521              subsequent parts, are only frame-related if
8522              explicitly marked.  */
8523           if (aarch64_emit_cfi_for_reg_p (regno2))
8524             RTX_FRAME_RELATED_P (cfi_set2) = 1;
8525
8526           /* Add a REG_FRAME_RELATED_EXPR note since the unspec
8527              representation of stp cannot be understood directly by
8528              dwarf2cfi.  */
8529           rtx par = gen_rtx_PARALLEL (VOIDmode,
8530                                       gen_rtvec (2, cfi_set, cfi_set2));
8531           add_reg_note (insn, REG_FRAME_RELATED_EXPR, par);
8532
8533           regno = regno2;
8534           ++i;
8535         }
8536       else
8537         {
8538           if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8539             {
8540               insn = emit_insn (gen_aarch64_pred_mov (mode, mem,
8541                                                       ptrue, move_src));
8542               need_cfi_note_p = true;
8543             }
8544           else if (aarch64_sve_mode_p (mode))
8545             insn = emit_insn (gen_rtx_SET (mem, move_src));
8546           else
8547             insn = emit_move_insn (mem, move_src);
8548
8549           if (frame_related_p && (need_cfi_note_p || move_src != reg))
8550             add_reg_note (insn, REG_FRAME_RELATED_EXPR, cfi_set);
8551         }
8552
8553       RTX_FRAME_RELATED_P (insn) = frame_related_p;
8554
8555       /* Emit a fake instruction to indicate that the VG save slot has
8556          been initialized.  */
8557       if (regno == VG_REGNUM)
8558         emit_insn (gen_aarch64_old_vg_saved (move_src, mem));
8559     }
8560 }
8561
8562 /* Emit code to restore the callee registers in REGS, ignoring pop candidates
8563    and any other registers that are handled separately.  Write the appropriate
8564    REG_CFA_RESTORE notes into CFI_OPS.
8565
8566    The stack pointer is currently BYTES_BELOW_SP bytes above the bottom
8567    of the static frame.  */
8568
8569 static void
8570 aarch64_restore_callee_saves (poly_int64 bytes_below_sp,
8571                               array_slice<unsigned int> regs, rtx *cfi_ops)
8572 {
8573   aarch64_frame &frame = cfun->machine->frame;
8574   poly_int64 offset;
8575   rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
8576
8577   auto skip_restore_p = [&](unsigned int regno)
8578     {
8579       if (cfun->machine->reg_is_wrapped_separately[regno])
8580         return true;
8581
8582       if (regno == frame.wb_pop_candidate1
8583           || regno == frame.wb_pop_candidate2)
8584         return true;
8585
8586       /* The shadow call stack code restores LR separately.  */
8587       if (frame.is_scs_enabled && regno == LR_REGNUM)
8588         return true;
8589
8590       return false;
8591     };
8592
8593   for (unsigned int i = 0; i < regs.size (); ++i)
8594     {
8595       unsigned int regno = regs[i];
8596       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8597       if (skip_restore_p (regno))
8598         continue;
8599
8600       machine_mode mode = aarch64_reg_save_mode (regno);
8601       rtx reg = gen_rtx_REG (mode, regno);
8602       offset = frame.reg_offset[regno] - bytes_below_sp;
8603       rtx base_rtx = stack_pointer_rtx;
8604       if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8605         aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
8606                                              offset, ptrue);
8607       rtx mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
8608
8609       unsigned int regno2;
8610       if (!aarch64_sve_mode_p (mode)
8611           && i + 1 < regs.size ()
8612           && (regno2 = regs[i + 1], !skip_restore_p (regno2))
8613           && known_eq (GET_MODE_SIZE (mode),
8614                        frame.reg_offset[regno2] - frame.reg_offset[regno]))
8615         {
8616           rtx reg2 = gen_rtx_REG (mode, regno2);
8617
8618           offset += GET_MODE_SIZE (mode);
8619           emit_insn (aarch64_gen_load_pair (reg, reg2, mem));
8620
8621           *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8622           regno = regno2;
8623           ++i;
8624         }
8625       else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8626         emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
8627       else if (aarch64_sve_mode_p (mode))
8628         emit_insn (gen_rtx_SET (reg, mem));
8629       else
8630         emit_move_insn (reg, mem);
8631       if (frame_related_p)
8632         *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
8633     }
8634 }
8635
8636 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
8637    of MODE.  */
8638
8639 static inline bool
8640 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8641 {
8642   HOST_WIDE_INT multiple;
8643   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8644           && IN_RANGE (multiple, -8, 7));
8645 }
8646
8647 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
8648    of MODE.  */
8649
8650 static inline bool
8651 offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8652 {
8653   HOST_WIDE_INT multiple;
8654   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8655           && IN_RANGE (multiple, -32, 31));
8656 }
8657
8658 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
8659    of MODE.  */
8660
8661 static inline bool
8662 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
8663 {
8664   HOST_WIDE_INT multiple;
8665   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8666           && IN_RANGE (multiple, 0, 63));
8667 }
8668
8669 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
8670    of MODE.  */
8671
8672 bool
8673 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8674 {
8675   HOST_WIDE_INT multiple;
8676   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8677           && IN_RANGE (multiple, -64, 63));
8678 }
8679
8680 /* Return true if OFFSET is a signed 9-bit value.  */
8681
8682 bool
8683 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
8684                                        poly_int64 offset)
8685 {
8686   HOST_WIDE_INT const_offset;
8687   return (offset.is_constant (&const_offset)
8688           && IN_RANGE (const_offset, -256, 255));
8689 }
8690
8691 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
8692    of MODE.  */
8693
8694 static inline bool
8695 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
8696 {
8697   HOST_WIDE_INT multiple;
8698   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8699           && IN_RANGE (multiple, -256, 255));
8700 }
8701
8702 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
8703    of MODE.  */
8704
8705 static inline bool
8706 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
8707 {
8708   HOST_WIDE_INT multiple;
8709   return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
8710           && IN_RANGE (multiple, 0, 4095));
8711 }
8712
8713 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS.  */
8714
8715 static sbitmap
8716 aarch64_get_separate_components (void)
8717 {
8718   aarch64_frame &frame = cfun->machine->frame;
8719   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8720   bitmap_clear (components);
8721
8722   /* The registers we need saved to the frame.  */
8723   bool enables_pstate_sm = aarch64_cfun_enables_pstate_sm ();
8724   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8725     if (aarch64_register_saved_on_entry (regno))
8726       {
8727         /* Disallow shrink wrapping for registers that will be clobbered
8728            by an SMSTART SM in the prologue.  */
8729         if (enables_pstate_sm
8730             && (FP_REGNUM_P (regno) || PR_REGNUM_P (regno)))
8731           continue;
8732
8733         /* Punt on saves and restores that use ST1D and LD1D.  We could
8734            try to be smarter, but it would involve making sure that the
8735            spare predicate register itself is safe to use at the save
8736            and restore points.  Also, when a frame pointer is being used,
8737            the slots are often out of reach of ST1D and LD1D anyway.  */
8738         machine_mode mode = aarch64_reg_save_mode (regno);
8739         if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
8740           continue;
8741
8742         poly_int64 offset = frame.reg_offset[regno];
8743
8744         /* Get the offset relative to the register we'll use.  */
8745         if (frame_pointer_needed)
8746           offset -= frame.bytes_below_hard_fp;
8747
8748         /* Check that we can access the stack slot of the register with one
8749            direct load with no adjustments needed.  */
8750         if (aarch64_sve_mode_p (mode)
8751             ? offset_9bit_signed_scaled_p (mode, offset)
8752             : offset_12bit_unsigned_scaled_p (mode, offset))
8753           bitmap_set_bit (components, regno);
8754       }
8755
8756   /* Don't mess with the hard frame pointer.  */
8757   if (frame_pointer_needed)
8758     bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
8759
8760   /* If the spare predicate register used by big-endian SVE code
8761      is call-preserved, it must be saved in the main prologue
8762      before any saves that use it.  */
8763   if (frame.spare_pred_reg != INVALID_REGNUM)
8764     bitmap_clear_bit (components, frame.spare_pred_reg);
8765
8766   unsigned reg1 = frame.wb_push_candidate1;
8767   unsigned reg2 = frame.wb_push_candidate2;
8768   /* If registers have been chosen to be stored/restored with
8769      writeback don't interfere with them to avoid having to output explicit
8770      stack adjustment instructions.  */
8771   if (reg2 != INVALID_REGNUM)
8772     bitmap_clear_bit (components, reg2);
8773   if (reg1 != INVALID_REGNUM)
8774     bitmap_clear_bit (components, reg1);
8775
8776   bitmap_clear_bit (components, LR_REGNUM);
8777   bitmap_clear_bit (components, SP_REGNUM);
8778   if (flag_stack_clash_protection)
8779     {
8780       if (frame.sve_save_and_probe != INVALID_REGNUM)
8781         bitmap_clear_bit (components, frame.sve_save_and_probe);
8782       if (frame.hard_fp_save_and_probe != INVALID_REGNUM)
8783         bitmap_clear_bit (components, frame.hard_fp_save_and_probe);
8784     }
8785
8786   /* The VG save sequence needs a temporary GPR.  Punt for now on trying
8787      to find one.  */
8788   bitmap_clear_bit (components, VG_REGNUM);
8789
8790   return components;
8791 }
8792
8793 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB.  */
8794
8795 static sbitmap
8796 aarch64_components_for_bb (basic_block bb)
8797 {
8798   bitmap in = DF_LIVE_IN (bb);
8799   bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
8800   bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
8801
8802   sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8803   bitmap_clear (components);
8804
8805   /* Clobbered registers don't generate values in any meaningful sense,
8806      since nothing after the clobber can rely on their value.  And we can't
8807      say that partially-clobbered registers are unconditionally killed,
8808      because whether they're killed or not depends on the mode of the
8809      value they're holding.  Thus partially call-clobbered registers
8810      appear in neither the kill set nor the gen set.
8811
8812      Check manually for any calls that clobber more of a register than the
8813      current function can.  */
8814   function_abi_aggregator callee_abis;
8815   rtx_insn *insn;
8816   FOR_BB_INSNS (bb, insn)
8817     if (CALL_P (insn))
8818       callee_abis.note_callee_abi (insn_callee_abi (insn));
8819   HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
8820
8821   /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets.  */
8822   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8823     if (!fixed_regs[regno]
8824         && !crtl->abi->clobbers_full_reg_p (regno)
8825         && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
8826             || bitmap_bit_p (in, regno)
8827             || bitmap_bit_p (gen, regno)
8828             || bitmap_bit_p (kill, regno)))
8829       {
8830         bitmap_set_bit (components, regno);
8831
8832         /* If there is a callee-save at an adjacent offset, add it too
8833            to increase the use of LDP/STP.  */
8834         poly_int64 offset = cfun->machine->frame.reg_offset[regno];
8835         unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
8836
8837         if (regno2 <= LAST_SAVED_REGNUM)
8838           {
8839             poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
8840             if (regno < regno2
8841                 ? known_eq (offset + 8, offset2)
8842                 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
8843               bitmap_set_bit (components, regno2);
8844           }
8845       }
8846
8847   return components;
8848 }
8849
8850 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
8851    Nothing to do for aarch64.  */
8852
8853 static void
8854 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
8855 {
8856 }
8857
8858 /* Return the next set bit in BMP from START onwards.  Return the total number
8859    of bits in BMP if no set bit is found at or after START.  */
8860
8861 static unsigned int
8862 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
8863 {
8864   unsigned int nbits = SBITMAP_SIZE (bmp);
8865   if (start == nbits)
8866     return start;
8867
8868   gcc_assert (start < nbits);
8869   for (unsigned int i = start; i < nbits; i++)
8870     if (bitmap_bit_p (bmp, i))
8871       return i;
8872
8873   return nbits;
8874 }
8875
8876 /* Do the work for aarch64_emit_prologue_components and
8877    aarch64_emit_epilogue_components.  COMPONENTS is the bitmap of registers
8878    to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
8879    for these components or the epilogue sequence.  That is, it determines
8880    whether we should emit stores or loads and what kind of CFA notes to attach
8881    to the insns.  Otherwise the logic for the two sequences is very
8882    similar.  */
8883
8884 static void
8885 aarch64_process_components (sbitmap components, bool prologue_p)
8886 {
8887   aarch64_frame &frame = cfun->machine->frame;
8888   rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
8889                              ? HARD_FRAME_POINTER_REGNUM
8890                              : STACK_POINTER_REGNUM);
8891
8892   unsigned last_regno = SBITMAP_SIZE (components);
8893   unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
8894   rtx_insn *insn = NULL;
8895
8896   while (regno != last_regno)
8897     {
8898       bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8899       machine_mode mode = aarch64_reg_save_mode (regno);
8900
8901       rtx reg = gen_rtx_REG (mode, regno);
8902       poly_int64 offset = frame.reg_offset[regno];
8903       if (frame_pointer_needed)
8904         offset -= frame.bytes_below_hard_fp;
8905
8906       rtx addr = plus_constant (Pmode, ptr_reg, offset);
8907       rtx mem = gen_frame_mem (mode, addr);
8908
8909       rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
8910       unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
8911       /* No more registers to handle after REGNO.
8912          Emit a single save/restore and exit.  */
8913       if (regno2 == last_regno)
8914         {
8915           insn = emit_insn (set);
8916           if (frame_related_p)
8917             {
8918               RTX_FRAME_RELATED_P (insn) = 1;
8919               if (prologue_p)
8920                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
8921               else
8922                 add_reg_note (insn, REG_CFA_RESTORE, reg);
8923             }
8924           break;
8925         }
8926
8927       poly_int64 offset2 = frame.reg_offset[regno2];
8928       /* The next register is not of the same class or its offset is not
8929          mergeable with the current one into a pair.  */
8930       if (aarch64_sve_mode_p (mode)
8931           || !satisfies_constraint_Ump (mem)
8932           || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
8933           || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
8934           || maybe_ne ((offset2 - frame.reg_offset[regno]),
8935                        GET_MODE_SIZE (mode)))
8936         {
8937           insn = emit_insn (set);
8938           if (frame_related_p)
8939             {
8940               RTX_FRAME_RELATED_P (insn) = 1;
8941               if (prologue_p)
8942                 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
8943               else
8944                 add_reg_note (insn, REG_CFA_RESTORE, reg);
8945             }
8946
8947           regno = regno2;
8948           continue;
8949         }
8950
8951       bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
8952
8953       /* REGNO2 can be saved/restored in a pair with REGNO.  */
8954       rtx reg2 = gen_rtx_REG (mode, regno2);
8955       if (frame_pointer_needed)
8956         offset2 -= frame.bytes_below_hard_fp;
8957       rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
8958       rtx mem2 = gen_frame_mem (mode, addr2);
8959       rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
8960                              : gen_rtx_SET (reg2, mem2);
8961
8962       if (prologue_p)
8963         insn = emit_insn (aarch64_gen_store_pair (mem, reg, reg2));
8964       else
8965         insn = emit_insn (aarch64_gen_load_pair (reg, reg2, mem));
8966
8967       if (frame_related_p || frame_related2_p)
8968         {
8969           RTX_FRAME_RELATED_P (insn) = 1;
8970           if (prologue_p)
8971             {
8972               if (frame_related_p)
8973                 add_reg_note (insn, REG_CFA_OFFSET, set);
8974               if (frame_related2_p)
8975                 add_reg_note (insn, REG_CFA_OFFSET, set2);
8976             }
8977           else
8978             {
8979               if (frame_related_p)
8980                 add_reg_note (insn, REG_CFA_RESTORE, reg);
8981               if (frame_related2_p)
8982                 add_reg_note (insn, REG_CFA_RESTORE, reg2);
8983             }
8984         }
8985
8986       regno = aarch64_get_next_set_bit (components, regno2 + 1);
8987     }
8988 }
8989
8990 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS.  */
8991
8992 static void
8993 aarch64_emit_prologue_components (sbitmap components)
8994 {
8995   aarch64_process_components (components, true);
8996 }
8997
8998 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS.  */
8999
9000 static void
9001 aarch64_emit_epilogue_components (sbitmap components)
9002 {
9003   aarch64_process_components (components, false);
9004 }
9005
9006 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS.  */
9007
9008 static void
9009 aarch64_set_handled_components (sbitmap components)
9010 {
9011   for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9012     if (bitmap_bit_p (components, regno))
9013       cfun->machine->reg_is_wrapped_separately[regno] = true;
9014 }
9015
9016 /* On AArch64 we have an ABI defined safe buffer.  This constant is used to
9017    determining the probe offset for alloca.  */
9018
9019 static HOST_WIDE_INT
9020 aarch64_stack_clash_protection_alloca_probe_range (void)
9021 {
9022   return STACK_CLASH_CALLER_GUARD;
9023 }
9024
9025 /* Emit a stack tie that acts as a scheduling barrier for all previous and
9026    subsequent memory accesses and that requires the stack pointer and REG
9027    to have their current values.  REG can be stack_pointer_rtx if no
9028    other register's value needs to be fixed.  */
9029
9030 static void
9031 aarch64_emit_stack_tie (rtx reg)
9032 {
9033   emit_insn (gen_stack_tie (reg, gen_int_mode (REGNO (reg), DImode)));
9034 }
9035
9036 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9037    registers.  If POLY_SIZE is not large enough to require a probe this function
9038    will only adjust the stack.  When allocating the stack space
9039    FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9040    FINAL_ADJUSTMENT_P indicates whether we are allocating the area below
9041    the saved registers.  If we are then we ensure that any allocation
9042    larger than the ABI defined buffer needs a probe so that the
9043    invariant of having a 1KB buffer is maintained.
9044
9045    We emit barriers after each stack adjustment to prevent optimizations from
9046    breaking the invariant that we never drop the stack more than a page.  This
9047    invariant is needed to make it easier to correctly handle asynchronous
9048    events, e.g. if we were to allow the stack to be dropped by more than a page
9049    and then have multiple probes up and we take a signal somewhere in between
9050    then the signal handler doesn't know the state of the stack and can make no
9051    assumptions about which pages have been probed.
9052
9053    FORCE_ISA_MODE is AARCH64_FL_SM_ON if any variable component of POLY_SIZE
9054    is measured relative to the SME vector length instead of the current
9055    prevailing vector length.  It is 0 otherwise.  */
9056
9057 static void
9058 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
9059                                         poly_int64 poly_size,
9060                                         aarch64_feature_flags force_isa_mode,
9061                                         bool frame_related_p,
9062                                         bool final_adjustment_p)
9063 {
9064   aarch64_frame &frame = cfun->machine->frame;
9065   HOST_WIDE_INT guard_size
9066     = 1 << param_stack_clash_protection_guard_size;
9067   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9068   HOST_WIDE_INT byte_sp_alignment = STACK_BOUNDARY / BITS_PER_UNIT;
9069   gcc_assert (multiple_p (poly_size, byte_sp_alignment));
9070   HOST_WIDE_INT min_probe_threshold
9071     = (final_adjustment_p
9072        ? guard_used_by_caller + byte_sp_alignment
9073        : guard_size - guard_used_by_caller);
9074   poly_int64 frame_size = frame.frame_size;
9075
9076   /* We should always have a positive probe threshold.  */
9077   gcc_assert (min_probe_threshold > 0);
9078
9079   if (flag_stack_clash_protection && !final_adjustment_p)
9080     {
9081       poly_int64 initial_adjust = frame.initial_adjust;
9082       poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9083       poly_int64 final_adjust = frame.final_adjust;
9084
9085       if (known_eq (frame_size, 0))
9086         {
9087           dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
9088         }
9089       else if (known_lt (initial_adjust + sve_callee_adjust,
9090                          guard_size - guard_used_by_caller)
9091                && known_lt (final_adjust, guard_used_by_caller))
9092         {
9093           dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
9094         }
9095     }
9096
9097   /* If SIZE is not large enough to require probing, just adjust the stack and
9098      exit.  */
9099   if (known_lt (poly_size, min_probe_threshold)
9100       || !flag_stack_clash_protection)
9101     {
9102       aarch64_sub_sp (temp1, temp2, poly_size, force_isa_mode,
9103                       frame_related_p);
9104       return;
9105     }
9106
9107   HOST_WIDE_INT size;
9108   /* Handle the SVE non-constant case first.  */
9109   if (!poly_size.is_constant (&size))
9110     {
9111      if (dump_file)
9112       {
9113         fprintf (dump_file, "Stack clash SVE prologue: ");
9114         print_dec (poly_size, dump_file);
9115         fprintf (dump_file, " bytes, dynamic probing will be required.\n");
9116       }
9117
9118       /* First calculate the amount of bytes we're actually spilling.  */
9119       aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
9120                           poly_size, temp1, temp2, force_isa_mode,
9121                           false, true);
9122
9123       rtx_insn *insn = get_last_insn ();
9124
9125       if (frame_related_p)
9126         {
9127           /* This is done to provide unwinding information for the stack
9128              adjustments we're about to do, however to prevent the optimizers
9129              from removing the R11 move and leaving the CFA note (which would be
9130              very wrong) we tie the old and new stack pointer together.
9131              The tie will expand to nothing but the optimizers will not touch
9132              the instruction.  */
9133           rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9134           emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
9135           aarch64_emit_stack_tie (stack_ptr_copy);
9136
9137           /* We want the CFA independent of the stack pointer for the
9138              duration of the loop.  */
9139           add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
9140           RTX_FRAME_RELATED_P (insn) = 1;
9141         }
9142
9143       rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
9144       rtx guard_const = gen_int_mode (guard_size, Pmode);
9145
9146       insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
9147                                                    stack_pointer_rtx, temp1,
9148                                                    probe_const, guard_const));
9149
9150       /* Now reset the CFA register if needed.  */
9151       if (frame_related_p)
9152         {
9153           add_reg_note (insn, REG_CFA_DEF_CFA,
9154                         gen_rtx_PLUS (Pmode, stack_pointer_rtx,
9155                                       gen_int_mode (poly_size, Pmode)));
9156           RTX_FRAME_RELATED_P (insn) = 1;
9157         }
9158
9159       return;
9160     }
9161
9162   if (dump_file)
9163     fprintf (dump_file,
9164              "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9165              " bytes, probing will be required.\n", size);
9166
9167   /* Round size to the nearest multiple of guard_size, and calculate the
9168      residual as the difference between the original size and the rounded
9169      size.  */
9170   HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
9171   HOST_WIDE_INT residual = size - rounded_size;
9172
9173   /* We can handle a small number of allocations/probes inline.  Otherwise
9174      punt to a loop.  */
9175   if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
9176     {
9177       for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
9178         {
9179           aarch64_sub_sp (NULL, temp2, guard_size, force_isa_mode, true);
9180           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9181                                            guard_used_by_caller));
9182           emit_insn (gen_blockage ());
9183         }
9184       dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
9185     }
9186   else
9187     {
9188       /* Compute the ending address.  */
9189       aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
9190                           temp1, NULL, force_isa_mode, false, true);
9191       rtx_insn *insn = get_last_insn ();
9192
9193       /* For the initial allocation, we don't have a frame pointer
9194          set up, so we always need CFI notes.  If we're doing the
9195          final allocation, then we may have a frame pointer, in which
9196          case it is the CFA, otherwise we need CFI notes.
9197
9198          We can determine which allocation we are doing by looking at
9199          the value of FRAME_RELATED_P since the final allocations are not
9200          frame related.  */
9201       if (frame_related_p)
9202         {
9203           /* We want the CFA independent of the stack pointer for the
9204              duration of the loop.  */
9205           add_reg_note (insn, REG_CFA_DEF_CFA,
9206                         plus_constant (Pmode, temp1, rounded_size));
9207           RTX_FRAME_RELATED_P (insn) = 1;
9208         }
9209
9210       /* This allocates and probes the stack.  Note that this re-uses some of
9211          the existing Ada stack protection code.  However we are guaranteed not
9212          to enter the non loop or residual branches of that code.
9213
9214          The non-loop part won't be entered because if our allocation amount
9215          doesn't require a loop, the case above would handle it.
9216
9217          The residual amount won't be entered because TEMP1 is a mutliple of
9218          the allocation size.  The residual will always be 0.  As such, the only
9219          part we are actually using from that code is the loop setup.  The
9220          actual probing is done in aarch64_output_probe_stack_range.  */
9221       insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
9222                                                stack_pointer_rtx, temp1));
9223
9224       /* Now reset the CFA register if needed.  */
9225       if (frame_related_p)
9226         {
9227           add_reg_note (insn, REG_CFA_DEF_CFA,
9228                         plus_constant (Pmode, stack_pointer_rtx, rounded_size));
9229           RTX_FRAME_RELATED_P (insn) = 1;
9230         }
9231
9232       emit_insn (gen_blockage ());
9233       dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
9234     }
9235
9236   /* Handle any residuals.  Residuals of at least MIN_PROBE_THRESHOLD have to
9237      be probed.  This maintains the requirement that each page is probed at
9238      least once.  For initial probing we probe only if the allocation is
9239      more than GUARD_SIZE - buffer, and below the saved registers we probe
9240      if the amount is larger than buffer.  GUARD_SIZE - buffer + buffer ==
9241      GUARD_SIZE.  This works that for any allocation that is large enough to
9242      trigger a probe here, we'll have at least one, and if they're not large
9243      enough for this code to emit anything for them, The page would have been
9244      probed by the saving of FP/LR either by this function or any callees.  If
9245      we don't have any callees then we won't have more stack adjustments and so
9246      are still safe.  */
9247   if (residual)
9248     {
9249       gcc_assert (guard_used_by_caller + byte_sp_alignment <= size);
9250
9251       /* If we're doing final adjustments, and we've done any full page
9252          allocations then any residual needs to be probed.  */
9253       if (final_adjustment_p && rounded_size != 0)
9254         min_probe_threshold = 0;
9255
9256       aarch64_sub_sp (temp1, temp2, residual, force_isa_mode, frame_related_p);
9257       if (residual >= min_probe_threshold)
9258         {
9259           if (dump_file)
9260             fprintf (dump_file,
9261                      "Stack clash AArch64 prologue residuals: "
9262                      HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
9263                      "\n", residual);
9264
9265           emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9266                                            guard_used_by_caller));
9267           emit_insn (gen_blockage ());
9268         }
9269     }
9270 }
9271
9272 /* Implement TARGET_EXTRA_LIVE_ON_ENTRY.  */
9273
9274 void
9275 aarch64_extra_live_on_entry (bitmap regs)
9276 {
9277   if (TARGET_ZA)
9278     {
9279       bitmap_set_bit (regs, LOWERING_REGNUM);
9280       bitmap_set_bit (regs, SME_STATE_REGNUM);
9281       bitmap_set_bit (regs, TPIDR2_SETUP_REGNUM);
9282       bitmap_set_bit (regs, ZA_FREE_REGNUM);
9283       bitmap_set_bit (regs, ZA_SAVED_REGNUM);
9284
9285       /* The only time ZA can't have live contents on entry is when
9286          the function explicitly treats it as a pure output.  */
9287       auto za_flags = aarch64_cfun_shared_flags ("za");
9288       if (za_flags != (AARCH64_STATE_SHARED | AARCH64_STATE_OUT))
9289         bitmap_set_bit (regs, ZA_REGNUM);
9290
9291       /* Since ZT0 is call-clobbered, it is only live on input if
9292          it is explicitly shared, and is not a pure output.  */
9293       auto zt0_flags = aarch64_cfun_shared_flags ("zt0");
9294       if (zt0_flags != 0
9295           && zt0_flags != (AARCH64_STATE_SHARED | AARCH64_STATE_OUT))
9296         bitmap_set_bit (regs, ZT0_REGNUM);
9297     }
9298 }
9299
9300 /* Return 1 if the register is used by the epilogue.  We need to say the
9301    return register is used, but only after epilogue generation is complete.
9302    Note that in the case of sibcalls, the values "used by the epilogue" are
9303    considered live at the start of the called function.  */
9304
9305 int
9306 aarch64_epilogue_uses (int regno)
9307 {
9308   if (epilogue_completed)
9309     {
9310       if (regno == LR_REGNUM)
9311         return 1;
9312     }
9313   if (regno == LOWERING_REGNUM && TARGET_ZA)
9314     return 1;
9315   if (regno == SME_STATE_REGNUM && TARGET_ZA)
9316     return 1;
9317   if (regno == TPIDR2_SETUP_REGNUM && TARGET_ZA)
9318     return 1;
9319   /* If the function shares SME state with its caller, ensure that that
9320      data is not in the lazy save buffer on exit.  */
9321   if (regno == ZA_SAVED_REGNUM && aarch64_cfun_incoming_pstate_za () != 0)
9322     return 1;
9323   if (regno == ZA_REGNUM && aarch64_cfun_shared_flags ("za") != 0)
9324     return 1;
9325   if (regno == ZT0_REGNUM && aarch64_cfun_shared_flags ("zt0") != 0)
9326     return 1;
9327   return 0;
9328 }
9329
9330 /* Implement TARGET_USE_LATE_PROLOGUE_EPILOGUE.  */
9331
9332 static bool
9333 aarch64_use_late_prologue_epilogue ()
9334 {
9335   return aarch64_cfun_enables_pstate_sm ();
9336 }
9337
9338 /* The current function's frame has a save slot for the incoming state
9339    of SVCR.  Return a legitimate memory for the slot, based on the hard
9340    frame pointer.  */
9341
9342 static rtx
9343 aarch64_old_svcr_mem ()
9344 {
9345   gcc_assert (frame_pointer_needed
9346               && known_ge (cfun->machine->frame.old_svcr_offset, 0));
9347   rtx base = hard_frame_pointer_rtx;
9348   poly_int64 offset = (0
9349                        /* hard fp -> bottom of frame.  */
9350                        - cfun->machine->frame.bytes_below_hard_fp
9351                        /* bottom of frame -> save slot.  */
9352                        + cfun->machine->frame.old_svcr_offset);
9353   return gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
9354 }
9355
9356 /* The current function's frame has a save slot for the incoming state
9357    of SVCR.  Load the slot into register REGNO and return the register.  */
9358
9359 static rtx
9360 aarch64_read_old_svcr (unsigned int regno)
9361 {
9362   rtx svcr = gen_rtx_REG (DImode, regno);
9363   emit_move_insn (svcr, aarch64_old_svcr_mem ());
9364   return svcr;
9365 }
9366
9367 /* Like the rtx version of aarch64_guard_switch_pstate_sm, but first
9368    load the incoming value of SVCR from its save slot into temporary
9369    register REGNO.  */
9370
9371 static rtx_insn *
9372 aarch64_guard_switch_pstate_sm (unsigned int regno,
9373                                 aarch64_feature_flags local_mode)
9374 {
9375   rtx old_svcr = aarch64_read_old_svcr (regno);
9376   return aarch64_guard_switch_pstate_sm (old_svcr, local_mode);
9377 }
9378
9379 /* AArch64 stack frames generated by this compiler look like:
9380
9381         +-------------------------------+
9382         |                               |
9383         |  incoming stack arguments     |
9384         |                               |
9385         +-------------------------------+
9386         |                               | <-- incoming stack pointer (aligned)
9387         |  callee-allocated save area   |
9388         |  for register varargs         |
9389         |                               |
9390         +-------------------------------+
9391         |  local variables (1)          | <-- frame_pointer_rtx
9392         |                               |
9393         +-------------------------------+
9394         |  padding (1)                  |
9395         +-------------------------------+
9396         |  callee-saved registers       |
9397         +-------------------------------+
9398         |  LR'                          |
9399         +-------------------------------+
9400         |  FP'                          |
9401         +-------------------------------+ <-- hard_frame_pointer_rtx (aligned)
9402         |  SVE vector registers         |
9403         +-------------------------------+
9404         |  SVE predicate registers      |
9405         +-------------------------------+
9406         |  local variables (2)          |
9407         +-------------------------------+
9408         |  padding (2)                  |
9409         +-------------------------------+
9410         |  dynamic allocation           |
9411         +-------------------------------+
9412         |  padding                      |
9413         +-------------------------------+
9414         |  outgoing stack arguments     | <-- arg_pointer
9415         |                               |
9416         +-------------------------------+
9417         |                               | <-- stack_pointer_rtx (aligned)
9418
9419    The regions marked (1) and (2) are mutually exclusive.  (2) is used
9420    when aarch64_save_regs_above_locals_p is true.
9421
9422    Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9423    but leave frame_pointer_rtx and hard_frame_pointer_rtx
9424    unchanged.
9425
9426    By default for stack-clash we assume the guard is at least 64KB, but this
9427    value is configurable to either 4KB or 64KB.  We also force the guard size to
9428    be the same as the probing interval and both values are kept in sync.
9429
9430    With those assumptions the callee can allocate up to 63KB (or 3KB depending
9431    on the guard size) of stack space without probing.
9432
9433    When probing is needed, we emit a probe at the start of the prologue
9434    and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9435
9436    We can also use register saves as probes.  These are stored in
9437    sve_save_and_probe and hard_fp_save_and_probe.
9438
9439    For outgoing arguments we probe if the size is larger than 1KB, such that
9440    the ABI specified buffer is maintained for the next callee.
9441
9442    The following registers are reserved during frame layout and should not be
9443    used for any other purpose:
9444
9445    - r11: Used by stack clash protection when SVE is enabled, and also
9446           as an anchor register when saving and restoring registers
9447    - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9448    - r14 and r15: Used for speculation tracking.
9449    - r16(IP0), r17(IP1): Used by indirect tailcalls.
9450    - r30(LR), r29(FP): Used by standard frame layout.
9451
9452    These registers must be avoided in frame layout related code unless the
9453    explicit intention is to interact with one of the features listed above.  */
9454
9455 /* Generate the prologue instructions for entry into a function.
9456    Establish the stack frame by decreasing the stack pointer with a
9457    properly calculated size and, if necessary, create a frame record
9458    filled with the values of LR and previous frame pointer.  The
9459    current FP is also set up if it is in use.  */
9460
9461 void
9462 aarch64_expand_prologue (void)
9463 {
9464   aarch64_frame &frame = cfun->machine->frame;
9465   poly_int64 frame_size = frame.frame_size;
9466   poly_int64 initial_adjust = frame.initial_adjust;
9467   HOST_WIDE_INT callee_adjust = frame.callee_adjust;
9468   poly_int64 final_adjust = frame.final_adjust;
9469   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9470   unsigned reg1 = frame.wb_push_candidate1;
9471   unsigned reg2 = frame.wb_push_candidate2;
9472   bool emit_frame_chain = frame.emit_frame_chain;
9473   rtx_insn *insn;
9474   aarch64_feature_flags force_isa_mode = 0;
9475   if (aarch64_cfun_enables_pstate_sm ())
9476     force_isa_mode = AARCH64_FL_SM_ON;
9477
9478   if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
9479     {
9480       /* Fold the SVE allocation into the initial allocation.
9481          We don't do this in aarch64_layout_arg to avoid pessimizing
9482          the epilogue code.  */
9483       initial_adjust += sve_callee_adjust;
9484       sve_callee_adjust = 0;
9485     }
9486
9487   /* Sign return address for functions.  */
9488   if (aarch64_return_address_signing_enabled ())
9489     {
9490       switch (aarch_ra_sign_key)
9491         {
9492           case AARCH_KEY_A:
9493             insn = emit_insn (gen_paciasp ());
9494             break;
9495           case AARCH_KEY_B:
9496             insn = emit_insn (gen_pacibsp ());
9497             break;
9498           default:
9499             gcc_unreachable ();
9500         }
9501       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
9502       RTX_FRAME_RELATED_P (insn) = 1;
9503     }
9504
9505   /* Push return address to shadow call stack.  */
9506   if (frame.is_scs_enabled)
9507     emit_insn (gen_scs_push ());
9508
9509   if (flag_stack_usage_info)
9510     current_function_static_stack_size = constant_lower_bound (frame_size);
9511
9512   if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
9513     {
9514       if (crtl->is_leaf && !cfun->calls_alloca)
9515         {
9516           if (maybe_gt (frame_size, PROBE_INTERVAL)
9517               && maybe_gt (frame_size, get_stack_check_protect ()))
9518             aarch64_emit_probe_stack_range (get_stack_check_protect (),
9519                                             (frame_size
9520                                              - get_stack_check_protect ()));
9521         }
9522       else if (maybe_gt (frame_size, 0))
9523         aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
9524     }
9525
9526   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9527   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9528
9529   /* In theory we should never have both an initial adjustment
9530      and a callee save adjustment.  Verify that is the case since the
9531      code below does not handle it for -fstack-clash-protection.  */
9532   gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
9533
9534   /* Will only probe if the initial adjustment is larger than the guard
9535      less the amount of the guard reserved for use by the caller's
9536      outgoing args.  */
9537   aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
9538                                           force_isa_mode, true, false);
9539
9540   if (callee_adjust != 0)
9541     aarch64_push_regs (reg1, reg2, callee_adjust);
9542
9543   /* The offset of the current SP from the bottom of the static frame.  */
9544   poly_int64 bytes_below_sp = frame_size - initial_adjust - callee_adjust;
9545
9546   if (emit_frame_chain)
9547     {
9548       /* The offset of the frame chain record (if any) from the current SP.  */
9549       poly_int64 chain_offset = (initial_adjust + callee_adjust
9550                                  - frame.bytes_above_hard_fp);
9551       gcc_assert (known_ge (chain_offset, 0));
9552
9553       gcc_assert (reg1 == R29_REGNUM && reg2 == R30_REGNUM);
9554       if (callee_adjust == 0)
9555         aarch64_save_callee_saves (bytes_below_sp, frame.saved_gprs,
9556                                    false, false);
9557       else
9558         gcc_assert (known_eq (chain_offset, 0));
9559       aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
9560                           stack_pointer_rtx, chain_offset,
9561                           tmp1_rtx, tmp0_rtx, force_isa_mode,
9562                           frame_pointer_needed);
9563       if (frame_pointer_needed && !frame_size.is_constant ())
9564         {
9565           /* Variable-sized frames need to describe the save slot
9566              address using DW_CFA_expression rather than DW_CFA_offset.
9567              This means that, without taking further action, the
9568              locations of the registers that we've already saved would
9569              remain based on the stack pointer even after we redefine
9570              the CFA based on the frame pointer.  We therefore need new
9571              DW_CFA_expressions to re-express the save slots with addresses
9572              based on the frame pointer.  */
9573           rtx_insn *insn = get_last_insn ();
9574           gcc_assert (RTX_FRAME_RELATED_P (insn));
9575
9576           /* Add an explicit CFA definition if this was previously
9577              implicit.  */
9578           if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
9579             {
9580               rtx src = plus_constant (Pmode, stack_pointer_rtx, chain_offset);
9581               add_reg_note (insn, REG_CFA_ADJUST_CFA,
9582                             gen_rtx_SET (hard_frame_pointer_rtx, src));
9583             }
9584
9585           /* Change the save slot expressions for the registers that
9586              we've already saved.  */
9587           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
9588                                       hard_frame_pointer_rtx, UNITS_PER_WORD);
9589           aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
9590                                       hard_frame_pointer_rtx, 0);
9591         }
9592       aarch64_emit_stack_tie (hard_frame_pointer_rtx);
9593     }
9594
9595   aarch64_save_callee_saves (bytes_below_sp, frame.saved_gprs, true,
9596                              emit_frame_chain);
9597   if (maybe_ge (frame.reg_offset[VG_REGNUM], 0))
9598     {
9599       unsigned int saved_regs[] = { VG_REGNUM };
9600       aarch64_save_callee_saves (bytes_below_sp, saved_regs, true,
9601                                  emit_frame_chain);
9602     }
9603   if (maybe_ne (sve_callee_adjust, 0))
9604     {
9605       gcc_assert (!flag_stack_clash_protection
9606                   || known_eq (initial_adjust, 0));
9607       aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
9608                                               sve_callee_adjust,
9609                                               force_isa_mode,
9610                                               !frame_pointer_needed, false);
9611       bytes_below_sp -= sve_callee_adjust;
9612     }
9613   aarch64_save_callee_saves (bytes_below_sp, frame.saved_prs, true,
9614                              emit_frame_chain);
9615   aarch64_save_callee_saves (bytes_below_sp, frame.saved_fprs, true,
9616                              emit_frame_chain);
9617
9618   /* We may need to probe the final adjustment if it is larger than the guard
9619      that is assumed by the called.  */
9620   gcc_assert (known_eq (bytes_below_sp, final_adjust));
9621   aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
9622                                           force_isa_mode,
9623                                           !frame_pointer_needed, true);
9624   if (emit_frame_chain && maybe_ne (final_adjust, 0))
9625     aarch64_emit_stack_tie (hard_frame_pointer_rtx);
9626
9627   /* Save the incoming value of PSTATE.SM, if required.  Code further
9628      down does this for locally-streaming functions.  */
9629   if (known_ge (frame.old_svcr_offset, 0)
9630       && !aarch64_cfun_enables_pstate_sm ())
9631     {
9632       rtx mem = aarch64_old_svcr_mem ();
9633       MEM_VOLATILE_P (mem) = 1;
9634       if (TARGET_SME)
9635         {
9636           rtx reg = gen_rtx_REG (DImode, IP0_REGNUM);
9637           emit_insn (gen_aarch64_read_svcr (reg));
9638           emit_move_insn (mem, reg);
9639         }
9640       else
9641         {
9642           rtx old_r0 = NULL_RTX, old_r1 = NULL_RTX;
9643           auto &args = crtl->args.info;
9644           if (args.aapcs_ncrn > 0)
9645             {
9646               old_r0 = gen_rtx_REG (DImode, PROBE_STACK_FIRST_REGNUM);
9647               emit_move_insn (old_r0, gen_rtx_REG (DImode, R0_REGNUM));
9648             }
9649           if (args.aapcs_ncrn > 1)
9650             {
9651               old_r1 = gen_rtx_REG (DImode, PROBE_STACK_SECOND_REGNUM);
9652               emit_move_insn (old_r1, gen_rtx_REG (DImode, R1_REGNUM));
9653             }
9654           emit_insn (gen_aarch64_get_sme_state ());
9655           emit_move_insn (mem, gen_rtx_REG (DImode, R0_REGNUM));
9656           if (old_r0)
9657             emit_move_insn (gen_rtx_REG (DImode, R0_REGNUM), old_r0);
9658           if (old_r1)
9659             emit_move_insn (gen_rtx_REG (DImode, R1_REGNUM), old_r1);
9660         }
9661     }
9662
9663   /* Enable PSTATE.SM, if required.  */
9664   if (aarch64_cfun_enables_pstate_sm ())
9665     {
9666       rtx_insn *guard_label = nullptr;
9667       if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
9668         {
9669           /* The current function is streaming-compatible.  Save the
9670              original state of PSTATE.SM.  */
9671           rtx svcr = gen_rtx_REG (DImode, IP0_REGNUM);
9672           emit_insn (gen_aarch64_read_svcr (svcr));
9673           emit_move_insn (aarch64_old_svcr_mem (), svcr);
9674           guard_label = aarch64_guard_switch_pstate_sm (svcr,
9675                                                         aarch64_isa_flags);
9676         }
9677       aarch64_sme_mode_switch_regs args_switch;
9678       auto &args = crtl->args.info;
9679       for (unsigned int i = 0; i < args.num_sme_mode_switch_args; ++i)
9680         {
9681           rtx x = args.sme_mode_switch_args[i];
9682           args_switch.add_reg (GET_MODE (x), REGNO (x));
9683         }
9684       args_switch.emit_prologue ();
9685       emit_insn (gen_aarch64_smstart_sm ());
9686       args_switch.emit_epilogue ();
9687       if (guard_label)
9688         emit_label (guard_label);
9689     }
9690 }
9691
9692 /* Return TRUE if we can use a simple_return insn.
9693
9694    This function checks whether the callee saved stack is empty, which
9695    means no restore actions are need. The pro_and_epilogue will use
9696    this to check whether shrink-wrapping opt is feasible.  */
9697
9698 bool
9699 aarch64_use_return_insn_p (void)
9700 {
9701   if (!reload_completed)
9702     return false;
9703
9704   if (crtl->profile)
9705     return false;
9706
9707   return known_eq (cfun->machine->frame.frame_size, 0);
9708 }
9709
9710 /* Generate the epilogue instructions for returning from a function.
9711    This is almost exactly the reverse of the prolog sequence, except
9712    that we need to insert barriers to avoid scheduling loads that read
9713    from a deallocated stack, and we optimize the unwind records by
9714    emitting them all together if possible.  */
9715 void
9716 aarch64_expand_epilogue (rtx_call_insn *sibcall)
9717 {
9718   aarch64_frame &frame = cfun->machine->frame;
9719   poly_int64 initial_adjust = frame.initial_adjust;
9720   HOST_WIDE_INT callee_adjust = frame.callee_adjust;
9721   poly_int64 final_adjust = frame.final_adjust;
9722   poly_int64 sve_callee_adjust = frame.sve_callee_adjust;
9723   poly_int64 bytes_below_hard_fp = frame.bytes_below_hard_fp;
9724   unsigned reg1 = frame.wb_pop_candidate1;
9725   unsigned reg2 = frame.wb_pop_candidate2;
9726   rtx cfi_ops = NULL;
9727   rtx_insn *insn;
9728   /* A stack clash protection prologue may not have left EP0_REGNUM or
9729      EP1_REGNUM in a usable state.  The same is true for allocations
9730      with an SVE component, since we then need both temporary registers
9731      for each allocation.  For stack clash we are in a usable state if
9732      the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER.  */
9733   HOST_WIDE_INT guard_size
9734     = 1 << param_stack_clash_protection_guard_size;
9735   HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9736   aarch64_feature_flags force_isa_mode = 0;
9737   if (aarch64_cfun_enables_pstate_sm ())
9738     force_isa_mode = AARCH64_FL_SM_ON;
9739
9740   /* We can re-use the registers when:
9741
9742      (a) the deallocation amount is the same as the corresponding
9743          allocation amount (which is false if we combine the initial
9744          and SVE callee save allocations in the prologue); and
9745
9746      (b) the allocation amount doesn't need a probe (which is false
9747          if the amount is guard_size - guard_used_by_caller or greater).
9748
9749      In such situations the register should remain live with the correct
9750      value.  */
9751   bool can_inherit_p = (initial_adjust.is_constant ()
9752                         && final_adjust.is_constant ()
9753                         && (!flag_stack_clash_protection
9754                             || (known_lt (initial_adjust,
9755                                           guard_size - guard_used_by_caller)
9756                                 && known_eq (sve_callee_adjust, 0))));
9757
9758   /* We need to add memory barrier to prevent read from deallocated stack.  */
9759   bool need_barrier_p
9760     = maybe_ne (get_frame_size ()
9761                 + frame.saved_varargs_size, 0);
9762
9763   /* Reset PSTATE.SM, if required.  */
9764   if (aarch64_cfun_enables_pstate_sm ())
9765     {
9766       rtx_insn *guard_label = nullptr;
9767       if (known_ge (cfun->machine->frame.old_svcr_offset, 0))
9768         guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
9769                                                       aarch64_isa_flags);
9770       aarch64_sme_mode_switch_regs return_switch;
9771       if (sibcall)
9772         return_switch.add_call_args (sibcall);
9773       else if (crtl->return_rtx && REG_P (crtl->return_rtx))
9774         return_switch.add_reg (GET_MODE (crtl->return_rtx),
9775                                REGNO (crtl->return_rtx));
9776       return_switch.emit_prologue ();
9777       emit_insn (gen_aarch64_smstop_sm ());
9778       return_switch.emit_epilogue ();
9779       if (guard_label)
9780         emit_label (guard_label);
9781     }
9782
9783   /* Emit a barrier to prevent loads from a deallocated stack.  */
9784   if (maybe_gt (final_adjust, crtl->outgoing_args_size)
9785       || cfun->calls_alloca
9786       || crtl->calls_eh_return)
9787     {
9788       aarch64_emit_stack_tie (stack_pointer_rtx);
9789       need_barrier_p = false;
9790     }
9791
9792   /* Restore the stack pointer from the frame pointer if it may not
9793      be the same as the stack pointer.  */
9794   rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
9795   rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
9796   if (frame_pointer_needed
9797       && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
9798     /* If writeback is used when restoring callee-saves, the CFA
9799        is restored on the instruction doing the writeback.  */
9800     aarch64_add_offset (Pmode, stack_pointer_rtx,
9801                         hard_frame_pointer_rtx,
9802                         -bytes_below_hard_fp + final_adjust,
9803                         tmp1_rtx, tmp0_rtx, force_isa_mode,
9804                         callee_adjust == 0);
9805   else
9806      /* The case where we need to re-use the register here is very rare, so
9807         avoid the complicated condition and just always emit a move if the
9808         immediate doesn't fit.  */
9809      aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, force_isa_mode, true);
9810
9811   /* Restore the vector registers before the predicate registers,
9812      so that we can use P4 as a temporary for big-endian SVE frames.  */
9813   aarch64_restore_callee_saves (final_adjust, frame.saved_fprs, &cfi_ops);
9814   aarch64_restore_callee_saves (final_adjust, frame.saved_prs, &cfi_ops);
9815   if (maybe_ne (sve_callee_adjust, 0))
9816     aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust,
9817                     force_isa_mode, true);
9818
9819   /* When shadow call stack is enabled, the scs_pop in the epilogue will
9820      restore x30, we don't need to restore x30 again in the traditional
9821      way.  */
9822   aarch64_restore_callee_saves (final_adjust + sve_callee_adjust,
9823                                 frame.saved_gprs, &cfi_ops);
9824
9825   if (need_barrier_p)
9826     aarch64_emit_stack_tie (stack_pointer_rtx);
9827
9828   if (callee_adjust != 0)
9829     aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
9830
9831   /* If we have no register restore information, the CFA must have been
9832      defined in terms of the stack pointer since the end of the prologue.  */
9833   gcc_assert (cfi_ops || !frame_pointer_needed);
9834
9835   if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
9836     {
9837       /* Emit delayed restores and set the CFA to be SP + initial_adjust.  */
9838       insn = get_last_insn ();
9839       rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
9840       REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
9841       RTX_FRAME_RELATED_P (insn) = 1;
9842       cfi_ops = NULL;
9843     }
9844
9845   /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
9846      add restriction on emit_move optimization to leaf functions.  */
9847   aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust, force_isa_mode,
9848                   (!can_inherit_p || !crtl->is_leaf
9849                    || df_regs_ever_live_p (EP0_REGNUM)));
9850
9851   if (cfi_ops)
9852     {
9853       /* Emit delayed restores and reset the CFA to be SP.  */
9854       insn = get_last_insn ();
9855       cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
9856       REG_NOTES (insn) = cfi_ops;
9857       RTX_FRAME_RELATED_P (insn) = 1;
9858     }
9859
9860   /* Pop return address from shadow call stack.  */
9861   if (frame.is_scs_enabled)
9862     {
9863       machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
9864       rtx reg = gen_rtx_REG (mode, R30_REGNUM);
9865
9866       insn = emit_insn (gen_scs_pop ());
9867       add_reg_note (insn, REG_CFA_RESTORE, reg);
9868       RTX_FRAME_RELATED_P (insn) = 1;
9869     }
9870
9871   /* Stack adjustment for exception handler.  */
9872   if (crtl->calls_eh_return && !sibcall)
9873     {
9874       /* If the EH_RETURN_TAKEN_RTX flag is set then we need
9875          to unwind the stack and jump to the handler, otherwise
9876          skip this eh_return logic and continue with normal
9877          return after the label.  We have already reset the CFA
9878          to be SP; letting the CFA move during this adjustment
9879          is just as correct as retaining the CFA from the body
9880          of the function.  Therefore, do nothing special.  */
9881       rtx label = gen_label_rtx ();
9882       rtx x = gen_rtx_EQ (VOIDmode, EH_RETURN_TAKEN_RTX, const0_rtx);
9883       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
9884                                 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
9885       rtx jump = emit_jump_insn (gen_rtx_SET (pc_rtx, x));
9886       JUMP_LABEL (jump) = label;
9887       LABEL_NUSES (label)++;
9888       emit_insn (gen_add2_insn (stack_pointer_rtx,
9889                                 EH_RETURN_STACKADJ_RTX));
9890       emit_jump_insn (gen_indirect_jump (EH_RETURN_HANDLER_RTX));
9891       emit_barrier ();
9892       emit_label (label);
9893     }
9894
9895   /* We prefer to emit the combined return/authenticate instruction RETAA,
9896      however there are three cases in which we must instead emit an explicit
9897      authentication instruction.
9898
9899         1) Sibcalls don't return in a normal way, so if we're about to call one
9900            we must authenticate.
9901
9902         2) The RETAA instruction is not available before ARMv8.3-A, so if we are
9903            generating code for !TARGET_ARMV8_3 we can't use it and must
9904            explicitly authenticate.
9905     */
9906   if (aarch64_return_address_signing_enabled ()
9907       && (sibcall || !TARGET_ARMV8_3))
9908     {
9909       switch (aarch_ra_sign_key)
9910         {
9911           case AARCH_KEY_A:
9912             insn = emit_insn (gen_autiasp ());
9913             break;
9914           case AARCH_KEY_B:
9915             insn = emit_insn (gen_autibsp ());
9916             break;
9917           default:
9918             gcc_unreachable ();
9919         }
9920       add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
9921       RTX_FRAME_RELATED_P (insn) = 1;
9922     }
9923
9924   emit_use (gen_rtx_REG (DImode, LR_REGNUM));
9925   if (!sibcall)
9926     emit_jump_insn (ret_rtx);
9927 }
9928
9929 /* Output code to add DELTA to the first argument, and then jump
9930    to FUNCTION.  Used for C++ multiple inheritance.  */
9931 static void
9932 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
9933                          HOST_WIDE_INT delta,
9934                          HOST_WIDE_INT vcall_offset,
9935                          tree function)
9936 {
9937   /* The this pointer is always in x0.  Note that this differs from
9938      Arm where the this pointer maybe bumped to r1 if r0 is required
9939      to return a pointer to an aggregate.  On AArch64 a result value
9940      pointer will be in x8.  */
9941   int this_regno = R0_REGNUM;
9942   rtx this_rtx, temp0, temp1, addr, funexp;
9943   rtx_insn *insn;
9944   const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
9945
9946   if (aarch_bti_enabled ())
9947     emit_insn (gen_bti_c());
9948
9949   reload_completed = 1;
9950   emit_note (NOTE_INSN_PROLOGUE_END);
9951
9952   this_rtx = gen_rtx_REG (Pmode, this_regno);
9953   temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
9954   temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
9955
9956   if (vcall_offset == 0)
9957     aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0,
9958                         0, false);
9959   else
9960     {
9961       gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
9962
9963       addr = this_rtx;
9964       if (delta != 0)
9965         {
9966           if (delta >= -256 && delta < 256)
9967             addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
9968                                        plus_constant (Pmode, this_rtx, delta));
9969           else
9970             aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
9971                                 temp1, temp0, 0, false);
9972         }
9973
9974       if (Pmode == ptr_mode)
9975         aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
9976       else
9977         aarch64_emit_move (temp0,
9978                            gen_rtx_ZERO_EXTEND (Pmode,
9979                                                 gen_rtx_MEM (ptr_mode, addr)));
9980
9981       if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
9982           addr = plus_constant (Pmode, temp0, vcall_offset);
9983       else
9984         {
9985           aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
9986                                           Pmode);
9987           addr = gen_rtx_PLUS (Pmode, temp0, temp1);
9988         }
9989
9990       if (Pmode == ptr_mode)
9991         aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
9992       else
9993         aarch64_emit_move (temp1,
9994                            gen_rtx_SIGN_EXTEND (Pmode,
9995                                                 gen_rtx_MEM (ptr_mode, addr)));
9996
9997       emit_insn (gen_add2_insn (this_rtx, temp1));
9998     }
9999
10000   /* Generate a tail call to the target function.  */
10001   if (!TREE_USED (function))
10002     {
10003       assemble_external (function);
10004       TREE_USED (function) = 1;
10005     }
10006   funexp = XEXP (DECL_RTL (function), 0);
10007   funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
10008   auto isa_mode = aarch64_fntype_isa_mode (TREE_TYPE (function));
10009   auto pcs_variant = arm_pcs (fndecl_abi (function).id ());
10010   rtx callee_abi = aarch64_gen_callee_cookie (isa_mode, pcs_variant);
10011   insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
10012   SIBLING_CALL_P (insn) = 1;
10013
10014   insn = get_insns ();
10015   shorten_branches (insn);
10016
10017   assemble_start_function (thunk, fnname);
10018   final_start_function (insn, file, 1);
10019   final (insn, file, 1);
10020   final_end_function ();
10021   assemble_end_function (thunk, fnname);
10022
10023   /* Stop pretending to be a post-reload pass.  */
10024   reload_completed = 0;
10025 }
10026
10027 static bool
10028 aarch64_tls_referenced_p (rtx x)
10029 {
10030   if (!TARGET_HAVE_TLS)
10031     return false;
10032   subrtx_iterator::array_type array;
10033   FOR_EACH_SUBRTX (iter, array, x, ALL)
10034     {
10035       const_rtx x = *iter;
10036       if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
10037         return true;
10038       /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10039          TLS offsets, not real symbol references.  */
10040       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10041         iter.skip_subrtxes ();
10042     }
10043   return false;
10044 }
10045
10046
10047 static bool
10048 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
10049 {
10050   if (GET_CODE (x) == HIGH)
10051     return true;
10052
10053   /* There's no way to calculate VL-based values using relocations.  */
10054   subrtx_iterator::array_type array;
10055   HOST_WIDE_INT factor;
10056   FOR_EACH_SUBRTX (iter, array, x, ALL)
10057     if (GET_CODE (*iter) == CONST_POLY_INT
10058         || aarch64_sme_vq_unspec_p (x, &factor))
10059       return true;
10060
10061   poly_int64 offset;
10062   rtx base = strip_offset_and_salt (x, &offset);
10063   if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
10064     {
10065       /* We checked for POLY_INT_CST offsets above.  */
10066       if (aarch64_classify_symbol (base, offset.to_constant ())
10067           != SYMBOL_FORCE_TO_MEM)
10068         return true;
10069       else
10070         /* Avoid generating a 64-bit relocation in ILP32; leave
10071            to aarch64_expand_mov_immediate to handle it properly.  */
10072         return mode != ptr_mode;
10073     }
10074
10075   return aarch64_tls_referenced_p (x);
10076 }
10077
10078 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10079    The expansion for a table switch is quite expensive due to the number
10080    of instructions, the table lookup and hard to predict indirect jump.
10081    When optimizing for speed, and -O3 enabled, use the per-core tuning if
10082    set, otherwise use tables for >= 11 cases as a tradeoff between size and
10083    performance.  When optimizing for size, use 8 for smallest codesize.  */
10084
10085 static unsigned int
10086 aarch64_case_values_threshold (void)
10087 {
10088   /* Use the specified limit for the number of cases before using jump
10089      tables at higher optimization levels.  */
10090   if (optimize > 2
10091       && aarch64_tune_params.max_case_values != 0)
10092     return aarch64_tune_params.max_case_values;
10093   else
10094     return optimize_size ? 8 : 11;
10095 }
10096
10097 /* Return true if register REGNO is a valid index register.
10098    STRICT_P is true if REG_OK_STRICT is in effect.  */
10099
10100 bool
10101 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
10102 {
10103   if (!HARD_REGISTER_NUM_P (regno))
10104     {
10105       if (!strict_p)
10106         return true;
10107
10108       if (!reg_renumber)
10109         return false;
10110
10111       regno = reg_renumber[regno];
10112     }
10113   return GP_REGNUM_P (regno);
10114 }
10115
10116 /* Return true if register REGNO is a valid base register for mode MODE.
10117    STRICT_P is true if REG_OK_STRICT is in effect.  */
10118
10119 bool
10120 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
10121 {
10122   if (!HARD_REGISTER_NUM_P (regno))
10123     {
10124       if (!strict_p)
10125         return true;
10126
10127       if (!reg_renumber)
10128         return false;
10129
10130       regno = reg_renumber[regno];
10131     }
10132
10133   /* The fake registers will be eliminated to either the stack or
10134      hard frame pointer, both of which are usually valid base registers.
10135      Reload deals with the cases where the eliminated form isn't valid.  */
10136   return (GP_REGNUM_P (regno)
10137           || regno == SP_REGNUM
10138           || regno == FRAME_POINTER_REGNUM
10139           || regno == ARG_POINTER_REGNUM);
10140 }
10141
10142 /* Return true if X is a valid base register for mode MODE.
10143    STRICT_P is true if REG_OK_STRICT is in effect.  */
10144
10145 static bool
10146 aarch64_base_register_rtx_p (rtx x, bool strict_p)
10147 {
10148   if (!strict_p
10149       && SUBREG_P (x)
10150       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
10151     x = SUBREG_REG (x);
10152
10153   return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
10154 }
10155
10156 /* Return true if address offset is a valid index.  If it is, fill in INFO
10157    appropriately.  STRICT_P is true if REG_OK_STRICT is in effect.  */
10158
10159 static bool
10160 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
10161                         machine_mode mode, bool strict_p)
10162 {
10163   enum aarch64_address_type type;
10164   rtx index;
10165   int shift;
10166
10167   /* (reg:P) */
10168   if ((REG_P (x) || SUBREG_P (x))
10169       && GET_MODE (x) == Pmode)
10170     {
10171       type = ADDRESS_REG_REG;
10172       index = x;
10173       shift = 0;
10174     }
10175   /* (sign_extend:DI (reg:SI)) */
10176   else if ((GET_CODE (x) == SIGN_EXTEND
10177             || GET_CODE (x) == ZERO_EXTEND)
10178            && GET_MODE (x) == DImode
10179            && GET_MODE (XEXP (x, 0)) == SImode)
10180     {
10181       type = (GET_CODE (x) == SIGN_EXTEND)
10182         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10183       index = XEXP (x, 0);
10184       shift = 0;
10185     }
10186   /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10187   else if (GET_CODE (x) == MULT
10188            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10189                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10190            && GET_MODE (XEXP (x, 0)) == DImode
10191            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10192            && CONST_INT_P (XEXP (x, 1)))
10193     {
10194       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10195         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10196       index = XEXP (XEXP (x, 0), 0);
10197       shift = exact_log2 (INTVAL (XEXP (x, 1)));
10198     }
10199   /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10200   else if (GET_CODE (x) == ASHIFT
10201            && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10202                || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10203            && GET_MODE (XEXP (x, 0)) == DImode
10204            && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10205            && CONST_INT_P (XEXP (x, 1)))
10206     {
10207       type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10208         ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10209       index = XEXP (XEXP (x, 0), 0);
10210       shift = INTVAL (XEXP (x, 1));
10211     }
10212   /* (and:DI (mult:DI (reg:DI) (const_int scale))
10213      (const_int 0xffffffff<<shift)) */
10214   else if (GET_CODE (x) == AND
10215            && GET_MODE (x) == DImode
10216            && GET_CODE (XEXP (x, 0)) == MULT
10217            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10218            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10219            && CONST_INT_P (XEXP (x, 1)))
10220     {
10221       type = ADDRESS_REG_UXTW;
10222       index = XEXP (XEXP (x, 0), 0);
10223       shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
10224       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10225         shift = -1;
10226     }
10227   /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10228      (const_int 0xffffffff<<shift)) */
10229   else if (GET_CODE (x) == AND
10230            && GET_MODE (x) == DImode
10231            && GET_CODE (XEXP (x, 0)) == ASHIFT
10232            && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10233            && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10234            && CONST_INT_P (XEXP (x, 1)))
10235     {
10236       type = ADDRESS_REG_UXTW;
10237       index = XEXP (XEXP (x, 0), 0);
10238       shift = INTVAL (XEXP (XEXP (x, 0), 1));
10239       if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10240         shift = -1;
10241     }
10242   /* (mult:P (reg:P) (const_int scale)) */
10243   else if (GET_CODE (x) == MULT
10244            && GET_MODE (x) == Pmode
10245            && GET_MODE (XEXP (x, 0)) == Pmode
10246            && CONST_INT_P (XEXP (x, 1)))
10247     {
10248       type = ADDRESS_REG_REG;
10249       index = XEXP (x, 0);
10250       shift = exact_log2 (INTVAL (XEXP (x, 1)));
10251     }
10252   /* (ashift:P (reg:P) (const_int shift)) */
10253   else if (GET_CODE (x) == ASHIFT
10254            && GET_MODE (x) == Pmode
10255            && GET_MODE (XEXP (x, 0)) == Pmode
10256            && CONST_INT_P (XEXP (x, 1)))
10257     {
10258       type = ADDRESS_REG_REG;
10259       index = XEXP (x, 0);
10260       shift = INTVAL (XEXP (x, 1));
10261     }
10262   else
10263     return false;
10264
10265   if (!strict_p
10266       && SUBREG_P (index)
10267       && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
10268     index = SUBREG_REG (index);
10269
10270   if (aarch64_sve_data_mode_p (mode) || mode == VNx1TImode)
10271     {
10272       if (type != ADDRESS_REG_REG
10273           || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
10274         return false;
10275     }
10276   else
10277     {
10278       if (shift != 0
10279           && !(IN_RANGE (shift, 1, 3)
10280                && known_eq (1 << shift, GET_MODE_SIZE (mode))))
10281         return false;
10282     }
10283
10284   if (REG_P (index)
10285       && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
10286     {
10287       info->type = type;
10288       info->offset = index;
10289       info->shift = shift;
10290       return true;
10291     }
10292
10293   return false;
10294 }
10295
10296 /* Return true if MODE is one of the modes for which we
10297    support LDP/STP operations.  */
10298
10299 static bool
10300 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
10301 {
10302   return mode == SImode || mode == DImode
10303          || mode == SFmode || mode == DFmode
10304          || mode == SDmode || mode == DDmode
10305          || (aarch64_vector_mode_supported_p (mode)
10306              && (known_eq (GET_MODE_SIZE (mode), 8)
10307                  || (known_eq (GET_MODE_SIZE (mode), 16)
10308                     && (aarch64_tune_params.extra_tuning_flags
10309                         & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
10310 }
10311
10312 /* Return true if REGNO is a virtual pointer register, or an eliminable
10313    "soft" frame register.  Like REGNO_PTR_FRAME_P except that we don't
10314    include stack_pointer or hard_frame_pointer.  */
10315 static bool
10316 virt_or_elim_regno_p (unsigned regno)
10317 {
10318   return ((regno >= FIRST_VIRTUAL_REGISTER
10319            && regno <= LAST_VIRTUAL_POINTER_REGISTER)
10320           || regno == FRAME_POINTER_REGNUM
10321           || regno == ARG_POINTER_REGNUM);
10322 }
10323
10324 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10325    If it is, fill in INFO appropriately.  STRICT_P is true if
10326    REG_OK_STRICT is in effect.  */
10327
10328 bool
10329 aarch64_classify_address (struct aarch64_address_info *info,
10330                           rtx x, machine_mode mode, bool strict_p,
10331                           aarch64_addr_query_type type)
10332 {
10333   enum rtx_code code = GET_CODE (x);
10334   rtx op0, op1;
10335   poly_int64 offset;
10336
10337   HOST_WIDE_INT const_size;
10338
10339   /* Whether a vector mode is partial doesn't affect address legitimacy.
10340      Partial vectors like VNx8QImode allow the same indexed addressing
10341      mode and MUL VL addressing mode as full vectors like VNx16QImode;
10342      in both cases, MUL VL counts multiples of GET_MODE_SIZE.  */
10343   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10344   vec_flags &= ~VEC_PARTIAL;
10345
10346   /* On BE, we use load/store pair for all large int mode load/stores.
10347      TI/TF/TDmode may also use a load/store pair.  */
10348   bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
10349   bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
10350                             || type == ADDR_QUERY_LDP_STP_N
10351                             || mode == TImode
10352                             || mode == TFmode
10353                             || mode == TDmode
10354                             || ((!TARGET_SIMD || BYTES_BIG_ENDIAN)
10355                                 && advsimd_struct_p));
10356   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10357      corresponds to the actual size of the memory being loaded/stored and the
10358      mode of the corresponding addressing mode is half of that.  */
10359   if (type == ADDR_QUERY_LDP_STP_N)
10360     {
10361       if (known_eq (GET_MODE_SIZE (mode), 32))
10362         mode = V16QImode;
10363       else if (known_eq (GET_MODE_SIZE (mode), 16))
10364         mode = DFmode;
10365       else if (known_eq (GET_MODE_SIZE (mode), 8))
10366         mode = SFmode;
10367       else
10368         return false;
10369
10370       /* This isn't really an Advanced SIMD struct mode, but a mode
10371          used to represent the complete mem in a load/store pair.  */
10372       advsimd_struct_p = false;
10373     }
10374
10375   bool allow_reg_index_p = (!load_store_pair_p
10376                             && ((vec_flags == 0
10377                                  && known_lt (GET_MODE_SIZE (mode), 16))
10378                                 || vec_flags == VEC_ADVSIMD
10379                                 || vec_flags & VEC_SVE_DATA
10380                                 || mode == VNx1TImode));
10381
10382   /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10383      The latter is not valid for SVE predicates, and that's rejected through
10384      allow_reg_index_p above.  */
10385   if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
10386       && (code != REG && code != PLUS))
10387     return false;
10388
10389   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
10390      REG addressing.  */
10391   if (advsimd_struct_p
10392       && TARGET_SIMD
10393       && !BYTES_BIG_ENDIAN
10394       && (code != POST_INC && code != REG))
10395     return false;
10396
10397   gcc_checking_assert (GET_MODE (x) == VOIDmode
10398                        || SCALAR_INT_MODE_P (GET_MODE (x)));
10399
10400   switch (code)
10401     {
10402     case REG:
10403     case SUBREG:
10404       info->type = ADDRESS_REG_IMM;
10405       info->base = x;
10406       info->offset = const0_rtx;
10407       info->const_offset = 0;
10408       return aarch64_base_register_rtx_p (x, strict_p);
10409
10410     case PLUS:
10411       op0 = XEXP (x, 0);
10412       op1 = XEXP (x, 1);
10413
10414       if (! strict_p
10415           && REG_P (op0)
10416           && virt_or_elim_regno_p (REGNO (op0))
10417           && poly_int_rtx_p (op1, &offset))
10418         {
10419           info->type = ADDRESS_REG_IMM;
10420           info->base = op0;
10421           info->offset = op1;
10422           info->const_offset = offset;
10423
10424           return true;
10425         }
10426
10427       if (maybe_ne (GET_MODE_SIZE (mode), 0)
10428           && aarch64_base_register_rtx_p (op0, strict_p)
10429           && poly_int_rtx_p (op1, &offset))
10430         {
10431           info->type = ADDRESS_REG_IMM;
10432           info->base = op0;
10433           info->offset = op1;
10434           info->const_offset = offset;
10435
10436           /* TImode, TFmode and TDmode values are allowed in both pairs of X
10437              registers and individual Q registers.  The available
10438              address modes are:
10439              X,X: 7-bit signed scaled offset
10440              Q:   9-bit signed offset
10441              We conservatively require an offset representable in either mode.
10442              When performing the check for pairs of X registers i.e.  LDP/STP
10443              pass down DImode since that is the natural size of the LDP/STP
10444              instruction memory accesses.  */
10445           if (mode == TImode || mode == TFmode || mode == TDmode)
10446             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10447                     && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10448                         || offset_12bit_unsigned_scaled_p (mode, offset)));
10449
10450           if (mode == V8DImode)
10451             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10452                     && aarch64_offset_7bit_signed_scaled_p (DImode, offset + 48));
10453
10454           /* A 7bit offset check because OImode will emit a ldp/stp
10455              instruction (only !TARGET_SIMD or big endian will get here).
10456              For ldp/stp instructions, the offset is scaled for the size of a
10457              single element of the pair.  */
10458           if (aarch64_advsimd_partial_struct_mode_p (mode)
10459               && known_eq (GET_MODE_SIZE (mode), 16))
10460             return aarch64_offset_7bit_signed_scaled_p (DImode, offset);
10461           if (aarch64_advsimd_full_struct_mode_p (mode)
10462               && known_eq (GET_MODE_SIZE (mode), 32))
10463             return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
10464
10465           /* Three 9/12 bit offsets checks because CImode will emit three
10466              ldr/str instructions (only !TARGET_SIMD or big endian will
10467              get here).  */
10468           if (aarch64_advsimd_partial_struct_mode_p (mode)
10469               && known_eq (GET_MODE_SIZE (mode), 24))
10470             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10471                     && (aarch64_offset_9bit_signed_unscaled_p (DImode,
10472                                                                offset + 16)
10473                         || offset_12bit_unsigned_scaled_p (DImode,
10474                                                            offset + 16)));
10475           if (aarch64_advsimd_full_struct_mode_p (mode)
10476               && known_eq (GET_MODE_SIZE (mode), 48))
10477             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10478                     && (aarch64_offset_9bit_signed_unscaled_p (TImode,
10479                                                                offset + 32)
10480                         || offset_12bit_unsigned_scaled_p (TImode,
10481                                                            offset + 32)));
10482
10483           /* Two 7bit offsets checks because XImode will emit two ldp/stp
10484              instructions (only big endian will get here).  */
10485           if (aarch64_advsimd_partial_struct_mode_p (mode)
10486               && known_eq (GET_MODE_SIZE (mode), 32))
10487             return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10488                     && aarch64_offset_7bit_signed_scaled_p (DImode,
10489                                                             offset + 16));
10490           if (aarch64_advsimd_full_struct_mode_p (mode)
10491               && known_eq (GET_MODE_SIZE (mode), 64))
10492             return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10493                     && aarch64_offset_7bit_signed_scaled_p (TImode,
10494                                                             offset + 32));
10495
10496           /* Make "m" use the LD1 offset range for SVE data modes, so
10497              that pre-RTL optimizers like ivopts will work to that
10498              instead of the wider LDR/STR range.  */
10499           if (vec_flags == VEC_SVE_DATA || mode == VNx1TImode)
10500             return (type == ADDR_QUERY_M
10501                     ? offset_4bit_signed_scaled_p (mode, offset)
10502                     : offset_9bit_signed_scaled_p (mode, offset));
10503
10504           if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
10505             {
10506               poly_int64 end_offset = (offset
10507                                        + GET_MODE_SIZE (mode)
10508                                        - BYTES_PER_SVE_VECTOR);
10509               return (type == ADDR_QUERY_M
10510                       ? offset_4bit_signed_scaled_p (mode, offset)
10511                       : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
10512                          && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
10513                                                          end_offset)));
10514             }
10515
10516           if (vec_flags == VEC_SVE_PRED)
10517             return offset_9bit_signed_scaled_p (mode, offset);
10518
10519           if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
10520             {
10521               poly_int64 end_offset = (offset
10522                                        + GET_MODE_SIZE (mode)
10523                                        - BYTES_PER_SVE_PRED);
10524               return (offset_9bit_signed_scaled_p (VNx16BImode, end_offset)
10525                       && offset_9bit_signed_scaled_p (VNx16BImode, offset));
10526             }
10527
10528           if (load_store_pair_p)
10529             return ((known_eq (GET_MODE_SIZE (mode), 4)
10530                      || known_eq (GET_MODE_SIZE (mode), 8)
10531                      || known_eq (GET_MODE_SIZE (mode), 16))
10532                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10533           else
10534             return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10535                     || offset_12bit_unsigned_scaled_p (mode, offset));
10536         }
10537
10538       if (allow_reg_index_p)
10539         {
10540           /* Look for base + (scaled/extended) index register.  */
10541           if (aarch64_base_register_rtx_p (op0, strict_p)
10542               && aarch64_classify_index (info, op1, mode, strict_p))
10543             {
10544               info->base = op0;
10545               return true;
10546             }
10547           if (aarch64_base_register_rtx_p (op1, strict_p)
10548               && aarch64_classify_index (info, op0, mode, strict_p))
10549             {
10550               info->base = op1;
10551               return true;
10552             }
10553         }
10554
10555       return false;
10556
10557     case POST_INC:
10558     case POST_DEC:
10559     case PRE_INC:
10560     case PRE_DEC:
10561       info->type = ADDRESS_REG_WB;
10562       info->base = XEXP (x, 0);
10563       info->offset = NULL_RTX;
10564       return aarch64_base_register_rtx_p (info->base, strict_p);
10565
10566     case POST_MODIFY:
10567     case PRE_MODIFY:
10568       info->type = ADDRESS_REG_WB;
10569       info->base = XEXP (x, 0);
10570       if (GET_CODE (XEXP (x, 1)) == PLUS
10571           && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
10572           && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
10573           && aarch64_base_register_rtx_p (info->base, strict_p))
10574         {
10575           info->offset = XEXP (XEXP (x, 1), 1);
10576           info->const_offset = offset;
10577
10578           /* TImode, TFmode and TDmode values are allowed in both pairs of X
10579              registers and individual Q registers.  The available
10580              address modes are:
10581              X,X: 7-bit signed scaled offset
10582              Q:   9-bit signed offset
10583              We conservatively require an offset representable in either mode.
10584            */
10585           if (mode == TImode || mode == TFmode || mode == TDmode)
10586             return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
10587                     && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
10588
10589           if (load_store_pair_p)
10590             return ((known_eq (GET_MODE_SIZE (mode), 4)
10591                      || known_eq (GET_MODE_SIZE (mode), 8)
10592                      || known_eq (GET_MODE_SIZE (mode), 16))
10593                     && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10594           else
10595             return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
10596         }
10597       return false;
10598
10599     case CONST:
10600     case SYMBOL_REF:
10601     case LABEL_REF:
10602       /* load literal: pc-relative constant pool entry.  Only supported
10603          for SI mode or larger.  */
10604       info->type = ADDRESS_SYMBOLIC;
10605
10606       if (!load_store_pair_p
10607           && GET_MODE_SIZE (mode).is_constant (&const_size)
10608           && const_size >= 4)
10609         {
10610           poly_int64 offset;
10611           rtx sym = strip_offset_and_salt (x, &offset);
10612           return ((LABEL_REF_P (sym)
10613                    || (SYMBOL_REF_P (sym)
10614                        && CONSTANT_POOL_ADDRESS_P (sym)
10615                        && aarch64_pcrelative_literal_loads)));
10616         }
10617       return false;
10618
10619     case LO_SUM:
10620       info->type = ADDRESS_LO_SUM;
10621       info->base = XEXP (x, 0);
10622       info->offset = XEXP (x, 1);
10623       if (allow_reg_index_p
10624           && aarch64_base_register_rtx_p (info->base, strict_p))
10625         {
10626           poly_int64 offset;
10627           HOST_WIDE_INT const_offset;
10628           rtx sym = strip_offset_and_salt (info->offset, &offset);
10629           if (SYMBOL_REF_P (sym)
10630               && offset.is_constant (&const_offset)
10631               && (aarch64_classify_symbol (sym, const_offset)
10632                   == SYMBOL_SMALL_ABSOLUTE))
10633             {
10634               /* The symbol and offset must be aligned to the access size.  */
10635               unsigned int align;
10636
10637               if (CONSTANT_POOL_ADDRESS_P (sym))
10638                 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
10639               else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
10640                 {
10641                   tree exp = SYMBOL_REF_DECL (sym);
10642                   align = TYPE_ALIGN (TREE_TYPE (exp));
10643                   align = aarch64_constant_alignment (exp, align);
10644                 }
10645               else if (SYMBOL_REF_DECL (sym))
10646                 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
10647               else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
10648                        && SYMBOL_REF_BLOCK (sym) != NULL)
10649                 align = SYMBOL_REF_BLOCK (sym)->alignment;
10650               else
10651                 align = BITS_PER_UNIT;
10652
10653               poly_int64 ref_size = GET_MODE_SIZE (mode);
10654               if (known_eq (ref_size, 0))
10655                 ref_size = GET_MODE_SIZE (DImode);
10656
10657               return (multiple_p (const_offset, ref_size)
10658                       && multiple_p (align / BITS_PER_UNIT, ref_size));
10659             }
10660         }
10661       return false;
10662
10663     default:
10664       return false;
10665     }
10666 }
10667
10668 /* Return true if the address X is valid for a PRFM instruction.
10669    STRICT_P is true if we should do strict checking with
10670    aarch64_classify_address.  */
10671
10672 bool
10673 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
10674 {
10675   struct aarch64_address_info addr;
10676
10677   /* PRFM accepts the same addresses as DImode...  */
10678   bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
10679   if (!res)
10680     return false;
10681
10682   /* ... except writeback forms.  */
10683   return addr.type != ADDRESS_REG_WB;
10684 }
10685
10686 bool
10687 aarch64_symbolic_address_p (rtx x)
10688 {
10689   poly_int64 offset;
10690   x = strip_offset_and_salt (x, &offset);
10691   return SYMBOL_REF_P (x) || LABEL_REF_P (x);
10692 }
10693
10694 /* Classify the base of symbolic expression X.  */
10695
10696 enum aarch64_symbol_type
10697 aarch64_classify_symbolic_expression (rtx x)
10698 {
10699   rtx offset;
10700
10701   split_const (x, &x, &offset);
10702   return aarch64_classify_symbol (x, INTVAL (offset));
10703 }
10704
10705
10706 /* Return TRUE if X is a legitimate address for accessing memory in
10707    mode MODE.  */
10708 static bool
10709 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p,
10710                                    code_helper = ERROR_MARK)
10711 {
10712   struct aarch64_address_info addr;
10713
10714   return aarch64_classify_address (&addr, x, mode, strict_p);
10715 }
10716
10717 /* Return TRUE if X is a legitimate address of type TYPE for accessing
10718    memory in mode MODE.  STRICT_P is true if REG_OK_STRICT is in effect.  */
10719 bool
10720 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
10721                               aarch64_addr_query_type type)
10722 {
10723   struct aarch64_address_info addr;
10724
10725   return aarch64_classify_address (&addr, x, mode, strict_p, type);
10726 }
10727
10728 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT.  */
10729
10730 static bool
10731 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
10732                                          poly_int64 orig_offset,
10733                                          machine_mode mode)
10734 {
10735   HOST_WIDE_INT size;
10736   if (GET_MODE_SIZE (mode).is_constant (&size))
10737     {
10738       HOST_WIDE_INT const_offset, second_offset;
10739
10740       /* A general SVE offset is A * VQ + B.  Remove the A component from
10741          coefficient 0 in order to get the constant B.  */
10742       const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
10743
10744       /* Split an out-of-range address displacement into a base and
10745          offset.  Use 4KB range for 1- and 2-byte accesses and a 16KB
10746          range otherwise to increase opportunities for sharing the base
10747          address of different sizes.  Unaligned accesses use the signed
10748          9-bit range, TImode/TFmode/TDmode use the intersection of signed
10749          scaled 7-bit and signed 9-bit offset.  */
10750       if (mode == TImode || mode == TFmode || mode == TDmode)
10751         second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
10752       else if ((const_offset & (size - 1)) != 0)
10753         second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
10754       else
10755         second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
10756
10757       if (second_offset == 0 || known_eq (orig_offset, second_offset))
10758         return false;
10759
10760       /* Split the offset into second_offset and the rest.  */
10761       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10762       *offset2 = gen_int_mode (second_offset, Pmode);
10763       return true;
10764     }
10765   else
10766     {
10767       /* Get the mode we should use as the basis of the range.  For structure
10768          modes this is the mode of one vector.  */
10769       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10770       machine_mode step_mode
10771         = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
10772
10773       /* Get the "mul vl" multiplier we'd like to use.  */
10774       HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
10775       HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
10776       if (vec_flags & VEC_SVE_DATA)
10777         /* LDR supports a 9-bit range, but the move patterns for
10778            structure modes require all vectors to be in range of the
10779            same base.  The simplest way of accomodating that while still
10780            promoting reuse of anchor points between different modes is
10781            to use an 8-bit range unconditionally.  */
10782         vnum = ((vnum + 128) & 255) - 128;
10783       else
10784         /* Predicates are only handled singly, so we might as well use
10785            the full range.  */
10786         vnum = ((vnum + 256) & 511) - 256;
10787       if (vnum == 0)
10788         return false;
10789
10790       /* Convert the "mul vl" multiplier into a byte offset.  */
10791       poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
10792       if (known_eq (second_offset, orig_offset))
10793         return false;
10794
10795       /* Split the offset into second_offset and the rest.  */
10796       *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10797       *offset2 = gen_int_mode (second_offset, Pmode);
10798       return true;
10799     }
10800 }
10801
10802 /* Return the binary representation of floating point constant VALUE in INTVAL.
10803    If the value cannot be converted, return false without setting INTVAL.
10804    The conversion is done in the given MODE.  */
10805 bool
10806 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
10807 {
10808
10809   /* We make a general exception for 0.  */
10810   if (aarch64_float_const_zero_rtx_p (value))
10811     {
10812       *intval = 0;
10813       return true;
10814     }
10815
10816   scalar_float_mode mode;
10817   if (!CONST_DOUBLE_P (value)
10818       || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
10819       || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
10820       /* Only support up to DF mode.  */
10821       || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
10822     return false;
10823
10824   unsigned HOST_WIDE_INT ival = 0;
10825
10826   long res[2];
10827   real_to_target (res,
10828                   CONST_DOUBLE_REAL_VALUE (value),
10829                   REAL_MODE_FORMAT (mode));
10830
10831   if (mode == DFmode || mode == DDmode)
10832     {
10833       int order = BYTES_BIG_ENDIAN ? 1 : 0;
10834       ival = zext_hwi (res[order], 32);
10835       ival |= (zext_hwi (res[1 - order], 32) << 32);
10836     }
10837   else
10838       ival = zext_hwi (res[0], 32);
10839
10840   *intval = ival;
10841   return true;
10842 }
10843
10844 /* Return TRUE if rtx X is an immediate constant that can be moved using a
10845    single MOV(+MOVK) followed by an FMOV.  */
10846 bool
10847 aarch64_float_const_rtx_p (rtx x)
10848 {
10849   machine_mode mode = GET_MODE (x);
10850   if (mode == VOIDmode)
10851     return false;
10852
10853   /* Determine whether it's cheaper to write float constants as
10854      mov/movk pairs over ldr/adrp pairs.  */
10855   unsigned HOST_WIDE_INT ival;
10856
10857   if (CONST_DOUBLE_P (x)
10858       && SCALAR_FLOAT_MODE_P (mode)
10859       && aarch64_reinterpret_float_as_int (x, &ival))
10860     {
10861       machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8) ? DImode : SImode;
10862       int num_instr = aarch64_internal_mov_immediate
10863                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10864       return num_instr < 3;
10865     }
10866
10867   return false;
10868 }
10869
10870 /* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
10871    Floating Point).  */
10872 bool
10873 aarch64_float_const_zero_rtx_p (rtx x)
10874 {
10875   /* 0.0 in Decimal Floating Point cannot be represented by #0 or
10876      zr as our callers expect, so no need to check the actual
10877      value if X is of Decimal Floating Point type.  */
10878   if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT)
10879     return false;
10880
10881   if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
10882     return !HONOR_SIGNED_ZEROS (GET_MODE (x));
10883   return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
10884 }
10885
10886 /* Return true if X is any kind of constant zero rtx.  */
10887
10888 bool
10889 aarch64_const_zero_rtx_p (rtx x)
10890 {
10891   return (x == CONST0_RTX (GET_MODE (x))
10892           || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)));
10893 }
10894
10895 /* Return TRUE if rtx X is immediate constant that fits in a single
10896    MOVI immediate operation.  */
10897 bool
10898 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
10899 {
10900   if (!TARGET_SIMD)
10901      return false;
10902
10903   machine_mode vmode;
10904   scalar_int_mode imode;
10905   unsigned HOST_WIDE_INT ival;
10906
10907   if (CONST_DOUBLE_P (x)
10908       && SCALAR_FLOAT_MODE_P (mode))
10909     {
10910       if (!aarch64_reinterpret_float_as_int (x, &ival))
10911         return false;
10912
10913       /* We make a general exception for 0.  */
10914       if (aarch64_float_const_zero_rtx_p (x))
10915         return true;
10916
10917       imode = int_mode_for_mode (mode).require ();
10918     }
10919   else if (CONST_INT_P (x)
10920            && is_a <scalar_int_mode> (mode, &imode))
10921     ival = INTVAL (x);
10922   else
10923     return false;
10924
10925    /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
10926      a 128 bit vector mode.  */
10927   int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
10928
10929   vmode = aarch64_simd_container_mode (imode, width);
10930   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
10931
10932   return aarch64_simd_valid_immediate (v_op, NULL);
10933 }
10934
10935
10936 /* Return the fixed registers used for condition codes.  */
10937
10938 static bool
10939 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
10940 {
10941   *p1 = CC_REGNUM;
10942   *p2 = INVALID_REGNUM;
10943   return true;
10944 }
10945
10946 /* Return a fresh memory reference to the current function's TPIDR2 block,
10947    creating a block if necessary.  */
10948
10949 static rtx
10950 aarch64_get_tpidr2_block ()
10951 {
10952   if (!cfun->machine->tpidr2_block)
10953     /* The TPIDR2 block is 16 bytes in size and must be aligned to a 128-bit
10954        boundary.  */
10955     cfun->machine->tpidr2_block = assign_stack_local (V16QImode, 16, 128);
10956   return copy_rtx (cfun->machine->tpidr2_block);
10957 }
10958
10959 /* Return a fresh register that points to the current function's
10960    TPIDR2 block, creating a block if necessary.  */
10961
10962 static rtx
10963 aarch64_get_tpidr2_ptr ()
10964 {
10965   rtx block = aarch64_get_tpidr2_block ();
10966   return force_reg (Pmode, XEXP (block, 0));
10967 }
10968
10969 /* Emit instructions to allocate a ZA lazy save buffer and initialize the
10970    current function's TPIDR2 block.  */
10971
10972 static void
10973 aarch64_init_tpidr2_block ()
10974 {
10975   rtx block = aarch64_get_tpidr2_block ();
10976
10977   /* The ZA save buffer is SVL.B*SVL.B bytes in size.  */
10978   rtx svl_bytes = aarch64_sme_vq_immediate (Pmode, 16, AARCH64_ISA_MODE);
10979   rtx svl_bytes_reg = force_reg (DImode, svl_bytes);
10980   rtx za_size = expand_simple_binop (Pmode, MULT, svl_bytes_reg,
10981                                      svl_bytes_reg, NULL, 0, OPTAB_LIB_WIDEN);
10982   rtx za_save_buffer = allocate_dynamic_stack_space (za_size, 128,
10983                                                      BITS_PER_UNIT, -1, true);
10984   za_save_buffer = force_reg (Pmode, za_save_buffer);
10985   cfun->machine->za_save_buffer = za_save_buffer;
10986
10987   /* The first word of the block points to the save buffer and the second
10988      word is the number of ZA slices to save.  */
10989   rtx block_0 = adjust_address (block, DImode, 0);
10990   emit_insn (aarch64_gen_store_pair (block_0, za_save_buffer, svl_bytes_reg));
10991
10992   if (!memory_operand (block, V16QImode))
10993     block = replace_equiv_address (block, force_reg (Pmode, XEXP (block, 0)));
10994   emit_insn (gen_aarch64_setup_local_tpidr2 (block));
10995 }
10996
10997 /* Restore the contents of ZA from the lazy save buffer, given that
10998    register TPIDR2_BLOCK points to the current function's TPIDR2 block.
10999    PSTATE.ZA is known to be 0 and TPIDR2_EL0 is known to be null.  */
11000
11001 void
11002 aarch64_restore_za (rtx tpidr2_block)
11003 {
11004   emit_insn (gen_aarch64_smstart_za ());
11005   if (REGNO (tpidr2_block) != R0_REGNUM)
11006     emit_move_insn (gen_rtx_REG (Pmode, R0_REGNUM), tpidr2_block);
11007   emit_insn (gen_aarch64_tpidr2_restore ());
11008 }
11009
11010 /* Return the ZT0 save buffer, creating one if necessary.  */
11011
11012 static rtx
11013 aarch64_get_zt0_save_buffer ()
11014 {
11015   if (!cfun->machine->zt0_save_buffer)
11016     cfun->machine->zt0_save_buffer = assign_stack_local (V8DImode, 64, 128);
11017   return cfun->machine->zt0_save_buffer;
11018 }
11019
11020 /* Save ZT0 to the current function's save buffer.  */
11021
11022 static void
11023 aarch64_save_zt0 ()
11024 {
11025   rtx mem = aarch64_get_zt0_save_buffer ();
11026   mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
11027   emit_insn (gen_aarch64_sme_str_zt0 (mem));
11028 }
11029
11030 /* Restore ZT0 from the current function's save buffer.  FROM_LAZY_SAVE_P
11031    is true if the load is happening after a call to a private-ZA function,
11032    false if it can be treated as a normal load.  */
11033
11034 static void
11035 aarch64_restore_zt0 (bool from_lazy_save_p)
11036 {
11037   rtx mem = aarch64_get_zt0_save_buffer ();
11038   mem = replace_equiv_address (mem, force_reg (Pmode, XEXP (mem, 0)));
11039   emit_insn (from_lazy_save_p
11040              ? gen_aarch64_restore_zt0 (mem)
11041              : gen_aarch64_sme_ldr_zt0 (mem));
11042 }
11043
11044 /* Implement TARGET_START_CALL_ARGS.  */
11045
11046 static void
11047 aarch64_start_call_args (cumulative_args_t ca_v)
11048 {
11049   CUMULATIVE_ARGS *ca = get_cumulative_args (ca_v);
11050
11051   if (!TARGET_SME && (ca->isa_mode & AARCH64_FL_SM_ON))
11052     {
11053       error ("calling a streaming function requires the ISA extension %qs",
11054              "sme");
11055       inform (input_location, "you can enable %qs using the command-line"
11056               " option %<-march%>, or by using the %<target%>"
11057               " attribute or pragma", "sme");
11058     }
11059
11060   if ((ca->shared_za_flags & (AARCH64_STATE_IN | AARCH64_STATE_OUT))
11061       && !aarch64_cfun_has_state ("za"))
11062     error ("call to a function that shares %qs state from a function"
11063            " that has no %qs state", "za", "za");
11064   else if ((ca->shared_zt0_flags & (AARCH64_STATE_IN | AARCH64_STATE_OUT))
11065            && !aarch64_cfun_has_state ("zt0"))
11066     error ("call to a function that shares %qs state from a function"
11067            " that has no %qs state", "zt0", "zt0");
11068   else if (!TARGET_ZA && (ca->isa_mode & AARCH64_FL_ZA_ON))
11069     error ("call to a function that shares SME state from a function"
11070            " that has no SME state");
11071
11072   /* If this is a call to a private ZA function, emit a marker to
11073      indicate where any necessary set-up code could be inserted.
11074      The code itself is inserted by the mode-switching pass.  */
11075   if (TARGET_ZA && !(ca->isa_mode & AARCH64_FL_ZA_ON))
11076     emit_insn (gen_aarch64_start_private_za_call ());
11077
11078   /* If this is a call to a shared-ZA function that doesn't share ZT0,
11079      save and restore ZT0 around the call.  */
11080   if (aarch64_cfun_has_state ("zt0")
11081       && (ca->isa_mode & AARCH64_FL_ZA_ON)
11082       && ca->shared_zt0_flags == 0)
11083     aarch64_save_zt0 ();
11084 }
11085
11086 /* This function is used by the call expanders of the machine description.
11087    RESULT is the register in which the result is returned.  It's NULL for
11088    "call" and "sibcall".
11089    MEM is the location of the function call.
11090    COOKIE is either:
11091      - a const_int that gives the argument to the call's UNSPEC_CALLEE_ABI.
11092      - a PARALLEL that contains such a const_int as its first element.
11093        The second element is a PARALLEL that lists all the argument
11094        registers that need to be saved and restored around a change
11095        in PSTATE.SM, or const0_rtx if no such switch is needed.
11096        The third and fourth elements are const_ints that contain the
11097        sharing flags for ZA and ZT0 respectively.
11098    SIBCALL indicates whether this function call is normal call or sibling call.
11099    It will generate different pattern accordingly.  */
11100
11101 void
11102 aarch64_expand_call (rtx result, rtx mem, rtx cookie, bool sibcall)
11103 {
11104   rtx call, callee, tmp;
11105   rtvec vec;
11106   machine_mode mode;
11107
11108   rtx callee_abi = cookie;
11109   rtx sme_mode_switch_args = const0_rtx;
11110   unsigned int shared_za_flags = 0;
11111   unsigned int shared_zt0_flags = 0;
11112   if (GET_CODE (cookie) == PARALLEL)
11113     {
11114       callee_abi = XVECEXP (cookie, 0, 0);
11115       sme_mode_switch_args = XVECEXP (cookie, 0, 1);
11116       shared_za_flags = INTVAL (XVECEXP (cookie, 0, 2));
11117       shared_zt0_flags = INTVAL (XVECEXP (cookie, 0, 3));
11118     }
11119
11120   gcc_assert (CONST_INT_P (callee_abi));
11121   auto callee_isa_mode = aarch64_callee_isa_mode (callee_abi);
11122
11123   if (aarch64_cfun_has_state ("za")
11124       && (callee_isa_mode & AARCH64_FL_ZA_ON)
11125       && !shared_za_flags)
11126     {
11127       sorry ("call to a function that shares state other than %qs"
11128              " from a function that has %qs state", "za", "za");
11129       inform (input_location, "use %<__arm_preserves(\"za\")%> if the"
11130               " callee preserves ZA");
11131     }
11132
11133   gcc_assert (MEM_P (mem));
11134   callee = XEXP (mem, 0);
11135   mode = GET_MODE (callee);
11136   gcc_assert (mode == Pmode);
11137
11138   /* Decide if we should generate indirect calls by loading the
11139      address of the callee into a register before performing
11140      the branch-and-link.  */
11141   if (SYMBOL_REF_P (callee)
11142       ? (aarch64_is_long_call_p (callee)
11143          || aarch64_is_noplt_call_p (callee))
11144       : !REG_P (callee))
11145     XEXP (mem, 0) = force_reg (mode, callee);
11146
11147   /* Accumulate the return values, including state that is shared via
11148      attributes.  */
11149   auto_vec<rtx, 8> return_values;
11150   if (result)
11151     {
11152       if (GET_CODE (result) == PARALLEL)
11153         for (int i = 0; i < XVECLEN (result, 0); ++i)
11154           return_values.safe_push (XVECEXP (result, 0, i));
11155       else
11156         return_values.safe_push (result);
11157     }
11158   unsigned int orig_num_return_values = return_values.length ();
11159   if (shared_za_flags & AARCH64_STATE_OUT)
11160     return_values.safe_push (gen_rtx_REG (VNx16BImode, ZA_REGNUM));
11161   /* When calling private-ZA functions from functions with ZA state,
11162      we want to know whether the call committed a lazy save.  */
11163   if (TARGET_ZA && !shared_za_flags)
11164     return_values.safe_push (gen_rtx_REG (VNx16BImode, ZA_SAVED_REGNUM));
11165   if (shared_zt0_flags & AARCH64_STATE_OUT)
11166     return_values.safe_push (gen_rtx_REG (V8DImode, ZT0_REGNUM));
11167
11168   /* Create the new return value, if necessary.  */
11169   if (orig_num_return_values != return_values.length ())
11170     {
11171       if (return_values.length () == 1)
11172         result = return_values[0];
11173       else
11174         {
11175           for (rtx &x : return_values)
11176             if (GET_CODE (x) != EXPR_LIST)
11177               x = gen_rtx_EXPR_LIST (VOIDmode, x, const0_rtx);
11178           rtvec v = gen_rtvec_v (return_values.length (),
11179                                  return_values.address ());
11180           result = gen_rtx_PARALLEL (VOIDmode, v);
11181         }
11182     }
11183
11184   call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
11185
11186   if (result != NULL_RTX)
11187     call = gen_rtx_SET (result, call);
11188
11189   if (sibcall)
11190     tmp = ret_rtx;
11191   else
11192     tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
11193
11194   callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
11195                                UNSPEC_CALLEE_ABI);
11196
11197   vec = gen_rtvec (3, call, callee_abi, tmp);
11198   call = gen_rtx_PARALLEL (VOIDmode, vec);
11199
11200   auto call_insn = aarch64_emit_call_insn (call);
11201
11202   /* Check whether the call requires a change to PSTATE.SM.  We can't
11203      emit the instructions to change PSTATE.SM yet, since they involve
11204      a change in vector length and a change in instruction set, which
11205      cannot be represented in RTL.
11206
11207      For now, just record which registers will be clobbered and used
11208      by the changes to PSTATE.SM.  */
11209   if (!sibcall && aarch64_call_switches_pstate_sm (callee_isa_mode))
11210     {
11211       aarch64_sme_mode_switch_regs args_switch;
11212       if (sme_mode_switch_args != const0_rtx)
11213         {
11214           unsigned int num_args = XVECLEN (sme_mode_switch_args, 0);
11215           for (unsigned int i = 0; i < num_args; ++i)
11216             {
11217               rtx x = XVECEXP (sme_mode_switch_args, 0, i);
11218               args_switch.add_reg (GET_MODE (x), REGNO (x));
11219             }
11220         }
11221
11222       aarch64_sme_mode_switch_regs result_switch;
11223       if (result)
11224         result_switch.add_call_result (call_insn);
11225
11226       unsigned int num_gprs = MAX (args_switch.num_gprs (),
11227                                    result_switch.num_gprs ());
11228       for (unsigned int i = 0; i < num_gprs; ++i)
11229         clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11230                      gen_rtx_REG (DImode, args_switch.FIRST_GPR + i));
11231
11232       for (int regno = V0_REGNUM; regno < V0_REGNUM + 32; regno += 4)
11233         clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11234                      gen_rtx_REG (V4x16QImode, regno));
11235
11236       for (int regno = P0_REGNUM; regno < P0_REGNUM + 16; regno += 1)
11237         clobber_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11238                      gen_rtx_REG (VNx16BImode, regno));
11239
11240       /* Ensure that the VG save slot has been initialized.  Also emit
11241          an instruction to model the effect of the temporary clobber
11242          of VG, so that the prologue/epilogue pass sees the need to
11243          save the old value.  */
11244       use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11245                gen_rtx_REG (DImode, VG_REGNUM));
11246       emit_insn_before (gen_aarch64_update_vg (), call_insn);
11247
11248       cfun->machine->call_switches_pstate_sm = true;
11249     }
11250
11251   /* Add any ZA-related information.
11252
11253      ZA_REGNUM represents the current function's ZA state, rather than
11254      the contents of the ZA register itself.  We ensure that the function's
11255      ZA state is preserved by private-ZA call sequences, so the call itself
11256      does not use or clobber ZA_REGNUM.  The same thing applies to
11257      ZT0_REGNUM.  */
11258   if (TARGET_ZA)
11259     {
11260       /* The callee requires ZA to be active if the callee is shared-ZA,
11261          otherwise it requires ZA to be dormant or off.  The state of ZA is
11262          captured by a combination of SME_STATE_REGNUM, TPIDR2_SETUP_REGNUM,
11263          and ZA_SAVED_REGNUM.  */
11264       use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11265                gen_rtx_REG (DImode, SME_STATE_REGNUM));
11266       use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11267                gen_rtx_REG (DImode, TPIDR2_SETUP_REGNUM));
11268       use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11269                gen_rtx_REG (VNx16BImode, ZA_SAVED_REGNUM));
11270
11271       /* Keep the aarch64_start/end_private_za_call markers live.  */
11272       if (!(callee_isa_mode & AARCH64_FL_ZA_ON))
11273         use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11274                  gen_rtx_REG (VNx16BImode, LOWERING_REGNUM));
11275
11276       /* If the callee is a shared-ZA function, record whether it uses the
11277          current value of ZA and ZT0.  */
11278       if (shared_za_flags & AARCH64_STATE_IN)
11279         use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11280                  gen_rtx_REG (VNx16BImode, ZA_REGNUM));
11281
11282       if (shared_zt0_flags & AARCH64_STATE_IN)
11283         use_reg (&CALL_INSN_FUNCTION_USAGE (call_insn),
11284                  gen_rtx_REG (V8DImode, ZT0_REGNUM));
11285     }
11286 }
11287
11288 /* Implement TARGET_END_CALL_ARGS.  */
11289
11290 static void
11291 aarch64_end_call_args (cumulative_args_t ca_v)
11292 {
11293   CUMULATIVE_ARGS *ca = get_cumulative_args (ca_v);
11294
11295   /* If this is a call to a private ZA function, emit a marker to
11296      indicate where any necessary restoration code could be inserted.
11297      The code itself is inserted by the mode-switching pass.  */
11298   if (TARGET_ZA && !(ca->isa_mode & AARCH64_FL_ZA_ON))
11299     emit_insn (gen_aarch64_end_private_za_call ());
11300
11301   /* If this is a call to a shared-ZA function that doesn't share ZT0,
11302      save and restore ZT0 around the call.  */
11303   if (aarch64_cfun_has_state ("zt0")
11304       && (ca->isa_mode & AARCH64_FL_ZA_ON)
11305       && ca->shared_zt0_flags == 0)
11306     aarch64_restore_zt0 (false);
11307 }
11308
11309 /* Emit call insn with PAT and do aarch64-specific handling.  */
11310
11311 rtx_call_insn *
11312 aarch64_emit_call_insn (rtx pat)
11313 {
11314   auto insn = emit_call_insn (pat);
11315
11316   rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
11317   clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
11318   clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
11319   return as_a<rtx_call_insn *> (insn);
11320 }
11321
11322 machine_mode
11323 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
11324 {
11325   machine_mode mode_x = GET_MODE (x);
11326   rtx_code code_x = GET_CODE (x);
11327
11328   /* All floating point compares return CCFP if it is an equality
11329      comparison, and CCFPE otherwise.  */
11330   if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
11331     {
11332       switch (code)
11333         {
11334         case EQ:
11335         case NE:
11336         case UNORDERED:
11337         case ORDERED:
11338         case UNLT:
11339         case UNLE:
11340         case UNGT:
11341         case UNGE:
11342         case UNEQ:
11343           return CCFPmode;
11344
11345         case LT:
11346         case LE:
11347         case GT:
11348         case GE:
11349         case LTGT:
11350           return CCFPEmode;
11351
11352         default:
11353           gcc_unreachable ();
11354         }
11355     }
11356
11357   /* Equality comparisons of short modes against zero can be performed
11358      using the TST instruction with the appropriate bitmask.  */
11359   if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
11360       && (code == EQ || code == NE)
11361       && (mode_x == HImode || mode_x == QImode))
11362     return CC_Zmode;
11363
11364   /* Similarly, comparisons of zero_extends from shorter modes can
11365      be performed using an ANDS with an immediate mask.  */
11366   if (y == const0_rtx && code_x == ZERO_EXTEND
11367       && (mode_x == SImode || mode_x == DImode)
11368       && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
11369       && (code == EQ || code == NE))
11370     return CC_Zmode;
11371
11372   /* Zero extracts support equality comparisons.  */
11373   if ((mode_x == SImode || mode_x == DImode)
11374       && y == const0_rtx
11375       && (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
11376           && CONST_INT_P (XEXP (x, 2)))
11377       && (code == EQ || code == NE))
11378     return CC_Zmode;
11379
11380   /* ANDS/BICS/TST support equality and all signed comparisons.  */
11381   if ((mode_x == SImode || mode_x == DImode)
11382       && y == const0_rtx
11383       && (code_x == AND)
11384       && (code == EQ || code == NE || code == LT || code == GE
11385           || code == GT || code == LE))
11386     return CC_NZVmode;
11387
11388   /* ADDS/SUBS correctly set N and Z flags.  */
11389   if ((mode_x == SImode || mode_x == DImode)
11390       && y == const0_rtx
11391       && (code == EQ || code == NE || code == LT || code == GE)
11392       && (code_x == PLUS || code_x == MINUS || code_x == NEG))
11393     return CC_NZmode;
11394
11395   /* A compare with a shifted operand.  Because of canonicalization,
11396      the comparison will have to be swapped when we emit the assembly
11397      code.  */
11398   if ((mode_x == SImode || mode_x == DImode)
11399       && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
11400       && (code_x == ASHIFT || code_x == ASHIFTRT
11401           || code_x == LSHIFTRT
11402           || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
11403     return CC_SWPmode;
11404
11405   /* Similarly for a negated operand, but we can only do this for
11406      equalities.  */
11407   if ((mode_x == SImode || mode_x == DImode)
11408       && (REG_P (y) || SUBREG_P (y))
11409       && (code == EQ || code == NE)
11410       && code_x == NEG)
11411     return CC_Zmode;
11412
11413   /* A test for unsigned overflow from an addition.  */
11414   if ((mode_x == DImode || mode_x == TImode)
11415       && (code == LTU || code == GEU)
11416       && code_x == PLUS
11417       && rtx_equal_p (XEXP (x, 0), y))
11418     return CC_Cmode;
11419
11420   /* A test for unsigned overflow from an add with carry.  */
11421   if ((mode_x == DImode || mode_x == TImode)
11422       && (code == LTU || code == GEU)
11423       && code_x == PLUS
11424       && CONST_SCALAR_INT_P (y)
11425       && (rtx_mode_t (y, mode_x)
11426           == (wi::shwi (1, mode_x)
11427               << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
11428     return CC_ADCmode;
11429
11430   /* A test for signed overflow.  */
11431   if ((mode_x == DImode || mode_x == TImode)
11432       && code == NE
11433       && code_x == PLUS
11434       && GET_CODE (y) == SIGN_EXTEND)
11435     return CC_Vmode;
11436
11437   /* For everything else, return CCmode.  */
11438   return CCmode;
11439 }
11440
11441 static int
11442 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
11443
11444 int
11445 aarch64_get_condition_code (rtx x)
11446 {
11447   machine_mode mode = GET_MODE (XEXP (x, 0));
11448   enum rtx_code comp_code = GET_CODE (x);
11449
11450   if (GET_MODE_CLASS (mode) != MODE_CC)
11451     mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
11452   return aarch64_get_condition_code_1 (mode, comp_code);
11453 }
11454
11455 static int
11456 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
11457 {
11458   switch (mode)
11459     {
11460     case E_CCFPmode:
11461     case E_CCFPEmode:
11462       switch (comp_code)
11463         {
11464         case GE: return AARCH64_GE;
11465         case GT: return AARCH64_GT;
11466         case LE: return AARCH64_LS;
11467         case LT: return AARCH64_MI;
11468         case NE: return AARCH64_NE;
11469         case EQ: return AARCH64_EQ;
11470         case ORDERED: return AARCH64_VC;
11471         case UNORDERED: return AARCH64_VS;
11472         case UNLT: return AARCH64_LT;
11473         case UNLE: return AARCH64_LE;
11474         case UNGT: return AARCH64_HI;
11475         case UNGE: return AARCH64_PL;
11476         default: return -1;
11477         }
11478       break;
11479
11480     case E_CCmode:
11481       switch (comp_code)
11482         {
11483         case NE: return AARCH64_NE;
11484         case EQ: return AARCH64_EQ;
11485         case GE: return AARCH64_GE;
11486         case GT: return AARCH64_GT;
11487         case LE: return AARCH64_LE;
11488         case LT: return AARCH64_LT;
11489         case GEU: return AARCH64_CS;
11490         case GTU: return AARCH64_HI;
11491         case LEU: return AARCH64_LS;
11492         case LTU: return AARCH64_CC;
11493         default: return -1;
11494         }
11495       break;
11496
11497     case E_CC_SWPmode:
11498       switch (comp_code)
11499         {
11500         case NE: return AARCH64_NE;
11501         case EQ: return AARCH64_EQ;
11502         case GE: return AARCH64_LE;
11503         case GT: return AARCH64_LT;
11504         case LE: return AARCH64_GE;
11505         case LT: return AARCH64_GT;
11506         case GEU: return AARCH64_LS;
11507         case GTU: return AARCH64_CC;
11508         case LEU: return AARCH64_CS;
11509         case LTU: return AARCH64_HI;
11510         default: return -1;
11511         }
11512       break;
11513
11514     case E_CC_NZCmode:
11515       switch (comp_code)
11516         {
11517         case NE: return AARCH64_NE; /* = any */
11518         case EQ: return AARCH64_EQ; /* = none */
11519         case GE: return AARCH64_PL; /* = nfrst */
11520         case LT: return AARCH64_MI; /* = first */
11521         case GEU: return AARCH64_CS; /* = nlast */
11522         case GTU: return AARCH64_HI; /* = pmore */
11523         case LEU: return AARCH64_LS; /* = plast */
11524         case LTU: return AARCH64_CC; /* = last */
11525         default: return -1;
11526         }
11527       break;
11528
11529     case E_CC_NZVmode:
11530       switch (comp_code)
11531         {
11532         case NE: return AARCH64_NE;
11533         case EQ: return AARCH64_EQ;
11534         case GE: return AARCH64_PL;
11535         case LT: return AARCH64_MI;
11536         case GT: return AARCH64_GT;
11537         case LE: return AARCH64_LE;
11538         default: return -1;
11539         }
11540       break;
11541
11542     case E_CC_NZmode:
11543       switch (comp_code)
11544         {
11545         case NE: return AARCH64_NE;
11546         case EQ: return AARCH64_EQ;
11547         case GE: return AARCH64_PL;
11548         case LT: return AARCH64_MI;
11549         default: return -1;
11550         }
11551       break;
11552
11553     case E_CC_Zmode:
11554       switch (comp_code)
11555         {
11556         case NE: return AARCH64_NE;
11557         case EQ: return AARCH64_EQ;
11558         default: return -1;
11559         }
11560       break;
11561
11562     case E_CC_Cmode:
11563       switch (comp_code)
11564         {
11565         case LTU: return AARCH64_CS;
11566         case GEU: return AARCH64_CC;
11567         default: return -1;
11568         }
11569       break;
11570
11571     case E_CC_ADCmode:
11572       switch (comp_code)
11573         {
11574         case GEU: return AARCH64_CS;
11575         case LTU: return AARCH64_CC;
11576         default: return -1;
11577         }
11578       break;
11579
11580     case E_CC_Vmode:
11581       switch (comp_code)
11582         {
11583         case NE: return AARCH64_VS;
11584         case EQ: return AARCH64_VC;
11585         default: return -1;
11586         }
11587       break;
11588
11589     default:
11590       return -1;
11591     }
11592
11593   return -1;
11594 }
11595
11596 /* Return true if X is a CONST_INT, CONST_WIDE_INT or a constant vector
11597    duplicate of such constants.  If so, store in RET_WI the wide_int
11598    representation of the constant paired with the inner mode of the vector mode
11599    or MODE for scalar X constants.  If MODE is not provided then TImode is
11600    used.  */
11601
11602 static bool
11603 aarch64_extract_vec_duplicate_wide_int (rtx x, wide_int *ret_wi,
11604                                         scalar_mode mode = TImode)
11605 {
11606   rtx elt = unwrap_const_vec_duplicate (x);
11607   if (!CONST_SCALAR_INT_P (elt))
11608     return false;
11609   scalar_mode smode
11610     = CONST_SCALAR_INT_P (x) ? mode : GET_MODE_INNER (GET_MODE (x));
11611   *ret_wi = rtx_mode_t (elt, smode);
11612   return true;
11613 }
11614
11615 /* Return true if X is a scalar or a constant vector of integer
11616    immediates that represent the rounding constant used in the fixed-point
11617    arithmetic instructions.
11618    The accepted form of the constant is (1 << (C - 1)) where C is in the range
11619    [1, MODE_WIDTH/2].  */
11620
11621 bool
11622 aarch64_rnd_imm_p (rtx x)
11623 {
11624   wide_int rnd_cst;
11625   if (!aarch64_extract_vec_duplicate_wide_int (x, &rnd_cst))
11626     return false;
11627   int log2 = wi::exact_log2 (rnd_cst);
11628   if (log2 < 0)
11629     return false;
11630   return IN_RANGE (log2, 0, rnd_cst.get_precision () / 2 - 1);
11631 }
11632
11633 /* Return true if RND is a constant vector of integer rounding constants
11634    corresponding to a constant vector of shifts, SHIFT.
11635    The relationship should be RND == (1 << (SHIFT - 1)).  */
11636
11637 bool
11638 aarch64_const_vec_rnd_cst_p (rtx rnd, rtx shift)
11639 {
11640   wide_int rnd_cst, shft_cst;
11641   if (!aarch64_extract_vec_duplicate_wide_int (rnd, &rnd_cst)
11642       || !aarch64_extract_vec_duplicate_wide_int (shift, &shft_cst))
11643     return false;
11644
11645   return rnd_cst == (wi::shwi (1, rnd_cst.get_precision ()) << (shft_cst - 1));
11646 }
11647
11648 bool
11649 aarch64_const_vec_all_same_in_range_p (rtx x,
11650                                        HOST_WIDE_INT minval,
11651                                        HOST_WIDE_INT maxval)
11652 {
11653   rtx elt;
11654   return (const_vec_duplicate_p (x, &elt)
11655           && CONST_INT_P (elt)
11656           && IN_RANGE (INTVAL (elt), minval, maxval));
11657 }
11658
11659 /* Some constants can't be made using normal mov instructions in Advanced SIMD
11660    but we can still create them in various ways.  If the constant in VAL can be
11661    created using alternate methods then if possible then return true and
11662    additionally set TARGET to the rtx for the sequence if TARGET is not NULL.
11663    Otherwise return false if sequence is not possible.  */
11664
11665 bool
11666 aarch64_maybe_generate_simd_constant (rtx target, rtx val, machine_mode mode)
11667 {
11668   wide_int wval;
11669   auto smode = GET_MODE_INNER (mode);
11670   if (!aarch64_extract_vec_duplicate_wide_int (val, &wval, smode))
11671     return false;
11672
11673   /* For Advanced SIMD we can create an integer with only the top bit set
11674      using fneg (0.0f).  */
11675   if (TARGET_SIMD
11676       && !TARGET_SVE
11677       && smode == DImode
11678       && wi::only_sign_bit_p (wval))
11679     {
11680       if (!target)
11681         return true;
11682
11683       /* Use the same base type as aarch64_gen_shareable_zero.  */
11684       rtx zero = CONST0_RTX (V4SImode);
11685       emit_move_insn (lowpart_subreg (V4SImode, target, mode), zero);
11686       rtx neg = lowpart_subreg (V2DFmode, target, mode);
11687       emit_insn (gen_negv2df2 (neg, copy_rtx (neg)));
11688       return true;
11689     }
11690
11691   return false;
11692 }
11693
11694 /* Check if the value in VAL with mode MODE can be created using special
11695    instruction sequences.  */
11696
11697 bool aarch64_simd_special_constant_p (rtx val, machine_mode mode)
11698 {
11699   return aarch64_maybe_generate_simd_constant (NULL_RTX, val, mode);
11700 }
11701
11702 bool
11703 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
11704 {
11705   return aarch64_const_vec_all_same_in_range_p (x, val, val);
11706 }
11707
11708 /* Return true if VEC is a constant in which every element is in the range
11709    [MINVAL, MAXVAL].  The elements do not need to have the same value.  */
11710
11711 static bool
11712 aarch64_const_vec_all_in_range_p (rtx vec,
11713                                   HOST_WIDE_INT minval,
11714                                   HOST_WIDE_INT maxval)
11715 {
11716   if (!CONST_VECTOR_P (vec)
11717       || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
11718     return false;
11719
11720   int nunits;
11721   if (!CONST_VECTOR_STEPPED_P (vec))
11722     nunits = const_vector_encoded_nelts (vec);
11723   else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
11724     return false;
11725
11726   for (int i = 0; i < nunits; i++)
11727     {
11728       rtx vec_elem = CONST_VECTOR_ELT (vec, i);
11729       if (!CONST_INT_P (vec_elem)
11730           || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
11731         return false;
11732     }
11733   return true;
11734 }
11735
11736 /* N Z C V.  */
11737 #define AARCH64_CC_V 1
11738 #define AARCH64_CC_C (1 << 1)
11739 #define AARCH64_CC_Z (1 << 2)
11740 #define AARCH64_CC_N (1 << 3)
11741
11742 /* N Z C V flags for ccmp.  Indexed by AARCH64_COND_CODE.  */
11743 static const int aarch64_nzcv_codes[] =
11744 {
11745   0,            /* EQ, Z == 1.  */
11746   AARCH64_CC_Z, /* NE, Z == 0.  */
11747   0,            /* CS, C == 1.  */
11748   AARCH64_CC_C, /* CC, C == 0.  */
11749   0,            /* MI, N == 1.  */
11750   AARCH64_CC_N, /* PL, N == 0.  */
11751   0,            /* VS, V == 1.  */
11752   AARCH64_CC_V, /* VC, V == 0.  */
11753   0,            /* HI, C ==1 && Z == 0.  */
11754   AARCH64_CC_C, /* LS, !(C == 1 && Z == 0).  */
11755   AARCH64_CC_V, /* GE, N == V.  */
11756   0,            /* LT, N != V.  */
11757   AARCH64_CC_Z, /* GT, Z == 0 && N == V.  */
11758   0,            /* LE, !(Z == 0 && N == V).  */
11759   0,            /* AL, Any.  */
11760   0             /* NV, Any.  */
11761 };
11762
11763 /* Print floating-point vector immediate operand X to F, negating it
11764    first if NEGATE is true.  Return true on success, false if it isn't
11765    a constant we can handle.  */
11766
11767 static bool
11768 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
11769 {
11770   rtx elt;
11771
11772   if (!const_vec_duplicate_p (x, &elt))
11773     return false;
11774
11775   REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
11776   if (negate)
11777     r = real_value_negate (&r);
11778
11779   /* Handle the SVE single-bit immediates specially, since they have a
11780      fixed form in the assembly syntax.  */
11781   if (real_equal (&r, &dconst0))
11782     asm_fprintf (f, "0.0");
11783   else if (real_equal (&r, &dconst2))
11784     asm_fprintf (f, "2.0");
11785   else if (real_equal (&r, &dconst1))
11786     asm_fprintf (f, "1.0");
11787   else if (real_equal (&r, &dconsthalf))
11788     asm_fprintf (f, "0.5");
11789   else
11790     {
11791       const int buf_size = 20;
11792       char float_buf[buf_size] = {'\0'};
11793       real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
11794                                 1, GET_MODE (elt));
11795       asm_fprintf (f, "%s", float_buf);
11796     }
11797
11798   return true;
11799 }
11800
11801 /* Return the equivalent letter for size.  */
11802 static char
11803 sizetochar (int size)
11804 {
11805   switch (size)
11806     {
11807     case 64: return 'd';
11808     case 32: return 's';
11809     case 16: return 'h';
11810     case 8 : return 'b';
11811     default: gcc_unreachable ();
11812     }
11813 }
11814
11815 /* Print operand X to file F in a target specific manner according to CODE.
11816    The acceptable formatting commands given by CODE are:
11817      'c':               An integer or symbol address without a preceding #
11818                         sign.
11819      'C':               Take the duplicated element in a vector constant
11820                         and print it in hex.
11821      'D':               Take the duplicated element in a vector constant
11822                         and print it as an unsigned integer, in decimal.
11823      'e':               Print the sign/zero-extend size as a character 8->b,
11824                         16->h, 32->w.  Can also be used for masks:
11825                         0xff->b, 0xffff->h, 0xffffffff->w.
11826      'I':               If the operand is a duplicated vector constant,
11827                         replace it with the duplicated scalar.  If the
11828                         operand is then a floating-point constant, replace
11829                         it with the integer bit representation.  Print the
11830                         transformed constant as a signed decimal number.
11831      'p':               Prints N such that 2^N == X (X must be power of 2 and
11832                         const int).
11833      'P':               Print the number of non-zero bits in X (a const_int).
11834      'H':               Print the higher numbered register of a pair (TImode)
11835                         of regs.
11836      'm':               Print a condition (eq, ne, etc).
11837      'M':               Same as 'm', but invert condition.
11838      'N':               Take the duplicated element in a vector constant
11839                         and print the negative of it in decimal.
11840      'b/h/s/d/q':       Print a scalar FP/SIMD register name.
11841      'Z':               Same for SVE registers.  ('z' was already taken.)
11842                         Note that it is not necessary to use %Z for operands
11843                         that have SVE modes.  The convention is to use %Z
11844                         only for non-SVE (or potentially non-SVE) modes.
11845      'S/T/U/V':         Print a FP/SIMD register name for a register list.
11846                         The register printed is the FP/SIMD register name
11847                         of X + 0/1/2/3 for S/T/U/V.
11848      'R':               Print a scalar Integer/FP/SIMD register name + 1.
11849      'X':               Print bottom 16 bits of integer constant in hex.
11850      'w/x':             Print a general register name or the zero register
11851                         (32-bit or 64-bit).
11852      '0':               Print a normal operand, if it's a general register,
11853                         then we assume DImode.
11854      'k':               Print NZCV for conditional compare instructions.
11855      'K':               Print a predicate register as pn<N> rather than p<N>
11856      'A':               Output address constant representing the first
11857                         argument of X, specifying a relocation offset
11858                         if appropriate.
11859      'L':               Output constant address specified by X
11860                         with a relocation offset if appropriate.
11861      'G':               Prints address of X, specifying a PC relative
11862                         relocation mode if appropriate.
11863      'y':               Output address of LDP or STP - this is used for
11864                         some LDP/STPs which don't use a PARALLEL in their
11865                         pattern (so the mode needs to be adjusted).
11866      'z':               Output address of a typical LDP or STP.  */
11867
11868 static void
11869 aarch64_print_operand (FILE *f, rtx x, int code)
11870 {
11871   rtx elt;
11872   switch (code)
11873     {
11874     case 'c':
11875       if (CONST_INT_P (x))
11876         fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
11877       else
11878         {
11879           poly_int64 offset;
11880           rtx base = strip_offset_and_salt (x, &offset);
11881           if (SYMBOL_REF_P (base))
11882             output_addr_const (f, x);
11883           else
11884             output_operand_lossage ("unsupported operand for code '%c'", code);
11885         }
11886       break;
11887
11888     case 'e':
11889       {
11890         x = unwrap_const_vec_duplicate (x);
11891         if (!CONST_INT_P (x))
11892           {
11893             output_operand_lossage ("invalid operand for '%%%c'", code);
11894             return;
11895           }
11896
11897         HOST_WIDE_INT val = INTVAL (x);
11898         if ((val & ~7) == 8 || val == 0xff)
11899           fputc ('b', f);
11900         else if ((val & ~7) == 16 || val == 0xffff)
11901           fputc ('h', f);
11902         else if ((val & ~7) == 32 || val == 0xffffffff)
11903           fputc ('w', f);
11904         else
11905           {
11906             output_operand_lossage ("invalid operand for '%%%c'", code);
11907             return;
11908           }
11909       }
11910       break;
11911
11912     case 'p':
11913       {
11914         int n;
11915
11916         if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
11917           {
11918             output_operand_lossage ("invalid operand for '%%%c'", code);
11919             return;
11920           }
11921
11922         asm_fprintf (f, "%d", n);
11923       }
11924       break;
11925
11926     case 'P':
11927       if (!CONST_INT_P (x))
11928         {
11929           output_operand_lossage ("invalid operand for '%%%c'", code);
11930           return;
11931         }
11932
11933       asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
11934       break;
11935
11936     case 'H':
11937       if (x == const0_rtx)
11938         {
11939           asm_fprintf (f, "xzr");
11940           break;
11941         }
11942
11943       if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
11944         {
11945           output_operand_lossage ("invalid operand for '%%%c'", code);
11946           return;
11947         }
11948
11949       asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
11950       break;
11951
11952     case 'I':
11953       {
11954         x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
11955         if (CONST_INT_P (x))
11956           asm_fprintf (f, "%wd", INTVAL (x));
11957         else
11958           {
11959             output_operand_lossage ("invalid operand for '%%%c'", code);
11960             return;
11961           }
11962         break;
11963       }
11964
11965     case 'M':
11966     case 'm':
11967       {
11968         int cond_code;
11969         /* CONST_TRUE_RTX means al/nv (al is the default, don't print it).  */
11970         if (x == const_true_rtx)
11971           {
11972             if (code == 'M')
11973               fputs ("nv", f);
11974             return;
11975           }
11976
11977         if (!COMPARISON_P (x))
11978           {
11979             output_operand_lossage ("invalid operand for '%%%c'", code);
11980             return;
11981           }
11982
11983         cond_code = aarch64_get_condition_code (x);
11984         gcc_assert (cond_code >= 0);
11985         if (code == 'M')
11986           cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
11987         if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
11988           fputs (aarch64_sve_condition_codes[cond_code], f);
11989         else
11990           fputs (aarch64_condition_codes[cond_code], f);
11991       }
11992       break;
11993
11994     case 'N':
11995       if (!const_vec_duplicate_p (x, &elt))
11996         {
11997           output_operand_lossage ("invalid vector constant");
11998           return;
11999         }
12000
12001       if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12002         asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
12003       else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12004                && aarch64_print_vector_float_operand (f, x, true))
12005         ;
12006       else
12007         {
12008           output_operand_lossage ("invalid vector constant");
12009           return;
12010         }
12011       break;
12012
12013     case 'b':
12014     case 'h':
12015     case 's':
12016     case 'd':
12017     case 'q':
12018     case 'Z':
12019       code = TOLOWER (code);
12020       if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
12021         {
12022           output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
12023           return;
12024         }
12025       asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
12026       break;
12027
12028     case 'S':
12029     case 'T':
12030     case 'U':
12031     case 'V':
12032       if (!REG_P (x) || (!FP_REGNUM_P (REGNO (x)) && !PR_REGNUM_P (REGNO (x))))
12033         {
12034           output_operand_lossage ("incompatible operand for '%%%c'", code);
12035           return;
12036         }
12037       if (PR_REGNUM_P (REGNO (x)))
12038         asm_fprintf (f, "p%d", REGNO (x) - P0_REGNUM + (code - 'S'));
12039       else
12040         asm_fprintf (f, "%c%d",
12041                      aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
12042                      REGNO (x) - V0_REGNUM + (code - 'S'));
12043       break;
12044
12045     case 'R':
12046       if (REG_P (x) && FP_REGNUM_P (REGNO (x))
12047           && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x))))
12048         asm_fprintf (f, "d%d", REGNO (x) - V0_REGNUM + 1);
12049       else if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
12050         asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
12051       else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12052         asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
12053       else
12054         output_operand_lossage ("incompatible register operand for '%%%c'",
12055                                 code);
12056       break;
12057
12058     case 'X':
12059       if (!CONST_INT_P (x))
12060         {
12061           output_operand_lossage ("invalid operand for '%%%c'", code);
12062           return;
12063         }
12064       asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
12065       break;
12066
12067     case 'C':
12068       {
12069         /* Print a replicated constant in hex.  */
12070         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12071           {
12072             output_operand_lossage ("invalid operand for '%%%c'", code);
12073             return;
12074           }
12075         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12076         asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12077       }
12078       break;
12079
12080     case 'D':
12081       {
12082         /* Print a replicated constant in decimal, treating it as
12083            unsigned.  */
12084         if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12085           {
12086             output_operand_lossage ("invalid operand for '%%%c'", code);
12087             return;
12088           }
12089         scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12090         asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12091       }
12092       break;
12093
12094     case 'w':
12095     case 'x':
12096       if (aarch64_const_zero_rtx_p (x))
12097         {
12098           asm_fprintf (f, "%czr", code);
12099           break;
12100         }
12101
12102       if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12103         {
12104           asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
12105           break;
12106         }
12107
12108       if (REG_P (x) && REGNO (x) == SP_REGNUM)
12109         {
12110           asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
12111           break;
12112         }
12113
12114       /* Fall through */
12115
12116     case 0:
12117       if (x == NULL)
12118         {
12119           output_operand_lossage ("missing operand");
12120           return;
12121         }
12122
12123       switch (GET_CODE (x))
12124         {
12125         case CONST_STRING:
12126           {
12127             asm_fprintf (f, "%s", XSTR (x, 0));
12128             break;
12129           }
12130         case REG:
12131           if (aarch64_sve_data_mode_p (GET_MODE (x)))
12132             {
12133               if (REG_NREGS (x) == 1)
12134                 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
12135               else
12136                 {
12137                   char suffix
12138                     = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
12139                   asm_fprintf (f, "{z%d.%c - z%d.%c}",
12140                                REGNO (x) - V0_REGNUM, suffix,
12141                                END_REGNO (x) - V0_REGNUM - 1, suffix);
12142                 }
12143             }
12144           else
12145             asm_fprintf (f, "%s", reg_names [REGNO (x)]);
12146           break;
12147
12148         case MEM:
12149           output_address (GET_MODE (x), XEXP (x, 0));
12150           break;
12151
12152         case LABEL_REF:
12153         case SYMBOL_REF:
12154           output_addr_const (asm_out_file, x);
12155           break;
12156
12157         case CONST_INT:
12158           asm_fprintf (f, "%wd", INTVAL (x));
12159           break;
12160
12161         case CONST:
12162           if (!VECTOR_MODE_P (GET_MODE (x)))
12163             {
12164               output_addr_const (asm_out_file, x);
12165               break;
12166             }
12167           /* fall through */
12168
12169         case CONST_VECTOR:
12170           if (!const_vec_duplicate_p (x, &elt))
12171             {
12172               output_operand_lossage ("invalid vector constant");
12173               return;
12174             }
12175
12176           if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12177             asm_fprintf (f, "%wd", INTVAL (elt));
12178           else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12179                    && aarch64_print_vector_float_operand (f, x, false))
12180             ;
12181           else
12182             {
12183               output_operand_lossage ("invalid vector constant");
12184               return;
12185             }
12186           break;
12187
12188         case CONST_DOUBLE:
12189           /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
12190              be getting CONST_DOUBLEs holding integers.  */
12191           gcc_assert (GET_MODE (x) != VOIDmode);
12192           if (aarch64_float_const_zero_rtx_p (x))
12193             {
12194               fputc ('0', f);
12195               break;
12196             }
12197           else if (aarch64_float_const_representable_p (x))
12198             {
12199 #define buf_size 20
12200               char float_buf[buf_size] = {'\0'};
12201               real_to_decimal_for_mode (float_buf,
12202                                         CONST_DOUBLE_REAL_VALUE (x),
12203                                         buf_size, buf_size,
12204                                         1, GET_MODE (x));
12205               asm_fprintf (asm_out_file, "%s", float_buf);
12206               break;
12207 #undef buf_size
12208             }
12209           output_operand_lossage ("invalid constant");
12210           return;
12211         default:
12212           output_operand_lossage ("invalid operand");
12213           return;
12214         }
12215       break;
12216
12217     case 'A':
12218       if (GET_CODE (x) == HIGH)
12219         x = XEXP (x, 0);
12220
12221       switch (aarch64_classify_symbolic_expression (x))
12222         {
12223         case SYMBOL_SMALL_GOT_4G:
12224           asm_fprintf (asm_out_file, ":got:");
12225           break;
12226
12227         case SYMBOL_SMALL_TLSGD:
12228           asm_fprintf (asm_out_file, ":tlsgd:");
12229           break;
12230
12231         case SYMBOL_SMALL_TLSDESC:
12232           asm_fprintf (asm_out_file, ":tlsdesc:");
12233           break;
12234
12235         case SYMBOL_SMALL_TLSIE:
12236           asm_fprintf (asm_out_file, ":gottprel:");
12237           break;
12238
12239         case SYMBOL_TLSLE24:
12240           asm_fprintf (asm_out_file, ":tprel:");
12241           break;
12242
12243         case SYMBOL_TINY_GOT:
12244           gcc_unreachable ();
12245           break;
12246
12247         default:
12248           break;
12249         }
12250       output_addr_const (asm_out_file, x);
12251       break;
12252
12253     case 'L':
12254       switch (aarch64_classify_symbolic_expression (x))
12255         {
12256         case SYMBOL_SMALL_GOT_4G:
12257           asm_fprintf (asm_out_file, ":got_lo12:");
12258           break;
12259
12260         case SYMBOL_SMALL_TLSGD:
12261           asm_fprintf (asm_out_file, ":tlsgd_lo12:");
12262           break;
12263
12264         case SYMBOL_SMALL_TLSDESC:
12265           asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
12266           break;
12267
12268         case SYMBOL_SMALL_TLSIE:
12269           asm_fprintf (asm_out_file, ":gottprel_lo12:");
12270           break;
12271
12272         case SYMBOL_TLSLE12:
12273           asm_fprintf (asm_out_file, ":tprel_lo12:");
12274           break;
12275
12276         case SYMBOL_TLSLE24:
12277           asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
12278           break;
12279
12280         case SYMBOL_TINY_GOT:
12281           asm_fprintf (asm_out_file, ":got:");
12282           break;
12283
12284         case SYMBOL_TINY_TLSIE:
12285           asm_fprintf (asm_out_file, ":gottprel:");
12286           break;
12287
12288         default:
12289           break;
12290         }
12291       output_addr_const (asm_out_file, x);
12292       break;
12293
12294     case 'G':
12295       switch (aarch64_classify_symbolic_expression (x))
12296         {
12297         case SYMBOL_TLSLE24:
12298           asm_fprintf (asm_out_file, ":tprel_hi12:");
12299           break;
12300         default:
12301           break;
12302         }
12303       output_addr_const (asm_out_file, x);
12304       break;
12305
12306     case 'k':
12307       {
12308         HOST_WIDE_INT cond_code;
12309
12310         if (!CONST_INT_P (x))
12311           {
12312             output_operand_lossage ("invalid operand for '%%%c'", code);
12313             return;
12314           }
12315
12316         cond_code = INTVAL (x);
12317         gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
12318         asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
12319       }
12320       break;
12321
12322     case 'K':
12323       if (!REG_P (x) || !PR_REGNUM_P (REGNO (x)))
12324         {
12325           output_operand_lossage ("invalid operand for '%%%c'", code);
12326           return;
12327         }
12328       asm_fprintf (f, "pn%d", REGNO (x) - P0_REGNUM);
12329       break;
12330
12331     case 'y':
12332     case 'z':
12333       {
12334         machine_mode mode = GET_MODE (x);
12335
12336         if (!MEM_P (x)
12337             || (code == 'y'
12338                 && maybe_ne (GET_MODE_SIZE (mode), 8)
12339                 && maybe_ne (GET_MODE_SIZE (mode), 16)
12340                 && maybe_ne (GET_MODE_SIZE (mode), 32)))
12341           {
12342             output_operand_lossage ("invalid operand for '%%%c'", code);
12343             return;
12344           }
12345
12346         if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
12347                                             code == 'y'
12348                                             ? ADDR_QUERY_LDP_STP_N
12349                                             : ADDR_QUERY_LDP_STP))
12350           output_operand_lossage ("invalid operand prefix '%%%c'", code);
12351       }
12352       break;
12353
12354     default:
12355       output_operand_lossage ("invalid operand prefix '%%%c'", code);
12356       return;
12357     }
12358 }
12359
12360 /* Print address 'x' of a memory access with mode 'mode'.
12361    'op' is the context required by aarch64_classify_address.  It can either be
12362    MEM for a normal memory access or PARALLEL for LDP/STP.  */
12363 static bool
12364 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
12365                                 aarch64_addr_query_type type)
12366 {
12367   struct aarch64_address_info addr;
12368   unsigned int size, vec_flags;
12369
12370   /* Check all addresses are Pmode - including ILP32.  */
12371   if (GET_MODE (x) != Pmode
12372       && (!CONST_INT_P (x)
12373           || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
12374     {
12375       output_operand_lossage ("invalid address mode");
12376       return false;
12377     }
12378
12379   const bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
12380                                   || type == ADDR_QUERY_LDP_STP_N);
12381
12382   if (aarch64_classify_address (&addr, x, mode, true, type))
12383     switch (addr.type)
12384       {
12385       case ADDRESS_REG_IMM:
12386         if (known_eq (addr.const_offset, 0))
12387           {
12388             asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
12389             return true;
12390           }
12391
12392         vec_flags = aarch64_classify_vector_mode (mode);
12393         if ((vec_flags & VEC_ANY_SVE) && !load_store_pair_p)
12394           {
12395             HOST_WIDE_INT vnum
12396               = exact_div (addr.const_offset,
12397                            aarch64_vl_bytes (mode, vec_flags)).to_constant ();
12398             asm_fprintf (f, "[%s, #%wd, mul vl]",
12399                          reg_names[REGNO (addr.base)], vnum);
12400             return true;
12401           }
12402
12403         if (!CONST_INT_P (addr.offset))
12404           return false;
12405
12406         asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
12407                      INTVAL (addr.offset));
12408         return true;
12409
12410       case ADDRESS_REG_REG:
12411         if (addr.shift == 0)
12412           asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
12413                        reg_names [REGNO (addr.offset)]);
12414         else
12415           asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
12416                        reg_names [REGNO (addr.offset)], addr.shift);
12417         return true;
12418
12419       case ADDRESS_REG_UXTW:
12420         if (addr.shift == 0)
12421           asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
12422                        REGNO (addr.offset) - R0_REGNUM);
12423         else
12424           asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
12425                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
12426         return true;
12427
12428       case ADDRESS_REG_SXTW:
12429         if (addr.shift == 0)
12430           asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
12431                        REGNO (addr.offset) - R0_REGNUM);
12432         else
12433           asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
12434                        REGNO (addr.offset) - R0_REGNUM, addr.shift);
12435         return true;
12436
12437       case ADDRESS_REG_WB:
12438         /* Writeback is only supported for fixed-width modes.  */
12439         size = GET_MODE_SIZE (mode).to_constant ();
12440         switch (GET_CODE (x))
12441           {
12442           case PRE_INC:
12443             asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
12444             return true;
12445           case POST_INC:
12446             asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
12447             return true;
12448           case PRE_DEC:
12449             asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
12450             return true;
12451           case POST_DEC:
12452             asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
12453             return true;
12454           case PRE_MODIFY:
12455             asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
12456                          INTVAL (addr.offset));
12457             return true;
12458           case POST_MODIFY:
12459             asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
12460                          INTVAL (addr.offset));
12461             return true;
12462           default:
12463             break;
12464           }
12465         break;
12466
12467       case ADDRESS_LO_SUM:
12468         asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
12469         output_addr_const (f, addr.offset);
12470         asm_fprintf (f, "]");
12471         return true;
12472
12473       case ADDRESS_SYMBOLIC:
12474         output_addr_const (f, x);
12475         return true;
12476       }
12477
12478   return false;
12479 }
12480
12481 /* Print address 'x' of a memory access with mode 'mode'.  */
12482 static void
12483 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
12484 {
12485   if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
12486     output_addr_const (f, x);
12487 }
12488
12489 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA.  */
12490
12491 static bool
12492 aarch64_output_addr_const_extra (FILE *file, rtx x)
12493 {
12494   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
12495     {
12496       output_addr_const (file, XVECEXP (x, 0, 0));
12497       return true;
12498    }
12499   return false;
12500 }
12501
12502 bool
12503 aarch64_label_mentioned_p (rtx x)
12504 {
12505   const char *fmt;
12506   int i;
12507
12508   if (LABEL_REF_P (x))
12509     return true;
12510
12511   /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12512      referencing instruction, but they are constant offsets, not
12513      symbols.  */
12514   if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
12515     return false;
12516
12517   fmt = GET_RTX_FORMAT (GET_CODE (x));
12518   for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
12519     {
12520       if (fmt[i] == 'E')
12521         {
12522           int j;
12523
12524           for (j = XVECLEN (x, i) - 1; j >= 0; j--)
12525             if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
12526               return 1;
12527         }
12528       else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
12529         return 1;
12530     }
12531
12532   return 0;
12533 }
12534
12535 /* Implement REGNO_REG_CLASS.  */
12536
12537 enum reg_class
12538 aarch64_regno_regclass (unsigned regno)
12539 {
12540   if (W8_W11_REGNUM_P (regno))
12541     return W8_W11_REGS;
12542
12543   if (W12_W15_REGNUM_P (regno))
12544     return W12_W15_REGS;
12545
12546   if (STUB_REGNUM_P (regno))
12547     return STUB_REGS;
12548
12549   if (GP_REGNUM_P (regno))
12550     return GENERAL_REGS;
12551
12552   if (regno == SP_REGNUM)
12553     return STACK_REG;
12554
12555   if (regno == FRAME_POINTER_REGNUM
12556       || regno == ARG_POINTER_REGNUM)
12557     return POINTER_REGS;
12558
12559   if (FP_REGNUM_P (regno))
12560     return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
12561             : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
12562
12563   if (PR_REGNUM_P (regno))
12564     return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
12565
12566   if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
12567     return FFR_REGS;
12568
12569   if (FAKE_REGNUM_P (regno))
12570     return FAKE_REGS;
12571
12572   return NO_REGS;
12573 }
12574
12575 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12576    If OFFSET is out of range, return an offset of an anchor point
12577    that is in range.  Return 0 otherwise.  */
12578
12579 static HOST_WIDE_INT
12580 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
12581                        machine_mode mode)
12582 {
12583   /* Does it look like we'll need a 16-byte load/store-pair operation?  */
12584   if (size > 16)
12585     return (offset + 0x400) & ~0x7f0;
12586
12587   /* For offsets that aren't a multiple of the access size, the limit is
12588      -256...255.  */
12589   if (offset & (size - 1))
12590     {
12591       /* BLKmode typically uses LDP of X-registers.  */
12592       if (mode == BLKmode)
12593         return (offset + 512) & ~0x3ff;
12594       return (offset + 0x100) & ~0x1ff;
12595     }
12596
12597   /* Small negative offsets are supported.  */
12598   if (IN_RANGE (offset, -256, 0))
12599     return 0;
12600
12601   if (mode == TImode || mode == TFmode || mode == TDmode)
12602     return (offset + 0x100) & ~0x1ff;
12603
12604   /* Use 12-bit offset by access size.  */
12605   return offset & (~0xfff * size);
12606 }
12607
12608 static rtx
12609 aarch64_legitimize_address (rtx x, rtx /* orig_x  */, machine_mode mode)
12610 {
12611   /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
12612      where mask is selected by alignment and size of the offset.
12613      We try to pick as large a range for the offset as possible to
12614      maximize the chance of a CSE.  However, for aligned addresses
12615      we limit the range to 4k so that structures with different sized
12616      elements are likely to use the same base.  We need to be careful
12617      not to split a CONST for some forms of address expression, otherwise
12618      it will generate sub-optimal code.  */
12619
12620   /* First split X + CONST (base, offset) into (base + X) + offset.  */
12621   if (GET_CODE (x) == PLUS && GET_CODE (XEXP (x, 1)) == CONST)
12622     {
12623       poly_int64 offset;
12624       rtx base = strip_offset (XEXP (x, 1), &offset);
12625
12626       base = expand_binop (Pmode, add_optab, base, XEXP (x, 0),
12627                            NULL_RTX, true, OPTAB_DIRECT);
12628       x = plus_constant (Pmode, base, offset);
12629     }
12630
12631   if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
12632     {
12633       rtx base = XEXP (x, 0);
12634       rtx offset_rtx = XEXP (x, 1);
12635       HOST_WIDE_INT offset = INTVAL (offset_rtx);
12636
12637       if (GET_CODE (base) == PLUS)
12638         {
12639           rtx op0 = XEXP (base, 0);
12640           rtx op1 = XEXP (base, 1);
12641
12642           /* Force any scaling into a temp for CSE.  */
12643           op0 = force_reg (Pmode, op0);
12644           op1 = force_reg (Pmode, op1);
12645
12646           /* Let the pointer register be in op0.  */
12647           if (REG_POINTER (op1))
12648             std::swap (op0, op1);
12649
12650           /* If the pointer is virtual or frame related, then we know that
12651              virtual register instantiation or register elimination is going
12652              to apply a second constant.  We want the two constants folded
12653              together easily.  Therefore, emit as (OP0 + CONST) + OP1.  */
12654           if (virt_or_elim_regno_p (REGNO (op0)))
12655             {
12656               base = expand_binop (Pmode, add_optab, op0, offset_rtx,
12657                                    NULL_RTX, true, OPTAB_DIRECT);
12658               return gen_rtx_PLUS (Pmode, base, op1);
12659             }
12660
12661           /* Otherwise, in order to encourage CSE (and thence loop strength
12662              reduce) scaled addresses, emit as (OP0 + OP1) + CONST.  */
12663           base = expand_binop (Pmode, add_optab, op0, op1,
12664                                NULL_RTX, true, OPTAB_DIRECT);
12665           x = gen_rtx_PLUS (Pmode, base, offset_rtx);
12666         }
12667
12668       HOST_WIDE_INT size;
12669       if (GET_MODE_SIZE (mode).is_constant (&size))
12670         {
12671           HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
12672                                                              mode);
12673           if (base_offset != 0)
12674             {
12675               base = plus_constant (Pmode, base, base_offset);
12676               base = force_operand (base, NULL_RTX);
12677               return plus_constant (Pmode, base, offset - base_offset);
12678             }
12679         }
12680     }
12681
12682   return x;
12683 }
12684
12685 static reg_class_t
12686 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
12687                           reg_class_t rclass,
12688                           machine_mode mode,
12689                           secondary_reload_info *sri)
12690 {
12691   /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
12692      LDR and STR.  See the comment at the head of aarch64-sve.md for
12693      more details about the big-endian handling.  */
12694   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12695   if (reg_class_subset_p (rclass, FP_REGS)
12696       && !((REG_P (x) && HARD_REGISTER_P (x))
12697            || aarch64_simd_valid_immediate (x, NULL))
12698       && mode != VNx16QImode
12699       && (vec_flags & VEC_SVE_DATA)
12700       && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
12701     {
12702       sri->icode = CODE_FOR_aarch64_sve_reload_mem;
12703       return NO_REGS;
12704     }
12705
12706   /* If we have to disable direct literal pool loads and stores because the
12707      function is too big, then we need a scratch register.  */
12708   if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
12709       && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
12710           || targetm.vector_mode_supported_p (GET_MODE (x)))
12711       && !aarch64_pcrelative_literal_loads)
12712     {
12713       sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
12714       return NO_REGS;
12715     }
12716
12717   /* Without the TARGET_SIMD or TARGET_SVE instructions we cannot move a
12718      Q register to a Q register directly.  We need a scratch.  */
12719   if (REG_P (x)
12720       && (mode == TFmode
12721           || mode == TImode
12722           || mode == TDmode
12723           || (vec_flags == VEC_ADVSIMD && known_eq (GET_MODE_SIZE (mode), 16)))
12724       && mode == GET_MODE (x)
12725       && !TARGET_SIMD
12726       && FP_REGNUM_P (REGNO (x))
12727       && reg_class_subset_p (rclass, FP_REGS))
12728     {
12729       sri->icode = code_for_aarch64_reload_mov (mode);
12730       return NO_REGS;
12731     }
12732
12733   /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
12734      because AArch64 has richer addressing modes for LDR/STR instructions
12735      than LDP/STP instructions.  */
12736   if (TARGET_FLOAT && rclass == GENERAL_REGS
12737       && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
12738     return FP_REGS;
12739
12740   if (rclass == FP_REGS
12741       && (mode == TImode || mode == TFmode || mode == TDmode)
12742       && CONSTANT_P(x))
12743       return GENERAL_REGS;
12744
12745   return NO_REGS;
12746 }
12747
12748 /* Implement TARGET_SECONDARY_MEMORY_NEEDED.  */
12749
12750 static bool
12751 aarch64_secondary_memory_needed (machine_mode mode, reg_class_t class1,
12752                                  reg_class_t class2)
12753 {
12754   if (!TARGET_SIMD
12755       && reg_classes_intersect_p (class1, FP_REGS)
12756       && reg_classes_intersect_p (class2, FP_REGS))
12757     {
12758       /* We can't do a 128-bit FPR-to-FPR move without TARGET_SIMD,
12759          so we can't easily split a move involving tuples of 128-bit
12760          vectors.  Force the copy through memory instead.
12761
12762          (Tuples of 64-bit vectors are fine.)  */
12763       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12764       if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12765         return true;
12766     }
12767   return false;
12768 }
12769
12770 /* Implement TARGET_FRAME_POINTER_REQUIRED.  */
12771
12772 static bool
12773 aarch64_frame_pointer_required ()
12774 {
12775   /* If the function needs to record the incoming value of PSTATE.SM,
12776      make sure that the slot is accessible from the frame pointer.  */
12777   return aarch64_need_old_pstate_sm ();
12778 }
12779
12780 static bool
12781 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
12782 {
12783   gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
12784
12785   /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
12786      can only eliminate to HARD_FRAME_POINTER_REGNUM.  */
12787   if (frame_pointer_needed)
12788     return to == HARD_FRAME_POINTER_REGNUM;
12789   return true;
12790 }
12791
12792 poly_int64
12793 aarch64_initial_elimination_offset (unsigned from, unsigned to)
12794 {
12795   aarch64_frame &frame = cfun->machine->frame;
12796
12797   if (to == HARD_FRAME_POINTER_REGNUM)
12798     {
12799       if (from == ARG_POINTER_REGNUM)
12800         return frame.bytes_above_hard_fp;
12801
12802       if (from == FRAME_POINTER_REGNUM)
12803         return frame.bytes_above_hard_fp - frame.bytes_above_locals;
12804     }
12805
12806   if (to == STACK_POINTER_REGNUM)
12807     {
12808       if (from == FRAME_POINTER_REGNUM)
12809         return frame.frame_size - frame.bytes_above_locals;
12810     }
12811
12812   return frame.frame_size;
12813 }
12814
12815
12816 /* Get return address without mangling.  */
12817
12818 rtx
12819 aarch64_return_addr_rtx (void)
12820 {
12821   rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
12822   /* Note: aarch64_return_address_signing_enabled only
12823      works after cfun->machine->frame.laid_out is set,
12824      so here we don't know if the return address will
12825      be signed or not.  */
12826   rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
12827   emit_move_insn (lr, val);
12828   emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
12829   return lr;
12830 }
12831
12832
12833 /* Implement RETURN_ADDR_RTX.  We do not support moving back to a
12834    previous frame.  */
12835
12836 rtx
12837 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
12838 {
12839   if (count != 0)
12840     return const0_rtx;
12841   return aarch64_return_addr_rtx ();
12842 }
12843
12844 static void
12845 aarch64_asm_trampoline_template (FILE *f)
12846 {
12847   /* Even if the current function doesn't have branch protection, some
12848      later function might, so since this template is only generated once
12849      we have to add a BTI just in case. */
12850   asm_fprintf (f, "\thint\t34 // bti c\n");
12851
12852   if (TARGET_ILP32)
12853     {
12854       asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
12855       asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
12856     }
12857   else
12858     {
12859       asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
12860       asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
12861     }
12862   asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
12863
12864   /* We always emit a speculation barrier.
12865      This is because the same trampoline template is used for every nested
12866      function.  Since nested functions are not particularly common or
12867      performant we don't worry too much about the extra instructions to copy
12868      around.
12869      This is not yet a problem, since we have not yet implemented function
12870      specific attributes to choose between hardening against straight line
12871      speculation or not, but such function specific attributes are likely to
12872      happen in the future.  */
12873   asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
12874
12875   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12876   assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12877 }
12878
12879 static void
12880 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
12881 {
12882   rtx fnaddr, mem, a_tramp;
12883   const int tramp_code_sz = 24;
12884
12885   /* Don't need to copy the trailing D-words, we fill those in below.  */
12886   /* We create our own memory address in Pmode so that `emit_block_move` can
12887      use parts of the backend which expect Pmode addresses.  */
12888   rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
12889   emit_block_move (gen_rtx_MEM (BLKmode, temp),
12890                    assemble_trampoline_template (),
12891                    GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
12892   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
12893   fnaddr = XEXP (DECL_RTL (fndecl), 0);
12894   if (GET_MODE (fnaddr) != ptr_mode)
12895     fnaddr = convert_memory_address (ptr_mode, fnaddr);
12896   emit_move_insn (mem, fnaddr);
12897
12898   mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
12899   emit_move_insn (mem, chain_value);
12900
12901   /* XXX We should really define a "clear_cache" pattern and use
12902      gen_clear_cache().  */
12903   a_tramp = XEXP (m_tramp, 0);
12904   maybe_emit_call_builtin___clear_cache (a_tramp,
12905                                          plus_constant (ptr_mode,
12906                                                         a_tramp,
12907                                                         TRAMPOLINE_SIZE));
12908 }
12909
12910 static unsigned char
12911 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
12912 {
12913   /* ??? Logically we should only need to provide a value when
12914      HARD_REGNO_MODE_OK says that at least one register in REGCLASS
12915      can hold MODE, but at the moment we need to handle all modes.
12916      Just ignore any runtime parts for registers that can't store them.  */
12917   HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
12918   unsigned int nregs, vec_flags;
12919   switch (regclass)
12920     {
12921     case W8_W11_REGS:
12922     case W12_W15_REGS:
12923     case STUB_REGS:
12924     case TAILCALL_ADDR_REGS:
12925     case POINTER_REGS:
12926     case GENERAL_REGS:
12927     case ALL_REGS:
12928     case POINTER_AND_FP_REGS:
12929     case FP_REGS:
12930     case FP_LO_REGS:
12931     case FP_LO8_REGS:
12932       vec_flags = aarch64_classify_vector_mode (mode);
12933       if ((vec_flags & VEC_SVE_DATA)
12934           && constant_multiple_p (GET_MODE_SIZE (mode),
12935                                   aarch64_vl_bytes (mode, vec_flags), &nregs))
12936         return nregs;
12937       return (vec_flags & VEC_ADVSIMD
12938               ? CEIL (lowest_size, UNITS_PER_VREG)
12939               : CEIL (lowest_size, UNITS_PER_WORD));
12940
12941     case PR_REGS:
12942     case PR_LO_REGS:
12943     case PR_HI_REGS:
12944       return mode == VNx32BImode ? 2 : 1;
12945
12946     case STACK_REG:
12947     case FFR_REGS:
12948     case PR_AND_FFR_REGS:
12949     case FAKE_REGS:
12950       return 1;
12951
12952     case NO_REGS:
12953       return 0;
12954
12955     default:
12956       break;
12957     }
12958   gcc_unreachable ();
12959 }
12960
12961 static reg_class_t
12962 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
12963 {
12964   if (regclass == POINTER_REGS)
12965     return GENERAL_REGS;
12966
12967   if (regclass == STACK_REG)
12968     {
12969       if (REG_P(x)
12970           && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
12971           return regclass;
12972
12973       return NO_REGS;
12974     }
12975
12976   /* Register eliminiation can result in a request for
12977      SP+constant->FP_REGS.  We cannot support such operations which
12978      use SP as source and an FP_REG as destination, so reject out
12979      right now.  */
12980   if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
12981     {
12982       rtx lhs = XEXP (x, 0);
12983
12984       /* Look through a possible SUBREG introduced by ILP32.  */
12985       if (SUBREG_P (lhs))
12986         lhs = SUBREG_REG (lhs);
12987
12988       gcc_assert (REG_P (lhs));
12989       gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
12990                                       POINTER_REGS));
12991       return NO_REGS;
12992     }
12993
12994   return regclass;
12995 }
12996
12997 void
12998 aarch64_asm_output_labelref (FILE* f, const char *name)
12999 {
13000   asm_fprintf (f, "%U%s", name);
13001 }
13002
13003 static void
13004 aarch64_elf_asm_constructor (rtx symbol, int priority)
13005 {
13006   if (priority == DEFAULT_INIT_PRIORITY)
13007     default_ctor_section_asm_out_constructor (symbol, priority);
13008   else
13009     {
13010       section *s;
13011       /* While priority is known to be in range [0, 65535], so 18 bytes
13012          would be enough, the compiler might not know that.  To avoid
13013          -Wformat-truncation false positive, use a larger size.  */
13014       char buf[23];
13015       snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
13016       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
13017       switch_to_section (s);
13018       assemble_align (POINTER_SIZE);
13019       assemble_aligned_integer (POINTER_BYTES, symbol);
13020     }
13021 }
13022
13023 static void
13024 aarch64_elf_asm_destructor (rtx symbol, int priority)
13025 {
13026   if (priority == DEFAULT_INIT_PRIORITY)
13027     default_dtor_section_asm_out_destructor (symbol, priority);
13028   else
13029     {
13030       section *s;
13031       /* While priority is known to be in range [0, 65535], so 18 bytes
13032          would be enough, the compiler might not know that.  To avoid
13033          -Wformat-truncation false positive, use a larger size.  */
13034       char buf[23];
13035       snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
13036       s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
13037       switch_to_section (s);
13038       assemble_align (POINTER_SIZE);
13039       assemble_aligned_integer (POINTER_BYTES, symbol);
13040     }
13041 }
13042
13043 const char*
13044 aarch64_output_casesi (rtx *operands)
13045 {
13046   char buf[100];
13047   char label[100];
13048   rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
13049   int index;
13050   static const char *const patterns[4][2] =
13051   {
13052     {
13053       "ldrb\t%w3, [%0,%w1,uxtw]",
13054       "add\t%3, %4, %w3, sxtb #2"
13055     },
13056     {
13057       "ldrh\t%w3, [%0,%w1,uxtw #1]",
13058       "add\t%3, %4, %w3, sxth #2"
13059     },
13060     {
13061       "ldr\t%w3, [%0,%w1,uxtw #2]",
13062       "add\t%3, %4, %w3, sxtw #2"
13063     },
13064     /* We assume that DImode is only generated when not optimizing and
13065        that we don't really need 64-bit address offsets.  That would
13066        imply an object file with 8GB of code in a single function!  */
13067     {
13068       "ldr\t%w3, [%0,%w1,uxtw #2]",
13069       "add\t%3, %4, %w3, sxtw #2"
13070     }
13071   };
13072
13073   gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
13074
13075   scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
13076   index = exact_log2 (GET_MODE_SIZE (mode));
13077
13078   gcc_assert (index >= 0 && index <= 3);
13079
13080   /* Need to implement table size reduction, by chaning the code below.  */
13081   output_asm_insn (patterns[index][0], operands);
13082   ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
13083   snprintf (buf, sizeof (buf),
13084             "adr\t%%4, %s", targetm.strip_name_encoding (label));
13085   output_asm_insn (buf, operands);
13086   output_asm_insn (patterns[index][1], operands);
13087   output_asm_insn ("br\t%3", operands);
13088   output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
13089                    operands);
13090   assemble_label (asm_out_file, label);
13091   return "";
13092 }
13093
13094 /* Return the asm string for an SME ZERO instruction whose 8-bit mask
13095    operand is MASK.  */
13096 const char *
13097 aarch64_output_sme_zero_za (rtx mask)
13098 {
13099   auto mask_val = UINTVAL (mask);
13100   if (mask_val == 0)
13101     return "zero\t{}";
13102
13103   if (mask_val == 0xff)
13104     return "zero\t{ za }";
13105
13106   static constexpr std::pair<unsigned int, char> tiles[] = {
13107     { 0xff, 'b' },
13108     { 0x55, 'h' },
13109     { 0x11, 's' },
13110     { 0x01, 'd' }
13111   };
13112   /* The last entry in the list has the form "za7.d }", but that's the
13113      same length as "za7.d, ".  */
13114   static char buffer[sizeof("zero\t{ ") + sizeof ("za7.d, ") * 8 + 1];
13115   unsigned int i = 0;
13116   i += snprintf (buffer + i, sizeof (buffer) - i, "zero\t");
13117   const char *prefix = "{ ";
13118   for (auto &tile : tiles)
13119     {
13120       auto tile_mask = tile.first;
13121       unsigned int tile_index = 0;
13122       while (tile_mask < 0x100)
13123         {
13124           if ((mask_val & tile_mask) == tile_mask)
13125             {
13126               i += snprintf (buffer + i, sizeof (buffer) - i, "%sza%d.%c",
13127                              prefix, tile_index, tile.second);
13128               prefix = ", ";
13129               mask_val &= ~tile_mask;
13130             }
13131           tile_mask <<= 1;
13132           tile_index += 1;
13133         }
13134     }
13135   gcc_assert (mask_val == 0 && i + 3 <= sizeof (buffer));
13136   snprintf (buffer + i, sizeof (buffer) - i, " }");
13137   return buffer;
13138 }
13139
13140 /* Return size in bits of an arithmetic operand which is shifted/scaled and
13141    masked such that it is suitable for a UXTB, UXTH, or UXTW extend
13142    operator.  */
13143
13144 int
13145 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
13146 {
13147   if (shift >= 0 && shift <= 4)
13148     {
13149       int size;
13150       for (size = 8; size <= 32; size *= 2)
13151         {
13152           HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
13153           if (mask == bits << shift)
13154             return size;
13155         }
13156     }
13157   return 0;
13158 }
13159
13160 /* Constant pools are per function only when PC relative
13161    literal loads are true or we are in the large memory
13162    model.  */
13163
13164 static inline bool
13165 aarch64_can_use_per_function_literal_pools_p (void)
13166 {
13167   return (aarch64_pcrelative_literal_loads
13168           || aarch64_cmodel == AARCH64_CMODEL_LARGE);
13169 }
13170
13171 static bool
13172 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
13173 {
13174   /* We can't use blocks for constants when we're using a per-function
13175      constant pool.  */
13176   return !aarch64_can_use_per_function_literal_pools_p ();
13177 }
13178
13179 /* Select appropriate section for constants depending
13180    on where we place literal pools.  */
13181
13182 static section *
13183 aarch64_select_rtx_section (machine_mode mode,
13184                             rtx x,
13185                             unsigned HOST_WIDE_INT align)
13186 {
13187   if (aarch64_can_use_per_function_literal_pools_p ())
13188     return function_section (current_function_decl);
13189
13190   return default_elf_select_rtx_section (mode, x, align);
13191 }
13192
13193 /* Implement ASM_OUTPUT_POOL_EPILOGUE.  */
13194 void
13195 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
13196                                   HOST_WIDE_INT offset)
13197 {
13198   /* When using per-function literal pools, we must ensure that any code
13199      section is aligned to the minimal instruction length, lest we get
13200      errors from the assembler re "unaligned instructions".  */
13201   if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
13202     ASM_OUTPUT_ALIGN (f, 2);
13203 }
13204
13205 /* Costs.  */
13206
13207 /* Helper function for rtx cost calculation.  Strip a shift expression
13208    from X.  Returns the inner operand if successful, or the original
13209    expression on failure.  */
13210 static rtx
13211 aarch64_strip_shift (rtx x)
13212 {
13213   rtx op = x;
13214
13215   /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
13216      we can convert both to ROR during final output.  */
13217   if ((GET_CODE (op) == ASHIFT
13218        || GET_CODE (op) == ASHIFTRT
13219        || GET_CODE (op) == LSHIFTRT
13220        || GET_CODE (op) == ROTATERT
13221        || GET_CODE (op) == ROTATE)
13222       && CONST_INT_P (XEXP (op, 1)))
13223     return XEXP (op, 0);
13224
13225   if (GET_CODE (op) == MULT
13226       && CONST_INT_P (XEXP (op, 1))
13227       && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
13228     return XEXP (op, 0);
13229
13230   return x;
13231 }
13232
13233 /* Helper function for rtx cost calculation.  Strip an extend
13234    expression from X.  Returns the inner operand if successful, or the
13235    original expression on failure.  We deal with a number of possible
13236    canonicalization variations here. If STRIP_SHIFT is true, then
13237    we can strip off a shift also.  */
13238 static rtx
13239 aarch64_strip_extend (rtx x, bool strip_shift)
13240 {
13241   scalar_int_mode mode;
13242   rtx op = x;
13243
13244   if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
13245     return op;
13246
13247   if (GET_CODE (op) == AND
13248       && GET_CODE (XEXP (op, 0)) == MULT
13249       && CONST_INT_P (XEXP (XEXP (op, 0), 1))
13250       && CONST_INT_P (XEXP (op, 1))
13251       && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
13252                            INTVAL (XEXP (op, 1))) != 0)
13253     return XEXP (XEXP (op, 0), 0);
13254
13255   /* Now handle extended register, as this may also have an optional
13256      left shift by 1..4.  */
13257   if (strip_shift
13258       && GET_CODE (op) == ASHIFT
13259       && CONST_INT_P (XEXP (op, 1))
13260       && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
13261     op = XEXP (op, 0);
13262
13263   if (GET_CODE (op) == ZERO_EXTEND
13264       || GET_CODE (op) == SIGN_EXTEND)
13265     op = XEXP (op, 0);
13266
13267   if (op != x)
13268     return op;
13269
13270   return x;
13271 }
13272
13273 /* Helper function for rtx cost calculation. Strip extension as well as any
13274    inner VEC_SELECT high-half from X. Returns the inner vector operand if
13275    successful, or the original expression on failure.  */
13276 static rtx
13277 aarch64_strip_extend_vec_half (rtx x)
13278 {
13279   if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13280     {
13281       x = XEXP (x, 0);
13282       if (GET_CODE (x) == VEC_SELECT
13283           && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
13284                                     XEXP (x, 1)))
13285         x = XEXP (x, 0);
13286     }
13287   return x;
13288 }
13289
13290 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
13291    any subsequent extend and VEC_SELECT from X. Returns the inner scalar
13292    operand if successful, or the original expression on failure.  */
13293 static rtx
13294 aarch64_strip_duplicate_vec_elt (rtx x)
13295 {
13296   if (GET_CODE (x) == VEC_DUPLICATE
13297       && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
13298     {
13299       x = XEXP (x, 0);
13300       if (GET_CODE (x) == VEC_SELECT)
13301         x = XEXP (x, 0);
13302       else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13303                && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
13304         x = XEXP (XEXP (x, 0), 0);
13305     }
13306   return x;
13307 }
13308
13309 /* Return true iff CODE is a shift supported in combination
13310    with arithmetic instructions.  */
13311
13312 static bool
13313 aarch64_shift_p (enum rtx_code code)
13314 {
13315   return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
13316 }
13317
13318
13319 /* Return true iff X is a cheap shift without a sign extend. */
13320
13321 static bool
13322 aarch64_cheap_mult_shift_p (rtx x)
13323 {
13324   rtx op0, op1;
13325
13326   op0 = XEXP (x, 0);
13327   op1 = XEXP (x, 1);
13328
13329   if (!(aarch64_tune_params.extra_tuning_flags
13330                       & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
13331     return false;
13332
13333   if (GET_CODE (op0) == SIGN_EXTEND)
13334     return false;
13335
13336   if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
13337       && UINTVAL (op1) <= 4)
13338     return true;
13339
13340   if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
13341     return false;
13342
13343   HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
13344
13345   if (l2 > 0 && l2 <= 4)
13346     return true;
13347
13348   return false;
13349 }
13350
13351 /* Helper function for rtx cost calculation.  Calculate the cost of
13352    a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13353    Return the calculated cost of the expression, recursing manually in to
13354    operands where needed.  */
13355
13356 static int
13357 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
13358 {
13359   rtx op0, op1;
13360   const struct cpu_cost_table *extra_cost
13361     = aarch64_tune_params.insn_extra_cost;
13362   int cost = 0;
13363   bool compound_p = (outer == PLUS || outer == MINUS);
13364   machine_mode mode = GET_MODE (x);
13365
13366   gcc_checking_assert (code == MULT);
13367
13368   op0 = XEXP (x, 0);
13369   op1 = XEXP (x, 1);
13370
13371   if (VECTOR_MODE_P (mode))
13372     {
13373       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13374       if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
13375         {
13376           /* The select-operand-high-half versions of the instruction have the
13377              same cost as the three vector version - don't add the costs of the
13378              extension or selection into the costs of the multiply.  */
13379           op0 = aarch64_strip_extend_vec_half (op0);
13380           op1 = aarch64_strip_extend_vec_half (op1);
13381           /* The by-element versions of the instruction have the same costs as
13382              the normal 3-vector version.  We make an assumption that the input
13383              to the VEC_DUPLICATE is already on the FP & SIMD side.  This means
13384              costing of a MUL by element pre RA is a bit optimistic.  */
13385           op0 = aarch64_strip_duplicate_vec_elt (op0);
13386           op1 = aarch64_strip_duplicate_vec_elt (op1);
13387         }
13388       cost += rtx_cost (op0, mode, MULT, 0, speed);
13389       cost += rtx_cost (op1, mode, MULT, 1, speed);
13390       if (speed)
13391         {
13392           if (GET_CODE (x) == MULT)
13393             cost += extra_cost->vect.mult;
13394           /* This is to catch the SSRA costing currently flowing here.  */
13395           else
13396             cost += extra_cost->vect.alu;
13397         }
13398       return cost;
13399     }
13400
13401   /* Integer multiply/fma.  */
13402   if (GET_MODE_CLASS (mode) == MODE_INT)
13403     {
13404       /* The multiply will be canonicalized as a shift, cost it as such.  */
13405       if (aarch64_shift_p (GET_CODE (x))
13406           || (CONST_INT_P (op1)
13407               && exact_log2 (INTVAL (op1)) > 0))
13408         {
13409           bool is_extend = GET_CODE (op0) == ZERO_EXTEND
13410                            || GET_CODE (op0) == SIGN_EXTEND;
13411           if (speed)
13412             {
13413               if (compound_p)
13414                 {
13415                   /* If the shift is considered cheap,
13416                      then don't add any cost. */
13417                   if (aarch64_cheap_mult_shift_p (x))
13418                     ;
13419                   else if (REG_P (op1))
13420                     /* ARITH + shift-by-register.  */
13421                     cost += extra_cost->alu.arith_shift_reg;
13422                   else if (is_extend)
13423                     /* ARITH + extended register.  We don't have a cost field
13424                        for ARITH+EXTEND+SHIFT, so use extend_arith here.  */
13425                     cost += extra_cost->alu.extend_arith;
13426                   else
13427                     /* ARITH + shift-by-immediate.  */
13428                     cost += extra_cost->alu.arith_shift;
13429                 }
13430               else
13431                 /* LSL (immediate).  */
13432                 cost += extra_cost->alu.shift;
13433
13434             }
13435           /* Strip extends as we will have costed them in the case above.  */
13436           if (is_extend)
13437             op0 = aarch64_strip_extend (op0, true);
13438
13439           cost += rtx_cost (op0, VOIDmode, code, 0, speed);
13440
13441           return cost;
13442         }
13443
13444       /* MNEG or [US]MNEGL.  Extract the NEG operand and indicate that it's a
13445          compound and let the below cases handle it.  After all, MNEG is a
13446          special-case alias of MSUB.  */
13447       if (GET_CODE (op0) == NEG)
13448         {
13449           op0 = XEXP (op0, 0);
13450           compound_p = true;
13451         }
13452
13453       /* Integer multiplies or FMAs have zero/sign extending variants.  */
13454       if ((GET_CODE (op0) == ZERO_EXTEND
13455            && GET_CODE (op1) == ZERO_EXTEND)
13456           || (GET_CODE (op0) == SIGN_EXTEND
13457               && GET_CODE (op1) == SIGN_EXTEND))
13458         {
13459           cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
13460           cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
13461
13462           if (speed)
13463             {
13464               if (compound_p)
13465                 /* SMADDL/UMADDL/UMSUBL/SMSUBL.  */
13466                 cost += extra_cost->mult[0].extend_add;
13467               else
13468                 /* MUL/SMULL/UMULL.  */
13469                 cost += extra_cost->mult[0].extend;
13470             }
13471
13472           return cost;
13473         }
13474
13475       /* This is either an integer multiply or a MADD.  In both cases
13476          we want to recurse and cost the operands.  */
13477       cost += rtx_cost (op0, mode, MULT, 0, speed);
13478       cost += rtx_cost (op1, mode, MULT, 1, speed);
13479
13480       if (speed)
13481         {
13482           if (compound_p)
13483             /* MADD/MSUB.  */
13484             cost += extra_cost->mult[mode == DImode].add;
13485           else
13486             /* MUL.  */
13487             cost += extra_cost->mult[mode == DImode].simple;
13488         }
13489
13490       return cost;
13491     }
13492   else
13493     {
13494       if (speed)
13495         {
13496           /* Floating-point FMA/FMUL can also support negations of the
13497              operands, unless the rounding mode is upward or downward in
13498              which case FNMUL is different than FMUL with operand negation.  */
13499           bool neg0 = GET_CODE (op0) == NEG;
13500           bool neg1 = GET_CODE (op1) == NEG;
13501           if (compound_p || !flag_rounding_math || (neg0 && neg1))
13502             {
13503               if (neg0)
13504                 op0 = XEXP (op0, 0);
13505               if (neg1)
13506                 op1 = XEXP (op1, 0);
13507             }
13508
13509           if (compound_p)
13510             /* FMADD/FNMADD/FNMSUB/FMSUB.  */
13511             cost += extra_cost->fp[mode == DFmode].fma;
13512           else
13513             /* FMUL/FNMUL.  */
13514             cost += extra_cost->fp[mode == DFmode].mult;
13515         }
13516
13517       cost += rtx_cost (op0, mode, MULT, 0, speed);
13518       cost += rtx_cost (op1, mode, MULT, 1, speed);
13519       return cost;
13520     }
13521 }
13522
13523 static int
13524 aarch64_address_cost (rtx x,
13525                       machine_mode mode,
13526                       addr_space_t as ATTRIBUTE_UNUSED,
13527                       bool speed)
13528 {
13529   enum rtx_code c = GET_CODE (x);
13530   const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
13531   struct aarch64_address_info info;
13532   int cost = 0;
13533   info.shift = 0;
13534
13535   if (!aarch64_classify_address (&info, x, mode, false))
13536     {
13537       if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
13538         {
13539           /* This is a CONST or SYMBOL ref which will be split
13540              in a different way depending on the code model in use.
13541              Cost it through the generic infrastructure.  */
13542           int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
13543           /* Divide through by the cost of one instruction to
13544              bring it to the same units as the address costs.  */
13545           cost_symbol_ref /= COSTS_N_INSNS (1);
13546           /* The cost is then the cost of preparing the address,
13547              followed by an immediate (possibly 0) offset.  */
13548           return cost_symbol_ref + addr_cost->imm_offset;
13549         }
13550       else
13551         {
13552           /* This is most likely a jump table from a case
13553              statement.  */
13554           return addr_cost->register_offset;
13555         }
13556     }
13557
13558   switch (info.type)
13559     {
13560       case ADDRESS_LO_SUM:
13561       case ADDRESS_SYMBOLIC:
13562       case ADDRESS_REG_IMM:
13563         cost += addr_cost->imm_offset;
13564         break;
13565
13566       case ADDRESS_REG_WB:
13567         if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
13568           cost += addr_cost->pre_modify;
13569         else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
13570           {
13571             unsigned int nvectors = aarch64_ldn_stn_vectors (mode);
13572             if (nvectors == 3)
13573               cost += addr_cost->post_modify_ld3_st3;
13574             else if (nvectors == 4)
13575               cost += addr_cost->post_modify_ld4_st4;
13576             else
13577               cost += addr_cost->post_modify;
13578           }
13579         else
13580           gcc_unreachable ();
13581
13582         break;
13583
13584       case ADDRESS_REG_REG:
13585         cost += addr_cost->register_offset;
13586         break;
13587
13588       case ADDRESS_REG_SXTW:
13589         cost += addr_cost->register_sextend;
13590         break;
13591
13592       case ADDRESS_REG_UXTW:
13593         cost += addr_cost->register_zextend;
13594         break;
13595
13596       default:
13597         gcc_unreachable ();
13598     }
13599
13600
13601   if (info.shift > 0)
13602     {
13603       /* For the sake of calculating the cost of the shifted register
13604          component, we can treat same sized modes in the same way.  */
13605       if (known_eq (GET_MODE_BITSIZE (mode), 16))
13606         cost += addr_cost->addr_scale_costs.hi;
13607       else if (known_eq (GET_MODE_BITSIZE (mode), 32))
13608         cost += addr_cost->addr_scale_costs.si;
13609       else if (known_eq (GET_MODE_BITSIZE (mode), 64))
13610         cost += addr_cost->addr_scale_costs.di;
13611       else
13612         /* We can't tell, or this is a 128-bit vector.  */
13613         cost += addr_cost->addr_scale_costs.ti;
13614     }
13615
13616   return cost;
13617 }
13618
13619 /* Return the cost of a branch.  If SPEED_P is true then the compiler is
13620    optimizing for speed.  If PREDICTABLE_P is true then the branch is predicted
13621    to be taken.  */
13622
13623 int
13624 aarch64_branch_cost (bool speed_p, bool predictable_p)
13625 {
13626   /* When optimizing for speed, use the cost of unpredictable branches.  */
13627   const struct cpu_branch_cost *branch_costs =
13628     aarch64_tune_params.branch_costs;
13629
13630   if (!speed_p || predictable_p)
13631     return branch_costs->predictable;
13632   else
13633     return branch_costs->unpredictable;
13634 }
13635
13636 /* Return true if X is a zero or sign extract
13637    usable in an ADD or SUB (extended register) instruction.  */
13638 static bool
13639 aarch64_rtx_arith_op_extract_p (rtx x)
13640 {
13641   /* The simple case <ARITH>, XD, XN, XM, [us]xt.
13642      No shift.  */
13643   if (GET_CODE (x) == SIGN_EXTEND
13644       || GET_CODE (x) == ZERO_EXTEND)
13645     return REG_P (XEXP (x, 0));
13646
13647   return false;
13648 }
13649
13650 static bool
13651 aarch64_frint_unspec_p (unsigned int u)
13652 {
13653   switch (u)
13654     {
13655       case UNSPEC_FRINTZ:
13656       case UNSPEC_FRINTP:
13657       case UNSPEC_FRINTM:
13658       case UNSPEC_FRINTA:
13659       case UNSPEC_FRINTN:
13660       case UNSPEC_FRINTX:
13661       case UNSPEC_FRINTI:
13662         return true;
13663
13664       default:
13665         return false;
13666     }
13667 }
13668
13669 /* Return true iff X is an rtx that will match an extr instruction
13670    i.e. as described in the *extr<mode>5_insn family of patterns.
13671    OP0 and OP1 will be set to the operands of the shifts involved
13672    on success and will be NULL_RTX otherwise.  */
13673
13674 static bool
13675 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
13676 {
13677   rtx op0, op1;
13678   scalar_int_mode mode;
13679   if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
13680     return false;
13681
13682   *res_op0 = NULL_RTX;
13683   *res_op1 = NULL_RTX;
13684
13685   if (GET_CODE (x) != IOR)
13686     return false;
13687
13688   op0 = XEXP (x, 0);
13689   op1 = XEXP (x, 1);
13690
13691   if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
13692       || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
13693     {
13694      /* Canonicalise locally to ashift in op0, lshiftrt in op1.  */
13695       if (GET_CODE (op1) == ASHIFT)
13696         std::swap (op0, op1);
13697
13698       if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
13699         return false;
13700
13701       unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
13702       unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
13703
13704       if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
13705           && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
13706         {
13707           *res_op0 = XEXP (op0, 0);
13708           *res_op1 = XEXP (op1, 0);
13709           return true;
13710         }
13711     }
13712
13713   return false;
13714 }
13715
13716 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
13717    storing it in *COST.  Result is true if the total cost of the operation
13718    has now been calculated.  */
13719 static bool
13720 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
13721 {
13722   rtx inner;
13723   rtx comparator;
13724   enum rtx_code cmpcode;
13725   const struct cpu_cost_table *extra_cost
13726     = aarch64_tune_params.insn_extra_cost;
13727
13728   if (COMPARISON_P (op0))
13729     {
13730       inner = XEXP (op0, 0);
13731       comparator = XEXP (op0, 1);
13732       cmpcode = GET_CODE (op0);
13733     }
13734   else
13735     {
13736       inner = op0;
13737       comparator = const0_rtx;
13738       cmpcode = NE;
13739     }
13740
13741   if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
13742     {
13743       /* Conditional branch.  */
13744       if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13745         return true;
13746       else
13747         {
13748           if (cmpcode == NE || cmpcode == EQ)
13749             {
13750               if (comparator == const0_rtx)
13751                 {
13752                   /* TBZ/TBNZ/CBZ/CBNZ.  */
13753                   if (GET_CODE (inner) == ZERO_EXTRACT)
13754                     /* TBZ/TBNZ.  */
13755                     *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
13756                                        ZERO_EXTRACT, 0, speed);
13757                   else
13758                     /* CBZ/CBNZ.  */
13759                     *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
13760
13761                   return true;
13762                 }
13763               if (register_operand (inner, VOIDmode)
13764                   && aarch64_imm24 (comparator, VOIDmode))
13765                 {
13766                   /* SUB and SUBS.  */
13767                   *cost += COSTS_N_INSNS (2);
13768                   if (speed)
13769                     *cost += extra_cost->alu.arith * 2;
13770                   return true;
13771                 }
13772             }
13773           else if (cmpcode == LT || cmpcode == GE)
13774             {
13775               /* TBZ/TBNZ.  */
13776               if (comparator == const0_rtx)
13777                 return true;
13778             }
13779         }
13780     }
13781   else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13782     {
13783       /* CCMP.  */
13784       if (GET_CODE (op1) == COMPARE)
13785         {
13786           /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0.  */
13787           if (XEXP (op1, 1) == const0_rtx)
13788             *cost += 1;
13789           if (speed)
13790             {
13791               machine_mode mode = GET_MODE (XEXP (op1, 0));
13792
13793               if (GET_MODE_CLASS (mode) == MODE_INT)
13794                 *cost += extra_cost->alu.arith;
13795               else
13796                 *cost += extra_cost->fp[mode == DFmode].compare;
13797             }
13798           return true;
13799         }
13800
13801       /* It's a conditional operation based on the status flags,
13802          so it must be some flavor of CSEL.  */
13803
13804       /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL.  */
13805       if (GET_CODE (op1) == NEG
13806           || GET_CODE (op1) == NOT
13807           || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
13808         op1 = XEXP (op1, 0);
13809       else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
13810         {
13811           /* CSEL with zero-extension (*cmovdi_insn_uxtw).  */
13812           op1 = XEXP (op1, 0);
13813           op2 = XEXP (op2, 0);
13814         }
13815       else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
13816         {
13817           inner = XEXP (op1, 0);
13818           if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
13819             /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3).  */
13820             op1 = XEXP (inner, 0);
13821         }
13822       else if (op1 == constm1_rtx || op1 == const1_rtx)
13823         {
13824           /* Use CSINV or CSINC.  */
13825           *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
13826           return true;
13827         }
13828       else if (op2 == constm1_rtx || op2 == const1_rtx)
13829         {
13830           /* Use CSINV or CSINC.  */
13831           *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13832           return true;
13833         }
13834
13835       *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13836       *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
13837       return true;
13838     }
13839
13840   /* We don't know what this is, cost all operands.  */
13841   return false;
13842 }
13843
13844 /* Check whether X is a bitfield operation of the form shift + extend that
13845    maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction.  If so, return the
13846    operand to which the bitfield operation is applied.  Otherwise return
13847    NULL_RTX.  */
13848
13849 static rtx
13850 aarch64_extend_bitfield_pattern_p (rtx x)
13851 {
13852   rtx_code outer_code = GET_CODE (x);
13853   machine_mode outer_mode = GET_MODE (x);
13854
13855   if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
13856       && outer_mode != SImode && outer_mode != DImode)
13857     return NULL_RTX;
13858
13859   rtx inner = XEXP (x, 0);
13860   rtx_code inner_code = GET_CODE (inner);
13861   machine_mode inner_mode = GET_MODE (inner);
13862   rtx op = NULL_RTX;
13863
13864   switch (inner_code)
13865     {
13866       case ASHIFT:
13867         if (CONST_INT_P (XEXP (inner, 1))
13868             && (inner_mode == QImode || inner_mode == HImode))
13869           op = XEXP (inner, 0);
13870         break;
13871       case LSHIFTRT:
13872         if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
13873             && (inner_mode == QImode || inner_mode == HImode))
13874           op = XEXP (inner, 0);
13875         break;
13876       case ASHIFTRT:
13877         if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
13878             && (inner_mode == QImode || inner_mode == HImode))
13879           op = XEXP (inner, 0);
13880         break;
13881       default:
13882         break;
13883     }
13884
13885   return op;
13886 }
13887
13888 /* Return true if the mask and a shift amount from an RTX of the form
13889    (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
13890    mode MODE.  See the *andim_ashift<mode>_bfiz pattern.  */
13891
13892 bool
13893 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
13894                                     rtx shft_amnt)
13895 {
13896   return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
13897          && INTVAL (mask) > 0
13898          && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
13899          && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
13900          && (UINTVAL (mask)
13901              & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
13902 }
13903
13904 /* Return true if the masks and a shift amount from an RTX of the form
13905    ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
13906    a BFI instruction of mode MODE.  See *arch64_bfi patterns.  */
13907
13908 bool
13909 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
13910                                    unsigned HOST_WIDE_INT mask1,
13911                                    unsigned HOST_WIDE_INT shft_amnt,
13912                                    unsigned HOST_WIDE_INT mask2)
13913 {
13914   unsigned HOST_WIDE_INT t;
13915
13916   /* Verify that there is no overlap in what bits are set in the two masks.  */
13917   if (mask1 != ~mask2)
13918     return false;
13919
13920   /* Verify that mask2 is not all zeros or ones.  */
13921   if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
13922     return false;
13923
13924   /* The shift amount should always be less than the mode size.  */
13925   gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
13926
13927   /* Verify that the mask being shifted is contiguous and would be in the
13928      least significant bits after shifting by shft_amnt.  */
13929   t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
13930   return (t == (t & -t));
13931 }
13932
13933 /* Return true if X is an RTX representing an operation in the ABD family
13934    of instructions.  */
13935
13936 static bool
13937 aarch64_abd_rtx_p (rtx x)
13938 {
13939   if (GET_CODE (x) != MINUS)
13940     return false;
13941   rtx max_arm = XEXP (x, 0);
13942   rtx min_arm = XEXP (x, 1);
13943   if (GET_CODE (max_arm) != SMAX && GET_CODE (max_arm) != UMAX)
13944     return false;
13945   bool signed_p = GET_CODE (max_arm) == SMAX;
13946   if (signed_p && GET_CODE (min_arm) != SMIN)
13947     return false;
13948   else if (!signed_p && GET_CODE (min_arm) != UMIN)
13949     return false;
13950
13951   rtx maxop0 = XEXP (max_arm, 0);
13952   rtx maxop1 = XEXP (max_arm, 1);
13953   rtx minop0 = XEXP (min_arm, 0);
13954   rtx minop1 = XEXP (min_arm, 1);
13955   return rtx_equal_p (maxop0, minop0) && rtx_equal_p (maxop1, minop1);
13956 }
13957
13958 /* Calculate the cost of calculating X, storing it in *COST.  Result
13959    is true if the total cost of the operation has now been calculated.  */
13960 static bool
13961 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
13962                    int param ATTRIBUTE_UNUSED, int *cost, bool speed)
13963 {
13964   rtx op0, op1, op2;
13965   const struct cpu_cost_table *extra_cost
13966     = aarch64_tune_params.insn_extra_cost;
13967   rtx_code code = GET_CODE (x);
13968   scalar_int_mode int_mode;
13969
13970   /* By default, assume that everything has equivalent cost to the
13971      cheapest instruction.  Any additional costs are applied as a delta
13972      above this default.  */
13973   *cost = COSTS_N_INSNS (1);
13974
13975   switch (code)
13976     {
13977     case SET:
13978       /* The cost depends entirely on the operands to SET.  */
13979       *cost = 0;
13980       op0 = SET_DEST (x);
13981       op1 = SET_SRC (x);
13982
13983       switch (GET_CODE (op0))
13984         {
13985         case MEM:
13986           if (speed)
13987             {
13988               rtx address = XEXP (op0, 0);
13989               if (VECTOR_MODE_P (mode))
13990                 *cost += extra_cost->ldst.storev;
13991               else if (GET_MODE_CLASS (mode) == MODE_INT)
13992                 *cost += extra_cost->ldst.store;
13993               else if (mode == SFmode || mode == SDmode)
13994                 *cost += extra_cost->ldst.storef;
13995               else if (mode == DFmode || mode == DDmode)
13996                 *cost += extra_cost->ldst.stored;
13997
13998               *cost +=
13999                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14000                                                      0, speed));
14001             }
14002
14003           *cost += rtx_cost (op1, mode, SET, 1, speed);
14004           return true;
14005
14006         case SUBREG:
14007           if (! REG_P (SUBREG_REG (op0)))
14008             *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
14009
14010           /* Fall through.  */
14011         case REG:
14012           /* The cost is one per vector-register copied.  */
14013           if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
14014             {
14015               int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
14016               *cost = COSTS_N_INSNS (nregs);
14017             }
14018           /* const0_rtx is in general free, but we will use an
14019              instruction to set a register to 0.  */
14020           else if (REG_P (op1) || op1 == const0_rtx)
14021             {
14022               /* The cost is 1 per register copied.  */
14023               int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
14024               *cost = COSTS_N_INSNS (nregs);
14025             }
14026           else
14027             /* Cost is just the cost of the RHS of the set.  */
14028             *cost += rtx_cost (op1, mode, SET, 1, speed);
14029           return true;
14030
14031         case ZERO_EXTRACT:
14032         case SIGN_EXTRACT:
14033           /* Bit-field insertion.  Strip any redundant widening of
14034              the RHS to meet the width of the target.  */
14035           if (SUBREG_P (op1))
14036             op1 = SUBREG_REG (op1);
14037           if ((GET_CODE (op1) == ZERO_EXTEND
14038                || GET_CODE (op1) == SIGN_EXTEND)
14039               && CONST_INT_P (XEXP (op0, 1))
14040               && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
14041               && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
14042             op1 = XEXP (op1, 0);
14043
14044           if (CONST_INT_P (op1))
14045             {
14046               /* MOV immediate is assumed to always be cheap.  */
14047               *cost = COSTS_N_INSNS (1);
14048             }
14049           else
14050             {
14051               /* BFM.  */
14052               if (speed)
14053                 *cost += extra_cost->alu.bfi;
14054               *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
14055             }
14056
14057           return true;
14058
14059         default:
14060           /* We can't make sense of this, assume default cost.  */
14061           *cost = COSTS_N_INSNS (1);
14062           return false;
14063         }
14064       return false;
14065
14066     case CONST_INT:
14067       /* If an instruction can incorporate a constant within the
14068          instruction, the instruction's expression avoids calling
14069          rtx_cost() on the constant.  If rtx_cost() is called on a
14070          constant, then it is usually because the constant must be
14071          moved into a register by one or more instructions.
14072
14073          The exception is constant 0, which can be expressed
14074          as XZR/WZR and is therefore free.  The exception to this is
14075          if we have (set (reg) (const0_rtx)) in which case we must cost
14076          the move.  However, we can catch that when we cost the SET, so
14077          we don't need to consider that here.  */
14078       if (x == const0_rtx)
14079         *cost = 0;
14080       else
14081         {
14082           /* To an approximation, building any other constant is
14083              proportionally expensive to the number of instructions
14084              required to build that constant.  This is true whether we
14085              are compiling for SPEED or otherwise.  */
14086           machine_mode imode = known_le (GET_MODE_SIZE (mode), 4)
14087                                 ? SImode : DImode;
14088           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
14089                                  (NULL_RTX, x, false, imode));
14090         }
14091       return true;
14092
14093     case CONST_DOUBLE:
14094
14095       /* First determine number of instructions to do the move
14096           as an integer constant.  */
14097       if (!aarch64_float_const_representable_p (x)
14098            && !aarch64_can_const_movi_rtx_p (x, mode)
14099            && aarch64_float_const_rtx_p (x))
14100         {
14101           unsigned HOST_WIDE_INT ival;
14102           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
14103           gcc_assert (succeed);
14104
14105           machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8)
14106                                 ? DImode : SImode;
14107           int ncost = aarch64_internal_mov_immediate
14108                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
14109           *cost += COSTS_N_INSNS (ncost);
14110           return true;
14111         }
14112
14113       if (speed)
14114         {
14115           /* mov[df,sf]_aarch64.  */
14116           if (aarch64_float_const_representable_p (x))
14117             /* FMOV (scalar immediate).  */
14118             *cost += extra_cost->fp[mode == DFmode || mode == DDmode].fpconst;
14119           else if (!aarch64_float_const_zero_rtx_p (x))
14120             {
14121               /* This will be a load from memory.  */
14122               if (mode == DFmode || mode == DDmode)
14123                 *cost += extra_cost->ldst.loadd;
14124               else
14125                 *cost += extra_cost->ldst.loadf;
14126             }
14127           else
14128             /* Otherwise this is +0.0.  We get this using MOVI d0, #0
14129                or MOV v0.s[0], wzr - neither of which are modeled by the
14130                cost tables.  Just use the default cost.  */
14131             {
14132             }
14133         }
14134
14135       return true;
14136
14137     case MEM:
14138       if (speed)
14139         {
14140           /* For loads we want the base cost of a load, plus an
14141              approximation for the additional cost of the addressing
14142              mode.  */
14143           rtx address = XEXP (x, 0);
14144           if (VECTOR_MODE_P (mode))
14145             *cost += extra_cost->ldst.loadv;
14146           else if (GET_MODE_CLASS (mode) == MODE_INT)
14147             *cost += extra_cost->ldst.load;
14148           else if (mode == SFmode || mode == SDmode)
14149             *cost += extra_cost->ldst.loadf;
14150           else if (mode == DFmode || mode == DDmode)
14151             *cost += extra_cost->ldst.loadd;
14152
14153           *cost +=
14154                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14155                                                      0, speed));
14156         }
14157
14158       return true;
14159
14160     case NEG:
14161       op0 = XEXP (x, 0);
14162
14163       if (VECTOR_MODE_P (mode))
14164         {
14165           /* Many vector comparison operations are represented as NEG
14166              of a comparison.  */
14167           if (COMPARISON_P (op0))
14168             {
14169               rtx op00 = XEXP (op0, 0);
14170               rtx op01 = XEXP (op0, 1);
14171               machine_mode inner_mode = GET_MODE (op00);
14172               /* FACGE/FACGT.  */
14173               if (GET_MODE_CLASS (inner_mode) == MODE_VECTOR_FLOAT
14174                   && GET_CODE (op00) == ABS
14175                   && GET_CODE (op01) == ABS)
14176                 {
14177                   op00 = XEXP (op00, 0);
14178                   op01 = XEXP (op01, 0);
14179                 }
14180               *cost += rtx_cost (op00, inner_mode, GET_CODE (op0), 0, speed);
14181               *cost += rtx_cost (op01, inner_mode, GET_CODE (op0), 1, speed);
14182               if (speed)
14183                 *cost += extra_cost->vect.alu;
14184               return true;
14185             }
14186           if (speed)
14187             {
14188               /* FNEG.  */
14189               *cost += extra_cost->vect.alu;
14190             }
14191           return false;
14192         }
14193
14194       if (GET_MODE_CLASS (mode) == MODE_INT)
14195         {
14196           if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14197               || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14198             {
14199               /* CSETM.  */
14200               *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
14201               return true;
14202             }
14203
14204           /* Cost this as SUB wzr, X.  */
14205           op0 = CONST0_RTX (mode);
14206           op1 = XEXP (x, 0);
14207           goto cost_minus;
14208         }
14209
14210       if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14211         {
14212           /* Support (neg(fma...)) as a single instruction only if
14213              sign of zeros is unimportant.  This matches the decision
14214              making in aarch64.md.  */
14215           if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
14216             {
14217               /* FNMADD.  */
14218               *cost = rtx_cost (op0, mode, NEG, 0, speed);
14219               return true;
14220             }
14221           if (GET_CODE (op0) == MULT)
14222             {
14223               /* FNMUL.  */
14224               *cost = rtx_cost (op0, mode, NEG, 0, speed);
14225               return true;
14226             }
14227           if (speed)
14228             /* FNEG.  */
14229             *cost += extra_cost->fp[mode == DFmode].neg;
14230           return false;
14231         }
14232
14233       return false;
14234
14235     case CLRSB:
14236     case CLZ:
14237       if (speed)
14238         {
14239           if (VECTOR_MODE_P (mode))
14240             *cost += extra_cost->vect.alu;
14241           else
14242             *cost += extra_cost->alu.clz;
14243         }
14244
14245       return false;
14246
14247     case CTZ:
14248       *cost = COSTS_N_INSNS (2);
14249
14250       if (speed)
14251         *cost += extra_cost->alu.clz + extra_cost->alu.rev;
14252       return false;
14253
14254     case COMPARE:
14255       op0 = XEXP (x, 0);
14256       op1 = XEXP (x, 1);
14257
14258       if (op1 == const0_rtx
14259           && GET_CODE (op0) == AND)
14260         {
14261           x = op0;
14262           mode = GET_MODE (op0);
14263           goto cost_logic;
14264         }
14265
14266       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
14267         {
14268           /* TODO: A write to the CC flags possibly costs extra, this
14269              needs encoding in the cost tables.  */
14270
14271           mode = GET_MODE (op0);
14272           /* ANDS.  */
14273           if (GET_CODE (op0) == AND)
14274             {
14275               x = op0;
14276               goto cost_logic;
14277             }
14278
14279           if (GET_CODE (op0) == PLUS)
14280             {
14281               /* ADDS (and CMN alias).  */
14282               x = op0;
14283               goto cost_plus;
14284             }
14285
14286           if (GET_CODE (op0) == MINUS)
14287             {
14288               /* SUBS.  */
14289               x = op0;
14290               goto cost_minus;
14291             }
14292
14293           if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
14294               && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
14295               && CONST_INT_P (XEXP (op0, 2)))
14296             {
14297               /* COMPARE of ZERO_EXTRACT form of TST-immediate.
14298                  Handle it here directly rather than going to cost_logic
14299                  since we know the immediate generated for the TST is valid
14300                  so we can avoid creating an intermediate rtx for it only
14301                  for costing purposes.  */
14302               if (speed)
14303                 *cost += extra_cost->alu.logical;
14304
14305               *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
14306                                  ZERO_EXTRACT, 0, speed);
14307               return true;
14308             }
14309
14310           if (GET_CODE (op1) == NEG)
14311             {
14312               /* CMN.  */
14313               if (speed)
14314                 *cost += extra_cost->alu.arith;
14315
14316               *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
14317               *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
14318               return true;
14319             }
14320
14321           /* CMP.
14322
14323              Compare can freely swap the order of operands, and
14324              canonicalization puts the more complex operation first.
14325              But the integer MINUS logic expects the shift/extend
14326              operation in op1.  */
14327           if (! (REG_P (op0)
14328                  || (SUBREG_P (op0) && REG_P (SUBREG_REG (op0)))))
14329           {
14330             op0 = XEXP (x, 1);
14331             op1 = XEXP (x, 0);
14332           }
14333           goto cost_minus;
14334         }
14335
14336       if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
14337         {
14338           /* FCMP.  */
14339           if (speed)
14340             *cost += extra_cost->fp[mode == DFmode].compare;
14341
14342           if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
14343             {
14344               *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
14345               /* FCMP supports constant 0.0 for no extra cost. */
14346               return true;
14347             }
14348           return false;
14349         }
14350
14351       if (VECTOR_MODE_P (mode))
14352         {
14353           /* Vector compare.  */
14354           if (speed)
14355             *cost += extra_cost->vect.alu;
14356
14357           if (aarch64_float_const_zero_rtx_p (op1))
14358             {
14359               /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
14360                  cost.  */
14361               return true;
14362             }
14363           return false;
14364         }
14365       return false;
14366
14367     case MINUS:
14368       {
14369         op0 = XEXP (x, 0);
14370         op1 = XEXP (x, 1);
14371
14372 cost_minus:
14373         if (VECTOR_MODE_P (mode))
14374           {
14375             unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14376             if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14377               {
14378                 /* Recognise the SABD and UABD operation here.
14379                    Recursion from the PLUS case will catch the accumulating
14380                    forms.  */
14381                 if (aarch64_abd_rtx_p (x))
14382                   {
14383                     if (speed)
14384                       *cost += extra_cost->vect.alu;
14385                     return true;
14386                   }
14387                   /* SUBL2 and SUBW2.
14388                    The select-operand-high-half versions of the sub instruction
14389                    have the same cost as the regular three vector version -
14390                    don't add the costs of the select into the costs of the sub.
14391                    */
14392                 op0 = aarch64_strip_extend_vec_half (op0);
14393                 op1 = aarch64_strip_extend_vec_half (op1);
14394               }
14395           }
14396
14397         *cost += rtx_cost (op0, mode, MINUS, 0, speed);
14398
14399         /* Detect valid immediates.  */
14400         if ((GET_MODE_CLASS (mode) == MODE_INT
14401              || (GET_MODE_CLASS (mode) == MODE_CC
14402                  && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
14403             && CONST_INT_P (op1)
14404             && aarch64_uimm12_shift (INTVAL (op1)))
14405           {
14406             if (speed)
14407               /* SUB(S) (immediate).  */
14408               *cost += extra_cost->alu.arith;
14409             return true;
14410           }
14411
14412         /* Look for SUB (extended register).  */
14413         if (is_a <scalar_int_mode> (mode)
14414             && aarch64_rtx_arith_op_extract_p (op1))
14415           {
14416             if (speed)
14417               *cost += extra_cost->alu.extend_arith;
14418
14419             op1 = aarch64_strip_extend (op1, true);
14420             *cost += rtx_cost (op1, VOIDmode,
14421                                (enum rtx_code) GET_CODE (op1), 0, speed);
14422             return true;
14423           }
14424
14425         rtx new_op1 = aarch64_strip_extend (op1, false);
14426
14427         /* Cost this as an FMA-alike operation.  */
14428         if ((GET_CODE (new_op1) == MULT
14429              || aarch64_shift_p (GET_CODE (new_op1)))
14430             && code != COMPARE)
14431           {
14432             *cost += aarch64_rtx_mult_cost (new_op1, MULT,
14433                                             (enum rtx_code) code,
14434                                             speed);
14435             return true;
14436           }
14437
14438         *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
14439
14440         if (speed)
14441           {
14442             if (VECTOR_MODE_P (mode))
14443               {
14444                 /* Vector SUB.  */
14445                 *cost += extra_cost->vect.alu;
14446               }
14447             else if (GET_MODE_CLASS (mode) == MODE_INT)
14448               {
14449                 /* SUB(S).  */
14450                 *cost += extra_cost->alu.arith;
14451               }
14452             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14453               {
14454                 /* FSUB.  */
14455                 *cost += extra_cost->fp[mode == DFmode].addsub;
14456               }
14457           }
14458         return true;
14459       }
14460
14461     case PLUS:
14462       {
14463         rtx new_op0;
14464
14465         op0 = XEXP (x, 0);
14466         op1 = XEXP (x, 1);
14467
14468 cost_plus:
14469         if (VECTOR_MODE_P (mode))
14470           {
14471             /* ADDL2 and ADDW2.  */
14472             unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14473             if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14474               {
14475                 /* The select-operand-high-half versions of the add instruction
14476                    have the same cost as the regular three vector version -
14477                    don't add the costs of the select into the costs of the add.
14478                    */
14479                 op0 = aarch64_strip_extend_vec_half (op0);
14480                 op1 = aarch64_strip_extend_vec_half (op1);
14481               }
14482           }
14483
14484         if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14485             || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14486           {
14487             /* CSINC.  */
14488             *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
14489             *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14490             return true;
14491           }
14492
14493         if (GET_MODE_CLASS (mode) == MODE_INT
14494             && (aarch64_plus_immediate (op1, mode)
14495                 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
14496           {
14497             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14498
14499             if (speed)
14500               {
14501                 /* ADD (immediate).  */
14502                 *cost += extra_cost->alu.arith;
14503
14504                 /* Some tunings prefer to not use the VL-based scalar ops.
14505                    Increase the cost of the poly immediate to prevent their
14506                    formation.  */
14507                 if (GET_CODE (op1) == CONST_POLY_INT
14508                     && (aarch64_tune_params.extra_tuning_flags
14509                         & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
14510                   *cost += COSTS_N_INSNS (1);
14511               }
14512             return true;
14513           }
14514
14515         if (aarch64_pluslong_immediate (op1, mode))
14516           {
14517             /* 24-bit add in 2 instructions or 12-bit shifted add.  */
14518             if ((INTVAL (op1) & 0xfff) != 0)
14519               *cost += COSTS_N_INSNS (1);
14520
14521             *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14522             return true;
14523           }
14524
14525         *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14526
14527         /* Look for ADD (extended register).  */
14528         if (is_a <scalar_int_mode> (mode)
14529             && aarch64_rtx_arith_op_extract_p (op0))
14530           {
14531             if (speed)
14532               *cost += extra_cost->alu.extend_arith;
14533
14534             op0 = aarch64_strip_extend (op0, true);
14535             *cost += rtx_cost (op0, VOIDmode,
14536                                (enum rtx_code) GET_CODE (op0), 0, speed);
14537             return true;
14538           }
14539
14540         /* Strip any extend, leave shifts behind as we will
14541            cost them through mult_cost.  */
14542         new_op0 = aarch64_strip_extend (op0, false);
14543
14544         if (GET_CODE (new_op0) == MULT
14545             || aarch64_shift_p (GET_CODE (new_op0)))
14546           {
14547             *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
14548                                             speed);
14549             return true;
14550           }
14551
14552         *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
14553
14554         if (speed)
14555           {
14556             if (VECTOR_MODE_P (mode))
14557               {
14558                 /* Vector ADD.  */
14559                 *cost += extra_cost->vect.alu;
14560               }
14561             else if (GET_MODE_CLASS (mode) == MODE_INT)
14562               {
14563                 /* ADD.  */
14564                 *cost += extra_cost->alu.arith;
14565               }
14566             else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14567               {
14568                 /* FADD.  */
14569                 *cost += extra_cost->fp[mode == DFmode].addsub;
14570               }
14571           }
14572         return true;
14573       }
14574
14575     case BSWAP:
14576       *cost = COSTS_N_INSNS (1);
14577
14578       if (speed)
14579         {
14580           if (VECTOR_MODE_P (mode))
14581             *cost += extra_cost->vect.alu;
14582           else
14583             *cost += extra_cost->alu.rev;
14584         }
14585       return false;
14586
14587     case IOR:
14588       if (aarch_rev16_p (x))
14589         {
14590           *cost = COSTS_N_INSNS (1);
14591
14592           if (speed)
14593             {
14594               if (VECTOR_MODE_P (mode))
14595                 *cost += extra_cost->vect.alu;
14596               else
14597                 *cost += extra_cost->alu.rev;
14598             }
14599           return true;
14600         }
14601
14602       if (aarch64_extr_rtx_p (x, &op0, &op1))
14603         {
14604           *cost += rtx_cost (op0, mode, IOR, 0, speed);
14605           *cost += rtx_cost (op1, mode, IOR, 1, speed);
14606           if (speed)
14607             *cost += extra_cost->alu.shift;
14608
14609           return true;
14610         }
14611     /* Fall through.  */
14612     case XOR:
14613     case AND:
14614     cost_logic:
14615       op0 = XEXP (x, 0);
14616       op1 = XEXP (x, 1);
14617
14618       if (VECTOR_MODE_P (mode))
14619         {
14620           if (speed)
14621             *cost += extra_cost->vect.alu;
14622           return true;
14623         }
14624
14625       if (code == AND
14626           && GET_CODE (op0) == MULT
14627           && CONST_INT_P (XEXP (op0, 1))
14628           && CONST_INT_P (op1)
14629           && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
14630                                INTVAL (op1)) != 0)
14631         {
14632           /* This is a UBFM/SBFM.  */
14633           *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
14634           if (speed)
14635             *cost += extra_cost->alu.bfx;
14636           return true;
14637         }
14638
14639       if (is_int_mode (mode, &int_mode))
14640         {
14641           if (CONST_INT_P (op1))
14642             {
14643               /* We have a mask + shift version of a UBFIZ
14644                  i.e. the *andim_ashift<mode>_bfiz pattern.  */
14645               if (GET_CODE (op0) == ASHIFT
14646                   && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
14647                                                          XEXP (op0, 1)))
14648                 {
14649                   *cost += rtx_cost (XEXP (op0, 0), int_mode,
14650                                      (enum rtx_code) code, 0, speed);
14651                   if (speed)
14652                     *cost += extra_cost->alu.bfx;
14653
14654                   return true;
14655                 }
14656               else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
14657                 {
14658                 /* We possibly get the immediate for free, this is not
14659                    modelled.  */
14660                   *cost += rtx_cost (op0, int_mode,
14661                                      (enum rtx_code) code, 0, speed);
14662                   if (speed)
14663                     *cost += extra_cost->alu.logical;
14664
14665                   return true;
14666                 }
14667             }
14668           else
14669             {
14670               rtx new_op0 = op0;
14671
14672               /* Handle ORN, EON, or BIC.  */
14673               if (GET_CODE (op0) == NOT)
14674                 op0 = XEXP (op0, 0);
14675
14676               new_op0 = aarch64_strip_shift (op0);
14677
14678               /* If we had a shift on op0 then this is a logical-shift-
14679                  by-register/immediate operation.  Otherwise, this is just
14680                  a logical operation.  */
14681               if (speed)
14682                 {
14683                   if (new_op0 != op0)
14684                     {
14685                       /* Shift by immediate.  */
14686                       if (CONST_INT_P (XEXP (op0, 1)))
14687                         *cost += extra_cost->alu.log_shift;
14688                       else
14689                         *cost += extra_cost->alu.log_shift_reg;
14690                     }
14691                   else
14692                     *cost += extra_cost->alu.logical;
14693                 }
14694
14695               /* In both cases we want to cost both operands.  */
14696               *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
14697                                  0, speed);
14698               *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
14699                                  1, speed);
14700
14701               return true;
14702             }
14703         }
14704       return false;
14705
14706     case NOT:
14707       x = XEXP (x, 0);
14708       op0 = aarch64_strip_shift (x);
14709
14710       if (VECTOR_MODE_P (mode))
14711         {
14712           /* Vector NOT.  */
14713           *cost += extra_cost->vect.alu;
14714           return false;
14715         }
14716
14717       /* MVN-shifted-reg.  */
14718       if (op0 != x)
14719         {
14720           *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14721
14722           if (speed)
14723             *cost += extra_cost->alu.log_shift;
14724
14725           return true;
14726         }
14727       /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
14728          Handle the second form here taking care that 'a' in the above can
14729          be a shift.  */
14730       else if (GET_CODE (op0) == XOR)
14731         {
14732           rtx newop0 = XEXP (op0, 0);
14733           rtx newop1 = XEXP (op0, 1);
14734           rtx op0_stripped = aarch64_strip_shift (newop0);
14735
14736           *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
14737           *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
14738
14739           if (speed)
14740             {
14741               if (op0_stripped != newop0)
14742                 *cost += extra_cost->alu.log_shift;
14743               else
14744                 *cost += extra_cost->alu.logical;
14745             }
14746
14747           return true;
14748         }
14749       /* MVN.  */
14750       if (speed)
14751         *cost += extra_cost->alu.logical;
14752
14753       return false;
14754
14755     case ZERO_EXTEND:
14756
14757       op0 = XEXP (x, 0);
14758       /* If a value is written in SI mode, then zero extended to DI
14759          mode, the operation will in general be free as a write to
14760          a 'w' register implicitly zeroes the upper bits of an 'x'
14761          register.  However, if this is
14762
14763            (set (reg) (zero_extend (reg)))
14764
14765          we must cost the explicit register move.  */
14766       if (mode == DImode
14767           && GET_MODE (op0) == SImode)
14768         {
14769           int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
14770
14771         /* If OP_COST is non-zero, then the cost of the zero extend
14772            is effectively the cost of the inner operation.  Otherwise
14773            we have a MOV instruction and we take the cost from the MOV
14774            itself.  This is true independently of whether we are
14775            optimizing for space or time.  */
14776           if (op_cost)
14777             *cost = op_cost;
14778
14779           return true;
14780         }
14781       else if (MEM_P (op0))
14782         {
14783           /* All loads can zero extend to any size for free.  */
14784           *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
14785           return true;
14786         }
14787
14788       op0 = aarch64_extend_bitfield_pattern_p (x);
14789       if (op0)
14790         {
14791           *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
14792           if (speed)
14793             *cost += extra_cost->alu.bfx;
14794           return true;
14795         }
14796
14797       if (speed)
14798         {
14799           if (VECTOR_MODE_P (mode))
14800             {
14801               /* UMOV.  */
14802               *cost += extra_cost->vect.alu;
14803             }
14804           else
14805             {
14806               /* We generate an AND instead of UXTB/UXTH.  */
14807               *cost += extra_cost->alu.logical;
14808             }
14809         }
14810       return false;
14811
14812     case SIGN_EXTEND:
14813       if (MEM_P (XEXP (x, 0)))
14814         {
14815           /* LDRSH.  */
14816           if (speed)
14817             {
14818               rtx address = XEXP (XEXP (x, 0), 0);
14819               *cost += extra_cost->ldst.load_sign_extend;
14820
14821               *cost +=
14822                 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14823                                                      0, speed));
14824             }
14825           return true;
14826         }
14827
14828       op0 = aarch64_extend_bitfield_pattern_p (x);
14829       if (op0)
14830         {
14831           *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
14832           if (speed)
14833             *cost += extra_cost->alu.bfx;
14834           return true;
14835         }
14836
14837       if (speed)
14838         {
14839           if (VECTOR_MODE_P (mode))
14840             *cost += extra_cost->vect.alu;
14841           else
14842             *cost += extra_cost->alu.extend;
14843         }
14844       return false;
14845
14846     case ROTATE:
14847     case ROTATERT:
14848     case LSHIFTRT:
14849     case ASHIFTRT:
14850     case ASHIFT:
14851       op0 = XEXP (x, 0);
14852       op1 = XEXP (x, 1);
14853
14854       if (CONST_INT_P (op1))
14855         {
14856           if (speed)
14857             {
14858               if (VECTOR_MODE_P (mode))
14859                 {
14860                   /* Vector shift (immediate).  */
14861                   *cost += extra_cost->vect.alu;
14862                 }
14863               else
14864                 {
14865                   /* LSL (immediate), ASR (immediate), UBMF, UBFIZ and friends.
14866                      These are all aliases.  */
14867                   *cost += extra_cost->alu.shift;
14868                 }
14869             }
14870
14871           /* We can incorporate zero/sign extend for free.  */
14872           if (GET_CODE (op0) == ZERO_EXTEND
14873               || GET_CODE (op0) == SIGN_EXTEND)
14874             op0 = XEXP (op0, 0);
14875
14876           *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
14877           return true;
14878         }
14879       else
14880         {
14881           if (VECTOR_MODE_P (mode))
14882             {
14883               if (speed)
14884                 /* Vector shift (register).  */
14885                 *cost += extra_cost->vect.alu;
14886             }
14887           else
14888             {
14889               if (speed)
14890                 /* LSLV, ASRV.  */
14891                 *cost += extra_cost->alu.shift_reg;
14892
14893                /* The register shift amount may be in a shorter mode expressed
14894                   as a lowpart SUBREG.  For costing purposes just look inside.  */
14895               if (SUBREG_P (op1) && subreg_lowpart_p (op1))
14896                 op1 = SUBREG_REG (op1);
14897               if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
14898                   && CONST_INT_P (XEXP (op1, 1))
14899                   && known_eq (INTVAL (XEXP (op1, 1)),
14900                                GET_MODE_BITSIZE (mode) - 1))
14901                 {
14902                   *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
14903                   /* We already demanded XEXP (op1, 0) to be REG_P, so
14904                      don't recurse into it.  */
14905                   return true;
14906                 }
14907             }
14908           return false;  /* All arguments need to be in registers.  */
14909         }
14910
14911     case SYMBOL_REF:
14912
14913       if (aarch64_cmodel == AARCH64_CMODEL_LARGE
14914           || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
14915         {
14916           /* LDR.  */
14917           if (speed)
14918             *cost += extra_cost->ldst.load;
14919         }
14920       else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
14921                || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
14922         {
14923           /* ADRP, followed by ADD.  */
14924           *cost += COSTS_N_INSNS (1);
14925           if (speed)
14926             *cost += 2 * extra_cost->alu.arith;
14927         }
14928       else if (aarch64_cmodel == AARCH64_CMODEL_TINY
14929                || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14930         {
14931           /* ADR.  */
14932           if (speed)
14933             *cost += extra_cost->alu.arith;
14934         }
14935
14936       if (flag_pic)
14937         {
14938           /* One extra load instruction, after accessing the GOT.  */
14939           *cost += COSTS_N_INSNS (1);
14940           if (speed)
14941             *cost += extra_cost->ldst.load;
14942         }
14943       return true;
14944
14945     case HIGH:
14946     case LO_SUM:
14947       /* ADRP/ADD (immediate).  */
14948       if (speed)
14949         *cost += extra_cost->alu.arith;
14950       return true;
14951
14952     case ZERO_EXTRACT:
14953     case SIGN_EXTRACT:
14954       /* UBFX/SBFX.  */
14955       if (speed)
14956         {
14957           if (VECTOR_MODE_P (mode))
14958             *cost += extra_cost->vect.alu;
14959           else
14960             *cost += extra_cost->alu.bfx;
14961         }
14962
14963       /* We can trust that the immediates used will be correct (there
14964          are no by-register forms), so we need only cost op0.  */
14965       *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
14966       return true;
14967
14968     case MULT:
14969       *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
14970       /* aarch64_rtx_mult_cost always handles recursion to its
14971          operands.  */
14972       return true;
14973
14974     case MOD:
14975     /* We can expand signed mod by power of 2 using a NEGS, two parallel
14976        ANDs and a CSNEG.  Assume here that CSNEG is the same as the cost of
14977        an unconditional negate.  This case should only ever be reached through
14978        the set_smod_pow2_cheap check in expmed.cc.  */
14979       if (CONST_INT_P (XEXP (x, 1))
14980           && exact_log2 (INTVAL (XEXP (x, 1))) > 0
14981           && (mode == SImode || mode == DImode))
14982         {
14983           /* We expand to 4 instructions.  Reset the baseline.  */
14984           *cost = COSTS_N_INSNS (4);
14985
14986           if (speed)
14987             *cost += 2 * extra_cost->alu.logical
14988                      + 2 * extra_cost->alu.arith;
14989
14990           return true;
14991         }
14992
14993     /* Fall-through.  */
14994     case UMOD:
14995       if (speed)
14996         {
14997           /* Slighly prefer UMOD over SMOD.  */
14998           if (VECTOR_MODE_P (mode))
14999             *cost += extra_cost->vect.alu;
15000           else if (GET_MODE_CLASS (mode) == MODE_INT)
15001             *cost += (extra_cost->mult[mode == DImode].add
15002                       + extra_cost->mult[mode == DImode].idiv
15003                       + (code == MOD ? 1 : 0));
15004         }
15005       return false;  /* All arguments need to be in registers.  */
15006
15007     case DIV:
15008     case UDIV:
15009     case SQRT:
15010       if (speed)
15011         {
15012           if (VECTOR_MODE_P (mode))
15013             *cost += extra_cost->vect.alu;
15014           else if (GET_MODE_CLASS (mode) == MODE_INT)
15015             /* There is no integer SQRT, so only DIV and UDIV can get
15016                here.  */
15017             *cost += (extra_cost->mult[mode == DImode].idiv
15018                      /* Slighly prefer UDIV over SDIV.  */
15019                      + (code == DIV ? 1 : 0));
15020           else
15021             *cost += extra_cost->fp[mode == DFmode].div;
15022         }
15023       return false;  /* All arguments need to be in registers.  */
15024
15025     case IF_THEN_ELSE:
15026       return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
15027                                          XEXP (x, 2), cost, speed);
15028
15029     case EQ:
15030     case NE:
15031     case GT:
15032     case GTU:
15033     case LT:
15034     case LTU:
15035     case GE:
15036     case GEU:
15037     case LE:
15038     case LEU:
15039
15040       return false; /* All arguments must be in registers.  */
15041
15042     case FMA:
15043       op0 = XEXP (x, 0);
15044       op1 = XEXP (x, 1);
15045       op2 = XEXP (x, 2);
15046
15047       if (speed)
15048         {
15049           if (VECTOR_MODE_P (mode))
15050             *cost += extra_cost->vect.alu;
15051           else
15052             *cost += extra_cost->fp[mode == DFmode].fma;
15053         }
15054
15055       /* FMSUB, FNMADD, and FNMSUB are free.  */
15056       if (GET_CODE (op0) == NEG)
15057         op0 = XEXP (op0, 0);
15058
15059       if (GET_CODE (op2) == NEG)
15060         op2 = XEXP (op2, 0);
15061
15062       /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
15063          and the by-element operand as operand 0.  */
15064       if (GET_CODE (op1) == NEG)
15065         op1 = XEXP (op1, 0);
15066
15067       /* Catch vector-by-element operations.  The by-element operand can
15068          either be (vec_duplicate (vec_select (x))) or just
15069          (vec_select (x)), depending on whether we are multiplying by
15070          a vector or a scalar.
15071
15072          Canonicalization is not very good in these cases, FMA4 will put the
15073          by-element operand as operand 0, FNMA4 will have it as operand 1.  */
15074       if (GET_CODE (op0) == VEC_DUPLICATE)
15075         op0 = XEXP (op0, 0);
15076       else if (GET_CODE (op1) == VEC_DUPLICATE)
15077         op1 = XEXP (op1, 0);
15078
15079       if (GET_CODE (op0) == VEC_SELECT)
15080         op0 = XEXP (op0, 0);
15081       else if (GET_CODE (op1) == VEC_SELECT)
15082         op1 = XEXP (op1, 0);
15083
15084       /* If the remaining parameters are not registers,
15085          get the cost to put them into registers.  */
15086       *cost += rtx_cost (op0, mode, FMA, 0, speed);
15087       *cost += rtx_cost (op1, mode, FMA, 1, speed);
15088       *cost += rtx_cost (op2, mode, FMA, 2, speed);
15089       return true;
15090
15091     case FLOAT:
15092     case UNSIGNED_FLOAT:
15093       if (speed)
15094         *cost += extra_cost->fp[mode == DFmode].fromint;
15095       return false;
15096
15097     case FLOAT_EXTEND:
15098       if (speed)
15099         {
15100           if (VECTOR_MODE_P (mode))
15101             {
15102               /*Vector truncate.  */
15103               *cost += extra_cost->vect.alu;
15104             }
15105           else
15106             *cost += extra_cost->fp[mode == DFmode].widen;
15107         }
15108       return false;
15109
15110     case FLOAT_TRUNCATE:
15111       if (speed)
15112         {
15113           if (VECTOR_MODE_P (mode))
15114             {
15115               /*Vector conversion.  */
15116               *cost += extra_cost->vect.alu;
15117             }
15118           else
15119             *cost += extra_cost->fp[mode == DFmode].narrow;
15120         }
15121       return false;
15122
15123     case FIX:
15124     case UNSIGNED_FIX:
15125       x = XEXP (x, 0);
15126       /* Strip the rounding part.  They will all be implemented
15127          by the fcvt* family of instructions anyway.  */
15128       if (GET_CODE (x) == UNSPEC)
15129         {
15130           unsigned int uns_code = XINT (x, 1);
15131
15132           if (uns_code == UNSPEC_FRINTA
15133               || uns_code == UNSPEC_FRINTM
15134               || uns_code == UNSPEC_FRINTN
15135               || uns_code == UNSPEC_FRINTP
15136               || uns_code == UNSPEC_FRINTZ)
15137             x = XVECEXP (x, 0, 0);
15138         }
15139
15140       if (speed)
15141         {
15142           if (VECTOR_MODE_P (mode))
15143             *cost += extra_cost->vect.alu;
15144           else
15145             *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
15146         }
15147
15148       /* We can combine fmul by a power of 2 followed by a fcvt into a single
15149          fixed-point fcvt.  */
15150       if (GET_CODE (x) == MULT
15151           && ((VECTOR_MODE_P (mode)
15152                && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
15153               || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
15154         {
15155           *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
15156                              0, speed);
15157           return true;
15158         }
15159
15160       *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
15161       return true;
15162
15163     case ABS:
15164       if (VECTOR_MODE_P (mode))
15165         {
15166           /* ABS (vector).  */
15167           if (speed)
15168             *cost += extra_cost->vect.alu;
15169         }
15170       else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
15171         {
15172           op0 = XEXP (x, 0);
15173
15174           /* FABD, which is analogous to FADD.  */
15175           if (GET_CODE (op0) == MINUS)
15176             {
15177               *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
15178               *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
15179               if (speed)
15180                 *cost += extra_cost->fp[mode == DFmode].addsub;
15181
15182               return true;
15183             }
15184           /* Simple FABS is analogous to FNEG.  */
15185           if (speed)
15186             *cost += extra_cost->fp[mode == DFmode].neg;
15187         }
15188       else
15189         {
15190           /* Integer ABS will either be split to
15191              two arithmetic instructions, or will be an ABS
15192              (scalar), which we don't model.  */
15193           *cost = COSTS_N_INSNS (2);
15194           if (speed)
15195             *cost += 2 * extra_cost->alu.arith;
15196         }
15197       return false;
15198
15199     case SMAX:
15200     case SMIN:
15201       if (speed)
15202         {
15203           if (VECTOR_MODE_P (mode))
15204             *cost += extra_cost->vect.alu;
15205           else
15206             {
15207               /* FMAXNM/FMINNM/FMAX/FMIN.
15208                  TODO: This may not be accurate for all implementations, but
15209                  we do not model this in the cost tables.  */
15210               *cost += extra_cost->fp[mode == DFmode].addsub;
15211             }
15212         }
15213       return false;
15214
15215     case UNSPEC:
15216       /* The floating point round to integer frint* instructions.  */
15217       if (aarch64_frint_unspec_p (XINT (x, 1)))
15218         {
15219           if (speed)
15220             *cost += extra_cost->fp[mode == DFmode].roundint;
15221
15222           return false;
15223         }
15224
15225       if (XINT (x, 1) == UNSPEC_RBIT)
15226         {
15227           if (speed)
15228             *cost += extra_cost->alu.rev;
15229
15230           return false;
15231         }
15232       break;
15233
15234     case TRUNCATE:
15235
15236       /* Decompose <su>muldi3_highpart.  */
15237       if (/* (truncate:DI  */
15238           mode == DImode
15239           /*   (lshiftrt:TI  */
15240           && GET_MODE (XEXP (x, 0)) == TImode
15241           && GET_CODE (XEXP (x, 0)) == LSHIFTRT
15242           /*      (mult:TI  */
15243           && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
15244           /*        (ANY_EXTEND:TI (reg:DI))
15245                     (ANY_EXTEND:TI (reg:DI)))  */
15246           && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
15247                && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
15248               || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
15249                   && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
15250           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
15251           && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
15252           /*     (const_int 64)  */
15253           && CONST_INT_P (XEXP (XEXP (x, 0), 1))
15254           && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
15255         {
15256           /* UMULH/SMULH.  */
15257           if (speed)
15258             *cost += extra_cost->mult[mode == DImode].extend;
15259           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
15260                              mode, MULT, 0, speed);
15261           *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
15262                              mode, MULT, 1, speed);
15263           return true;
15264         }
15265         break;
15266     case CONST_VECTOR:
15267         {
15268           /* Load using MOVI/MVNI.  */
15269           if (aarch64_simd_valid_immediate (x, NULL))
15270             *cost = extra_cost->vect.movi;
15271           else /* Load using constant pool.  */
15272             *cost = extra_cost->ldst.load;
15273           break;
15274         }
15275     case VEC_CONCAT:
15276         /* depending on the operation, either DUP or INS.
15277            For now, keep default costing.  */
15278         break;
15279     case VEC_DUPLICATE:
15280         /* Load using a DUP.  */
15281         *cost = extra_cost->vect.dup;
15282         return false;
15283     case VEC_SELECT:
15284         {
15285           rtx op0 = XEXP (x, 0);
15286           *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
15287
15288           /* cost subreg of 0 as free, otherwise as DUP */
15289           rtx op1 = XEXP (x, 1);
15290           if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
15291             ;
15292           else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
15293             *cost = extra_cost->vect.dup;
15294           else
15295             *cost = extra_cost->vect.extract;
15296           return true;
15297         }
15298     default:
15299       break;
15300     }
15301
15302   if (dump_file
15303       && flag_aarch64_verbose_cost)
15304     fprintf (dump_file,
15305       "\nFailed to cost RTX.  Assuming default cost.\n");
15306
15307   return true;
15308 }
15309
15310 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
15311    calculated for X.  This cost is stored in *COST.  Returns true
15312    if the total cost of X was calculated.  */
15313 static bool
15314 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
15315                    int param, int *cost, bool speed)
15316 {
15317   bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
15318
15319   if (dump_file
15320       && flag_aarch64_verbose_cost)
15321     {
15322       print_rtl_single (dump_file, x);
15323       fprintf (dump_file, "\n%s cost: %d (%s)\n",
15324                speed ? "Hot" : "Cold",
15325                *cost, result ? "final" : "partial");
15326     }
15327
15328   return result;
15329 }
15330
15331 static int
15332 aarch64_register_move_cost (machine_mode mode,
15333                             reg_class_t from_i, reg_class_t to_i)
15334 {
15335   enum reg_class from = (enum reg_class) from_i;
15336   enum reg_class to = (enum reg_class) to_i;
15337   const struct cpu_regmove_cost *regmove_cost
15338     = aarch64_tune_params.regmove_cost;
15339
15340   /* Trest any subset of POINTER_REGS as though it were GENERAL_REGS.  */
15341   if (reg_class_subset_p (to, POINTER_REGS))
15342     to = GENERAL_REGS;
15343
15344   if (reg_class_subset_p (from, POINTER_REGS))
15345     from = GENERAL_REGS;
15346
15347   /* Make RDFFR very expensive.  In particular, if we know that the FFR
15348      contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
15349      as a way of obtaining a PTRUE.  */
15350   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15351       && hard_reg_set_subset_p (reg_class_contents[from_i],
15352                                 reg_class_contents[FFR_REGS]))
15353     return 80;
15354
15355   /* Moving between GPR and stack cost is the same as GP2GP.  */
15356   if ((from == GENERAL_REGS && to == STACK_REG)
15357       || (to == GENERAL_REGS && from == STACK_REG))
15358     return regmove_cost->GP2GP;
15359
15360   /* To/From the stack register, we move via the gprs.  */
15361   if (to == STACK_REG || from == STACK_REG)
15362     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
15363             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
15364
15365   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15366   if (vec_flags != (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL)
15367       && known_eq (GET_MODE_SIZE (mode), 16))
15368     {
15369       /* 128-bit operations on general registers require 2 instructions.  */
15370       if (from == GENERAL_REGS && to == GENERAL_REGS)
15371         return regmove_cost->GP2GP * 2;
15372       else if (from == GENERAL_REGS)
15373         return regmove_cost->GP2FP * 2;
15374       else if (to == GENERAL_REGS)
15375         return regmove_cost->FP2GP * 2;
15376
15377       /* When AdvSIMD instructions are disabled it is not possible to move
15378          a 128-bit value directly between Q registers.  This is handled in
15379          secondary reload.  A general register is used as a scratch to move
15380          the upper DI value and the lower DI value is moved directly,
15381          hence the cost is the sum of three moves. */
15382       if (!TARGET_SIMD && !TARGET_SVE)
15383         return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
15384
15385       return regmove_cost->FP2FP;
15386     }
15387
15388   if (from == GENERAL_REGS && to == GENERAL_REGS)
15389     return regmove_cost->GP2GP;
15390   else if (from == GENERAL_REGS)
15391     return regmove_cost->GP2FP;
15392   else if (to == GENERAL_REGS)
15393     return regmove_cost->FP2GP;
15394
15395   if (!TARGET_SIMD && vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15396     {
15397       /* Needs a round-trip through memory, which can use LDP/STP for pairs.
15398          The cost must be greater than 2 units to indicate that direct
15399          moves aren't possible.  */
15400       auto per_vector = (aarch64_tune_params.memmov_cost.load_fp
15401                          + aarch64_tune_params.memmov_cost.store_fp);
15402       return MIN (CEIL (per_vector, 2), 4);
15403     }
15404
15405   return regmove_cost->FP2FP;
15406 }
15407
15408 /* Implements TARGET_MEMORY_MOVE_COST.  */
15409 static int
15410 aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
15411 {
15412   enum reg_class rclass = (enum reg_class) rclass_i;
15413   if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15414       ? reg_classes_intersect_p (rclass, PR_REGS)
15415       : reg_class_subset_p (rclass, PR_REGS))
15416     return (in
15417             ? aarch64_tune_params.memmov_cost.load_pred
15418             : aarch64_tune_params.memmov_cost.store_pred);
15419
15420   if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)
15421       ? reg_classes_intersect_p (rclass, FP_REGS)
15422       : reg_class_subset_p (rclass, FP_REGS))
15423     return (in
15424             ? aarch64_tune_params.memmov_cost.load_fp
15425             : aarch64_tune_params.memmov_cost.store_fp);
15426
15427   return (in
15428           ? aarch64_tune_params.memmov_cost.load_int
15429           : aarch64_tune_params.memmov_cost.store_int);
15430 }
15431
15432 /* Implement TARGET_INSN_COST.  We have the opportunity to do something
15433    much more productive here, such as using insn attributes to cost things.
15434    But we don't, not yet.
15435
15436    The main point of this current definition is to make calling insn_cost
15437    on one instruction equivalent to calling seq_cost on a sequence that
15438    contains only that instruction.  The default definition would instead
15439    only look at SET_SRCs, ignoring SET_DESTs.
15440
15441    This ensures that, for example, storing a 128-bit zero vector is more
15442    expensive than storing a 128-bit vector register.  A move of zero
15443    into a 128-bit vector register followed by multiple stores of that
15444    register is then cheaper than multiple stores of zero (which would
15445    use STP of XZR).  This in turn allows STP Qs to be formed.  */
15446 static int
15447 aarch64_insn_cost (rtx_insn *insn, bool speed)
15448 {
15449   if (rtx set = single_set (insn))
15450     return set_rtx_cost (set, speed);
15451   return pattern_cost (PATTERN (insn), speed);
15452 }
15453
15454 /* Implement TARGET_INIT_BUILTINS.  */
15455 static void
15456 aarch64_init_builtins ()
15457 {
15458   aarch64_general_init_builtins ();
15459   aarch64_sve::init_builtins ();
15460 #ifdef SUBTARGET_INIT_BUILTINS
15461   SUBTARGET_INIT_BUILTINS;
15462 #endif
15463 }
15464
15465 /* Implement TARGET_FOLD_BUILTIN.  */
15466 static tree
15467 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
15468 {
15469   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15470   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15471   tree type = TREE_TYPE (TREE_TYPE (fndecl));
15472   switch (code & AARCH64_BUILTIN_CLASS)
15473     {
15474     case AARCH64_BUILTIN_GENERAL:
15475       return aarch64_general_fold_builtin (subcode, type, nargs, args);
15476
15477     case AARCH64_BUILTIN_SVE:
15478       return NULL_TREE;
15479     }
15480   gcc_unreachable ();
15481 }
15482
15483 /* Implement TARGET_GIMPLE_FOLD_BUILTIN.  */
15484 static bool
15485 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
15486 {
15487   gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
15488   tree fndecl = gimple_call_fndecl (stmt);
15489   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15490   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15491   gimple *new_stmt = NULL;
15492   switch (code & AARCH64_BUILTIN_CLASS)
15493     {
15494     case AARCH64_BUILTIN_GENERAL:
15495       new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt, gsi);
15496       break;
15497
15498     case AARCH64_BUILTIN_SVE:
15499       new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
15500       break;
15501     }
15502
15503   if (!new_stmt)
15504     return false;
15505
15506   gsi_replace (gsi, new_stmt, false);
15507   return true;
15508 }
15509
15510 /* Implement TARGET_EXPAND_BUILTIN.  */
15511 static rtx
15512 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
15513 {
15514   tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
15515   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15516   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15517   switch (code & AARCH64_BUILTIN_CLASS)
15518     {
15519     case AARCH64_BUILTIN_GENERAL:
15520       return aarch64_general_expand_builtin (subcode, exp, target, ignore);
15521
15522     case AARCH64_BUILTIN_SVE:
15523       return aarch64_sve::expand_builtin (subcode, exp, target);
15524     }
15525   gcc_unreachable ();
15526 }
15527
15528 /* Implement TARGET_BUILTIN_DECL.  */
15529 static tree
15530 aarch64_builtin_decl (unsigned int code, bool initialize_p)
15531 {
15532   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15533   switch (code & AARCH64_BUILTIN_CLASS)
15534     {
15535     case AARCH64_BUILTIN_GENERAL:
15536       return aarch64_general_builtin_decl (subcode, initialize_p);
15537
15538     case AARCH64_BUILTIN_SVE:
15539       return aarch64_sve::builtin_decl (subcode, initialize_p);
15540     }
15541   gcc_unreachable ();
15542 }
15543
15544 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15545    to optimize 1.0/sqrt.  */
15546
15547 static bool
15548 use_rsqrt_p (machine_mode mode)
15549 {
15550   return (!flag_trapping_math
15551           && flag_unsafe_math_optimizations
15552           && ((aarch64_tune_params.approx_modes->recip_sqrt
15553                & AARCH64_APPROX_MODE (mode))
15554               || flag_mrecip_low_precision_sqrt));
15555 }
15556
15557 /* Function to decide when to use the approximate reciprocal square root
15558    builtin.  */
15559
15560 static tree
15561 aarch64_builtin_reciprocal (tree fndecl)
15562 {
15563   machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
15564
15565   if (!use_rsqrt_p (mode))
15566     return NULL_TREE;
15567   unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15568   unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15569   switch (code & AARCH64_BUILTIN_CLASS)
15570     {
15571     case AARCH64_BUILTIN_GENERAL:
15572       return aarch64_general_builtin_rsqrt (subcode);
15573
15574     case AARCH64_BUILTIN_SVE:
15575       return NULL_TREE;
15576     }
15577   gcc_unreachable ();
15578 }
15579
15580 /* Emit code to perform the floating-point operation:
15581
15582      DST = SRC1 * SRC2
15583
15584    where all three operands are already known to be registers.
15585    If the operation is an SVE one, PTRUE is a suitable all-true
15586    predicate.  */
15587
15588 static void
15589 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
15590 {
15591   if (ptrue)
15592     emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
15593                                  dst, ptrue, src1, src2,
15594                                  gen_int_mode (SVE_RELAXED_GP, SImode)));
15595   else
15596     emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
15597 }
15598
15599 /* Emit instruction sequence to compute either the approximate square root
15600    or its approximate reciprocal, depending on the flag RECP, and return
15601    whether the sequence was emitted or not.  */
15602
15603 bool
15604 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
15605 {
15606   machine_mode mode = GET_MODE (dst);
15607
15608   if (GET_MODE_INNER (mode) == HFmode)
15609     {
15610       gcc_assert (!recp);
15611       return false;
15612     }
15613
15614   if (!recp)
15615     {
15616       if (!(flag_mlow_precision_sqrt
15617             || (aarch64_tune_params.approx_modes->sqrt
15618                 & AARCH64_APPROX_MODE (mode))))
15619         return false;
15620
15621       if (!flag_finite_math_only
15622           || flag_trapping_math
15623           || !flag_unsafe_math_optimizations
15624           || optimize_function_for_size_p (cfun))
15625         return false;
15626     }
15627   else
15628     /* Caller assumes we cannot fail.  */
15629     gcc_assert (use_rsqrt_p (mode));
15630
15631   rtx pg = NULL_RTX;
15632   if (aarch64_sve_mode_p (mode))
15633     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15634   machine_mode mmsk = (VECTOR_MODE_P (mode)
15635                        ? related_int_vector_mode (mode).require ()
15636                        : int_mode_for_mode (mode).require ());
15637   rtx xmsk = NULL_RTX;
15638   if (!recp)
15639     {
15640       /* When calculating the approximate square root, compare the
15641          argument with 0.0 and create a mask.  */
15642       rtx zero = CONST0_RTX (mode);
15643       if (pg)
15644         {
15645           xmsk = gen_reg_rtx (GET_MODE (pg));
15646           rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
15647           emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
15648                                            xmsk, pg, hint, src, zero));
15649         }
15650       else
15651         {
15652           xmsk = gen_reg_rtx (mmsk);
15653           emit_insn (gen_rtx_SET (xmsk,
15654                                   gen_rtx_NEG (mmsk,
15655                                                gen_rtx_EQ (mmsk, src, zero))));
15656         }
15657     }
15658
15659   /* Estimate the approximate reciprocal square root.  */
15660   rtx xdst = gen_reg_rtx (mode);
15661   emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
15662
15663   /* Iterate over the series twice for SF and thrice for DF.  */
15664   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15665
15666   /* Optionally iterate over the series once less for faster performance
15667      while sacrificing the accuracy.  */
15668   if ((recp && flag_mrecip_low_precision_sqrt)
15669       || (!recp && flag_mlow_precision_sqrt))
15670     iterations--;
15671
15672   /* Iterate over the series to calculate the approximate reciprocal square
15673      root.  */
15674   rtx x1 = gen_reg_rtx (mode);
15675   while (iterations--)
15676     {
15677       rtx x2 = gen_reg_rtx (mode);
15678       aarch64_emit_mult (x2, pg, xdst, xdst);
15679
15680       emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
15681
15682       if (iterations > 0)
15683         aarch64_emit_mult (xdst, pg, xdst, x1);
15684     }
15685
15686   if (!recp)
15687     {
15688       if (pg)
15689         /* Multiply nonzero source values by the corresponding intermediate
15690            result elements, so that the final calculation is the approximate
15691            square root rather than its reciprocal.  Select a zero result for
15692            zero source values, to avoid the Inf * 0 -> NaN that we'd get
15693            otherwise.  */
15694         emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
15695                              xdst, xmsk, xdst, src, CONST0_RTX (mode)));
15696       else
15697         {
15698           /* Qualify the approximate reciprocal square root when the
15699              argument is 0.0 by squashing the intermediary result to 0.0.  */
15700           rtx xtmp = gen_reg_rtx (mmsk);
15701           emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
15702                                             gen_rtx_SUBREG (mmsk, xdst, 0)));
15703           emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
15704
15705           /* Calculate the approximate square root.  */
15706           aarch64_emit_mult (xdst, pg, xdst, src);
15707         }
15708     }
15709
15710   /* Finalize the approximation.  */
15711   aarch64_emit_mult (dst, pg, xdst, x1);
15712
15713   return true;
15714 }
15715
15716 /* Emit the instruction sequence to compute the approximation for the division
15717    of NUM by DEN in QUO and return whether the sequence was emitted or not.  */
15718
15719 bool
15720 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
15721 {
15722   machine_mode mode = GET_MODE (quo);
15723
15724   if (GET_MODE_INNER (mode) == HFmode)
15725     return false;
15726
15727   bool use_approx_division_p = (flag_mlow_precision_div
15728                                 || (aarch64_tune_params.approx_modes->division
15729                                     & AARCH64_APPROX_MODE (mode)));
15730
15731   if (!flag_finite_math_only
15732       || flag_trapping_math
15733       || !flag_unsafe_math_optimizations
15734       || optimize_function_for_size_p (cfun)
15735       || !use_approx_division_p)
15736     return false;
15737
15738   if (!TARGET_SIMD && VECTOR_MODE_P (mode))
15739     return false;
15740
15741   rtx pg = NULL_RTX;
15742   if (aarch64_sve_mode_p (mode))
15743     pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15744
15745   /* Estimate the approximate reciprocal.  */
15746   rtx xrcp = gen_reg_rtx (mode);
15747   emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
15748
15749   /* Iterate over the series twice for SF and thrice for DF.  */
15750   int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15751
15752   /* Optionally iterate over the series less for faster performance,
15753      while sacrificing the accuracy.  The default is 2 for DF and 1 for SF.  */
15754   if (flag_mlow_precision_div)
15755     iterations = (GET_MODE_INNER (mode) == DFmode
15756                   ? aarch64_double_recp_precision
15757                   : aarch64_float_recp_precision);
15758
15759   /* Iterate over the series to calculate the approximate reciprocal.  */
15760   rtx xtmp = gen_reg_rtx (mode);
15761   while (iterations--)
15762     {
15763       emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
15764
15765       if (iterations > 0)
15766         aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
15767     }
15768
15769   if (num != CONST1_RTX (mode))
15770     {
15771       /* As the approximate reciprocal of DEN is already calculated, only
15772          calculate the approximate division when NUM is not 1.0.  */
15773       rtx xnum = force_reg (mode, num);
15774       aarch64_emit_mult (xrcp, pg, xrcp, xnum);
15775     }
15776
15777   /* Finalize the approximation.  */
15778   aarch64_emit_mult (quo, pg, xrcp, xtmp);
15779   return true;
15780 }
15781
15782 /* Return the number of instructions that can be issued per cycle.  */
15783 static int
15784 aarch64_sched_issue_rate (void)
15785 {
15786   return aarch64_tune_params.issue_rate;
15787 }
15788
15789 /* Implement TARGET_SCHED_VARIABLE_ISSUE.  */
15790 static int
15791 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
15792 {
15793   if (DEBUG_INSN_P (insn))
15794     return more;
15795
15796   rtx_code code = GET_CODE (PATTERN (insn));
15797   if (code == USE || code == CLOBBER)
15798     return more;
15799
15800   if (get_attr_type (insn) == TYPE_NO_INSN)
15801     return more;
15802
15803   return more - 1;
15804 }
15805
15806 static int
15807 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
15808 {
15809   int issue_rate = aarch64_sched_issue_rate ();
15810
15811   return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
15812 }
15813
15814
15815 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
15816    autopref_multipass_dfa_lookahead_guard from haifa-sched.cc.  It only
15817    has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0.  */
15818
15819 static int
15820 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
15821                                                     int ready_index)
15822 {
15823   return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
15824 }
15825
15826
15827 /* Vectorizer cost model target hooks.  */
15828
15829 /* If a vld1 from address ADDR should be recorded in vector_load_decls,
15830    return the decl that should be recorded.  Return null otherwise.  */
15831 tree
15832 aarch64_vector_load_decl (tree addr)
15833 {
15834   if (TREE_CODE (addr) != ADDR_EXPR)
15835     return NULL_TREE;
15836   tree base = get_base_address (TREE_OPERAND (addr, 0));
15837   if (TREE_CODE (base) != VAR_DECL)
15838     return NULL_TREE;
15839   return base;
15840 }
15841
15842 /* Return true if STMT_INFO accesses a decl that is known to be the
15843    argument to a vld1 in the same function.  */
15844 static bool
15845 aarch64_accesses_vector_load_decl_p (stmt_vec_info stmt_info)
15846 {
15847   if (!cfun->machine->vector_load_decls)
15848     return false;
15849   auto dr = STMT_VINFO_DATA_REF (stmt_info);
15850   if (!dr)
15851     return false;
15852   tree decl = aarch64_vector_load_decl (DR_BASE_ADDRESS (dr));
15853   return decl && cfun->machine->vector_load_decls->contains (decl);
15854 }
15855
15856 /* Information about how the CPU would issue the scalar, Advanced SIMD
15857    or SVE version of a vector loop, using the scheme defined by the
15858    aarch64_base_vec_issue_info hierarchy of structures.  */
15859 class aarch64_vec_op_count
15860 {
15861 public:
15862   aarch64_vec_op_count () = default;
15863   aarch64_vec_op_count (const aarch64_vec_issue_info *, unsigned int,
15864                         unsigned int = 1);
15865
15866   unsigned int vec_flags () const { return m_vec_flags; }
15867   unsigned int vf_factor () const { return m_vf_factor; }
15868
15869   const aarch64_base_vec_issue_info *base_issue_info () const;
15870   const aarch64_simd_vec_issue_info *simd_issue_info () const;
15871   const aarch64_sve_vec_issue_info *sve_issue_info () const;
15872
15873   fractional_cost rename_cycles_per_iter () const;
15874   fractional_cost min_nonpred_cycles_per_iter () const;
15875   fractional_cost min_pred_cycles_per_iter () const;
15876   fractional_cost min_cycles_per_iter () const;
15877
15878   void dump () const;
15879
15880   /* The number of individual "general" operations.  See the comments
15881      in aarch64_base_vec_issue_info for details.  */
15882   unsigned int general_ops = 0;
15883
15884   /* The number of load and store operations, under the same scheme
15885      as above.  */
15886   unsigned int loads = 0;
15887   unsigned int stores = 0;
15888
15889   /* The minimum number of cycles needed to execute all loop-carried
15890      operations, which in the vector code become associated with
15891      reductions.  */
15892   unsigned int reduction_latency = 0;
15893
15894   /* The number of individual predicate operations.  See the comments
15895      in aarch64_sve_vec_issue_info for details.  */
15896   unsigned int pred_ops = 0;
15897
15898 private:
15899   /* The issue information for the core.  */
15900   const aarch64_vec_issue_info *m_issue_info = nullptr;
15901
15902   /* - If M_VEC_FLAGS is zero then this structure describes scalar code
15903      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
15904        Advanced SIMD code.
15905      - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
15906        SVE code.  */
15907   unsigned int m_vec_flags = 0;
15908
15909   /* Assume that, when the code is executing on the core described
15910      by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
15911      times more data than the vectorizer anticipates.
15912
15913      This is only ever different from 1 for SVE.  It allows us to consider
15914      what would happen on a 256-bit SVE target even when the -mtune
15915      parameters say that the “likely” SVE length is 128 bits.  */
15916   unsigned int m_vf_factor = 1;
15917 };
15918
15919 aarch64_vec_op_count::
15920 aarch64_vec_op_count (const aarch64_vec_issue_info *issue_info,
15921                       unsigned int vec_flags, unsigned int vf_factor)
15922   : m_issue_info (issue_info),
15923     m_vec_flags (vec_flags),
15924     m_vf_factor (vf_factor)
15925 {
15926 }
15927
15928 /* Return the base issue information (i.e. the parts that make sense
15929    for both scalar and vector code).  Return null if we have no issue
15930    information.  */
15931 const aarch64_base_vec_issue_info *
15932 aarch64_vec_op_count::base_issue_info () const
15933 {
15934   if (auto *ret = simd_issue_info ())
15935     return ret;
15936   return m_issue_info->scalar;
15937 }
15938
15939 /* If the structure describes vector code and we have associated issue
15940    information, return that issue information, otherwise return null.  */
15941 const aarch64_simd_vec_issue_info *
15942 aarch64_vec_op_count::simd_issue_info () const
15943 {
15944   if (auto *ret = sve_issue_info ())
15945     return ret;
15946   if (m_vec_flags)
15947     return m_issue_info->advsimd;
15948   return nullptr;
15949 }
15950
15951 /* If the structure describes SVE code and we have associated issue
15952    information, return that issue information, otherwise return null.  */
15953 const aarch64_sve_vec_issue_info *
15954 aarch64_vec_op_count::sve_issue_info () const
15955 {
15956   if (m_vec_flags & VEC_ANY_SVE)
15957     return m_issue_info->sve;
15958   return nullptr;
15959 }
15960
15961 /* Estimate the minimum number of cycles per iteration needed to rename
15962    the instructions.
15963
15964    ??? For now this is done inline rather than via cost tables, since it
15965    isn't clear how it should be parameterized for the general case.  */
15966 fractional_cost
15967 aarch64_vec_op_count::rename_cycles_per_iter () const
15968 {
15969   if (sve_issue_info () == &neoverse512tvb_sve_issue_info
15970       || sve_issue_info () == &neoversen2_sve_issue_info
15971       || sve_issue_info () == &neoversev2_sve_issue_info)
15972     /* + 1 for an addition.  We've already counted a general op for each
15973        store, so we don't need to account for stores separately.  The branch
15974        reads no registers and so does not need to be counted either.
15975
15976        ??? This value is very much on the pessimistic side, but seems to work
15977        pretty well in practice.  */
15978     return { general_ops + loads + pred_ops + 1, 5 };
15979
15980   return 0;
15981 }
15982
15983 /* Like min_cycles_per_iter, but excluding predicate operations.  */
15984 fractional_cost
15985 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
15986 {
15987   auto *issue_info = base_issue_info ();
15988
15989   fractional_cost cycles = MAX (reduction_latency, 1);
15990   cycles = std::max (cycles, { stores, issue_info->stores_per_cycle });
15991   cycles = std::max (cycles, { loads + stores,
15992                                issue_info->loads_stores_per_cycle });
15993   cycles = std::max (cycles, { general_ops,
15994                                issue_info->general_ops_per_cycle });
15995   cycles = std::max (cycles, rename_cycles_per_iter ());
15996   return cycles;
15997 }
15998
15999 /* Like min_cycles_per_iter, but including only the predicate operations.  */
16000 fractional_cost
16001 aarch64_vec_op_count::min_pred_cycles_per_iter () const
16002 {
16003   if (auto *issue_info = sve_issue_info ())
16004     return { pred_ops, issue_info->pred_ops_per_cycle };
16005   return 0;
16006 }
16007
16008 /* Estimate the minimum number of cycles needed to issue the operations.
16009    This is a very simplistic model!  */
16010 fractional_cost
16011 aarch64_vec_op_count::min_cycles_per_iter () const
16012 {
16013   return std::max (min_nonpred_cycles_per_iter (),
16014                    min_pred_cycles_per_iter ());
16015 }
16016
16017 /* Dump information about the structure.  */
16018 void
16019 aarch64_vec_op_count::dump () const
16020 {
16021   dump_printf_loc (MSG_NOTE, vect_location,
16022                    "  load operations = %d\n", loads);
16023   dump_printf_loc (MSG_NOTE, vect_location,
16024                    "  store operations = %d\n", stores);
16025   dump_printf_loc (MSG_NOTE, vect_location,
16026                    "  general operations = %d\n", general_ops);
16027   if (sve_issue_info ())
16028     dump_printf_loc (MSG_NOTE, vect_location,
16029                      "  predicate operations = %d\n", pred_ops);
16030   dump_printf_loc (MSG_NOTE, vect_location,
16031                    "  reduction latency = %d\n", reduction_latency);
16032   if (auto rcpi = rename_cycles_per_iter ())
16033     dump_printf_loc (MSG_NOTE, vect_location,
16034                      "  estimated cycles per iteration to rename = %f\n",
16035                      rcpi.as_double ());
16036   if (auto pred_cpi = min_pred_cycles_per_iter ())
16037     {
16038       dump_printf_loc (MSG_NOTE, vect_location,
16039                        "  estimated min cycles per iteration"
16040                        " without predication = %f\n",
16041                        min_nonpred_cycles_per_iter ().as_double ());
16042       dump_printf_loc (MSG_NOTE, vect_location,
16043                        "  estimated min cycles per iteration"
16044                        " for predication = %f\n", pred_cpi.as_double ());
16045     }
16046   if (auto cpi = min_cycles_per_iter ())
16047     dump_printf_loc (MSG_NOTE, vect_location,
16048                      "  estimated min cycles per iteration = %f\n",
16049                      cpi.as_double ());
16050 }
16051
16052 /* Information about vector code that we're in the process of costing.  */
16053 class aarch64_vector_costs : public vector_costs
16054 {
16055 public:
16056   aarch64_vector_costs (vec_info *, bool);
16057
16058   unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
16059                               stmt_vec_info stmt_info, slp_tree, tree vectype,
16060                               int misalign,
16061                               vect_cost_model_location where) override;
16062   void finish_cost (const vector_costs *) override;
16063   bool better_main_loop_than_p (const vector_costs *other) const override;
16064
16065 private:
16066   void record_potential_advsimd_unrolling (loop_vec_info);
16067   void analyze_loop_vinfo (loop_vec_info);
16068   void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info,
16069                   aarch64_vec_op_count *);
16070   fractional_cost adjust_body_cost_sve (const aarch64_vec_op_count *,
16071                                         fractional_cost, unsigned int,
16072                                         unsigned int *, bool *);
16073   unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
16074                                  unsigned int);
16075   bool prefer_unrolled_loop () const;
16076   unsigned int determine_suggested_unroll_factor ();
16077
16078   /* True if we have performed one-time initialization based on the
16079      vec_info.  */
16080   bool m_analyzed_vinfo = false;
16081
16082   /* This loop uses an average operation that is not supported by SVE, but is
16083      supported by Advanced SIMD and SVE2.  */
16084   bool m_has_avg = false;
16085
16086   /* True if the vector body contains a store to a decl and if the
16087      function is known to have a vld1 from the same decl.
16088
16089      In the Advanced SIMD ACLE, the recommended endian-agnostic way of
16090      initializing a vector is:
16091
16092        float f[4] = { elts };
16093        float32x4_t x = vld1q_f32(f);
16094
16095      We should strongly prefer vectorization of the initialization of f,
16096      so that the store to f and the load back can be optimized away,
16097      leaving a vectorization of { elts }.  */
16098   bool m_stores_to_vector_load_decl = false;
16099
16100   /* Non-zero if the last operation we costed is a vector promotion or demotion.
16101      In this case the value is the number of insns in the last operation.
16102
16103      On AArch64 vector promotion and demotions require us to first widen or
16104      narrow the input and only after that emit conversion instructions.  For
16105      costing this means we need to emit the cost of the final conversions as
16106      well.  */
16107   unsigned int m_num_last_promote_demote = 0;
16108
16109   /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
16110      - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
16111        SIMD code.
16112      - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code.  */
16113   unsigned int m_vec_flags = 0;
16114
16115   /* At the moment, we do not model LDP and STP in the vector and scalar costs.
16116      This means that code such as:
16117
16118         a[0] = x;
16119         a[1] = x;
16120
16121      will be costed as two scalar instructions and two vector instructions
16122      (a scalar_to_vec and an unaligned_store).  For SLP, the vector form
16123      wins if the costs are equal, because of the fact that the vector costs
16124      include constant initializations whereas the scalar costs don't.
16125      We would therefore tend to vectorize the code above, even though
16126      the scalar version can use a single STP.
16127
16128      We should eventually fix this and model LDP and STP in the main costs;
16129      see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
16130      Until then, we look specifically for code that does nothing more than
16131      STP-like operations.  We cost them on that basis in addition to the
16132      normal latency-based costs.
16133
16134      If the scalar or vector code could be a sequence of STPs +
16135      initialization, this variable counts the cost of the sequence,
16136      with 2 units per instruction.  The variable is ~0U for other
16137      kinds of code.  */
16138   unsigned int m_stp_sequence_cost = 0;
16139
16140   /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
16141      throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE.  In those
16142      situations, we try to predict whether an Advanced SIMD implementation
16143      of the loop could be completely unrolled and become straight-line code.
16144      If so, it is generally better to use the Advanced SIMD version rather
16145      than length-agnostic SVE, since the SVE loop would execute an unknown
16146      number of times and so could not be completely unrolled in the same way.
16147
16148      If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
16149      number of Advanced SIMD loop iterations that would be unrolled and
16150      M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
16151      in the unrolled loop.  Both values are zero if we're not applying
16152      the heuristic.  */
16153   unsigned HOST_WIDE_INT m_unrolled_advsimd_niters = 0;
16154   unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts = 0;
16155
16156   /* If we're vectorizing a loop that executes a constant number of times,
16157      this variable gives the number of times that the vector loop would
16158      iterate, otherwise it is zero.  */
16159   uint64_t m_num_vector_iterations = 0;
16160
16161   /* Used only when vectorizing loops.  Estimates the number and kind of
16162      operations that would be needed by one iteration of the scalar
16163      or vector loop.  There is one entry for each tuning option of
16164      interest.  */
16165   auto_vec<aarch64_vec_op_count, 2> m_ops;
16166 };
16167
16168 aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
16169                                             bool costing_for_scalar)
16170   : vector_costs (vinfo, costing_for_scalar),
16171     m_vec_flags (costing_for_scalar ? 0
16172                  : aarch64_classify_vector_mode (vinfo->vector_mode))
16173 {
16174   if (auto *issue_info = aarch64_tune_params.vec_costs->issue_info)
16175     {
16176       m_ops.quick_push ({ issue_info, m_vec_flags });
16177       if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
16178         {
16179           unsigned int vf_factor = (m_vec_flags & VEC_ANY_SVE) ? 2 : 1;
16180           m_ops.quick_push ({ &neoversev1_vec_issue_info, m_vec_flags,
16181                               vf_factor });
16182         }
16183     }
16184 }
16185
16186 /* Implement TARGET_VECTORIZE_CREATE_COSTS.  */
16187 vector_costs *
16188 aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
16189 {
16190   return new aarch64_vector_costs (vinfo, costing_for_scalar);
16191 }
16192
16193 /* Return true if the current CPU should use the new costs defined
16194    in GCC 11.  This should be removed for GCC 12 and above, with the
16195    costs applying to all CPUs instead.  */
16196 static bool
16197 aarch64_use_new_vector_costs_p ()
16198 {
16199   return (aarch64_tune_params.extra_tuning_flags
16200           & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
16201 }
16202
16203 /* Return the appropriate SIMD costs for vectors of type VECTYPE.  */
16204 static const simd_vec_cost *
16205 aarch64_simd_vec_costs (tree vectype)
16206 {
16207   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16208   if (vectype != NULL
16209       && aarch64_sve_mode_p (TYPE_MODE (vectype))
16210       && costs->sve != NULL)
16211     return costs->sve;
16212   return costs->advsimd;
16213 }
16214
16215 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS.  */
16216 static const simd_vec_cost *
16217 aarch64_simd_vec_costs_for_flags (unsigned int flags)
16218 {
16219   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16220   if ((flags & VEC_ANY_SVE) && costs->sve)
16221     return costs->sve;
16222   return costs->advsimd;
16223 }
16224
16225 /* If STMT_INFO is a memory reference, return the scalar memory type,
16226    otherwise return null.  */
16227 static tree
16228 aarch64_dr_type (stmt_vec_info stmt_info)
16229 {
16230   if (auto dr = STMT_VINFO_DATA_REF (stmt_info))
16231     return TREE_TYPE (DR_REF (dr));
16232   return NULL_TREE;
16233 }
16234
16235 /* Decide whether to use the unrolling heuristic described above
16236    m_unrolled_advsimd_niters, updating that field if so.  LOOP_VINFO
16237    describes the loop that we're vectorizing.  */
16238 void
16239 aarch64_vector_costs::
16240 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo)
16241 {
16242   /* The heuristic only makes sense on targets that have the same
16243      vector throughput for SVE and Advanced SIMD.  */
16244   if (!(aarch64_tune_params.extra_tuning_flags
16245         & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
16246     return;
16247
16248   /* We only want to apply the heuristic if LOOP_VINFO is being
16249      vectorized for SVE.  */
16250   if (!(m_vec_flags & VEC_ANY_SVE))
16251     return;
16252
16253   /* Check whether it is possible in principle to use Advanced SIMD
16254      instead.  */
16255   if (aarch64_autovec_preference == 2)
16256     return;
16257
16258   /* We don't want to apply the heuristic to outer loops, since it's
16259      harder to track two levels of unrolling.  */
16260   if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
16261     return;
16262
16263   /* Only handle cases in which the number of Advanced SIMD iterations
16264      would be known at compile time but the number of SVE iterations
16265      would not.  */
16266   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
16267       || aarch64_sve_vg.is_constant ())
16268     return;
16269
16270   /* Guess how many times the Advanced SIMD loop would iterate and make
16271      sure that it is within the complete unrolling limit.  Even if the
16272      number of iterations is small enough, the number of statements might
16273      not be, which is why we need to estimate the number of statements too.  */
16274   unsigned int estimated_vq = aarch64_estimated_sve_vq ();
16275   unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
16276   unsigned HOST_WIDE_INT unrolled_advsimd_niters
16277     = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
16278   if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
16279     return;
16280
16281   /* Record that we're applying the heuristic and should try to estimate
16282      the number of statements in the Advanced SIMD loop.  */
16283   m_unrolled_advsimd_niters = unrolled_advsimd_niters;
16284 }
16285
16286 /* Do one-time initialization of the aarch64_vector_costs given that we're
16287    costing the loop vectorization described by LOOP_VINFO.  */
16288 void
16289 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
16290 {
16291   /* Record the number of times that the vector loop would execute,
16292      if known.  */
16293   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
16294   auto scalar_niters = max_stmt_executions_int (loop);
16295   if (scalar_niters >= 0)
16296     {
16297       unsigned int vf = vect_vf_for_cost (loop_vinfo);
16298       if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
16299         m_num_vector_iterations = scalar_niters / vf;
16300       else
16301         m_num_vector_iterations = CEIL (scalar_niters, vf);
16302     }
16303
16304   /* Detect whether we're vectorizing for SVE and should apply the unrolling
16305      heuristic described above m_unrolled_advsimd_niters.  */
16306   record_potential_advsimd_unrolling (loop_vinfo);
16307 }
16308
16309 /* Implement targetm.vectorize.builtin_vectorization_cost.  */
16310 static int
16311 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
16312                                     tree vectype,
16313                                     int misalign ATTRIBUTE_UNUSED)
16314 {
16315   unsigned elements;
16316   const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16317   bool fp = false;
16318
16319   if (vectype != NULL)
16320     fp = FLOAT_TYPE_P (vectype);
16321
16322   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16323
16324   switch (type_of_cost)
16325     {
16326       case scalar_stmt:
16327         return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
16328
16329       case scalar_load:
16330         return costs->scalar_load_cost;
16331
16332       case scalar_store:
16333         return costs->scalar_store_cost;
16334
16335       case vector_stmt:
16336         return fp ? simd_costs->fp_stmt_cost
16337                   : simd_costs->int_stmt_cost;
16338
16339       case vector_load:
16340         return simd_costs->align_load_cost;
16341
16342       case vector_store:
16343         return simd_costs->store_cost;
16344
16345       case vec_to_scalar:
16346         return simd_costs->vec_to_scalar_cost;
16347
16348       case scalar_to_vec:
16349         return simd_costs->scalar_to_vec_cost;
16350
16351       case unaligned_load:
16352       case vector_gather_load:
16353         return simd_costs->unalign_load_cost;
16354
16355       case unaligned_store:
16356       case vector_scatter_store:
16357         return simd_costs->unalign_store_cost;
16358
16359       case cond_branch_taken:
16360         return costs->cond_taken_branch_cost;
16361
16362       case cond_branch_not_taken:
16363         return costs->cond_not_taken_branch_cost;
16364
16365       case vec_perm:
16366         return simd_costs->permute_cost;
16367
16368       case vec_promote_demote:
16369         return fp ? simd_costs->fp_stmt_cost
16370                   : simd_costs->int_stmt_cost;
16371
16372       case vec_construct:
16373         elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
16374         return elements / 2 + 1;
16375
16376       default:
16377         gcc_unreachable ();
16378     }
16379 }
16380
16381 /* Return true if an access of kind KIND for STMT_INFO represents one
16382    vector of an LD[234] or ST[234] operation.  Return the total number of
16383    vectors (2, 3 or 4) if so, otherwise return a value outside that range.  */
16384 static int
16385 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
16386 {
16387   if ((kind == vector_load
16388        || kind == unaligned_load
16389        || kind == vector_store
16390        || kind == unaligned_store)
16391       && STMT_VINFO_DATA_REF (stmt_info))
16392     {
16393       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
16394       if (stmt_info
16395           && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
16396         return DR_GROUP_SIZE (stmt_info);
16397     }
16398   return 0;
16399 }
16400
16401 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
16402    vectors would produce a series of LDP or STP operations.  KIND is the
16403    kind of statement that STMT_INFO represents.  */
16404 static bool
16405 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
16406                            stmt_vec_info stmt_info)
16407 {
16408   switch (kind)
16409     {
16410     case vector_load:
16411     case vector_store:
16412     case unaligned_load:
16413     case unaligned_store:
16414       break;
16415
16416     default:
16417       return false;
16418     }
16419
16420   if (aarch64_tune_params.extra_tuning_flags
16421       & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
16422     return false;
16423
16424   return is_gimple_assign (stmt_info->stmt);
16425 }
16426
16427 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
16428    or multiply-subtract sequence that might be suitable for fusing into a
16429    single instruction.  If VEC_FLAGS is zero, analyze the operation as
16430    a scalar one, otherwise analyze it as an operation on vectors with those
16431    VEC_* flags.  */
16432 static bool
16433 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
16434                         unsigned int vec_flags)
16435 {
16436   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16437   if (!assign)
16438     return false;
16439   tree_code code = gimple_assign_rhs_code (assign);
16440   if (code != PLUS_EXPR && code != MINUS_EXPR)
16441     return false;
16442
16443   auto is_mul_result = [&](int i)
16444     {
16445       tree rhs = gimple_op (assign, i);
16446       /* ??? Should we try to check for a single use as well?  */
16447       if (TREE_CODE (rhs) != SSA_NAME)
16448         return false;
16449
16450       stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16451       if (!def_stmt_info
16452           || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16453         return false;
16454       gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16455       if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
16456         return false;
16457
16458       if (vec_flags & VEC_ADVSIMD)
16459         {
16460           /* Scalar and SVE code can tie the result to any FMLA input (or none,
16461              although that requires a MOVPRFX for SVE).  However, Advanced SIMD
16462              only supports MLA forms, so will require a move if the result
16463              cannot be tied to the accumulator.  The most important case in
16464              which this is true is when the accumulator input is invariant.  */
16465           rhs = gimple_op (assign, 3 - i);
16466           if (TREE_CODE (rhs) != SSA_NAME)
16467             return false;
16468           def_stmt_info = vinfo->lookup_def (rhs);
16469           if (!def_stmt_info
16470               || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def
16471               || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_constant_def)
16472             return false;
16473         }
16474
16475       return true;
16476     };
16477
16478   if (code == MINUS_EXPR && (vec_flags & VEC_ADVSIMD))
16479     /* Advanced SIMD doesn't have FNMADD/FNMSUB/FNMLA/FNMLS, so the
16480        multiplication must be on the second operand (to form an FMLS).
16481        But if both operands are multiplications and the second operand
16482        is used more than once, we'll instead negate the second operand
16483        and use it as an accumulator for the first operand.  */
16484     return (is_mul_result (2)
16485             && (has_single_use (gimple_assign_rhs2 (assign))
16486                 || !is_mul_result (1)));
16487
16488   return is_mul_result (1) || is_mul_result (2);
16489 }
16490
16491 /* Return true if STMT_INFO is the second part of a two-statement boolean AND
16492    expression sequence that might be suitable for fusing into a
16493    single instruction.  If VEC_FLAGS is zero, analyze the operation as
16494    a scalar one, otherwise analyze it as an operation on vectors with those
16495    VEC_* flags.  */
16496
16497 static bool
16498 aarch64_bool_compound_p (vec_info *vinfo, stmt_vec_info stmt_info,
16499                          unsigned int vec_flags)
16500 {
16501   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16502   if (!assign
16503       || gimple_assign_rhs_code (assign) != BIT_AND_EXPR
16504       || !STMT_VINFO_VECTYPE (stmt_info)
16505       || !VECTOR_BOOLEAN_TYPE_P (STMT_VINFO_VECTYPE (stmt_info)))
16506     return false;
16507
16508   for (int i = 1; i < 3; ++i)
16509     {
16510       tree rhs = gimple_op (assign, i);
16511
16512       if (TREE_CODE (rhs) != SSA_NAME)
16513         continue;
16514
16515       stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16516       if (!def_stmt_info
16517           || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16518         continue;
16519
16520       gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16521       if (!rhs_assign
16522           || TREE_CODE_CLASS (gimple_assign_rhs_code (rhs_assign))
16523                 != tcc_comparison)
16524         continue;
16525
16526       if (vec_flags & VEC_ADVSIMD)
16527         return false;
16528
16529       return true;
16530     }
16531   return false;
16532 }
16533
16534 /* We are considering implementing STMT_INFO using SVE.  If STMT_INFO is an
16535    in-loop reduction that SVE supports directly, return its latency in cycles,
16536    otherwise return zero.  SVE_COSTS specifies the latencies of the relevant
16537    instructions.  */
16538 static unsigned int
16539 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
16540                                        stmt_vec_info stmt_info,
16541                                        const sve_vec_cost *sve_costs)
16542 {
16543   switch (vect_reduc_type (vinfo, stmt_info))
16544     {
16545     case EXTRACT_LAST_REDUCTION:
16546       return sve_costs->clast_cost;
16547
16548     case FOLD_LEFT_REDUCTION:
16549       switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info->stmt))))
16550         {
16551         case E_HFmode:
16552         case E_BFmode:
16553           return sve_costs->fadda_f16_cost;
16554
16555         case E_SFmode:
16556           return sve_costs->fadda_f32_cost;
16557
16558         case E_DFmode:
16559           return sve_costs->fadda_f64_cost;
16560
16561         default:
16562           break;
16563         }
16564       break;
16565     }
16566
16567   return 0;
16568 }
16569
16570 /* STMT_INFO describes a loop-carried operation in the original scalar code
16571    that we are considering implementing as a reduction.  Return one of the
16572    following values, depending on VEC_FLAGS:
16573
16574    - If VEC_FLAGS is zero, return the loop carry latency of the original
16575      scalar operation.
16576
16577    - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
16578      Advanced SIMD implementation.
16579
16580    - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
16581      SVE implementation.  */
16582 static unsigned int
16583 aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
16584                                    unsigned int vec_flags)
16585 {
16586   const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
16587   const sve_vec_cost *sve_costs = nullptr;
16588   if (vec_flags & VEC_ANY_SVE)
16589     sve_costs = aarch64_tune_params.vec_costs->sve;
16590
16591   /* If the caller is asking for the SVE latency, check for forms of reduction
16592      that only SVE can handle directly.  */
16593   if (sve_costs)
16594     {
16595       unsigned int latency
16596         = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16597       if (latency)
16598         return latency;
16599     }
16600
16601   /* Handle scalar costs.  */
16602   bool is_float = FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info->stmt)));
16603   if (vec_flags == 0)
16604     {
16605       if (is_float)
16606         return vec_costs->scalar_fp_stmt_cost;
16607       return vec_costs->scalar_int_stmt_cost;
16608     }
16609
16610   /* Otherwise, the loop body just contains normal integer or FP operations,
16611      with a vector reduction outside the loop.  */
16612   const simd_vec_cost *simd_costs
16613     = aarch64_simd_vec_costs_for_flags (vec_flags);
16614   if (is_float)
16615     return simd_costs->fp_stmt_cost;
16616   return simd_costs->int_stmt_cost;
16617 }
16618
16619 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16620    for STMT_INFO, which has cost kind KIND.  If this is a scalar operation,
16621    try to subdivide the target-independent categorization provided by KIND
16622    to get a more accurate cost.  */
16623 static fractional_cost
16624 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16625                                     stmt_vec_info stmt_info,
16626                                     fractional_cost stmt_cost)
16627 {
16628   /* Detect an extension of a loaded value.  In general, we'll be able to fuse
16629      the extension with the load.  */
16630   if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
16631     return 0;
16632
16633   return stmt_cost;
16634 }
16635
16636 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16637    for the vectorized form of STMT_INFO, which has cost kind KIND and which
16638    when vectorized would operate on vector type VECTYPE.  Try to subdivide
16639    the target-independent categorization provided by KIND to get a more
16640    accurate cost.  WHERE specifies where the cost associated with KIND
16641    occurs.  */
16642 static fractional_cost
16643 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16644                                     stmt_vec_info stmt_info, tree vectype,
16645                                     enum vect_cost_model_location where,
16646                                     fractional_cost stmt_cost)
16647 {
16648   const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16649   const sve_vec_cost *sve_costs = nullptr;
16650   if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16651     sve_costs = aarch64_tune_params.vec_costs->sve;
16652
16653   /* It's generally better to avoid costing inductions, since the induction
16654      will usually be hidden by other operations.  This is particularly true
16655      for things like COND_REDUCTIONS.  */
16656   if (is_a<gphi *> (stmt_info->stmt))
16657     return 0;
16658
16659   /* Detect cases in which vec_to_scalar is describing the extraction of a
16660      vector element in preparation for a scalar store.  The store itself is
16661      costed separately.  */
16662   if (vect_is_store_elt_extraction (kind, stmt_info))
16663     return simd_costs->store_elt_extra_cost;
16664
16665   /* Detect SVE gather loads, which are costed as a single scalar_load
16666      for each element.  We therefore need to divide the full-instruction
16667      cost by the number of elements in the vector.  */
16668   if (kind == scalar_load
16669       && sve_costs
16670       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16671     {
16672       unsigned int nunits = vect_nunits_for_cost (vectype);
16673       if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
16674         return { sve_costs->gather_load_x64_cost, nunits };
16675       return { sve_costs->gather_load_x32_cost, nunits };
16676     }
16677
16678   /* Detect cases in which a scalar_store is really storing one element
16679      in a scatter operation.  */
16680   if (kind == scalar_store
16681       && sve_costs
16682       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16683     return sve_costs->scatter_store_elt_cost;
16684
16685   /* Detect cases in which vec_to_scalar represents an in-loop reduction.  */
16686   if (kind == vec_to_scalar
16687       && where == vect_body
16688       && sve_costs)
16689     {
16690       unsigned int latency
16691         = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16692       if (latency)
16693         return latency;
16694     }
16695
16696   /* Detect cases in which vec_to_scalar represents a single reduction
16697      instruction like FADDP or MAXV.  */
16698   if (kind == vec_to_scalar
16699       && where == vect_epilogue
16700       && vect_is_reduction (stmt_info))
16701     switch (GET_MODE_INNER (TYPE_MODE (vectype)))
16702       {
16703       case E_QImode:
16704         return simd_costs->reduc_i8_cost;
16705
16706       case E_HImode:
16707         return simd_costs->reduc_i16_cost;
16708
16709       case E_SImode:
16710         return simd_costs->reduc_i32_cost;
16711
16712       case E_DImode:
16713         return simd_costs->reduc_i64_cost;
16714
16715       case E_HFmode:
16716       case E_BFmode:
16717         return simd_costs->reduc_f16_cost;
16718
16719       case E_SFmode:
16720         return simd_costs->reduc_f32_cost;
16721
16722       case E_DFmode:
16723         return simd_costs->reduc_f64_cost;
16724
16725       default:
16726         break;
16727       }
16728
16729   /* Otherwise stick with the original categorization.  */
16730   return stmt_cost;
16731 }
16732
16733 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16734    for STMT_INFO, which has cost kind KIND and which when vectorized would
16735    operate on vector type VECTYPE.  Adjust the cost as necessary for SVE
16736    targets.  */
16737 static fractional_cost
16738 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
16739                               stmt_vec_info stmt_info, tree vectype,
16740                               fractional_cost stmt_cost)
16741 {
16742   /* Unlike vec_promote_demote, vector_stmt conversions do not change the
16743      vector register size or number of units.  Integer promotions of this
16744      type therefore map to SXT[BHW] or UXT[BHW].
16745
16746      Most loads have extending forms that can do the sign or zero extension
16747      on the fly.  Optimistically assume that a load followed by an extension
16748      will fold to this form during combine, and that the extension therefore
16749      comes for free.  */
16750   if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
16751     stmt_cost = 0;
16752
16753   /* For similar reasons, vector_stmt integer truncations are a no-op,
16754      because we can just ignore the unused upper bits of the source.  */
16755   if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
16756     stmt_cost = 0;
16757
16758   /* Advanced SIMD can load and store pairs of registers using LDP and STP,
16759      but there are no equivalent instructions for SVE.  This means that
16760      (all other things being equal) 128-bit SVE needs twice as many load
16761      and store instructions as Advanced SIMD in order to process vector pairs.
16762
16763      Also, scalar code can often use LDP and STP to access pairs of values,
16764      so it is too simplistic to say that one SVE load or store replaces
16765      VF scalar loads and stores.
16766
16767      Ideally we would account for this in the scalar and Advanced SIMD
16768      costs by making suitable load/store pairs as cheap as a single
16769      load/store.  However, that would be a very invasive change and in
16770      practice it tends to stress other parts of the cost model too much.
16771      E.g. stores of scalar constants currently count just a store,
16772      whereas stores of vector constants count a store and a vec_init.
16773      This is an artificial distinction for AArch64, where stores of
16774      nonzero scalar constants need the same kind of register invariant
16775      as vector stores.
16776
16777      An alternative would be to double the cost of any SVE loads and stores
16778      that could be paired in Advanced SIMD (and possibly also paired in
16779      scalar code).  But this tends to stress other parts of the cost model
16780      in the same way.  It also means that we can fall back to Advanced SIMD
16781      even if full-loop predication would have been useful.
16782
16783      Here we go for a more conservative version: double the costs of SVE
16784      loads and stores if one iteration of the scalar loop processes enough
16785      elements for it to use a whole number of Advanced SIMD LDP or STP
16786      instructions.  This makes it very likely that the VF would be 1 for
16787      Advanced SIMD, and so no epilogue should be needed.  */
16788   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
16789     {
16790       stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
16791       unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
16792       unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
16793       if (multiple_p (count * elt_bits, 256)
16794           && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
16795         stmt_cost *= 2;
16796     }
16797
16798   return stmt_cost;
16799 }
16800
16801 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
16802    and which when vectorized would operate on vector type VECTYPE.  Add the
16803    cost of any embedded operations.  */
16804 static fractional_cost
16805 aarch64_adjust_stmt_cost (vec_info *vinfo, vect_cost_for_stmt kind,
16806                           stmt_vec_info stmt_info, tree vectype,
16807                           unsigned vec_flags, fractional_cost stmt_cost)
16808 {
16809   if (vectype)
16810     {
16811       const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16812
16813       /* Detect cases in which a vector load or store represents an
16814          LD[234] or ST[234] instruction.  */
16815       switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16816         {
16817         case 2:
16818           stmt_cost += simd_costs->ld2_st2_permute_cost;
16819           break;
16820
16821         case 3:
16822           stmt_cost += simd_costs->ld3_st3_permute_cost;
16823           break;
16824
16825         case 4:
16826           stmt_cost += simd_costs->ld4_st4_permute_cost;
16827           break;
16828         }
16829
16830       gassign *assign = dyn_cast<gassign *> (STMT_VINFO_STMT (stmt_info));
16831       if ((kind == scalar_stmt || kind == vector_stmt) && assign)
16832         {
16833           /* For MLA we need to reduce the cost since MLA is 1 instruction.  */
16834           if (!vect_is_reduction (stmt_info)
16835               && aarch64_multiply_add_p (vinfo, stmt_info, vec_flags))
16836             return 0;
16837
16838           /* For vector boolean ANDs with a compare operand we just need
16839              one insn.  */
16840           if (aarch64_bool_compound_p (vinfo, stmt_info, vec_flags))
16841             return 0;
16842         }
16843
16844       if (kind == vector_stmt || kind == vec_to_scalar)
16845         if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16846           {
16847             if (FLOAT_TYPE_P (cmp_type))
16848               stmt_cost += simd_costs->fp_stmt_cost;
16849             else
16850               stmt_cost += simd_costs->int_stmt_cost;
16851           }
16852     }
16853
16854   if (kind == scalar_stmt)
16855     if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16856       {
16857         if (FLOAT_TYPE_P (cmp_type))
16858           stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
16859         else
16860           stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
16861       }
16862
16863   return stmt_cost;
16864 }
16865
16866 /* Return true if STMT_INFO is part of a reduction that has the form:
16867
16868       r = r op ...;
16869       r = r op ...;
16870
16871    with the single accumulator being read and written multiple times.  */
16872 static bool
16873 aarch64_force_single_cycle (vec_info *vinfo, stmt_vec_info stmt_info)
16874 {
16875   if (!STMT_VINFO_REDUC_DEF (stmt_info))
16876     return false;
16877
16878   auto reduc_info = info_for_reduction (vinfo, stmt_info);
16879   return STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
16880 }
16881
16882 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
16883    and they describe an operation in the body of a vector loop.  Record issue
16884    information relating to the vector operation in OPS.  */
16885 void
16886 aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
16887                                  stmt_vec_info stmt_info,
16888                                  aarch64_vec_op_count *ops)
16889 {
16890   const aarch64_base_vec_issue_info *base_issue = ops->base_issue_info ();
16891   if (!base_issue)
16892     return;
16893   const aarch64_simd_vec_issue_info *simd_issue = ops->simd_issue_info ();
16894   const aarch64_sve_vec_issue_info *sve_issue = ops->sve_issue_info ();
16895
16896   /* Calculate the minimum cycles per iteration imposed by a reduction
16897      operation.  */
16898   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16899       && vect_is_reduction (stmt_info))
16900     {
16901       unsigned int base
16902         = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
16903       if (aarch64_force_single_cycle (m_vinfo, stmt_info))
16904         /* ??? Ideally we'd use a tree to reduce the copies down to 1 vector,
16905            and then accumulate that, but at the moment the loop-carried
16906            dependency includes all copies.  */
16907         ops->reduction_latency = MAX (ops->reduction_latency, base * count);
16908       else
16909         ops->reduction_latency = MAX (ops->reduction_latency, base);
16910     }
16911
16912   if (stmt_info && (kind == scalar_stmt || kind == vector_stmt))
16913     {
16914       /* Assume that multiply-adds will become a single operation.  */
16915       if (aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
16916         return;
16917
16918       /* Assume that bool AND with compare operands will become a single
16919          operation.  */
16920       if (aarch64_bool_compound_p (m_vinfo, stmt_info, m_vec_flags))
16921         return;
16922     }
16923
16924
16925   /* Count the basic operation cost associated with KIND.  */
16926   switch (kind)
16927     {
16928     case cond_branch_taken:
16929     case cond_branch_not_taken:
16930     case vector_gather_load:
16931     case vector_scatter_store:
16932       /* We currently don't expect these to be used in a loop body.  */
16933       break;
16934
16935     case vec_perm:
16936     case vec_promote_demote:
16937     case vec_construct:
16938     case vec_to_scalar:
16939     case scalar_to_vec:
16940     case vector_stmt:
16941     case scalar_stmt:
16942       ops->general_ops += count;
16943       break;
16944
16945     case scalar_load:
16946     case vector_load:
16947     case unaligned_load:
16948       ops->loads += count;
16949       if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16950         ops->general_ops += base_issue->fp_simd_load_general_ops * count;
16951       break;
16952
16953     case vector_store:
16954     case unaligned_store:
16955     case scalar_store:
16956       ops->stores += count;
16957       if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16958         ops->general_ops += base_issue->fp_simd_store_general_ops * count;
16959       break;
16960     }
16961
16962   /* Add any embedded comparison operations.  */
16963   if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16964       && vect_embedded_comparison_type (stmt_info))
16965     ops->general_ops += count;
16966
16967   /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
16968      have only accounted for one.  */
16969   if ((kind == vector_stmt || kind == vec_to_scalar)
16970       && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
16971     ops->general_ops += count;
16972
16973   /* Count the predicate operations needed by an SVE comparison.  */
16974   if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
16975     if (tree type = vect_comparison_type (stmt_info))
16976       {
16977         unsigned int base = (FLOAT_TYPE_P (type)
16978                              ? sve_issue->fp_cmp_pred_ops
16979                              : sve_issue->int_cmp_pred_ops);
16980         ops->pred_ops += base * count;
16981       }
16982
16983   /* Add any extra overhead associated with LD[234] and ST[234] operations.  */
16984   if (simd_issue)
16985     switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16986       {
16987       case 2:
16988         ops->general_ops += simd_issue->ld2_st2_general_ops * count;
16989         break;
16990
16991       case 3:
16992         ops->general_ops += simd_issue->ld3_st3_general_ops * count;
16993         break;
16994
16995       case 4:
16996         ops->general_ops += simd_issue->ld4_st4_general_ops * count;
16997         break;
16998       }
16999
17000   /* Add any overhead associated with gather loads and scatter stores.  */
17001   if (sve_issue
17002       && (kind == scalar_load || kind == scalar_store)
17003       && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
17004     {
17005       unsigned int pairs = CEIL (count, 2);
17006       ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
17007       ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
17008     }
17009 }
17010
17011 /* Return true if STMT_INFO contains a memory access and if the constant
17012    component of the memory address is aligned to SIZE bytes.  */
17013 static bool
17014 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info,
17015                                    poly_uint64 size)
17016 {
17017   if (!STMT_VINFO_DATA_REF (stmt_info))
17018     return false;
17019
17020   if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info))
17021     stmt_info = first_stmt;
17022   tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info));
17023   /* Needed for gathers & scatters, for example.  */
17024   if (!constant_offset)
17025     return false;
17026
17027   return multiple_p (wi::to_poly_offset (constant_offset), size);
17028 }
17029
17030 /* Check if a scalar or vector stmt could be part of a region of code
17031    that does nothing more than store values to memory, in the scalar
17032    case using STP.  Return the cost of the stmt if so, counting 2 for
17033    one instruction.  Return ~0U otherwise.
17034
17035    The arguments are a subset of those passed to add_stmt_cost.  */
17036 unsigned int
17037 aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
17038                            stmt_vec_info stmt_info, tree vectype)
17039 {
17040   /* Code that stores vector constants uses a vector_load to create
17041      the constant.  We don't apply the heuristic to that case for two
17042      main reasons:
17043
17044      - At the moment, STPs are only formed via peephole2, and the
17045        constant scalar moves would often come between STRs and so
17046        prevent STP formation.
17047
17048      - The scalar code also has to load the constant somehow, and that
17049        isn't costed.  */
17050   switch (kind)
17051     {
17052     case scalar_to_vec:
17053       /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup.  */
17054       return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count;
17055
17056     case vec_construct:
17057       if (FLOAT_TYPE_P (vectype))
17058         /* Count 1 insn for the maximum number of FP->SIMD INS
17059            instructions.  */
17060         return (vect_nunits_for_cost (vectype) - 1) * 2 * count;
17061
17062       /* Count 2 insns for a GPR->SIMD move and 2 insns for the
17063          maximum number of GPR->SIMD INS instructions.  */
17064       return vect_nunits_for_cost (vectype) * 4 * count;
17065
17066     case vector_store:
17067     case unaligned_store:
17068       /* Count 1 insn per vector if we can't form STP Q pairs.  */
17069       if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
17070         return count * 2;
17071       if (aarch64_tune_params.extra_tuning_flags
17072           & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
17073         return count * 2;
17074
17075       if (stmt_info)
17076         {
17077           /* Assume we won't be able to use STP if the constant offset
17078              component of the address is misaligned.  ??? This could be
17079              removed if we formed STP pairs earlier, rather than relying
17080              on peephole2.  */
17081           auto size = GET_MODE_SIZE (TYPE_MODE (vectype));
17082           if (!aarch64_aligned_constant_offset_p (stmt_info, size))
17083             return count * 2;
17084         }
17085       return CEIL (count, 2) * 2;
17086
17087     case scalar_store:
17088       if (stmt_info && STMT_VINFO_DATA_REF (stmt_info))
17089         {
17090           /* Check for a mode in which STP pairs can be formed.  */
17091           auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info)));
17092           if (maybe_ne (size, 4) && maybe_ne (size, 8))
17093             return ~0U;
17094
17095           /* Assume we won't be able to use STP if the constant offset
17096              component of the address is misaligned.  ??? This could be
17097              removed if we formed STP pairs earlier, rather than relying
17098              on peephole2.  */
17099           if (!aarch64_aligned_constant_offset_p (stmt_info, size))
17100             return ~0U;
17101         }
17102       return count;
17103
17104     default:
17105       return ~0U;
17106     }
17107 }
17108
17109 unsigned
17110 aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
17111                                      stmt_vec_info stmt_info, slp_tree,
17112                                      tree vectype, int misalign,
17113                                      vect_cost_model_location where)
17114 {
17115   fractional_cost stmt_cost
17116     = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
17117
17118   bool in_inner_loop_p = (where == vect_body
17119                           && stmt_info
17120                           && stmt_in_inner_loop_p (m_vinfo, stmt_info));
17121
17122   /* Do one-time initialization based on the vinfo.  */
17123   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17124   if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
17125     {
17126       if (loop_vinfo)
17127         analyze_loop_vinfo (loop_vinfo);
17128
17129       m_analyzed_vinfo = true;
17130     }
17131
17132   /* Apply the heuristic described above m_stp_sequence_cost.  */
17133   if (m_stp_sequence_cost != ~0U)
17134     {
17135       uint64_t cost = aarch64_stp_sequence_cost (count, kind,
17136                                                  stmt_info, vectype);
17137       m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U);
17138     }
17139
17140   /* Try to get a more accurate cost by looking at STMT_INFO instead
17141      of just looking at KIND.  */
17142   if (stmt_info && aarch64_use_new_vector_costs_p ())
17143     {
17144       /* If we scalarize a strided store, the vectorizer costs one
17145          vec_to_scalar for each element.  However, we can store the first
17146          element using an FP store without a separate extract step.  */
17147       if (vect_is_store_elt_extraction (kind, stmt_info))
17148         count -= 1;
17149
17150       stmt_cost = aarch64_detect_scalar_stmt_subtype (m_vinfo, kind,
17151                                                       stmt_info, stmt_cost);
17152
17153       if (vectype && m_vec_flags)
17154         stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
17155                                                         stmt_info, vectype,
17156                                                         where, stmt_cost);
17157     }
17158
17159   /* Do any SVE-specific adjustments to the cost.  */
17160   if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
17161     stmt_cost = aarch64_sve_adjust_stmt_cost (m_vinfo, kind, stmt_info,
17162                                               vectype, stmt_cost);
17163
17164   /*  Vector promotion and demotion requires us to widen the operation first
17165       and only after that perform the conversion.  Unfortunately the mid-end
17166       expects this to be doable as a single operation and doesn't pass on
17167       enough context here for us to tell which operation is happening.  To
17168       account for this we count every promote-demote operation twice and if
17169       the previously costed operation was also a promote-demote we reduce
17170       the cost of the currently being costed operation to simulate the final
17171       conversion cost.  Note that for SVE we can do better here if the converted
17172       value comes from a load since the widening load would consume the widening
17173       operations.  However since we're in stage 3 we can't change the helper
17174       vect_is_extending_load and duplicating the code seems not useful.  */
17175   gassign *assign = NULL;
17176   if (kind == vec_promote_demote
17177       && (assign = dyn_cast <gassign *> (STMT_VINFO_STMT (stmt_info)))
17178       && gimple_assign_rhs_code (assign) == FLOAT_EXPR)
17179     {
17180       auto new_count = count * 2 - m_num_last_promote_demote;
17181       m_num_last_promote_demote = count;
17182       count = new_count;
17183     }
17184   else
17185     m_num_last_promote_demote = 0;
17186
17187   if (stmt_info && aarch64_use_new_vector_costs_p ())
17188     {
17189       /* Account for any extra "embedded" costs that apply additively
17190          to the base cost calculated above.  */
17191       stmt_cost = aarch64_adjust_stmt_cost (m_vinfo, kind, stmt_info,
17192                                             vectype, m_vec_flags, stmt_cost);
17193
17194       /* If we're recording a nonzero vector loop body cost for the
17195          innermost loop, also estimate the operations that would need
17196          to be issued by all relevant implementations of the loop.  */
17197       if (loop_vinfo
17198           && (m_costing_for_scalar || where == vect_body)
17199           && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
17200           && stmt_cost != 0)
17201         for (auto &ops : m_ops)
17202           count_ops (count, kind, stmt_info, &ops);
17203
17204       /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
17205          estimate the number of statements in the unrolled Advanced SIMD
17206          loop.  For simplicitly, we assume that one iteration of the
17207          Advanced SIMD loop would need the same number of statements
17208          as one iteration of the SVE loop.  */
17209       if (where == vect_body && m_unrolled_advsimd_niters)
17210         m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
17211
17212       /* Detect the use of an averaging operation.  */
17213       gimple *stmt = stmt_info->stmt;
17214       if (is_gimple_call (stmt)
17215           && gimple_call_internal_p (stmt))
17216         {
17217           switch (gimple_call_internal_fn (stmt))
17218             {
17219             case IFN_AVG_FLOOR:
17220             case IFN_AVG_CEIL:
17221               m_has_avg = true;
17222             default:
17223               break;
17224             }
17225         }
17226     }
17227
17228   /* If the statement stores to a decl that is known to be the argument
17229      to a vld1 in the same function, ignore the store for costing purposes.
17230      See the comment above m_stores_to_vector_load_decl for more details.  */
17231   if (stmt_info
17232       && (kind == vector_store || kind == unaligned_store)
17233       && aarch64_accesses_vector_load_decl_p (stmt_info))
17234     {
17235       stmt_cost = 0;
17236       m_stores_to_vector_load_decl = true;
17237     }
17238
17239   return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
17240 }
17241
17242 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
17243    heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
17244    says that we should prefer the Advanced SIMD loop.  */
17245 bool
17246 aarch64_vector_costs::prefer_unrolled_loop () const
17247 {
17248   if (!m_unrolled_advsimd_stmts)
17249     return false;
17250
17251   if (dump_enabled_p ())
17252     dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
17253                      " unrolled Advanced SIMD loop = "
17254                      HOST_WIDE_INT_PRINT_UNSIGNED "\n",
17255                      m_unrolled_advsimd_stmts);
17256
17257   /* The balance here is tricky.  On the one hand, we can't be sure whether
17258      the code is vectorizable with Advanced SIMD or not.  However, even if
17259      it isn't vectorizable with Advanced SIMD, there's a possibility that
17260      the scalar code could also be unrolled.  Some of the code might then
17261      benefit from SLP, or from using LDP and STP.  We therefore apply
17262      the heuristic regardless of can_use_advsimd_p.  */
17263   return (m_unrolled_advsimd_stmts
17264           && (m_unrolled_advsimd_stmts
17265               <= (unsigned int) param_max_completely_peeled_insns));
17266 }
17267
17268 /* Subroutine of adjust_body_cost for handling SVE.  Use ISSUE_INFO to work out
17269    how fast the SVE code can be issued and compare it to the equivalent value
17270    for scalar code (SCALAR_CYCLES_PER_ITER).  If COULD_USE_ADVSIMD is true,
17271    also compare it to the issue rate of Advanced SIMD code
17272    (ADVSIMD_CYCLES_PER_ITER).
17273
17274    ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
17275    *BODY_COST is the current value of the adjusted cost.  *SHOULD_DISPARAGE
17276    is true if we think the loop body is too expensive.  */
17277
17278 fractional_cost
17279 aarch64_vector_costs::
17280 adjust_body_cost_sve (const aarch64_vec_op_count *ops,
17281                       fractional_cost scalar_cycles_per_iter,
17282                       unsigned int orig_body_cost, unsigned int *body_cost,
17283                       bool *should_disparage)
17284 {
17285   if (dump_enabled_p ())
17286     ops->dump ();
17287
17288   fractional_cost sve_pred_cycles_per_iter = ops->min_pred_cycles_per_iter ();
17289   fractional_cost sve_cycles_per_iter = ops->min_cycles_per_iter ();
17290
17291   /* If the scalar version of the loop could issue at least as
17292      quickly as the predicate parts of the SVE loop, make the SVE loop
17293      prohibitively expensive.  In this case vectorization is adding an
17294      overhead that the original scalar code didn't have.
17295
17296      This is mostly intended to detect cases in which WHILELOs dominate
17297      for very tight loops, which is something that normal latency-based
17298      costs would not model.  Adding this kind of cliffedge would be
17299      too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
17300      code in the caller handles that case in a more conservative way.  */
17301   fractional_cost sve_estimate = sve_pred_cycles_per_iter + 1;
17302   if (scalar_cycles_per_iter < sve_estimate)
17303     {
17304       unsigned int min_cost
17305         = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
17306       if (*body_cost < min_cost)
17307         {
17308           if (dump_enabled_p ())
17309             dump_printf_loc (MSG_NOTE, vect_location,
17310                              "Increasing body cost to %d because the"
17311                              " scalar code could issue within the limit"
17312                              " imposed by predicate operations\n",
17313                              min_cost);
17314           *body_cost = min_cost;
17315           *should_disparage = true;
17316         }
17317     }
17318
17319   return sve_cycles_per_iter;
17320 }
17321
17322 unsigned int
17323 aarch64_vector_costs::determine_suggested_unroll_factor ()
17324 {
17325   bool sve = m_vec_flags & VEC_ANY_SVE;
17326   /* If we are trying to unroll an Advanced SIMD main loop that contains
17327      an averaging operation that we do not support with SVE and we might use a
17328      predicated epilogue, we need to be conservative and block unrolling as
17329      this might lead to a less optimal loop for the first and only epilogue
17330      using the original loop's vectorization factor.
17331      TODO: Remove this constraint when we add support for multiple epilogue
17332      vectorization.  */
17333   if (!sve && !TARGET_SVE2 && m_has_avg)
17334     return 1;
17335
17336   unsigned int max_unroll_factor = 1;
17337   for (auto vec_ops : m_ops)
17338     {
17339       aarch64_simd_vec_issue_info const *vec_issue
17340         = vec_ops.simd_issue_info ();
17341       if (!vec_issue)
17342         return 1;
17343       /* Limit unroll factor to a value adjustable by the user, the default
17344          value is 4. */
17345       unsigned int unroll_factor = aarch64_vect_unroll_limit;
17346       unsigned int factor
17347        = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
17348       unsigned int temp;
17349
17350       /* Sanity check, this should never happen.  */
17351       if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
17352         return 1;
17353
17354       /* Check stores.  */
17355       if (vec_ops.stores > 0)
17356         {
17357           temp = CEIL (factor * vec_issue->stores_per_cycle,
17358                        vec_ops.stores);
17359           unroll_factor = MIN (unroll_factor, temp);
17360         }
17361
17362       /* Check loads + stores.  */
17363       if (vec_ops.loads > 0)
17364         {
17365           temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
17366                        vec_ops.loads + vec_ops.stores);
17367           unroll_factor = MIN (unroll_factor, temp);
17368         }
17369
17370       /* Check general ops.  */
17371       if (vec_ops.general_ops > 0)
17372         {
17373           temp = CEIL (factor * vec_issue->general_ops_per_cycle,
17374                        vec_ops.general_ops);
17375           unroll_factor = MIN (unroll_factor, temp);
17376          }
17377       max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
17378     }
17379
17380   /* Make sure unroll factor is power of 2.  */
17381   return 1 << ceil_log2 (max_unroll_factor);
17382 }
17383
17384 /* BODY_COST is the cost of a vector loop body.  Adjust the cost as necessary
17385    and return the new cost.  */
17386 unsigned int
17387 aarch64_vector_costs::
17388 adjust_body_cost (loop_vec_info loop_vinfo,
17389                   const aarch64_vector_costs *scalar_costs,
17390                   unsigned int body_cost)
17391 {
17392   if (scalar_costs->m_ops.is_empty () || m_ops.is_empty ())
17393     return body_cost;
17394
17395   const auto &scalar_ops = scalar_costs->m_ops[0];
17396   const auto &vector_ops = m_ops[0];
17397   unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
17398   unsigned int orig_body_cost = body_cost;
17399   bool should_disparage = false;
17400
17401   if (dump_enabled_p ())
17402     dump_printf_loc (MSG_NOTE, vect_location,
17403                      "Original vector body cost = %d\n", body_cost);
17404
17405   fractional_cost scalar_cycles_per_iter
17406     = scalar_ops.min_cycles_per_iter () * estimated_vf;
17407
17408   fractional_cost vector_cycles_per_iter = vector_ops.min_cycles_per_iter ();
17409
17410   if (dump_enabled_p ())
17411     {
17412       if (IN_RANGE (m_num_vector_iterations, 0, 65536))
17413         dump_printf_loc (MSG_NOTE, vect_location,
17414                          "Vector loop iterates at most %wd times\n",
17415                          m_num_vector_iterations);
17416       dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
17417       scalar_ops.dump ();
17418       dump_printf_loc (MSG_NOTE, vect_location,
17419                        "  estimated cycles per vector iteration"
17420                        " (for VF %d) = %f\n",
17421                        estimated_vf, scalar_cycles_per_iter.as_double ());
17422     }
17423
17424   if (vector_ops.sve_issue_info ())
17425     {
17426       if (dump_enabled_p ())
17427         dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
17428       vector_cycles_per_iter
17429         = adjust_body_cost_sve (&vector_ops, scalar_cycles_per_iter,
17430                                 orig_body_cost, &body_cost, &should_disparage);
17431
17432       if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
17433         {
17434           /* Also take Neoverse V1 tuning into account, doubling the
17435              scalar and Advanced SIMD estimates to account for the
17436              doubling in SVE vector length.  */
17437           if (dump_enabled_p ())
17438             dump_printf_loc (MSG_NOTE, vect_location,
17439                              "Neoverse V1 estimate:\n");
17440           auto vf_factor = m_ops[1].vf_factor ();
17441           adjust_body_cost_sve (&m_ops[1], scalar_cycles_per_iter * vf_factor,
17442                                 orig_body_cost, &body_cost, &should_disparage);
17443         }
17444     }
17445   else
17446     {
17447       if (dump_enabled_p ())
17448         {
17449           dump_printf_loc (MSG_NOTE, vect_location,
17450                            "Vector issue estimate:\n");
17451           vector_ops.dump ();
17452         }
17453     }
17454
17455   /* Decide whether to stick to latency-based costs or whether to try to
17456      take issue rates into account.  */
17457   unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
17458   if (m_vec_flags & VEC_ANY_SVE)
17459     threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
17460
17461   if (m_num_vector_iterations >= 1
17462       && m_num_vector_iterations < threshold)
17463     {
17464       if (dump_enabled_p ())
17465         dump_printf_loc (MSG_NOTE, vect_location,
17466                          "Low iteration count, so using pure latency"
17467                          " costs\n");
17468     }
17469   /* Increase the cost of the vector code if it looks like the scalar code
17470      could issue more quickly.  These values are only rough estimates,
17471      so minor differences should only result in minor changes.  */
17472   else if (scalar_cycles_per_iter < vector_cycles_per_iter)
17473     {
17474       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17475                                           scalar_cycles_per_iter);
17476       if (dump_enabled_p ())
17477         dump_printf_loc (MSG_NOTE, vect_location,
17478                          "Increasing body cost to %d because scalar code"
17479                          " would issue more quickly\n", body_cost);
17480     }
17481   /* In general, it's expected that the proposed vector code would be able
17482      to issue more quickly than the original scalar code.  This should
17483      already be reflected to some extent in the latency-based costs.
17484
17485      However, the latency-based costs effectively assume that the scalar
17486      code and the vector code execute serially, which tends to underplay
17487      one important case: if the real (non-serialized) execution time of
17488      a scalar iteration is dominated by loop-carried dependencies,
17489      and if the vector code is able to reduce both the length of
17490      the loop-carried dependencies *and* the number of cycles needed
17491      to issue the code in general, we can be more confident that the
17492      vector code is an improvement, even if adding the other (non-loop-carried)
17493      latencies tends to hide this saving.  We therefore reduce the cost of the
17494      vector loop body in proportion to the saving.  */
17495   else if (scalar_ops.reduction_latency > vector_ops.reduction_latency
17496            && scalar_ops.reduction_latency == scalar_cycles_per_iter
17497            && scalar_cycles_per_iter > vector_cycles_per_iter
17498            && !should_disparage)
17499     {
17500       body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17501                                           scalar_cycles_per_iter);
17502       if (dump_enabled_p ())
17503         dump_printf_loc (MSG_NOTE, vect_location,
17504                          "Decreasing body cost to %d account for smaller"
17505                          " reduction latency\n", body_cost);
17506     }
17507
17508   return body_cost;
17509 }
17510
17511 void
17512 aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
17513 {
17514   /* Record the issue information for any SVE WHILE instructions that the
17515      loop needs.  */
17516   loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17517   if (!m_ops.is_empty ()
17518       && loop_vinfo
17519       && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
17520     {
17521       unsigned int num_masks = 0;
17522       rgroup_controls *rgm;
17523       unsigned int num_vectors_m1;
17524       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
17525                         num_vectors_m1, rgm)
17526         if (rgm->type)
17527           num_masks += num_vectors_m1 + 1;
17528       for (auto &ops : m_ops)
17529         if (auto *issue = ops.sve_issue_info ())
17530           ops.pred_ops += num_masks * issue->while_pred_ops;
17531     }
17532
17533   auto *scalar_costs
17534     = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
17535   if (loop_vinfo
17536       && m_vec_flags
17537       && aarch64_use_new_vector_costs_p ())
17538     {
17539       m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
17540                                              m_costs[vect_body]);
17541       m_suggested_unroll_factor = determine_suggested_unroll_factor ();
17542     }
17543
17544   /* Apply the heuristic described above m_stp_sequence_cost.  Prefer
17545      the scalar code in the event of a tie, since there is more chance
17546      of scalar code being optimized with surrounding operations.
17547
17548      In addition, if the vector body is a simple store to a decl that
17549      is elsewhere loaded using vld1, strongly prefer the vector form,
17550      to the extent of giving the prologue a zero cost.  See the comment
17551      above m_stores_to_vector_load_decl for details.  */
17552   if (!loop_vinfo
17553       && scalar_costs
17554       && m_stp_sequence_cost != ~0U)
17555     {
17556       if (m_stores_to_vector_load_decl)
17557         m_costs[vect_prologue] = 0;
17558       else if (m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
17559         m_costs[vect_body] = 2 * scalar_costs->total_cost ();
17560     }
17561
17562   vector_costs::finish_cost (scalar_costs);
17563 }
17564
17565 bool
17566 aarch64_vector_costs::
17567 better_main_loop_than_p (const vector_costs *uncast_other) const
17568 {
17569   auto other = static_cast<const aarch64_vector_costs *> (uncast_other);
17570
17571   auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
17572   auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
17573
17574   if (dump_enabled_p ())
17575     dump_printf_loc (MSG_NOTE, vect_location,
17576                      "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
17577                      GET_MODE_NAME (this_loop_vinfo->vector_mode),
17578                      vect_vf_for_cost (this_loop_vinfo),
17579                      GET_MODE_NAME (other_loop_vinfo->vector_mode),
17580                      vect_vf_for_cost (other_loop_vinfo));
17581
17582   /* Apply the unrolling heuristic described above
17583      m_unrolled_advsimd_niters.  */
17584   if (bool (m_unrolled_advsimd_stmts)
17585       != bool (other->m_unrolled_advsimd_stmts))
17586     {
17587       bool this_prefer_unrolled = this->prefer_unrolled_loop ();
17588       bool other_prefer_unrolled = other->prefer_unrolled_loop ();
17589       if (this_prefer_unrolled != other_prefer_unrolled)
17590         {
17591           if (dump_enabled_p ())
17592             dump_printf_loc (MSG_NOTE, vect_location,
17593                              "Preferring Advanced SIMD loop because"
17594                              " it can be unrolled\n");
17595           return other_prefer_unrolled;
17596         }
17597     }
17598
17599   for (unsigned int i = 0; i < m_ops.length (); ++i)
17600     {
17601       if (dump_enabled_p ())
17602         {
17603           if (i)
17604             dump_printf_loc (MSG_NOTE, vect_location,
17605                              "Reconsidering with subtuning %d\n", i);
17606           dump_printf_loc (MSG_NOTE, vect_location,
17607                            "Issue info for %s loop:\n",
17608                            GET_MODE_NAME (this_loop_vinfo->vector_mode));
17609           this->m_ops[i].dump ();
17610           dump_printf_loc (MSG_NOTE, vect_location,
17611                            "Issue info for %s loop:\n",
17612                            GET_MODE_NAME (other_loop_vinfo->vector_mode));
17613           other->m_ops[i].dump ();
17614         }
17615
17616       auto this_estimated_vf = (vect_vf_for_cost (this_loop_vinfo)
17617                                 * this->m_ops[i].vf_factor ());
17618       auto other_estimated_vf = (vect_vf_for_cost (other_loop_vinfo)
17619                                  * other->m_ops[i].vf_factor ());
17620
17621       /* If it appears that one loop could process the same amount of data
17622          in fewer cycles, prefer that loop over the other one.  */
17623       fractional_cost this_cost
17624         = this->m_ops[i].min_cycles_per_iter () * other_estimated_vf;
17625       fractional_cost other_cost
17626         = other->m_ops[i].min_cycles_per_iter () * this_estimated_vf;
17627       if (dump_enabled_p ())
17628         {
17629           dump_printf_loc (MSG_NOTE, vect_location,
17630                            "Weighted cycles per iteration of %s loop ~= %f\n",
17631                            GET_MODE_NAME (this_loop_vinfo->vector_mode),
17632                            this_cost.as_double ());
17633           dump_printf_loc (MSG_NOTE, vect_location,
17634                            "Weighted cycles per iteration of %s loop ~= %f\n",
17635                            GET_MODE_NAME (other_loop_vinfo->vector_mode),
17636                            other_cost.as_double ());
17637         }
17638       if (this_cost != other_cost)
17639         {
17640           if (dump_enabled_p ())
17641             dump_printf_loc (MSG_NOTE, vect_location,
17642                              "Preferring loop with lower cycles"
17643                              " per iteration\n");
17644           return this_cost < other_cost;
17645         }
17646
17647       /* If the issue rate of SVE code is limited by predicate operations
17648          (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
17649          and if Advanced SIMD code could issue within the limit imposed
17650          by the predicate operations, the predicate operations are adding an
17651          overhead that the original code didn't have and so we should prefer
17652          the Advanced SIMD version.  */
17653       auto better_pred_limit_p = [](const aarch64_vec_op_count &a,
17654                                     const aarch64_vec_op_count &b) -> bool
17655         {
17656           if (a.pred_ops == 0
17657               && (b.min_pred_cycles_per_iter ()
17658                   > b.min_nonpred_cycles_per_iter ()))
17659             {
17660               if (dump_enabled_p ())
17661                 dump_printf_loc (MSG_NOTE, vect_location,
17662                                  "Preferring Advanced SIMD loop since"
17663                                  " SVE loop is predicate-limited\n");
17664               return true;
17665             }
17666           return false;
17667         };
17668       if (better_pred_limit_p (this->m_ops[i], other->m_ops[i]))
17669         return true;
17670       if (better_pred_limit_p (other->m_ops[i], this->m_ops[i]))
17671         return false;
17672     }
17673
17674   return vector_costs::better_main_loop_than_p (other);
17675 }
17676
17677 static void initialize_aarch64_code_model (struct gcc_options *);
17678
17679 /* Parse the TO_PARSE string and put the architecture struct that it
17680    selects into RES and the architectural features into ISA_FLAGS.
17681    Return an aarch_parse_opt_result describing the parse result.
17682    If there is an error parsing, RES and ISA_FLAGS are left unchanged.
17683    When the TO_PARSE string contains an invalid extension,
17684    a copy of the string is created and stored to INVALID_EXTENSION.  */
17685
17686 static enum aarch_parse_opt_result
17687 aarch64_parse_arch (const char *to_parse, const struct processor **res,
17688                     aarch64_feature_flags *isa_flags,
17689                     std::string *invalid_extension)
17690 {
17691   const char *ext;
17692   const struct processor *arch;
17693   size_t len;
17694
17695   ext = strchr (to_parse, '+');
17696
17697   if (ext != NULL)
17698     len = ext - to_parse;
17699   else
17700     len = strlen (to_parse);
17701
17702   if (len == 0)
17703     return AARCH_PARSE_MISSING_ARG;
17704
17705
17706   /* Loop through the list of supported ARCHes to find a match.  */
17707   for (arch = all_architectures; arch->name != NULL; arch++)
17708     {
17709       if (strlen (arch->name) == len
17710           && strncmp (arch->name, to_parse, len) == 0)
17711         {
17712           auto isa_temp = arch->flags;
17713
17714           if (ext != NULL)
17715             {
17716               /* TO_PARSE string contains at least one extension.  */
17717               enum aarch_parse_opt_result ext_res
17718                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17719
17720               if (ext_res != AARCH_PARSE_OK)
17721                 return ext_res;
17722             }
17723           /* Extension parsing was successful.  Confirm the result
17724              arch and ISA flags.  */
17725           *res = arch;
17726           *isa_flags = isa_temp;
17727           return AARCH_PARSE_OK;
17728         }
17729     }
17730
17731   /* ARCH name not found in list.  */
17732   return AARCH_PARSE_INVALID_ARG;
17733 }
17734
17735 /* Parse the TO_PARSE string and put the result tuning in RES and the
17736    architecture flags in ISA_FLAGS.  Return an aarch_parse_opt_result
17737    describing the parse result.  If there is an error parsing, RES and
17738    ISA_FLAGS are left unchanged.
17739    When the TO_PARSE string contains an invalid extension,
17740    a copy of the string is created and stored to INVALID_EXTENSION.  */
17741
17742 static enum aarch_parse_opt_result
17743 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
17744                    aarch64_feature_flags *isa_flags,
17745                    std::string *invalid_extension)
17746 {
17747   const char *ext;
17748   const struct processor *cpu;
17749   size_t len;
17750
17751   ext = strchr (to_parse, '+');
17752
17753   if (ext != NULL)
17754     len = ext - to_parse;
17755   else
17756     len = strlen (to_parse);
17757
17758   if (len == 0)
17759     return AARCH_PARSE_MISSING_ARG;
17760
17761
17762   /* Loop through the list of supported CPUs to find a match.  */
17763   for (cpu = all_cores; cpu->name != NULL; cpu++)
17764     {
17765       if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
17766         {
17767           auto isa_temp = cpu->flags;
17768
17769           if (ext != NULL)
17770             {
17771               /* TO_PARSE string contains at least one extension.  */
17772               enum aarch_parse_opt_result ext_res
17773                 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17774
17775               if (ext_res != AARCH_PARSE_OK)
17776                 return ext_res;
17777             }
17778           /* Extension parsing was successfull.  Confirm the result
17779              cpu and ISA flags.  */
17780           *res = cpu;
17781           *isa_flags = isa_temp;
17782           return AARCH_PARSE_OK;
17783         }
17784     }
17785
17786   /* CPU name not found in list.  */
17787   return AARCH_PARSE_INVALID_ARG;
17788 }
17789
17790 /* Parse the TO_PARSE string and put the cpu it selects into RES.
17791    Return an aarch_parse_opt_result describing the parse result.
17792    If the parsing fails the RES does not change.  */
17793
17794 static enum aarch_parse_opt_result
17795 aarch64_parse_tune (const char *to_parse, const struct processor **res)
17796 {
17797   const struct processor *cpu;
17798
17799   /* Loop through the list of supported CPUs to find a match.  */
17800   for (cpu = all_cores; cpu->name != NULL; cpu++)
17801     {
17802       if (strcmp (cpu->name, to_parse) == 0)
17803         {
17804           *res = cpu;
17805           return AARCH_PARSE_OK;
17806         }
17807     }
17808
17809   /* CPU name not found in list.  */
17810   return AARCH_PARSE_INVALID_ARG;
17811 }
17812
17813 /* Parse TOKEN, which has length LENGTH to see if it is an option
17814    described in FLAG.  If it is, return the index bit for that fusion type.
17815    If not, error (printing OPTION_NAME) and return zero.  */
17816
17817 static unsigned int
17818 aarch64_parse_one_option_token (const char *token,
17819                                 size_t length,
17820                                 const struct aarch64_flag_desc *flag,
17821                                 const char *option_name)
17822 {
17823   for (; flag->name != NULL; flag++)
17824     {
17825       if (length == strlen (flag->name)
17826           && !strncmp (flag->name, token, length))
17827         return flag->flag;
17828     }
17829
17830   error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
17831   return 0;
17832 }
17833
17834 /* Parse OPTION which is a comma-separated list of flags to enable.
17835    FLAGS gives the list of flags we understand, INITIAL_STATE gives any
17836    default state we inherit from the CPU tuning structures.  OPTION_NAME
17837    gives the top-level option we are parsing in the -moverride string,
17838    for use in error messages.  */
17839
17840 static unsigned int
17841 aarch64_parse_boolean_options (const char *option,
17842                                const struct aarch64_flag_desc *flags,
17843                                unsigned int initial_state,
17844                                const char *option_name)
17845 {
17846   const char separator = '.';
17847   const char* specs = option;
17848   const char* ntoken = option;
17849   unsigned int found_flags = initial_state;
17850
17851   while ((ntoken = strchr (specs, separator)))
17852     {
17853       size_t token_length = ntoken - specs;
17854       unsigned token_ops = aarch64_parse_one_option_token (specs,
17855                                                            token_length,
17856                                                            flags,
17857                                                            option_name);
17858       /* If we find "none" (or, for simplicity's sake, an error) anywhere
17859          in the token stream, reset the supported operations.  So:
17860
17861            adrp+add.cmp+branch.none.adrp+add
17862
17863            would have the result of turning on only adrp+add fusion.  */
17864       if (!token_ops)
17865         found_flags = 0;
17866
17867       found_flags |= token_ops;
17868       specs = ++ntoken;
17869     }
17870
17871   /* We ended with a comma, print something.  */
17872   if (!(*specs))
17873     {
17874       error ("%qs string ill-formed", option_name);
17875       return 0;
17876     }
17877
17878   /* We still have one more token to parse.  */
17879   size_t token_length = strlen (specs);
17880   unsigned token_ops = aarch64_parse_one_option_token (specs,
17881                                                        token_length,
17882                                                        flags,
17883                                                        option_name);
17884    if (!token_ops)
17885      found_flags = 0;
17886
17887   found_flags |= token_ops;
17888   return found_flags;
17889 }
17890
17891 /* Support for overriding instruction fusion.  */
17892
17893 static void
17894 aarch64_parse_fuse_string (const char *fuse_string,
17895                             struct tune_params *tune)
17896 {
17897   tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
17898                                                      aarch64_fusible_pairs,
17899                                                      tune->fusible_ops,
17900                                                      "fuse=");
17901 }
17902
17903 /* Support for overriding other tuning flags.  */
17904
17905 static void
17906 aarch64_parse_tune_string (const char *tune_string,
17907                             struct tune_params *tune)
17908 {
17909   tune->extra_tuning_flags
17910     = aarch64_parse_boolean_options (tune_string,
17911                                      aarch64_tuning_flags,
17912                                      tune->extra_tuning_flags,
17913                                      "tune=");
17914 }
17915
17916 /* Parse the sve_width tuning moverride string in TUNE_STRING.
17917    Accept the valid SVE vector widths allowed by
17918    aarch64_sve_vector_bits_enum and use it to override sve_width
17919    in TUNE.  */
17920
17921 static void
17922 aarch64_parse_sve_width_string (const char *tune_string,
17923                                 struct tune_params *tune)
17924 {
17925   int width = -1;
17926
17927   int n = sscanf (tune_string, "%d", &width);
17928   if (n == EOF)
17929     {
17930       error ("invalid format for %<sve_width%>");
17931       return;
17932     }
17933   switch (width)
17934     {
17935     case SVE_128:
17936     case SVE_256:
17937     case SVE_512:
17938     case SVE_1024:
17939     case SVE_2048:
17940       break;
17941     default:
17942       error ("invalid %<sve_width%> value: %d", width);
17943     }
17944   tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
17945 }
17946
17947 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
17948    we understand.  If it is, extract the option string and handoff to
17949    the appropriate function.  */
17950
17951 void
17952 aarch64_parse_one_override_token (const char* token,
17953                                   size_t length,
17954                                   struct tune_params *tune)
17955 {
17956   const struct aarch64_tuning_override_function *fn
17957     = aarch64_tuning_override_functions;
17958
17959   const char *option_part = strchr (token, '=');
17960   if (!option_part)
17961     {
17962       error ("tuning string missing in option (%s)", token);
17963       return;
17964     }
17965
17966   /* Get the length of the option name.  */
17967   length = option_part - token;
17968   /* Skip the '=' to get to the option string.  */
17969   option_part++;
17970
17971   for (; fn->name != NULL; fn++)
17972     {
17973       if (!strncmp (fn->name, token, length))
17974         {
17975           fn->parse_override (option_part, tune);
17976           return;
17977         }
17978     }
17979
17980   error ("unknown tuning option (%s)",token);
17981   return;
17982 }
17983
17984 /* A checking mechanism for the implementation of the tls size.  */
17985
17986 static void
17987 initialize_aarch64_tls_size (struct gcc_options *opts)
17988 {
17989   if (aarch64_tls_size == 0)
17990     aarch64_tls_size = 24;
17991
17992   switch (opts->x_aarch64_cmodel_var)
17993     {
17994     case AARCH64_CMODEL_TINY:
17995       /* Both the default and maximum TLS size allowed under tiny is 1M which
17996          needs two instructions to address, so we clamp the size to 24.  */
17997       if (aarch64_tls_size > 24)
17998         aarch64_tls_size = 24;
17999       break;
18000     case AARCH64_CMODEL_SMALL:
18001       /* The maximum TLS size allowed under small is 4G.  */
18002       if (aarch64_tls_size > 32)
18003         aarch64_tls_size = 32;
18004       break;
18005     case AARCH64_CMODEL_LARGE:
18006       /* The maximum TLS size allowed under large is 16E.
18007          FIXME: 16E should be 64bit, we only support 48bit offset now.  */
18008       if (aarch64_tls_size > 48)
18009         aarch64_tls_size = 48;
18010       break;
18011     default:
18012       gcc_unreachable ();
18013     }
18014
18015   return;
18016 }
18017
18018 /* Return the CPU corresponding to the enum CPU.  */
18019
18020 static const struct processor *
18021 aarch64_get_tune_cpu (enum aarch64_processor cpu)
18022 {
18023   gcc_assert (cpu != aarch64_none);
18024
18025   return &all_cores[cpu];
18026 }
18027
18028 /* Return the architecture corresponding to the enum ARCH.  */
18029
18030 static const struct processor *
18031 aarch64_get_arch (enum aarch64_arch arch)
18032 {
18033   gcc_assert (arch != aarch64_no_arch);
18034
18035   return &all_architectures[arch];
18036 }
18037
18038 /* Parse STRING looking for options in the format:
18039      string     :: option:string
18040      option     :: name=substring
18041      name       :: {a-z}
18042      substring  :: defined by option.  */
18043
18044 static void
18045 aarch64_parse_override_string (const char* input_string,
18046                                struct tune_params* tune)
18047 {
18048   const char separator = ':';
18049   size_t string_length = strlen (input_string) + 1;
18050   char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
18051   char *string = string_root;
18052   strncpy (string, input_string, string_length);
18053   string[string_length - 1] = '\0';
18054
18055   char* ntoken = string;
18056
18057   while ((ntoken = strchr (string, separator)))
18058     {
18059       size_t token_length = ntoken - string;
18060       /* Make this substring look like a string.  */
18061       *ntoken = '\0';
18062       aarch64_parse_one_override_token (string, token_length, tune);
18063       string = ++ntoken;
18064     }
18065
18066   /* One last option to parse.  */
18067   aarch64_parse_one_override_token (string, strlen (string), tune);
18068   free (string_root);
18069 }
18070
18071 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
18072    are best for a generic target with the currently-enabled architecture
18073    extensions.  */
18074 static void
18075 aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
18076 {
18077   /* Neoverse V1 is the only core that is known to benefit from
18078      AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS.  There is therefore no
18079      point enabling it for SVE2 and above.  */
18080   if (TARGET_SVE2)
18081     current_tune.extra_tuning_flags
18082       &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
18083 }
18084
18085 static void
18086 aarch64_override_options_after_change_1 (struct gcc_options *opts)
18087 {
18088   /* PR 70044: We have to be careful about being called multiple times for the
18089      same function.  This means all changes should be repeatable.  */
18090
18091   /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
18092      Disable the frame pointer flag so the mid-end will not use a frame
18093      pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
18094      Set x_flag_omit_frame_pointer to the special value 2 to differentiate
18095      between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2).  */
18096   aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
18097   if (opts->x_flag_omit_frame_pointer == 0)
18098     opts->x_flag_omit_frame_pointer = 2;
18099
18100   /* If not optimizing for size, set the default
18101      alignment to what the target wants.  */
18102   if (!opts->x_optimize_size)
18103     {
18104       if (opts->x_flag_align_loops && !opts->x_str_align_loops)
18105         opts->x_str_align_loops = aarch64_tune_params.loop_align;
18106       if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
18107         opts->x_str_align_jumps = aarch64_tune_params.jump_align;
18108       if (opts->x_flag_align_functions && !opts->x_str_align_functions)
18109         opts->x_str_align_functions = aarch64_tune_params.function_align;
18110     }
18111
18112   /* We default to no pc-relative literal loads.  */
18113
18114   aarch64_pcrelative_literal_loads = false;
18115
18116   /* If -mpc-relative-literal-loads is set on the command line, this
18117      implies that the user asked for PC relative literal loads.  */
18118   if (opts->x_pcrelative_literal_loads == 1)
18119     aarch64_pcrelative_literal_loads = true;
18120
18121   /* In the tiny memory model it makes no sense to disallow PC relative
18122      literal pool loads.  */
18123   if (aarch64_cmodel == AARCH64_CMODEL_TINY
18124       || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
18125     aarch64_pcrelative_literal_loads = true;
18126
18127   /* When enabling the lower precision Newton series for the square root, also
18128      enable it for the reciprocal square root, since the latter is an
18129      intermediary step for the former.  */
18130   if (flag_mlow_precision_sqrt)
18131     flag_mrecip_low_precision_sqrt = true;
18132 }
18133
18134 /* 'Unpack' up the internal tuning structs and update the options
18135     in OPTS.  The caller must have set up selected_tune and selected_arch
18136     as all the other target-specific codegen decisions are
18137     derived from them.  */
18138
18139 void
18140 aarch64_override_options_internal (struct gcc_options *opts)
18141 {
18142   const struct processor *tune = aarch64_get_tune_cpu (opts->x_selected_tune);
18143   aarch64_tune_flags = tune->flags;
18144   aarch64_tune = tune->sched_core;
18145   /* Make a copy of the tuning parameters attached to the core, which
18146      we may later overwrite.  */
18147   aarch64_tune_params = *(tune->tune);
18148   if (tune->tune == &generic_tunings)
18149     aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
18150
18151   if (opts->x_aarch64_override_tune_string)
18152     aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
18153                                    &aarch64_tune_params);
18154
18155   if (opts->x_aarch64_ldp_policy_param)
18156     aarch64_tune_params.ldp_policy_model = opts->x_aarch64_ldp_policy_param;
18157
18158   if (opts->x_aarch64_stp_policy_param)
18159     aarch64_tune_params.stp_policy_model = opts->x_aarch64_stp_policy_param;
18160
18161   /* This target defaults to strict volatile bitfields.  */
18162   if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
18163     opts->x_flag_strict_volatile_bitfields = 1;
18164
18165   if (aarch64_stack_protector_guard == SSP_GLOBAL
18166       && opts->x_aarch64_stack_protector_guard_offset_str)
18167     {
18168       error ("incompatible options %<-mstack-protector-guard=global%> and "
18169              "%<-mstack-protector-guard-offset=%s%>",
18170              aarch64_stack_protector_guard_offset_str);
18171     }
18172
18173   if (aarch64_stack_protector_guard == SSP_SYSREG
18174       && !(opts->x_aarch64_stack_protector_guard_offset_str
18175            && opts->x_aarch64_stack_protector_guard_reg_str))
18176     {
18177       error ("both %<-mstack-protector-guard-offset%> and "
18178              "%<-mstack-protector-guard-reg%> must be used "
18179              "with %<-mstack-protector-guard=sysreg%>");
18180     }
18181
18182   if (opts->x_aarch64_stack_protector_guard_reg_str)
18183     {
18184       if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
18185           error ("specify a system register with a small string length");
18186     }
18187
18188   if (opts->x_aarch64_stack_protector_guard_offset_str)
18189     {
18190       char *end;
18191       const char *str = aarch64_stack_protector_guard_offset_str;
18192       errno = 0;
18193       long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
18194       if (!*str || *end || errno)
18195         error ("%qs is not a valid offset in %qs", str,
18196                "-mstack-protector-guard-offset=");
18197       aarch64_stack_protector_guard_offset = offs;
18198     }
18199
18200   if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK)
18201       && !fixed_regs[R18_REGNUM])
18202     error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
18203
18204   if ((opts->x_aarch64_isa_flags & (AARCH64_FL_SM_ON | AARCH64_FL_ZA_ON))
18205       && !(opts->x_aarch64_isa_flags & AARCH64_FL_SME))
18206     {
18207       if (opts->x_aarch64_isa_flags & AARCH64_FL_SM_ON)
18208         error ("streaming functions require the ISA extension %qs", "sme");
18209       else
18210         error ("functions with SME state require the ISA extension %qs",
18211                "sme");
18212       inform (input_location, "you can enable %qs using the command-line"
18213               " option %<-march%>, or by using the %<target%>"
18214               " attribute or pragma", "sme");
18215       opts->x_target_flags &= ~MASK_GENERAL_REGS_ONLY;
18216       auto new_flags = (opts->x_aarch64_asm_isa_flags
18217                         | feature_deps::SME ().enable);
18218       aarch64_set_asm_isa_flags (opts, new_flags);
18219     }
18220
18221   initialize_aarch64_code_model (opts);
18222   initialize_aarch64_tls_size (opts);
18223   aarch64_tpidr_register = opts->x_aarch64_tpidr_reg;
18224
18225   int queue_depth = 0;
18226   switch (aarch64_tune_params.autoprefetcher_model)
18227     {
18228       case tune_params::AUTOPREFETCHER_OFF:
18229         queue_depth = -1;
18230         break;
18231       case tune_params::AUTOPREFETCHER_WEAK:
18232         queue_depth = 0;
18233         break;
18234       case tune_params::AUTOPREFETCHER_STRONG:
18235         queue_depth = max_insn_queue_index + 1;
18236         break;
18237       default:
18238         gcc_unreachable ();
18239     }
18240
18241   /* We don't mind passing in global_options_set here as we don't use
18242      the *options_set structs anyway.  */
18243   SET_OPTION_IF_UNSET (opts, &global_options_set,
18244                        param_sched_autopref_queue_depth, queue_depth);
18245
18246   /* Set up parameters to be used in prefetching algorithm.  Do not
18247      override the defaults unless we are tuning for a core we have
18248      researched values for.  */
18249   if (aarch64_tune_params.prefetch->num_slots > 0)
18250     SET_OPTION_IF_UNSET (opts, &global_options_set,
18251                          param_simultaneous_prefetches,
18252                          aarch64_tune_params.prefetch->num_slots);
18253   if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
18254     SET_OPTION_IF_UNSET (opts, &global_options_set,
18255                          param_l1_cache_size,
18256                          aarch64_tune_params.prefetch->l1_cache_size);
18257   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
18258     SET_OPTION_IF_UNSET (opts, &global_options_set,
18259                          param_l1_cache_line_size,
18260                          aarch64_tune_params.prefetch->l1_cache_line_size);
18261
18262   if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
18263     {
18264       SET_OPTION_IF_UNSET (opts, &global_options_set,
18265                            param_destruct_interfere_size,
18266                            aarch64_tune_params.prefetch->l1_cache_line_size);
18267       SET_OPTION_IF_UNSET (opts, &global_options_set,
18268                            param_construct_interfere_size,
18269                            aarch64_tune_params.prefetch->l1_cache_line_size);
18270     }
18271   else
18272     {
18273       /* For a generic AArch64 target, cover the current range of cache line
18274          sizes.  */
18275       SET_OPTION_IF_UNSET (opts, &global_options_set,
18276                            param_destruct_interfere_size,
18277                            256);
18278       SET_OPTION_IF_UNSET (opts, &global_options_set,
18279                            param_construct_interfere_size,
18280                            64);
18281     }
18282
18283   if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
18284     SET_OPTION_IF_UNSET (opts, &global_options_set,
18285                          param_l2_cache_size,
18286                          aarch64_tune_params.prefetch->l2_cache_size);
18287   if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
18288     SET_OPTION_IF_UNSET (opts, &global_options_set,
18289                          param_prefetch_dynamic_strides, 0);
18290   if (aarch64_tune_params.prefetch->minimum_stride >= 0)
18291     SET_OPTION_IF_UNSET (opts, &global_options_set,
18292                          param_prefetch_minimum_stride,
18293                          aarch64_tune_params.prefetch->minimum_stride);
18294
18295   /* Use the alternative scheduling-pressure algorithm by default.  */
18296   SET_OPTION_IF_UNSET (opts, &global_options_set,
18297                        param_sched_pressure_algorithm,
18298                        SCHED_PRESSURE_MODEL);
18299
18300   /* Validate the guard size.  */
18301   int guard_size = param_stack_clash_protection_guard_size;
18302
18303   if (guard_size != 12 && guard_size != 16)
18304     error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
18305            "size.  Given value %d (%llu KB) is out of range",
18306            guard_size, (1ULL << guard_size) / 1024ULL);
18307
18308   /* Enforce that interval is the same size as size so the mid-end does the
18309      right thing.  */
18310   SET_OPTION_IF_UNSET (opts, &global_options_set,
18311                        param_stack_clash_protection_probe_interval,
18312                        guard_size);
18313
18314   /* The maybe_set calls won't update the value if the user has explicitly set
18315      one.  Which means we need to validate that probing interval and guard size
18316      are equal.  */
18317   int probe_interval
18318     = param_stack_clash_protection_probe_interval;
18319   if (guard_size != probe_interval)
18320     error ("stack clash guard size %<%d%> must be equal to probing interval "
18321            "%<%d%>", guard_size, probe_interval);
18322
18323   /* Enable sw prefetching at specified optimization level for
18324      CPUS that have prefetch.  Lower optimization level threshold by 1
18325      when profiling is enabled.  */
18326   if (opts->x_flag_prefetch_loop_arrays < 0
18327       && !opts->x_optimize_size
18328       && aarch64_tune_params.prefetch->default_opt_level >= 0
18329       && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
18330     opts->x_flag_prefetch_loop_arrays = 1;
18331
18332   /* Avoid loop-dependant FMA chains.  */
18333   if (aarch64_tune_params.extra_tuning_flags
18334       & AARCH64_EXTRA_TUNE_AVOID_CROSS_LOOP_FMA)
18335     SET_OPTION_IF_UNSET (opts, &global_options_set, param_avoid_fma_max_bits,
18336                          512);
18337
18338   /* Consider fully pipelined FMA in reassociation.  */
18339   if (aarch64_tune_params.extra_tuning_flags
18340       & AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA)
18341     SET_OPTION_IF_UNSET (opts, &global_options_set, param_fully_pipelined_fma,
18342                          1);
18343
18344   aarch64_override_options_after_change_1 (opts);
18345 }
18346
18347 /* Print a hint with a suggestion for a core or architecture name that
18348    most closely resembles what the user passed in STR.  ARCH is true if
18349    the user is asking for an architecture name.  ARCH is false if the user
18350    is asking for a core name.  */
18351
18352 static void
18353 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
18354 {
18355   auto_vec<const char *> candidates;
18356   const struct processor *entry = arch ? all_architectures : all_cores;
18357   for (; entry->name != NULL; entry++)
18358     candidates.safe_push (entry->name);
18359
18360 #ifdef HAVE_LOCAL_CPU_DETECT
18361   /* Add also "native" as possible value.  */
18362   if (arch)
18363     candidates.safe_push ("native");
18364 #endif
18365
18366   char *s;
18367   const char *hint = candidates_list_and_hint (str, s, candidates);
18368   if (hint)
18369     inform (input_location, "valid arguments are: %s;"
18370                              " did you mean %qs?", s, hint);
18371   else
18372     inform (input_location, "valid arguments are: %s", s);
18373
18374   XDELETEVEC (s);
18375 }
18376
18377 /* Print a hint with a suggestion for a core name that most closely resembles
18378    what the user passed in STR.  */
18379
18380 inline static void
18381 aarch64_print_hint_for_core (const char *str)
18382 {
18383   aarch64_print_hint_for_core_or_arch (str, false);
18384 }
18385
18386 /* Print a hint with a suggestion for an architecture name that most closely
18387    resembles what the user passed in STR.  */
18388
18389 inline static void
18390 aarch64_print_hint_for_arch (const char *str)
18391 {
18392   aarch64_print_hint_for_core_or_arch (str, true);
18393 }
18394
18395
18396 /* Print a hint with a suggestion for an extension name
18397    that most closely resembles what the user passed in STR.  */
18398
18399 void
18400 aarch64_print_hint_for_extensions (const std::string &str)
18401 {
18402   auto_vec<const char *> candidates;
18403   aarch64_get_all_extension_candidates (&candidates);
18404   char *s;
18405   const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
18406   if (hint)
18407     inform (input_location, "valid arguments are: %s;"
18408                              " did you mean %qs?", s, hint);
18409   else
18410     inform (input_location, "valid arguments are: %s", s);
18411
18412   XDELETEVEC (s);
18413 }
18414
18415 /* Validate a command-line -mcpu option.  Parse the cpu and extensions (if any)
18416    specified in STR and throw errors if appropriate.  Put the results if
18417    they are valid in RES and ISA_FLAGS.  Return whether the option is
18418    valid.  */
18419
18420 static bool
18421 aarch64_validate_mcpu (const char *str, const struct processor **res,
18422                        aarch64_feature_flags *isa_flags)
18423 {
18424   std::string invalid_extension;
18425   enum aarch_parse_opt_result parse_res
18426     = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18427
18428   if (parse_res == AARCH_PARSE_OK)
18429     return true;
18430
18431   switch (parse_res)
18432     {
18433       case AARCH_PARSE_MISSING_ARG:
18434         error ("missing cpu name in %<-mcpu=%s%>", str);
18435         break;
18436       case AARCH_PARSE_INVALID_ARG:
18437         error ("unknown value %qs for %<-mcpu%>", str);
18438         aarch64_print_hint_for_core (str);
18439         /* A common user error is confusing -march and -mcpu.
18440            If the -mcpu string matches a known architecture then suggest
18441            -march=.  */
18442         parse_res = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18443         if (parse_res == AARCH_PARSE_OK)
18444           inform (input_location, "did you mean %<-march=%s%>?", str);
18445         break;
18446       case AARCH_PARSE_INVALID_FEATURE:
18447         error ("invalid feature modifier %qs in %<-mcpu=%s%>",
18448                invalid_extension.c_str (), str);
18449         aarch64_print_hint_for_extensions (invalid_extension);
18450         break;
18451       default:
18452         gcc_unreachable ();
18453     }
18454
18455   return false;
18456 }
18457
18458 /* Straight line speculation indicators.  */
18459 enum aarch64_sls_hardening_type
18460 {
18461   SLS_NONE = 0,
18462   SLS_RETBR = 1,
18463   SLS_BLR = 2,
18464   SLS_ALL = 3,
18465 };
18466 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
18467
18468 /* Return whether we should mitigatate Straight Line Speculation for the RET
18469    and BR instructions.  */
18470 bool
18471 aarch64_harden_sls_retbr_p (void)
18472 {
18473   return aarch64_sls_hardening & SLS_RETBR;
18474 }
18475
18476 /* Return whether we should mitigatate Straight Line Speculation for the BLR
18477    instruction.  */
18478 bool
18479 aarch64_harden_sls_blr_p (void)
18480 {
18481   return aarch64_sls_hardening & SLS_BLR;
18482 }
18483
18484 /* As of yet we only allow setting these options globally, in the future we may
18485    allow setting them per function.  */
18486 static void
18487 aarch64_validate_sls_mitigation (const char *const_str)
18488 {
18489   char *token_save = NULL;
18490   char *str = NULL;
18491
18492   if (strcmp (const_str, "none") == 0)
18493     {
18494       aarch64_sls_hardening = SLS_NONE;
18495       return;
18496     }
18497   if (strcmp (const_str, "all") == 0)
18498     {
18499       aarch64_sls_hardening = SLS_ALL;
18500       return;
18501     }
18502
18503   char *str_root = xstrdup (const_str);
18504   str = strtok_r (str_root, ",", &token_save);
18505   if (!str)
18506     error ("invalid argument given to %<-mharden-sls=%>");
18507
18508   int temp = SLS_NONE;
18509   while (str)
18510     {
18511       if (strcmp (str, "blr") == 0)
18512         temp |= SLS_BLR;
18513       else if (strcmp (str, "retbr") == 0)
18514         temp |= SLS_RETBR;
18515       else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
18516         {
18517           error ("%qs must be by itself for %<-mharden-sls=%>", str);
18518           break;
18519         }
18520       else
18521         {
18522           error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
18523           break;
18524         }
18525       str = strtok_r (NULL, ",", &token_save);
18526     }
18527   aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
18528   free (str_root);
18529 }
18530
18531 /* Validate a command-line -march option.  Parse the arch and extensions
18532    (if any) specified in STR and throw errors if appropriate.  Put the
18533    results, if they are valid, in RES and ISA_FLAGS.  Return whether the
18534    option is valid.  */
18535
18536 static bool
18537 aarch64_validate_march (const char *str, const struct processor **res,
18538                         aarch64_feature_flags *isa_flags)
18539 {
18540   std::string invalid_extension;
18541   enum aarch_parse_opt_result parse_res
18542     = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18543
18544   if (parse_res == AARCH_PARSE_OK)
18545     return true;
18546
18547   switch (parse_res)
18548     {
18549       case AARCH_PARSE_MISSING_ARG:
18550         error ("missing arch name in %<-march=%s%>", str);
18551         break;
18552       case AARCH_PARSE_INVALID_ARG:
18553         error ("unknown value %qs for %<-march%>", str);
18554         aarch64_print_hint_for_arch (str);
18555         /* A common user error is confusing -march and -mcpu.
18556            If the -march string matches a known CPU suggest -mcpu.  */
18557         parse_res = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18558         if (parse_res == AARCH_PARSE_OK)
18559           inform (input_location, "did you mean %<-mcpu=%s%>?", str);
18560         break;
18561       case AARCH_PARSE_INVALID_FEATURE:
18562         error ("invalid feature modifier %qs in %<-march=%s%>",
18563                invalid_extension.c_str (), str);
18564         aarch64_print_hint_for_extensions (invalid_extension);
18565         break;
18566       default:
18567         gcc_unreachable ();
18568     }
18569
18570   return false;
18571 }
18572
18573 /* Validate a command-line -mtune option.  Parse the cpu
18574    specified in STR and throw errors if appropriate.  Put the
18575    result, if it is valid, in RES.  Return whether the option is
18576    valid.  */
18577
18578 static bool
18579 aarch64_validate_mtune (const char *str, const struct processor **res)
18580 {
18581   enum aarch_parse_opt_result parse_res
18582     = aarch64_parse_tune (str, res);
18583
18584   if (parse_res == AARCH_PARSE_OK)
18585     return true;
18586
18587   switch (parse_res)
18588     {
18589       case AARCH_PARSE_MISSING_ARG:
18590         error ("missing cpu name in %<-mtune=%s%>", str);
18591         break;
18592       case AARCH_PARSE_INVALID_ARG:
18593         error ("unknown value %qs for %<-mtune%>", str);
18594         aarch64_print_hint_for_core (str);
18595         break;
18596       default:
18597         gcc_unreachable ();
18598     }
18599   return false;
18600 }
18601
18602 /* Return the VG value associated with -msve-vector-bits= value VALUE.  */
18603
18604 static poly_uint16
18605 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
18606 {
18607   /* 128-bit SVE and Advanced SIMD modes use different register layouts
18608      on big-endian targets, so we would need to forbid subregs that convert
18609      from one to the other.  By default a reinterpret sequence would then
18610      involve a store to memory in one mode and a load back in the other.
18611      Even if we optimize that sequence using reverse instructions,
18612      it would still be a significant potential overhead.
18613
18614      For now, it seems better to generate length-agnostic code for that
18615      case instead.  */
18616   if (value == SVE_SCALABLE
18617       || (value == SVE_128 && BYTES_BIG_ENDIAN))
18618     return poly_uint16 (2, 2);
18619   else
18620     return (int) value / 64;
18621 }
18622
18623 /* Set the global aarch64_asm_isa_flags to FLAGS and update
18624    aarch64_isa_flags accordingly.  */
18625
18626 void
18627 aarch64_set_asm_isa_flags (aarch64_feature_flags flags)
18628 {
18629   aarch64_set_asm_isa_flags (&global_options, flags);
18630 }
18631
18632 /* Implement TARGET_OPTION_OVERRIDE.  This is called once in the beginning
18633    and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18634    tuning structs.  In particular it must set selected_tune and
18635    aarch64_asm_isa_flags that define the available ISA features and tuning
18636    decisions.  It must also set selected_arch as this will be used to
18637    output the .arch asm tags for each function.  */
18638
18639 static void
18640 aarch64_override_options (void)
18641 {
18642   aarch64_feature_flags cpu_isa = 0;
18643   aarch64_feature_flags arch_isa = 0;
18644   aarch64_set_asm_isa_flags (0);
18645
18646   const struct processor *cpu = NULL;
18647   const struct processor *arch = NULL;
18648   const struct processor *tune = NULL;
18649
18650   if (aarch64_harden_sls_string)
18651     aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
18652
18653   if (aarch64_branch_protection_string)
18654     aarch_validate_mbranch_protection (aarch64_branch_protection_string,
18655                                        "-mbranch-protection=");
18656
18657   /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18658      If either of -march or -mtune is given, they override their
18659      respective component of -mcpu.  */
18660   if (aarch64_cpu_string)
18661     aarch64_validate_mcpu (aarch64_cpu_string, &cpu, &cpu_isa);
18662
18663   if (aarch64_arch_string)
18664     aarch64_validate_march (aarch64_arch_string, &arch, &arch_isa);
18665
18666   if (aarch64_tune_string)
18667     aarch64_validate_mtune (aarch64_tune_string, &tune);
18668
18669 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18670   SUBTARGET_OVERRIDE_OPTIONS;
18671 #endif
18672
18673   auto isa_mode = AARCH64_FL_DEFAULT_ISA_MODE;
18674   if (cpu && arch)
18675     {
18676       /* If both -mcpu and -march are specified, warn if they are not
18677          feature compatible.  feature compatible means that the inclusion of the
18678          cpu features would end up disabling an achitecture feature.  In
18679          otherwords the cpu features need to be a strict superset of the arch
18680          features and if so prefer the -march ISA flags.  */
18681       auto full_arch_flags = arch->flags | arch_isa;
18682       auto full_cpu_flags = cpu->flags | cpu_isa;
18683       if (~full_cpu_flags & full_arch_flags)
18684         {
18685           std::string ext_diff
18686             = aarch64_get_extension_string_for_isa_flags (full_arch_flags,
18687                                                           full_cpu_flags);
18688           warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch "
18689                       "and resulted in options %<%s%> being added",
18690                        aarch64_cpu_string,
18691                        aarch64_arch_string,
18692                        ext_diff.c_str ());
18693         }
18694
18695       selected_arch = arch->arch;
18696       aarch64_set_asm_isa_flags (arch_isa | isa_mode);
18697     }
18698   else if (cpu)
18699     {
18700       selected_arch = cpu->arch;
18701       aarch64_set_asm_isa_flags (cpu_isa | isa_mode);
18702     }
18703   else if (arch)
18704     {
18705       cpu = &all_cores[arch->ident];
18706       selected_arch = arch->arch;
18707       aarch64_set_asm_isa_flags (arch_isa | isa_mode);
18708     }
18709   else
18710     {
18711       /* No -mcpu or -march specified, so use the default CPU.  */
18712       cpu = &all_cores[TARGET_CPU_DEFAULT];
18713       selected_arch = cpu->arch;
18714       aarch64_set_asm_isa_flags (cpu->flags | isa_mode);
18715     }
18716
18717   selected_tune = tune ? tune->ident : cpu->ident;
18718
18719   if (aarch_enable_bti == 2)
18720     {
18721 #ifdef TARGET_ENABLE_BTI
18722       aarch_enable_bti = 1;
18723 #else
18724       aarch_enable_bti = 0;
18725 #endif
18726     }
18727
18728   /* Return address signing is currently not supported for ILP32 targets.  For
18729      LP64 targets use the configured option in the absence of a command-line
18730      option for -mbranch-protection.  */
18731   if (!TARGET_ILP32 && aarch64_branch_protection_string == NULL)
18732     {
18733 #ifdef TARGET_ENABLE_PAC_RET
18734       aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18735 #else
18736       aarch_ra_sign_scope = AARCH_FUNCTION_NONE;
18737 #endif
18738     }
18739
18740 #ifndef HAVE_AS_MABI_OPTION
18741   /* The compiler may have been configured with 2.23.* binutils, which does
18742      not have support for ILP32.  */
18743   if (TARGET_ILP32)
18744     error ("assembler does not support %<-mabi=ilp32%>");
18745 #endif
18746
18747   /* Convert -msve-vector-bits to a VG count.  */
18748   aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
18749
18750   if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE && TARGET_ILP32)
18751     sorry ("return address signing is only supported for %<-mabi=lp64%>");
18752
18753   /* The pass to insert speculation tracking runs before
18754      shrink-wrapping and the latter does not know how to update the
18755      tracking status.  So disable it in this case.  */
18756   if (aarch64_track_speculation)
18757     flag_shrink_wrap = 0;
18758
18759   aarch64_override_options_internal (&global_options);
18760
18761   /* Save these options as the default ones in case we push and pop them later
18762      while processing functions with potential target attributes.  */
18763   target_option_default_node = target_option_current_node
18764     = build_target_option_node (&global_options, &global_options_set);
18765 }
18766
18767 /* Implement targetm.override_options_after_change.  */
18768
18769 static void
18770 aarch64_override_options_after_change (void)
18771 {
18772   aarch64_override_options_after_change_1 (&global_options);
18773 }
18774
18775 /* Implement the TARGET_OFFLOAD_OPTIONS hook.  */
18776 static char *
18777 aarch64_offload_options (void)
18778 {
18779   if (TARGET_ILP32)
18780     return xstrdup ("-foffload-abi=ilp32");
18781   else
18782     return xstrdup ("-foffload-abi=lp64");
18783 }
18784
18785 static struct machine_function *
18786 aarch64_init_machine_status (void)
18787 {
18788   struct machine_function *machine;
18789   machine = ggc_cleared_alloc<machine_function> ();
18790   return machine;
18791 }
18792
18793 void
18794 aarch64_init_expanders (void)
18795 {
18796   init_machine_status = aarch64_init_machine_status;
18797 }
18798
18799 /* A checking mechanism for the implementation of the various code models.  */
18800 static void
18801 initialize_aarch64_code_model (struct gcc_options *opts)
18802 {
18803   aarch64_cmodel = opts->x_aarch64_cmodel_var;
18804   switch (opts->x_aarch64_cmodel_var)
18805     {
18806     case AARCH64_CMODEL_TINY:
18807       if (opts->x_flag_pic)
18808         aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
18809       break;
18810     case AARCH64_CMODEL_SMALL:
18811       if (opts->x_flag_pic)
18812         {
18813 #ifdef HAVE_AS_SMALL_PIC_RELOCS
18814           aarch64_cmodel = (flag_pic == 2
18815                             ? AARCH64_CMODEL_SMALL_PIC
18816                             : AARCH64_CMODEL_SMALL_SPIC);
18817 #else
18818           aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
18819 #endif
18820         }
18821       break;
18822     case AARCH64_CMODEL_LARGE:
18823       if (opts->x_flag_pic)
18824         sorry ("code model %qs with %<-f%s%>", "large",
18825                opts->x_flag_pic > 1 ? "PIC" : "pic");
18826       if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
18827         sorry ("code model %qs not supported in ilp32 mode", "large");
18828       break;
18829     case AARCH64_CMODEL_TINY_PIC:
18830     case AARCH64_CMODEL_SMALL_PIC:
18831     case AARCH64_CMODEL_SMALL_SPIC:
18832       gcc_unreachable ();
18833     }
18834 }
18835
18836 /* Implements TARGET_OPTION_RESTORE.  Restore the backend codegen decisions
18837    using the information saved in PTR.  */
18838
18839 static void
18840 aarch64_option_restore (struct gcc_options *opts,
18841                         struct gcc_options * /* opts_set */,
18842                         struct cl_target_option * /* ptr */)
18843 {
18844   aarch64_override_options_internal (opts);
18845 }
18846
18847 /* Implement TARGET_OPTION_PRINT.  */
18848
18849 static void
18850 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
18851 {
18852   const struct processor *cpu
18853     = aarch64_get_tune_cpu (ptr->x_selected_tune);
18854   const struct processor *arch = aarch64_get_arch (ptr->x_selected_arch);
18855   std::string extension
18856     = aarch64_get_extension_string_for_isa_flags (ptr->x_aarch64_asm_isa_flags,
18857                                                   arch->flags);
18858
18859   fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
18860   fprintf (file, "%*sselected arch = %s%s\n", indent, "",
18861            arch->name, extension.c_str ());
18862 }
18863
18864 static GTY(()) tree aarch64_previous_fndecl;
18865
18866 void
18867 aarch64_reset_previous_fndecl (void)
18868 {
18869   aarch64_previous_fndecl = NULL;
18870 }
18871
18872 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
18873    Used by aarch64_set_current_function and aarch64_pragma_target_parse to
18874    make sure optab availability predicates are recomputed when necessary.  */
18875
18876 void
18877 aarch64_save_restore_target_globals (tree new_tree)
18878 {
18879   if (TREE_TARGET_GLOBALS (new_tree))
18880     restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
18881   else if (new_tree == target_option_default_node)
18882     restore_target_globals (&default_target_globals);
18883   else
18884     TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
18885 }
18886
18887 /* Return the target_option_node for FNDECL, or the current options
18888    if FNDECL is null.  */
18889
18890 static tree
18891 aarch64_fndecl_options (tree fndecl)
18892 {
18893   if (!fndecl)
18894     return target_option_current_node;
18895
18896   if (tree options = DECL_FUNCTION_SPECIFIC_TARGET (fndecl))
18897     return options;
18898
18899   return target_option_default_node;
18900 }
18901
18902 /* Implement TARGET_SET_CURRENT_FUNCTION.  Unpack the codegen decisions
18903    like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
18904    of the function, if such exists.  This function may be called multiple
18905    times on a single function so use aarch64_previous_fndecl to avoid
18906    setting up identical state.  */
18907
18908 static void
18909 aarch64_set_current_function (tree fndecl)
18910 {
18911   tree old_tree = aarch64_fndecl_options (aarch64_previous_fndecl);
18912   tree new_tree = aarch64_fndecl_options (fndecl);
18913
18914   auto new_isa_mode = (fndecl
18915                        ? aarch64_fndecl_isa_mode (fndecl)
18916                        : AARCH64_FL_DEFAULT_ISA_MODE);
18917   auto isa_flags = TREE_TARGET_OPTION (new_tree)->x_aarch64_isa_flags;
18918
18919   static bool reported_zt0_p;
18920   if (!reported_zt0_p
18921       && !(isa_flags & AARCH64_FL_SME2)
18922       && fndecl
18923       && aarch64_fndecl_has_state (fndecl, "zt0"))
18924     {
18925       error ("functions with %qs state require the ISA extension %qs",
18926              "zt0", "sme2");
18927       inform (input_location, "you can enable %qs using the command-line"
18928               " option %<-march%>, or by using the %<target%>"
18929               " attribute or pragma", "sme2");
18930       reported_zt0_p = true;
18931     }
18932
18933   /* If nothing to do, return.  #pragma GCC reset or #pragma GCC pop to
18934      the default have been handled by aarch64_save_restore_target_globals from
18935      aarch64_pragma_target_parse.  */
18936   if (old_tree == new_tree
18937       && (!fndecl || aarch64_previous_fndecl)
18938       && (isa_flags & AARCH64_FL_ISA_MODES) == new_isa_mode)
18939     {
18940       gcc_assert (AARCH64_ISA_MODE == new_isa_mode);
18941       return;
18942     }
18943
18944   aarch64_previous_fndecl = fndecl;
18945
18946   /* First set the target options.  */
18947   cl_target_option_restore (&global_options, &global_options_set,
18948                             TREE_TARGET_OPTION (new_tree));
18949
18950   /* The ISA mode can vary based on function type attributes and
18951      function declaration attributes.  Make sure that the target
18952      options correctly reflect these attributes.  */
18953   if ((isa_flags & AARCH64_FL_ISA_MODES) != new_isa_mode)
18954     {
18955       auto base_flags = (aarch64_asm_isa_flags & ~AARCH64_FL_ISA_MODES);
18956       aarch64_set_asm_isa_flags (base_flags | new_isa_mode);
18957
18958       aarch64_override_options_internal (&global_options);
18959       new_tree = build_target_option_node (&global_options,
18960                                            &global_options_set);
18961       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_tree;
18962
18963       tree new_optimize = build_optimization_node (&global_options,
18964                                                    &global_options_set);
18965       if (new_optimize != optimization_default_node)
18966         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
18967     }
18968
18969   aarch64_save_restore_target_globals (new_tree);
18970
18971   gcc_assert (AARCH64_ISA_MODE == new_isa_mode);
18972 }
18973
18974 /* Enum describing the various ways we can handle attributes.
18975    In many cases we can reuse the generic option handling machinery.  */
18976
18977 enum aarch64_attr_opt_type
18978 {
18979   aarch64_attr_mask,    /* Attribute should set a bit in target_flags.  */
18980   aarch64_attr_bool,    /* Attribute sets or unsets a boolean variable.  */
18981   aarch64_attr_enum,    /* Attribute sets an enum variable.  */
18982   aarch64_attr_custom   /* Attribute requires a custom handling function.  */
18983 };
18984
18985 /* All the information needed to handle a target attribute.
18986    NAME is the name of the attribute.
18987    ATTR_TYPE specifies the type of behavior of the attribute as described
18988    in the definition of enum aarch64_attr_opt_type.
18989    ALLOW_NEG is true if the attribute supports a "no-" form.
18990    HANDLER is the function that takes the attribute string as an argument
18991    It is needed only when the ATTR_TYPE is aarch64_attr_custom.
18992    OPT_NUM is the enum specifying the option that the attribute modifies.
18993    This is needed for attributes that mirror the behavior of a command-line
18994    option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
18995    aarch64_attr_enum.  */
18996
18997 struct aarch64_attribute_info
18998 {
18999   const char *name;
19000   enum aarch64_attr_opt_type attr_type;
19001   bool allow_neg;
19002   bool (*handler) (const char *);
19003   enum opt_code opt_num;
19004 };
19005
19006 /* Handle the ARCH_STR argument to the arch= target attribute.  */
19007
19008 static bool
19009 aarch64_handle_attr_arch (const char *str)
19010 {
19011   const struct processor *tmp_arch = NULL;
19012   std::string invalid_extension;
19013   aarch64_feature_flags tmp_flags;
19014   enum aarch_parse_opt_result parse_res
19015     = aarch64_parse_arch (str, &tmp_arch, &tmp_flags, &invalid_extension);
19016
19017   if (parse_res == AARCH_PARSE_OK)
19018     {
19019       gcc_assert (tmp_arch);
19020       selected_arch = tmp_arch->arch;
19021       aarch64_set_asm_isa_flags (tmp_flags | AARCH64_ISA_MODE);
19022       return true;
19023     }
19024
19025   switch (parse_res)
19026     {
19027       case AARCH_PARSE_MISSING_ARG:
19028         error ("missing name in %<target(\"arch=\")%> pragma or attribute");
19029         break;
19030       case AARCH_PARSE_INVALID_ARG:
19031         error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str);
19032         aarch64_print_hint_for_arch (str);
19033         break;
19034       case AARCH_PARSE_INVALID_FEATURE:
19035         error ("invalid feature modifier %s of value %qs in "
19036                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19037         aarch64_print_hint_for_extensions (invalid_extension);
19038         break;
19039       default:
19040         gcc_unreachable ();
19041     }
19042
19043   return false;
19044 }
19045
19046 /* Handle the argument CPU_STR to the cpu= target attribute.  */
19047
19048 static bool
19049 aarch64_handle_attr_cpu (const char *str)
19050 {
19051   const struct processor *tmp_cpu = NULL;
19052   std::string invalid_extension;
19053   aarch64_feature_flags tmp_flags;
19054   enum aarch_parse_opt_result parse_res
19055     = aarch64_parse_cpu (str, &tmp_cpu, &tmp_flags, &invalid_extension);
19056
19057   if (parse_res == AARCH_PARSE_OK)
19058     {
19059       gcc_assert (tmp_cpu);
19060       selected_tune = tmp_cpu->ident;
19061       selected_arch = tmp_cpu->arch;
19062       aarch64_set_asm_isa_flags (tmp_flags | AARCH64_ISA_MODE);
19063       return true;
19064     }
19065
19066   switch (parse_res)
19067     {
19068       case AARCH_PARSE_MISSING_ARG:
19069         error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
19070         break;
19071       case AARCH_PARSE_INVALID_ARG:
19072         error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str);
19073         aarch64_print_hint_for_core (str);
19074         break;
19075       case AARCH_PARSE_INVALID_FEATURE:
19076         error ("invalid feature modifier %qs of value %qs in "
19077                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19078         aarch64_print_hint_for_extensions (invalid_extension);
19079         break;
19080       default:
19081         gcc_unreachable ();
19082     }
19083
19084   return false;
19085 }
19086
19087 /* Handle the argument STR to the branch-protection= attribute.  */
19088
19089 static bool
19090 aarch64_handle_attr_branch_protection (const char* str)
19091 {
19092   return aarch_validate_mbranch_protection (str,
19093                                             "target(\"branch-protection=\")");
19094 }
19095
19096 /* Handle the argument STR to the tune= target attribute.  */
19097
19098 static bool
19099 aarch64_handle_attr_tune (const char *str)
19100 {
19101   const struct processor *tmp_tune = NULL;
19102   enum aarch_parse_opt_result parse_res
19103     = aarch64_parse_tune (str, &tmp_tune);
19104
19105   if (parse_res == AARCH_PARSE_OK)
19106     {
19107       gcc_assert (tmp_tune);
19108       selected_tune = tmp_tune->ident;
19109       return true;
19110     }
19111
19112   switch (parse_res)
19113     {
19114       case AARCH_PARSE_INVALID_ARG:
19115         error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str);
19116         aarch64_print_hint_for_core (str);
19117         break;
19118       default:
19119         gcc_unreachable ();
19120     }
19121
19122   return false;
19123 }
19124
19125 /* Parse an architecture extensions target attribute string specified in STR.
19126    For example "+fp+nosimd".  Show any errors if needed.  Return TRUE
19127    if successful.  Update aarch64_isa_flags to reflect the ISA features
19128    modified.  */
19129
19130 static bool
19131 aarch64_handle_attr_isa_flags (char *str)
19132 {
19133   enum aarch_parse_opt_result parse_res;
19134   auto isa_flags = aarch64_asm_isa_flags;
19135
19136   /* We allow "+nothing" in the beginning to clear out all architectural
19137      features if the user wants to handpick specific features.  */
19138   if (strncmp ("+nothing", str, 8) == 0)
19139     {
19140       isa_flags = AARCH64_ISA_MODE;
19141       str += 8;
19142     }
19143
19144   std::string invalid_extension;
19145   parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
19146
19147   if (parse_res == AARCH_PARSE_OK)
19148     {
19149       aarch64_set_asm_isa_flags (isa_flags);
19150       return true;
19151     }
19152
19153   switch (parse_res)
19154     {
19155       case AARCH_PARSE_MISSING_ARG:
19156         error ("missing value in %<target()%> pragma or attribute");
19157         break;
19158
19159       case AARCH_PARSE_INVALID_FEATURE:
19160         error ("invalid feature modifier %qs of value %qs in "
19161                "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
19162         break;
19163
19164       default:
19165         gcc_unreachable ();
19166     }
19167
19168  return false;
19169 }
19170
19171 /* The target attributes that we support.  On top of these we also support just
19172    ISA extensions, like  __attribute__ ((target ("+crc"))), but that case is
19173    handled explicitly in aarch64_process_one_target_attr.  */
19174
19175 static const struct aarch64_attribute_info aarch64_attributes[] =
19176 {
19177   { "general-regs-only", aarch64_attr_mask, false, NULL,
19178      OPT_mgeneral_regs_only },
19179   { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
19180      OPT_mfix_cortex_a53_835769 },
19181   { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
19182      OPT_mfix_cortex_a53_843419 },
19183   { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
19184   { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
19185   { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
19186      OPT_momit_leaf_frame_pointer },
19187   { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
19188   { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
19189      OPT_march_ },
19190   { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
19191   { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
19192      OPT_mtune_ },
19193   { "branch-protection", aarch64_attr_custom, false,
19194      aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
19195   { "sign-return-address", aarch64_attr_enum, false, NULL,
19196      OPT_msign_return_address_ },
19197   { "outline-atomics", aarch64_attr_bool, true, NULL,
19198      OPT_moutline_atomics},
19199   { NULL, aarch64_attr_custom, false, NULL, OPT____ }
19200 };
19201
19202 /* Parse ARG_STR which contains the definition of one target attribute.
19203    Show appropriate errors if any or return true if the attribute is valid.  */
19204
19205 static bool
19206 aarch64_process_one_target_attr (char *arg_str)
19207 {
19208   bool invert = false;
19209
19210   size_t len = strlen (arg_str);
19211
19212   if (len == 0)
19213     {
19214       error ("malformed %<target()%> pragma or attribute");
19215       return false;
19216     }
19217
19218   char *str_to_check = (char *) alloca (len + 1);
19219   strcpy (str_to_check, arg_str);
19220
19221   /* We have something like __attribute__ ((target ("+fp+nosimd"))).
19222      It is easier to detect and handle it explicitly here rather than going
19223      through the machinery for the rest of the target attributes in this
19224      function.  */
19225   if (*str_to_check == '+')
19226     return aarch64_handle_attr_isa_flags (str_to_check);
19227
19228   if (len > 3 && startswith (str_to_check, "no-"))
19229     {
19230       invert = true;
19231       str_to_check += 3;
19232     }
19233   char *arg = strchr (str_to_check, '=');
19234
19235   /* If we found opt=foo then terminate STR_TO_CHECK at the '='
19236      and point ARG to "foo".  */
19237   if (arg)
19238     {
19239       *arg = '\0';
19240       arg++;
19241     }
19242   const struct aarch64_attribute_info *p_attr;
19243   bool found = false;
19244   for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
19245     {
19246       /* If the names don't match up, or the user has given an argument
19247          to an attribute that doesn't accept one, or didn't give an argument
19248          to an attribute that expects one, fail to match.  */
19249       if (strcmp (str_to_check, p_attr->name) != 0)
19250         continue;
19251
19252       found = true;
19253       bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
19254                               || p_attr->attr_type == aarch64_attr_enum;
19255
19256       if (attr_need_arg_p ^ (arg != NULL))
19257         {
19258           error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
19259           return false;
19260         }
19261
19262       /* If the name matches but the attribute does not allow "no-" versions
19263          then we can't match.  */
19264       if (invert && !p_attr->allow_neg)
19265         {
19266           error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
19267           return false;
19268         }
19269
19270       switch (p_attr->attr_type)
19271         {
19272         /* Has a custom handler registered.
19273            For example, cpu=, arch=, tune=.  */
19274           case aarch64_attr_custom:
19275             gcc_assert (p_attr->handler);
19276             if (!p_attr->handler (arg))
19277               return false;
19278             break;
19279
19280           /* Either set or unset a boolean option.  */
19281           case aarch64_attr_bool:
19282             {
19283               struct cl_decoded_option decoded;
19284
19285               generate_option (p_attr->opt_num, NULL, !invert,
19286                                CL_TARGET, &decoded);
19287               aarch64_handle_option (&global_options, &global_options_set,
19288                                       &decoded, input_location);
19289               break;
19290             }
19291           /* Set or unset a bit in the target_flags.  aarch64_handle_option
19292              should know what mask to apply given the option number.  */
19293           case aarch64_attr_mask:
19294             {
19295               struct cl_decoded_option decoded;
19296               /* We only need to specify the option number.
19297                  aarch64_handle_option will know which mask to apply.  */
19298               decoded.opt_index = p_attr->opt_num;
19299               decoded.value = !invert;
19300               aarch64_handle_option (&global_options, &global_options_set,
19301                                       &decoded, input_location);
19302               break;
19303             }
19304           /* Use the option setting machinery to set an option to an enum.  */
19305           case aarch64_attr_enum:
19306             {
19307               gcc_assert (arg);
19308               bool valid;
19309               int value;
19310               valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
19311                                               &value, CL_TARGET);
19312               if (valid)
19313                 {
19314                   set_option (&global_options, NULL, p_attr->opt_num, value,
19315                               NULL, DK_UNSPECIFIED, input_location,
19316                               global_dc);
19317                 }
19318               else
19319                 {
19320                   error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
19321                 }
19322               break;
19323             }
19324           default:
19325             gcc_unreachable ();
19326         }
19327     }
19328
19329   /* If we reached here we either have found an attribute and validated
19330      it or didn't match any.  If we matched an attribute but its arguments
19331      were malformed we will have returned false already.  */
19332   return found;
19333 }
19334
19335 /* Count how many times the character C appears in
19336    NULL-terminated string STR.  */
19337
19338 static unsigned int
19339 num_occurences_in_str (char c, char *str)
19340 {
19341   unsigned int res = 0;
19342   while (*str != '\0')
19343     {
19344       if (*str == c)
19345         res++;
19346
19347       str++;
19348     }
19349
19350   return res;
19351 }
19352
19353 /* Parse the tree in ARGS that contains the target attribute information
19354    and update the global target options space.  */
19355
19356 bool
19357 aarch64_process_target_attr (tree args)
19358 {
19359   if (TREE_CODE (args) == TREE_LIST)
19360     {
19361       do
19362         {
19363           tree head = TREE_VALUE (args);
19364           if (head)
19365             {
19366               if (!aarch64_process_target_attr (head))
19367                 return false;
19368             }
19369           args = TREE_CHAIN (args);
19370         } while (args);
19371
19372       return true;
19373     }
19374
19375   if (TREE_CODE (args) != STRING_CST)
19376     {
19377       error ("attribute %<target%> argument not a string");
19378       return false;
19379     }
19380
19381   size_t len = strlen (TREE_STRING_POINTER (args));
19382   char *str_to_check = (char *) alloca (len + 1);
19383   strcpy (str_to_check, TREE_STRING_POINTER (args));
19384
19385   if (len == 0)
19386     {
19387       error ("malformed %<target()%> pragma or attribute");
19388       return false;
19389     }
19390
19391   /* Used to catch empty spaces between commas i.e.
19392      attribute ((target ("attr1,,attr2"))).  */
19393   unsigned int num_commas = num_occurences_in_str (',', str_to_check);
19394
19395   /* Handle multiple target attributes separated by ','.  */
19396   char *token = strtok_r (str_to_check, ",", &str_to_check);
19397
19398   unsigned int num_attrs = 0;
19399   while (token)
19400     {
19401       num_attrs++;
19402       if (!aarch64_process_one_target_attr (token))
19403         {
19404           /* Check if token is possibly an arch extension without
19405              leading '+'.  */
19406           aarch64_feature_flags isa_temp = 0;
19407           auto with_plus = std::string ("+") + token;
19408           enum aarch_parse_opt_result ext_res
19409             = aarch64_parse_extension (with_plus.c_str (), &isa_temp, nullptr);
19410
19411           if (ext_res == AARCH_PARSE_OK)
19412             error ("arch extension %<%s%> should be prefixed by %<+%>",
19413                    token);
19414           else
19415             error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
19416           return false;
19417         }
19418
19419       token = strtok_r (NULL, ",", &str_to_check);
19420     }
19421
19422   if (num_attrs != num_commas + 1)
19423     {
19424       error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
19425       return false;
19426     }
19427
19428   return true;
19429 }
19430
19431 static bool aarch64_process_target_version_attr (tree args);
19432
19433 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P.  This is used to
19434    process attribute ((target ("..."))).  */
19435
19436 static bool
19437 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
19438 {
19439   struct cl_target_option cur_target;
19440   bool ret;
19441   tree old_optimize;
19442   tree new_target, new_optimize;
19443   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19444
19445   /* If what we're processing is the current pragma string then the
19446      target option node is already stored in target_option_current_node
19447      by aarch64_pragma_target_parse in aarch64-c.cc.  Use that to avoid
19448      having to re-parse the string.  This is especially useful to keep
19449      arm_neon.h compile times down since that header contains a lot
19450      of intrinsics enclosed in pragmas.  */
19451   if (!existing_target && args == current_target_pragma)
19452     {
19453       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
19454       return true;
19455     }
19456   tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19457
19458   old_optimize
19459     = build_optimization_node (&global_options, &global_options_set);
19460   func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19461
19462   /* If the function changed the optimization levels as well as setting
19463      target options, start with the optimizations specified.  */
19464   if (func_optimize && func_optimize != old_optimize)
19465     cl_optimization_restore (&global_options, &global_options_set,
19466                              TREE_OPTIMIZATION (func_optimize));
19467
19468   /* Save the current target options to restore at the end.  */
19469   cl_target_option_save (&cur_target, &global_options, &global_options_set);
19470
19471   /* If fndecl already has some target attributes applied to it, unpack
19472      them so that we add this attribute on top of them, rather than
19473      overwriting them.  */
19474   if (existing_target)
19475     {
19476       struct cl_target_option *existing_options
19477         = TREE_TARGET_OPTION (existing_target);
19478
19479       if (existing_options)
19480         cl_target_option_restore (&global_options, &global_options_set,
19481                                   existing_options);
19482     }
19483   else
19484     cl_target_option_restore (&global_options, &global_options_set,
19485                               TREE_TARGET_OPTION (target_option_current_node));
19486
19487   ret = aarch64_process_target_attr (args);
19488   ret = aarch64_process_target_attr (args);
19489   if (ret)
19490     {
19491       tree version_attr = lookup_attribute ("target_version",
19492                                             DECL_ATTRIBUTES (fndecl));
19493       if (version_attr != NULL_TREE)
19494         {
19495           /* Reapply any target_version attribute after target attribute.
19496              This should be equivalent to applying the target_version once
19497              after processing all target attributes.  */
19498           tree version_args = TREE_VALUE (version_attr);
19499           ret = aarch64_process_target_version_attr (version_args);
19500         }
19501     }
19502
19503   /* Set up any additional state.  */
19504   if (ret)
19505     {
19506       aarch64_override_options_internal (&global_options);
19507       new_target = build_target_option_node (&global_options,
19508                                              &global_options_set);
19509     }
19510   else
19511     new_target = NULL;
19512
19513   new_optimize = build_optimization_node (&global_options,
19514                                           &global_options_set);
19515
19516   if (fndecl && ret)
19517     {
19518       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19519
19520       if (old_optimize != new_optimize)
19521         DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19522     }
19523
19524   cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19525
19526   if (old_optimize != new_optimize)
19527     cl_optimization_restore (&global_options, &global_options_set,
19528                              TREE_OPTIMIZATION (old_optimize));
19529   return ret;
19530 }
19531
19532 typedef unsigned long long aarch64_fmv_feature_mask;
19533
19534 typedef struct
19535 {
19536   const char *name;
19537   aarch64_fmv_feature_mask feature_mask;
19538   aarch64_feature_flags opt_flags;
19539 } aarch64_fmv_feature_datum;
19540
19541 #define AARCH64_FMV_FEATURE(NAME, FEAT_NAME, C) \
19542   {NAME, 1ULL << FEAT_##FEAT_NAME, ::feature_deps::fmv_deps_##FEAT_NAME},
19543
19544 /* FMV features are listed in priority order, to make it easier to sort target
19545    strings.  */
19546 static aarch64_fmv_feature_datum aarch64_fmv_feature_data[] = {
19547 #include "config/aarch64/aarch64-option-extensions.def"
19548 };
19549
19550 /* Parse a function multiversioning feature string STR, as found in a
19551    target_version or target_clones attribute.
19552
19553    If ISA_FLAGS is nonnull, then update it with the specified architecture
19554    features turned on.  If FEATURE_MASK is nonnull, then assign to it a bitmask
19555    representing the set of features explicitly specified in the feature string.
19556    Return an aarch_parse_opt_result describing the result.
19557
19558    When the STR string contains an invalid or duplicate extension, a copy of
19559    the extension string is created and stored to INVALID_EXTENSION.  */
19560
19561 static enum aarch_parse_opt_result
19562 aarch64_parse_fmv_features (const char *str, aarch64_feature_flags *isa_flags,
19563                             aarch64_fmv_feature_mask *feature_mask,
19564                             std::string *invalid_extension)
19565 {
19566   if (feature_mask)
19567     *feature_mask = 0ULL;
19568
19569   if (strcmp (str, "default") == 0)
19570     return AARCH_PARSE_OK;
19571
19572   while (str != NULL && *str != 0)
19573     {
19574       const char *ext;
19575       size_t len;
19576
19577       ext = strchr (str, '+');
19578
19579       if (ext != NULL)
19580         len = ext - str;
19581       else
19582         len = strlen (str);
19583
19584       if (len == 0)
19585         return AARCH_PARSE_MISSING_ARG;
19586
19587       static const int num_features = ARRAY_SIZE (aarch64_fmv_feature_data);
19588       int i;
19589       for (i = 0; i < num_features; i++)
19590         {
19591           if (strlen (aarch64_fmv_feature_data[i].name) == len
19592               && strncmp (aarch64_fmv_feature_data[i].name, str, len) == 0)
19593             {
19594               if (isa_flags)
19595                 *isa_flags |= aarch64_fmv_feature_data[i].opt_flags;
19596               if (feature_mask)
19597                 {
19598                   auto old_feature_mask = *feature_mask;
19599                   *feature_mask |= aarch64_fmv_feature_data[i].feature_mask;
19600                   if (*feature_mask == old_feature_mask)
19601                     {
19602                       /* Duplicate feature.  */
19603                       if (invalid_extension)
19604                         *invalid_extension = std::string (str, len);
19605                       return AARCH_PARSE_DUPLICATE_FEATURE;
19606                     }
19607                 }
19608               break;
19609             }
19610         }
19611
19612       if (i == num_features)
19613         {
19614           /* Feature not found in list.  */
19615           if (invalid_extension)
19616             *invalid_extension = std::string (str, len);
19617           return AARCH_PARSE_INVALID_FEATURE;
19618         }
19619
19620       str = ext;
19621       if (str)
19622         /* Skip over the next '+'.  */
19623         str++;
19624     }
19625
19626   return AARCH_PARSE_OK;
19627 }
19628
19629 /* Parse the tree in ARGS that contains the target_version attribute
19630    information and update the global target options space.  */
19631
19632 static bool
19633 aarch64_process_target_version_attr (tree args)
19634 {
19635   if (TREE_CODE (args) == TREE_LIST)
19636     {
19637       if (TREE_CHAIN (args))
19638         {
19639           error ("attribute %<target_version%> has multiple values");
19640           return false;
19641         }
19642       args = TREE_VALUE (args);
19643     }
19644
19645   if (!args || TREE_CODE (args) != STRING_CST)
19646     {
19647       error ("attribute %<target_version%> argument not a string");
19648       return false;
19649     }
19650
19651   const char *str = TREE_STRING_POINTER (args);
19652
19653   enum aarch_parse_opt_result parse_res;
19654   auto isa_flags = aarch64_asm_isa_flags;
19655
19656   std::string invalid_extension;
19657   parse_res = aarch64_parse_fmv_features (str, &isa_flags, NULL,
19658                                           &invalid_extension);
19659
19660   if (parse_res == AARCH_PARSE_OK)
19661     {
19662       aarch64_set_asm_isa_flags (isa_flags);
19663       return true;
19664     }
19665
19666   switch (parse_res)
19667     {
19668     case AARCH_PARSE_MISSING_ARG:
19669       error ("missing value in %<target_version%> attribute");
19670       break;
19671
19672     case AARCH_PARSE_INVALID_FEATURE:
19673       error ("invalid feature modifier %qs of value %qs in "
19674              "%<target_version%> attribute", invalid_extension.c_str (),
19675              str);
19676       break;
19677
19678     case AARCH_PARSE_DUPLICATE_FEATURE:
19679       error ("duplicate feature modifier %qs of value %qs in "
19680              "%<target_version%> attribute", invalid_extension.c_str (),
19681              str);
19682       break;
19683
19684     default:
19685       gcc_unreachable ();
19686     }
19687
19688   return false;
19689 }
19690
19691 /* Implement TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P.  This is used to
19692    process attribute ((target_version ("..."))).  */
19693
19694 static bool
19695 aarch64_option_valid_version_attribute_p (tree fndecl, tree, tree args, int)
19696 {
19697   struct cl_target_option cur_target;
19698   bool ret;
19699   tree new_target;
19700   tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19701
19702   /* Save the current target options to restore at the end.  */
19703   cl_target_option_save (&cur_target, &global_options, &global_options_set);
19704
19705   /* If fndecl already has some target attributes applied to it, unpack
19706      them so that we add this attribute on top of them, rather than
19707      overwriting them.  */
19708   if (existing_target)
19709     {
19710       struct cl_target_option *existing_options
19711         = TREE_TARGET_OPTION (existing_target);
19712
19713       if (existing_options)
19714         cl_target_option_restore (&global_options, &global_options_set,
19715                                   existing_options);
19716     }
19717   else
19718     cl_target_option_restore (&global_options, &global_options_set,
19719                               TREE_TARGET_OPTION (target_option_current_node));
19720
19721   ret = aarch64_process_target_version_attr (args);
19722
19723   /* Set up any additional state.  */
19724   if (ret)
19725     {
19726       aarch64_override_options_internal (&global_options);
19727       new_target = build_target_option_node (&global_options,
19728                                              &global_options_set);
19729     }
19730   else
19731     new_target = NULL;
19732
19733   if (fndecl && ret)
19734       DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19735
19736   cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19737
19738   return ret;
19739 }
19740
19741 /* This parses the attribute arguments to target_version in DECL and the
19742    feature mask required to select those targets.  No adjustments are made to
19743    add or remove redundant feature requirements.  */
19744
19745 static aarch64_fmv_feature_mask
19746 get_feature_mask_for_version (tree decl)
19747 {
19748   tree version_attr = lookup_attribute ("target_version",
19749                                         DECL_ATTRIBUTES (decl));
19750   if (version_attr == NULL)
19751     return 0;
19752
19753   const char *version_string = TREE_STRING_POINTER (TREE_VALUE (TREE_VALUE
19754                                                     (version_attr)));
19755   enum aarch_parse_opt_result parse_res;
19756   aarch64_fmv_feature_mask feature_mask;
19757
19758   parse_res = aarch64_parse_fmv_features (version_string, NULL, &feature_mask,
19759                                           NULL);
19760
19761   /* We should have detected any errors before getting here.  */
19762   gcc_assert (parse_res == AARCH_PARSE_OK);
19763
19764   return feature_mask;
19765 }
19766
19767 /* Compare priorities of two feature masks. Return:
19768      1: mask1 is higher priority
19769     -1: mask2 is higher priority
19770      0: masks are equal.  */
19771
19772 static int
19773 compare_feature_masks (aarch64_fmv_feature_mask mask1,
19774                        aarch64_fmv_feature_mask mask2)
19775 {
19776   int pop1 = popcount_hwi (mask1);
19777   int pop2 = popcount_hwi (mask2);
19778   if (pop1 > pop2)
19779     return 1;
19780   if (pop2 > pop1)
19781     return -1;
19782
19783   auto diff_mask = mask1 ^ mask2;
19784   if (diff_mask == 0ULL)
19785     return 0;
19786   for (int i = FEAT_MAX - 1; i > 0; i--)
19787     {
19788       auto bit_mask = aarch64_fmv_feature_data[i].feature_mask;
19789       if (diff_mask & bit_mask)
19790         return (mask1 & bit_mask) ? 1 : -1;
19791     }
19792   gcc_unreachable();
19793 }
19794
19795 /* Compare priorities of two version decls.  */
19796
19797 int
19798 aarch64_compare_version_priority (tree decl1, tree decl2)
19799 {
19800   auto mask1 = get_feature_mask_for_version (decl1);
19801   auto mask2 = get_feature_mask_for_version (decl2);
19802
19803   return compare_feature_masks (mask1, mask2);
19804 }
19805
19806 /* Build the struct __ifunc_arg_t type:
19807
19808    struct __ifunc_arg_t
19809    {
19810      unsigned long _size; // Size of the struct, so it can grow.
19811      unsigned long _hwcap;
19812      unsigned long _hwcap2;
19813    }
19814  */
19815
19816 static tree
19817 build_ifunc_arg_type ()
19818 {
19819   tree ifunc_arg_type = lang_hooks.types.make_type (RECORD_TYPE);
19820   tree field1 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
19821                             get_identifier ("_size"),
19822                             long_unsigned_type_node);
19823   tree field2 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
19824                             get_identifier ("_hwcap"),
19825                             long_unsigned_type_node);
19826   tree field3 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
19827                             get_identifier ("_hwcap2"),
19828                             long_unsigned_type_node);
19829
19830   DECL_FIELD_CONTEXT (field1) = ifunc_arg_type;
19831   DECL_FIELD_CONTEXT (field2) = ifunc_arg_type;
19832   DECL_FIELD_CONTEXT (field3) = ifunc_arg_type;
19833
19834   TYPE_FIELDS (ifunc_arg_type) = field1;
19835   DECL_CHAIN (field1) = field2;
19836   DECL_CHAIN (field2) = field3;
19837
19838   layout_type (ifunc_arg_type);
19839
19840   tree const_type = build_qualified_type (ifunc_arg_type, TYPE_QUAL_CONST);
19841   tree pointer_type = build_pointer_type (const_type);
19842
19843   return pointer_type;
19844 }
19845
19846 /* Make the resolver function decl to dispatch the versions of
19847    a multi-versioned function,  DEFAULT_DECL.  IFUNC_ALIAS_DECL is
19848    ifunc alias that will point to the created resolver.  Create an
19849    empty basic block in the resolver and store the pointer in
19850    EMPTY_BB.  Return the decl of the resolver function.  */
19851
19852 static tree
19853 make_resolver_func (const tree default_decl,
19854                     const tree ifunc_alias_decl,
19855                     basic_block *empty_bb)
19856 {
19857   tree decl, type, t;
19858
19859   /* Create resolver function name based on default_decl.  */
19860   tree decl_name = clone_function_name (default_decl, "resolver");
19861   const char *resolver_name = IDENTIFIER_POINTER (decl_name);
19862
19863   /* The resolver function should have signature
19864      (void *) resolver (uint64_t, const __ifunc_arg_t *) */
19865   type = build_function_type_list (ptr_type_node,
19866                                    uint64_type_node,
19867                                    build_ifunc_arg_type (),
19868                                    NULL_TREE);
19869
19870   decl = build_fn_decl (resolver_name, type);
19871   SET_DECL_ASSEMBLER_NAME (decl, decl_name);
19872
19873   DECL_NAME (decl) = decl_name;
19874   TREE_USED (decl) = 1;
19875   DECL_ARTIFICIAL (decl) = 1;
19876   DECL_IGNORED_P (decl) = 1;
19877   TREE_PUBLIC (decl) = 0;
19878   DECL_UNINLINABLE (decl) = 1;
19879
19880   /* Resolver is not external, body is generated.  */
19881   DECL_EXTERNAL (decl) = 0;
19882   DECL_EXTERNAL (ifunc_alias_decl) = 0;
19883
19884   DECL_CONTEXT (decl) = NULL_TREE;
19885   DECL_INITIAL (decl) = make_node (BLOCK);
19886   DECL_STATIC_CONSTRUCTOR (decl) = 0;
19887
19888   if (DECL_COMDAT_GROUP (default_decl)
19889       || TREE_PUBLIC (default_decl))
19890     {
19891       /* In this case, each translation unit with a call to this
19892          versioned function will put out a resolver.  Ensure it
19893          is comdat to keep just one copy.  */
19894       DECL_COMDAT (decl) = 1;
19895       make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
19896     }
19897   else
19898     TREE_PUBLIC (ifunc_alias_decl) = 0;
19899
19900   /* Build result decl and add to function_decl. */
19901   t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
19902   DECL_CONTEXT (t) = decl;
19903   DECL_ARTIFICIAL (t) = 1;
19904   DECL_IGNORED_P (t) = 1;
19905   DECL_RESULT (decl) = t;
19906
19907   /* Build parameter decls and add to function_decl. */
19908   tree arg1 = build_decl (UNKNOWN_LOCATION, PARM_DECL,
19909                           get_identifier ("hwcap"),
19910                           uint64_type_node);
19911   tree arg2 = build_decl (UNKNOWN_LOCATION, PARM_DECL,
19912                           get_identifier ("arg"),
19913                           build_ifunc_arg_type());
19914   DECL_CONTEXT (arg1) = decl;
19915   DECL_CONTEXT (arg2) = decl;
19916   DECL_ARTIFICIAL (arg1) = 1;
19917   DECL_ARTIFICIAL (arg2) = 1;
19918   DECL_IGNORED_P (arg1) = 1;
19919   DECL_IGNORED_P (arg2) = 1;
19920   DECL_ARG_TYPE (arg1) = uint64_type_node;
19921   DECL_ARG_TYPE (arg2) = build_ifunc_arg_type ();
19922   DECL_ARGUMENTS (decl) = arg1;
19923   TREE_CHAIN (arg1) = arg2;
19924
19925   gimplify_function_tree (decl);
19926   push_cfun (DECL_STRUCT_FUNCTION (decl));
19927   *empty_bb = init_lowered_empty_function (decl, false,
19928                                            profile_count::uninitialized ());
19929
19930   cgraph_node::add_new_function (decl, true);
19931   symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
19932
19933   pop_cfun ();
19934
19935   gcc_assert (ifunc_alias_decl != NULL);
19936   /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name.  */
19937   DECL_ATTRIBUTES (ifunc_alias_decl)
19938     = make_attribute ("ifunc", resolver_name,
19939                       DECL_ATTRIBUTES (ifunc_alias_decl));
19940
19941   /* Create the alias for dispatch to resolver here.  */
19942   cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
19943   return decl;
19944 }
19945
19946 /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
19947    to return a pointer to VERSION_DECL if all feature bits specified in
19948    FEATURE_MASK are not set in MASK_VAR.  This function will be called during
19949    version dispatch to decide which function version to execute.  It returns
19950    the basic block at the end, to which more conditions can be added.  */
19951 static basic_block
19952 add_condition_to_bb (tree function_decl, tree version_decl,
19953                      aarch64_fmv_feature_mask feature_mask,
19954                      tree mask_var, basic_block new_bb)
19955 {
19956   gimple *return_stmt;
19957   tree convert_expr, result_var;
19958   gimple *convert_stmt;
19959   gimple *if_else_stmt;
19960
19961   basic_block bb1, bb2, bb3;
19962   edge e12, e23;
19963
19964   gimple_seq gseq;
19965
19966   push_cfun (DECL_STRUCT_FUNCTION (function_decl));
19967
19968   gcc_assert (new_bb != NULL);
19969   gseq = bb_seq (new_bb);
19970
19971   convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
19972                          build_fold_addr_expr (version_decl));
19973   result_var = create_tmp_var (ptr_type_node);
19974   convert_stmt = gimple_build_assign (result_var, convert_expr);
19975   return_stmt = gimple_build_return (result_var);
19976
19977   if (feature_mask == 0ULL)
19978     {
19979       /* Default version.  */
19980       gimple_seq_add_stmt (&gseq, convert_stmt);
19981       gimple_seq_add_stmt (&gseq, return_stmt);
19982       set_bb_seq (new_bb, gseq);
19983       gimple_set_bb (convert_stmt, new_bb);
19984       gimple_set_bb (return_stmt, new_bb);
19985       pop_cfun ();
19986       return new_bb;
19987     }
19988
19989   tree and_expr_var = create_tmp_var (long_long_unsigned_type_node);
19990   tree and_expr = build2 (BIT_AND_EXPR,
19991                           long_long_unsigned_type_node,
19992                           mask_var,
19993                           build_int_cst (long_long_unsigned_type_node,
19994                                          feature_mask));
19995   gimple *and_stmt = gimple_build_assign (and_expr_var, and_expr);
19996   gimple_set_block (and_stmt, DECL_INITIAL (function_decl));
19997   gimple_set_bb (and_stmt, new_bb);
19998   gimple_seq_add_stmt (&gseq, and_stmt);
19999
20000   tree zero_llu = build_int_cst (long_long_unsigned_type_node, 0);
20001   if_else_stmt = gimple_build_cond (EQ_EXPR, and_expr_var, zero_llu,
20002                                     NULL_TREE, NULL_TREE);
20003   gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
20004   gimple_set_bb (if_else_stmt, new_bb);
20005   gimple_seq_add_stmt (&gseq, if_else_stmt);
20006
20007   gimple_seq_add_stmt (&gseq, convert_stmt);
20008   gimple_seq_add_stmt (&gseq, return_stmt);
20009   set_bb_seq (new_bb, gseq);
20010
20011   bb1 = new_bb;
20012   e12 = split_block (bb1, if_else_stmt);
20013   bb2 = e12->dest;
20014   e12->flags &= ~EDGE_FALLTHRU;
20015   e12->flags |= EDGE_TRUE_VALUE;
20016
20017   e23 = split_block (bb2, return_stmt);
20018
20019   gimple_set_bb (convert_stmt, bb2);
20020   gimple_set_bb (return_stmt, bb2);
20021
20022   bb3 = e23->dest;
20023   make_edge (bb1, bb3, EDGE_FALSE_VALUE);
20024
20025   remove_edge (e23);
20026   make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
20027
20028   pop_cfun ();
20029
20030   return bb3;
20031 }
20032
20033 /* This function generates the dispatch function for
20034    multi-versioned functions.  DISPATCH_DECL is the function which will
20035    contain the dispatch logic.  FNDECLS are the function choices for
20036    dispatch, and is a tree chain.  EMPTY_BB is the basic block pointer
20037    in DISPATCH_DECL in which the dispatch code is generated.  */
20038
20039 static int
20040 dispatch_function_versions (tree dispatch_decl,
20041                             void *fndecls_p,
20042                             basic_block *empty_bb)
20043 {
20044   gimple *ifunc_cpu_init_stmt;
20045   gimple_seq gseq;
20046   vec<tree> *fndecls;
20047
20048   gcc_assert (dispatch_decl != NULL
20049               && fndecls_p != NULL
20050               && empty_bb != NULL);
20051
20052   push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
20053
20054   gseq = bb_seq (*empty_bb);
20055   /* Function version dispatch is via IFUNC.  IFUNC resolvers fire before
20056      constructors, so explicity call __init_cpu_features_resolver here.  */
20057   tree init_fn_type = build_function_type_list (void_type_node,
20058                                                 long_unsigned_type_node,
20059                                                 build_ifunc_arg_type(),
20060                                                 NULL);
20061   tree init_fn_id = get_identifier ("__init_cpu_features_resolver");
20062   tree init_fn_decl = build_decl (UNKNOWN_LOCATION, FUNCTION_DECL,
20063                                   init_fn_id, init_fn_type);
20064   tree arg1 = DECL_ARGUMENTS (dispatch_decl);
20065   tree arg2 = TREE_CHAIN (arg1);
20066   ifunc_cpu_init_stmt = gimple_build_call (init_fn_decl, 2, arg1, arg2);
20067   gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
20068   gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
20069
20070   /* Build the struct type for __aarch64_cpu_features.  */
20071   tree global_type = lang_hooks.types.make_type (RECORD_TYPE);
20072   tree field1 = build_decl (UNKNOWN_LOCATION, FIELD_DECL,
20073                             get_identifier ("features"),
20074                             long_long_unsigned_type_node);
20075   DECL_FIELD_CONTEXT (field1) = global_type;
20076   TYPE_FIELDS (global_type) = field1;
20077   layout_type (global_type);
20078
20079   tree global_var = build_decl (UNKNOWN_LOCATION, VAR_DECL,
20080                                 get_identifier ("__aarch64_cpu_features"),
20081                                 global_type);
20082   DECL_EXTERNAL (global_var) = 1;
20083   tree mask_var = create_tmp_var (long_long_unsigned_type_node);
20084
20085   tree component_expr = build3 (COMPONENT_REF, long_long_unsigned_type_node,
20086                                 global_var, field1, NULL_TREE);
20087   gimple *component_stmt = gimple_build_assign (mask_var, component_expr);
20088   gimple_set_block (component_stmt, DECL_INITIAL (dispatch_decl));
20089   gimple_set_bb (component_stmt, *empty_bb);
20090   gimple_seq_add_stmt (&gseq, component_stmt);
20091
20092   tree not_expr = build1 (BIT_NOT_EXPR, long_long_unsigned_type_node, mask_var);
20093   gimple *not_stmt = gimple_build_assign (mask_var, not_expr);
20094   gimple_set_block (not_stmt, DECL_INITIAL (dispatch_decl));
20095   gimple_set_bb (not_stmt, *empty_bb);
20096   gimple_seq_add_stmt (&gseq, not_stmt);
20097
20098   set_bb_seq (*empty_bb, gseq);
20099
20100   pop_cfun ();
20101
20102   /* fndecls_p is actually a vector.  */
20103   fndecls = static_cast<vec<tree> *> (fndecls_p);
20104
20105   /* At least one more version other than the default.  */
20106   unsigned int num_versions = fndecls->length ();
20107   gcc_assert (num_versions >= 2);
20108
20109   struct function_version_info
20110     {
20111       tree version_decl;
20112       aarch64_fmv_feature_mask feature_mask;
20113     } *function_versions;
20114
20115   function_versions = (struct function_version_info *)
20116     XNEWVEC (struct function_version_info, (num_versions));
20117
20118   unsigned int actual_versions = 0;
20119
20120   for (tree version_decl : *fndecls)
20121     {
20122       aarch64_fmv_feature_mask feature_mask;
20123       /* Get attribute string, parse it and find the right features.  */
20124       feature_mask = get_feature_mask_for_version (version_decl);
20125       function_versions [actual_versions].version_decl = version_decl;
20126       function_versions [actual_versions].feature_mask = feature_mask;
20127       actual_versions++;
20128     }
20129
20130   auto compare_feature_version_info = [](const void *p1, const void *p2) {
20131     const function_version_info v1 = *(const function_version_info *)p1;
20132     const function_version_info v2 = *(const function_version_info *)p2;
20133     return - compare_feature_masks (v1.feature_mask, v2.feature_mask);
20134   };
20135
20136   /* Sort the versions according to descending order of dispatch priority.  */
20137   qsort (function_versions, actual_versions,
20138          sizeof (struct function_version_info), compare_feature_version_info);
20139
20140   for (unsigned int i = 0; i < actual_versions; ++i)
20141     *empty_bb = add_condition_to_bb (dispatch_decl,
20142                                      function_versions[i].version_decl,
20143                                      function_versions[i].feature_mask,
20144                                      mask_var,
20145                                      *empty_bb);
20146
20147   free (function_versions);
20148   return 0;
20149 }
20150
20151 /* Implement TARGET_GENERATE_VERSION_DISPATCHER_BODY.  */
20152
20153 tree
20154 aarch64_generate_version_dispatcher_body (void *node_p)
20155 {
20156   tree resolver_decl;
20157   basic_block empty_bb;
20158   tree default_ver_decl;
20159   struct cgraph_node *versn;
20160   struct cgraph_node *node;
20161
20162   struct cgraph_function_version_info *node_version_info = NULL;
20163   struct cgraph_function_version_info *versn_info = NULL;
20164
20165   node = (cgraph_node *)node_p;
20166
20167   node_version_info = node->function_version ();
20168   gcc_assert (node->dispatcher_function
20169               && node_version_info != NULL);
20170
20171   if (node_version_info->dispatcher_resolver)
20172     return node_version_info->dispatcher_resolver;
20173
20174   /* The first version in the chain corresponds to the default version.  */
20175   default_ver_decl = node_version_info->next->this_node->decl;
20176
20177   /* node is going to be an alias, so remove the finalized bit.  */
20178   node->definition = false;
20179
20180   resolver_decl = make_resolver_func (default_ver_decl,
20181                                       node->decl, &empty_bb);
20182
20183   node_version_info->dispatcher_resolver = resolver_decl;
20184
20185   push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
20186
20187   auto_vec<tree, 2> fn_ver_vec;
20188
20189   for (versn_info = node_version_info->next; versn_info;
20190        versn_info = versn_info->next)
20191     {
20192       versn = versn_info->this_node;
20193       /* Check for virtual functions here again, as by this time it should
20194          have been determined if this function needs a vtable index or
20195          not.  This happens for methods in derived classes that override
20196          virtual methods in base classes but are not explicitly marked as
20197          virtual.  */
20198       if (DECL_VINDEX (versn->decl))
20199         sorry ("virtual function multiversioning not supported");
20200
20201       fn_ver_vec.safe_push (versn->decl);
20202     }
20203
20204   dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
20205   cgraph_edge::rebuild_edges ();
20206   pop_cfun ();
20207   return resolver_decl;
20208 }
20209
20210 /* Make a dispatcher declaration for the multi-versioned function DECL.
20211    Calls to DECL function will be replaced with calls to the dispatcher
20212    by the front-end.  Returns the decl of the dispatcher function.  */
20213
20214 tree
20215 aarch64_get_function_versions_dispatcher (void *decl)
20216 {
20217   tree fn = (tree) decl;
20218   struct cgraph_node *node = NULL;
20219   struct cgraph_node *default_node = NULL;
20220   struct cgraph_function_version_info *node_v = NULL;
20221   struct cgraph_function_version_info *first_v = NULL;
20222
20223   tree dispatch_decl = NULL;
20224
20225   struct cgraph_function_version_info *default_version_info = NULL;
20226
20227   gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
20228
20229   node = cgraph_node::get (fn);
20230   gcc_assert (node != NULL);
20231
20232   node_v = node->function_version ();
20233   gcc_assert (node_v != NULL);
20234
20235   if (node_v->dispatcher_resolver != NULL)
20236     return node_v->dispatcher_resolver;
20237
20238   /* Find the default version and make it the first node.  */
20239   first_v = node_v;
20240   /* Go to the beginning of the chain.  */
20241   while (first_v->prev != NULL)
20242     first_v = first_v->prev;
20243   default_version_info = first_v;
20244   while (default_version_info != NULL)
20245     {
20246       if (get_feature_mask_for_version
20247             (default_version_info->this_node->decl) == 0ULL)
20248         break;
20249       default_version_info = default_version_info->next;
20250     }
20251
20252   /* If there is no default node, just return NULL.  */
20253   if (default_version_info == NULL)
20254     return NULL;
20255
20256   /* Make default info the first node.  */
20257   if (first_v != default_version_info)
20258     {
20259       default_version_info->prev->next = default_version_info->next;
20260       if (default_version_info->next)
20261         default_version_info->next->prev = default_version_info->prev;
20262       first_v->prev = default_version_info;
20263       default_version_info->next = first_v;
20264       default_version_info->prev = NULL;
20265     }
20266
20267   default_node = default_version_info->this_node;
20268
20269   if (targetm.has_ifunc_p ())
20270     {
20271       struct cgraph_function_version_info *it_v = NULL;
20272       struct cgraph_node *dispatcher_node = NULL;
20273       struct cgraph_function_version_info *dispatcher_version_info = NULL;
20274
20275       /* Right now, the dispatching is done via ifunc.  */
20276       dispatch_decl = make_dispatcher_decl (default_node->decl);
20277       TREE_NOTHROW (dispatch_decl) = TREE_NOTHROW (fn);
20278
20279       dispatcher_node = cgraph_node::get_create (dispatch_decl);
20280       gcc_assert (dispatcher_node != NULL);
20281       dispatcher_node->dispatcher_function = 1;
20282       dispatcher_version_info
20283         = dispatcher_node->insert_new_function_version ();
20284       dispatcher_version_info->next = default_version_info;
20285       dispatcher_node->definition = 1;
20286
20287       /* Set the dispatcher for all the versions.  */
20288       it_v = default_version_info;
20289       while (it_v != NULL)
20290         {
20291           it_v->dispatcher_resolver = dispatch_decl;
20292           it_v = it_v->next;
20293         }
20294     }
20295   else
20296     {
20297       error_at (DECL_SOURCE_LOCATION (default_node->decl),
20298                 "multiversioning needs %<ifunc%> which is not supported "
20299                 "on this target");
20300     }
20301
20302   return dispatch_decl;
20303 }
20304
20305 /* This function returns true if FN1 and FN2 are versions of the same function,
20306    that is, the target_version attributes of the function decls are different.
20307    This assumes that FN1 and FN2 have the same signature.  */
20308
20309 bool
20310 aarch64_common_function_versions (tree fn1, tree fn2)
20311 {
20312   if (TREE_CODE (fn1) != FUNCTION_DECL
20313       || TREE_CODE (fn2) != FUNCTION_DECL)
20314     return false;
20315
20316   return (aarch64_compare_version_priority (fn1, fn2) != 0);
20317 }
20318
20319 /* Implement TARGET_MANGLE_DECL_ASSEMBLER_NAME, to add function multiversioning
20320    suffixes.  */
20321
20322 tree
20323 aarch64_mangle_decl_assembler_name (tree decl, tree id)
20324 {
20325   /* For function version, add the target suffix to the assembler name.  */
20326   if (TREE_CODE (decl) == FUNCTION_DECL
20327       && DECL_FUNCTION_VERSIONED (decl))
20328     {
20329       aarch64_fmv_feature_mask feature_mask = get_feature_mask_for_version (decl);
20330
20331       /* No suffix for the default version.  */
20332       if (feature_mask == 0ULL)
20333         return id;
20334
20335       std::string name = IDENTIFIER_POINTER (id);
20336       name += "._";
20337
20338       for (int i = 0; i < FEAT_MAX; i++)
20339         {
20340           if (feature_mask & aarch64_fmv_feature_data[i].feature_mask)
20341             {
20342               name += "M";
20343               name += aarch64_fmv_feature_data[i].name;
20344             }
20345         }
20346
20347       if (DECL_ASSEMBLER_NAME_SET_P (decl))
20348         SET_DECL_RTL (decl, NULL);
20349
20350       id = get_identifier (name.c_str());
20351     }
20352   return id;
20353 }
20354
20355 /* Implement TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P.  Use an opt-out
20356    rather than an opt-in list.  */
20357
20358 static bool
20359 aarch64_function_attribute_inlinable_p (const_tree fndecl)
20360 {
20361   /* A function that has local SME state cannot be inlined into its caller,
20362      since we only support managing PSTATE.ZA switches at function scope.  */
20363   return (!aarch64_fndecl_has_new_state (fndecl, "za")
20364           && !aarch64_fndecl_has_new_state (fndecl, "zt0"));
20365 }
20366
20367 /* Helper for aarch64_can_inline_p.  In the case where CALLER and CALLEE are
20368    tri-bool options (yes, no, don't care) and the default value is
20369    DEF, determine whether to reject inlining.  */
20370
20371 static bool
20372 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
20373                                      int dont_care, int def)
20374 {
20375   /* If the callee doesn't care, always allow inlining.  */
20376   if (callee == dont_care)
20377     return true;
20378
20379   /* If the caller doesn't care, always allow inlining.  */
20380   if (caller == dont_care)
20381     return true;
20382
20383   /* Otherwise, allow inlining if either the callee and caller values
20384      agree, or if the callee is using the default value.  */
20385   return (callee == caller || callee == def);
20386 }
20387
20388 /* Bit allocations for ipa_fn_summary::target_info.  */
20389
20390 /* Set if the function contains a stmt that relies on the function's
20391    choice of PSTATE.SM setting (0 for non-streaming, 1 for streaming).
20392    Not meaningful for streaming-compatible functions.  */
20393 constexpr auto AARCH64_IPA_SM_FIXED = 1U << 0;
20394
20395 /* Set if the function clobbers ZA and ZT0.  Not meaningful for functions that
20396    have ZA state.  */
20397 constexpr auto AARCH64_IPA_CLOBBERS_ZA = 1U << 1;
20398 constexpr auto AARCH64_IPA_CLOBBERS_ZT0 = 1U << 2;
20399
20400 /* Implement TARGET_NEED_IPA_FN_TARGET_INFO.  */
20401
20402 static bool
20403 aarch64_need_ipa_fn_target_info (const_tree, unsigned int &)
20404 {
20405   /* We could in principle skip this for streaming-compatible functions
20406      that have ZA state, but that's a rare combination.  */
20407   return true;
20408 }
20409
20410 /* Implement TARGET_UPDATE_IPA_FN_TARGET_INFO.  */
20411
20412 static bool
20413 aarch64_update_ipa_fn_target_info (unsigned int &info, const gimple *stmt)
20414 {
20415   if (auto *ga = dyn_cast<const gasm *> (stmt))
20416     {
20417       /* We don't know what the asm does, so conservatively assume that
20418          it requires the function's current SM mode.  */
20419       info |= AARCH64_IPA_SM_FIXED;
20420       for (unsigned int i = 0; i < gimple_asm_nclobbers (ga); ++i)
20421         {
20422           tree op = gimple_asm_clobber_op (ga, i);
20423           const char *clobber = TREE_STRING_POINTER (TREE_VALUE (op));
20424           if (strcmp (clobber, "za") == 0)
20425             info |= AARCH64_IPA_CLOBBERS_ZA;
20426           if (strcmp (clobber, "zt0") == 0)
20427             info |= AARCH64_IPA_CLOBBERS_ZT0;
20428         }
20429     }
20430   if (auto *call = dyn_cast<const gcall *> (stmt))
20431     {
20432       if (gimple_call_builtin_p (call, BUILT_IN_MD))
20433         {
20434           /* The attributes on AArch64 builtins are supposed to be accurate.
20435              If the function isn't marked streaming-compatible then it
20436              needs whichever SM mode it selects.  */
20437           tree decl = gimple_call_fndecl (call);
20438           if (aarch64_fndecl_pstate_sm (decl) != 0)
20439             info |= AARCH64_IPA_SM_FIXED;
20440         }
20441     }
20442   return true;
20443 }
20444
20445 /* Implement TARGET_CAN_INLINE_P.  Decide whether it is valid
20446    to inline CALLEE into CALLER based on target-specific info.
20447    Make sure that the caller and callee have compatible architectural
20448    features.  Then go through the other possible target attributes
20449    and see if they can block inlining.  Try not to reject always_inline
20450    callees unless they are incompatible architecturally.  */
20451
20452 static bool
20453 aarch64_can_inline_p (tree caller, tree callee)
20454 {
20455   tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
20456   tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
20457
20458   struct cl_target_option *caller_opts
20459         = TREE_TARGET_OPTION (caller_tree ? caller_tree
20460                                            : target_option_default_node);
20461
20462   struct cl_target_option *callee_opts
20463         = TREE_TARGET_OPTION (callee_tree ? callee_tree
20464                                            : target_option_default_node);
20465
20466   /* Callee's ISA flags should be a subset of the caller's.  */
20467   auto caller_asm_isa = (caller_opts->x_aarch64_asm_isa_flags
20468                          & ~AARCH64_FL_ISA_MODES);
20469   auto callee_asm_isa = (callee_opts->x_aarch64_asm_isa_flags
20470                          & ~AARCH64_FL_ISA_MODES);
20471   if (callee_asm_isa & ~caller_asm_isa)
20472     return false;
20473
20474   auto caller_isa = (caller_opts->x_aarch64_isa_flags
20475                      & ~AARCH64_FL_ISA_MODES);
20476   auto callee_isa = (callee_opts->x_aarch64_isa_flags
20477                      & ~AARCH64_FL_ISA_MODES);
20478   if (callee_isa & ~caller_isa)
20479     return false;
20480
20481   /* Return true if the callee might have target_info property PROPERTY.
20482      The answer must be true unless we have positive proof to the contrary.  */
20483   auto callee_has_property = [&](unsigned int property)
20484     {
20485       if (ipa_fn_summaries)
20486         if (auto *summary = ipa_fn_summaries->get (cgraph_node::get (callee)))
20487           if (!(summary->target_info & property))
20488             return false;
20489       return true;
20490     };
20491
20492   /* Streaming-compatible code can be inlined into functions with any
20493      PSTATE.SM mode.  Otherwise the caller and callee must agree on
20494      PSTATE.SM mode, unless we can prove that the callee is naturally
20495      streaming-compatible.  */
20496   auto caller_sm = (caller_opts->x_aarch64_isa_flags & AARCH64_FL_SM_STATE);
20497   auto callee_sm = (callee_opts->x_aarch64_isa_flags & AARCH64_FL_SM_STATE);
20498   if (callee_sm
20499       && caller_sm != callee_sm
20500       && callee_has_property (AARCH64_IPA_SM_FIXED))
20501     return false;
20502
20503   /* aarch64_function_attribute_inlinable_p prevents new-ZA and new-ZT0
20504      functions from being inlined into others.  We also need to prevent
20505      inlining of shared-ZA functions into functions without ZA state,
20506      since this is an error condition.
20507
20508      The only other problematic case for ZA is inlining a function that
20509      directly clobbers ZA or ZT0 into a function that has ZA or ZT0 state.  */
20510   auto caller_za = (caller_opts->x_aarch64_isa_flags & AARCH64_FL_ZA_ON);
20511   auto callee_za = (callee_opts->x_aarch64_isa_flags & AARCH64_FL_ZA_ON);
20512   if (!caller_za && callee_za)
20513     return false;
20514   if (!callee_za
20515       && aarch64_fndecl_has_state (caller, "za")
20516       && callee_has_property (AARCH64_IPA_CLOBBERS_ZA))
20517     return false;
20518   if (!callee_za
20519       && aarch64_fndecl_has_state (caller, "zt0")
20520       && callee_has_property (AARCH64_IPA_CLOBBERS_ZT0))
20521     return false;
20522
20523   /* Allow non-strict aligned functions inlining into strict
20524      aligned ones.  */
20525   if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
20526        != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
20527       && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
20528            && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
20529     return false;
20530
20531   bool always_inline = lookup_attribute ("always_inline",
20532                                           DECL_ATTRIBUTES (callee));
20533
20534   /* If the architectural features match up and the callee is always_inline
20535      then the other attributes don't matter.  */
20536   if (always_inline)
20537     return true;
20538
20539   if (caller_opts->x_aarch64_cmodel_var
20540       != callee_opts->x_aarch64_cmodel_var)
20541     return false;
20542
20543   if (caller_opts->x_aarch64_tls_dialect
20544       != callee_opts->x_aarch64_tls_dialect)
20545     return false;
20546
20547   /* Honour explicit requests to workaround errata.  */
20548   if (!aarch64_tribools_ok_for_inlining_p (
20549           caller_opts->x_aarch64_fix_a53_err835769,
20550           callee_opts->x_aarch64_fix_a53_err835769,
20551           2, TARGET_FIX_ERR_A53_835769_DEFAULT))
20552     return false;
20553
20554   if (!aarch64_tribools_ok_for_inlining_p (
20555           caller_opts->x_aarch64_fix_a53_err843419,
20556           callee_opts->x_aarch64_fix_a53_err843419,
20557           2, TARGET_FIX_ERR_A53_843419))
20558     return false;
20559
20560   /* If the user explicitly specified -momit-leaf-frame-pointer for the
20561      caller and calle and they don't match up, reject inlining.  */
20562   if (!aarch64_tribools_ok_for_inlining_p (
20563           caller_opts->x_flag_omit_leaf_frame_pointer,
20564           callee_opts->x_flag_omit_leaf_frame_pointer,
20565           2, 1))
20566     return false;
20567
20568   /* If the callee has specific tuning overrides, respect them.  */
20569   if (callee_opts->x_aarch64_override_tune_string != NULL
20570       && caller_opts->x_aarch64_override_tune_string == NULL)
20571     return false;
20572
20573   /* If the user specified tuning override strings for the
20574      caller and callee and they don't match up, reject inlining.
20575      We just do a string compare here, we don't analyze the meaning
20576      of the string, as it would be too costly for little gain.  */
20577   if (callee_opts->x_aarch64_override_tune_string
20578       && caller_opts->x_aarch64_override_tune_string
20579       && (strcmp (callee_opts->x_aarch64_override_tune_string,
20580                   caller_opts->x_aarch64_override_tune_string) != 0))
20581     return false;
20582
20583   return true;
20584 }
20585
20586 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
20587    been already.  */
20588
20589 arm_pcs
20590 aarch64_tlsdesc_abi_id ()
20591 {
20592   predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
20593   if (!tlsdesc_abi.initialized_p ())
20594     {
20595       HARD_REG_SET full_reg_clobbers;
20596       CLEAR_HARD_REG_SET (full_reg_clobbers);
20597       SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
20598       SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
20599       for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
20600         SET_HARD_REG_BIT (full_reg_clobbers, regno);
20601       tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
20602     }
20603   return ARM_PCS_TLSDESC;
20604 }
20605
20606 /* Return true if SYMBOL_REF X binds locally.  */
20607
20608 static bool
20609 aarch64_symbol_binds_local_p (const_rtx x)
20610 {
20611   return (SYMBOL_REF_DECL (x)
20612           ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
20613           : SYMBOL_REF_LOCAL_P (x));
20614 }
20615
20616 /* Return true if SYMBOL_REF X is thread local */
20617 static bool
20618 aarch64_tls_symbol_p (rtx x)
20619 {
20620   if (! TARGET_HAVE_TLS)
20621     return false;
20622
20623   x = strip_salt (x);
20624   if (!SYMBOL_REF_P (x))
20625     return false;
20626
20627   return SYMBOL_REF_TLS_MODEL (x) != 0;
20628 }
20629
20630 /* Classify a TLS symbol into one of the TLS kinds.  */
20631 enum aarch64_symbol_type
20632 aarch64_classify_tls_symbol (rtx x)
20633 {
20634   enum tls_model tls_kind = tls_symbolic_operand_type (x);
20635
20636   switch (tls_kind)
20637     {
20638     case TLS_MODEL_GLOBAL_DYNAMIC:
20639     case TLS_MODEL_LOCAL_DYNAMIC:
20640       return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
20641
20642     case TLS_MODEL_INITIAL_EXEC:
20643       switch (aarch64_cmodel)
20644         {
20645         case AARCH64_CMODEL_TINY:
20646         case AARCH64_CMODEL_TINY_PIC:
20647           return SYMBOL_TINY_TLSIE;
20648         default:
20649           return SYMBOL_SMALL_TLSIE;
20650         }
20651
20652     case TLS_MODEL_LOCAL_EXEC:
20653       if (aarch64_tls_size == 12)
20654         return SYMBOL_TLSLE12;
20655       else if (aarch64_tls_size == 24)
20656         return SYMBOL_TLSLE24;
20657       else if (aarch64_tls_size == 32)
20658         return SYMBOL_TLSLE32;
20659       else if (aarch64_tls_size == 48)
20660         return SYMBOL_TLSLE48;
20661       else
20662         gcc_unreachable ();
20663
20664     case TLS_MODEL_EMULATED:
20665     case TLS_MODEL_NONE:
20666       return SYMBOL_FORCE_TO_MEM;
20667
20668     default:
20669       gcc_unreachable ();
20670     }
20671 }
20672
20673 /* Return the correct method for accessing X + OFFSET, where X is either
20674    a SYMBOL_REF or LABEL_REF.  */
20675
20676 enum aarch64_symbol_type
20677 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
20678 {
20679   x = strip_salt (x);
20680
20681   if (LABEL_REF_P (x))
20682     {
20683       switch (aarch64_cmodel)
20684         {
20685         case AARCH64_CMODEL_LARGE:
20686           return SYMBOL_FORCE_TO_MEM;
20687
20688         case AARCH64_CMODEL_TINY_PIC:
20689         case AARCH64_CMODEL_TINY:
20690           return SYMBOL_TINY_ABSOLUTE;
20691
20692         case AARCH64_CMODEL_SMALL_SPIC:
20693         case AARCH64_CMODEL_SMALL_PIC:
20694         case AARCH64_CMODEL_SMALL:
20695           return SYMBOL_SMALL_ABSOLUTE;
20696
20697         default:
20698           gcc_unreachable ();
20699         }
20700     }
20701
20702   if (SYMBOL_REF_P (x))
20703     {
20704       if (aarch64_tls_symbol_p (x))
20705         return aarch64_classify_tls_symbol (x);
20706
20707       switch (aarch64_cmodel)
20708         {
20709         case AARCH64_CMODEL_TINY_PIC:
20710         case AARCH64_CMODEL_TINY:
20711           /* With -fPIC non-local symbols use the GOT.  For orthogonality
20712              always use the GOT for extern weak symbols.  */
20713           if ((flag_pic || SYMBOL_REF_WEAK (x))
20714               && !aarch64_symbol_binds_local_p (x))
20715             return SYMBOL_TINY_GOT;
20716
20717           /* When we retrieve symbol + offset address, we have to make sure
20718              the offset does not cause overflow of the final address.  But
20719              we have no way of knowing the address of symbol at compile time
20720              so we can't accurately say if the distance between the PC and
20721              symbol + offset is outside the addressible range of +/-1MB in the
20722              TINY code model.  So we limit the maximum offset to +/-64KB and
20723              assume the offset to the symbol is not larger than +/-(1MB - 64KB).
20724              If offset_within_block_p is true we allow larger offsets.  */
20725           if (!(IN_RANGE (offset, -0x10000, 0x10000)
20726                 || offset_within_block_p (x, offset)))
20727             return SYMBOL_FORCE_TO_MEM;
20728
20729           return SYMBOL_TINY_ABSOLUTE;
20730
20731
20732         case AARCH64_CMODEL_SMALL_SPIC:
20733         case AARCH64_CMODEL_SMALL_PIC:
20734         case AARCH64_CMODEL_SMALL:
20735           if ((flag_pic || SYMBOL_REF_WEAK (x))
20736               && !aarch64_symbol_binds_local_p (x))
20737             return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
20738                     ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
20739
20740           /* Same reasoning as the tiny code model, but the offset cap here is
20741              1MB, allowing +/-3.9GB for the offset to the symbol.  */
20742           if (!(IN_RANGE (offset, -0x100000, 0x100000)
20743                 || offset_within_block_p (x, offset)))
20744             return SYMBOL_FORCE_TO_MEM;
20745
20746           return SYMBOL_SMALL_ABSOLUTE;
20747
20748         case AARCH64_CMODEL_LARGE:
20749           /* This is alright even in PIC code as the constant
20750              pool reference is always PC relative and within
20751              the same translation unit.  */
20752           if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
20753             return SYMBOL_SMALL_ABSOLUTE;
20754           else
20755             return SYMBOL_FORCE_TO_MEM;
20756
20757         default:
20758           gcc_unreachable ();
20759         }
20760     }
20761
20762   /* By default push everything into the constant pool.  */
20763   return SYMBOL_FORCE_TO_MEM;
20764 }
20765
20766 bool
20767 aarch64_constant_address_p (rtx x)
20768 {
20769   return (CONSTANT_P (x) && memory_address_p (DImode, x));
20770 }
20771
20772 bool
20773 aarch64_legitimate_pic_operand_p (rtx x)
20774 {
20775   poly_int64 offset;
20776   x = strip_offset_and_salt (x, &offset);
20777   if (SYMBOL_REF_P (x))
20778     return false;
20779
20780   return true;
20781 }
20782
20783 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook.  Return true for constants
20784    that should be rematerialized rather than spilled.  */
20785
20786 static bool
20787 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
20788 {
20789   /* Support CSE and rematerialization of common constants.  */
20790   if (CONST_INT_P (x)
20791       || CONST_DOUBLE_P (x))
20792     return true;
20793
20794   /* Only accept variable-length vector constants if they can be
20795      handled directly.
20796
20797      ??? It would be possible (but complex) to handle rematerialization
20798      of other constants via secondary reloads.  */
20799   if (!GET_MODE_SIZE (mode).is_constant ())
20800     return aarch64_simd_valid_immediate (x, NULL);
20801
20802   /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
20803      least be forced to memory and loaded from there.  */
20804   if (CONST_VECTOR_P (x))
20805     return !targetm.cannot_force_const_mem (mode, x);
20806
20807   /* Do not allow vector struct mode constants for Advanced SIMD.
20808      We could support 0 and -1 easily, but they need support in
20809      aarch64-simd.md.  */
20810   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
20811   if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
20812     return false;
20813
20814   if (GET_CODE (x) == HIGH)
20815     x = XEXP (x, 0);
20816
20817   /* Accept polynomial constants that can be calculated by using the
20818      destination of a move as the sole temporary.  Constants that
20819      require a second temporary cannot be rematerialized (they can't be
20820      forced to memory and also aren't legitimate constants).  */
20821   poly_int64 offset;
20822   if (poly_int_rtx_p (x, &offset))
20823     return aarch64_offset_temporaries (false, offset) <= 1;
20824
20825   /* If an offset is being added to something else, we need to allow the
20826      base to be moved into the destination register, meaning that there
20827      are no free temporaries for the offset.  */
20828   x = strip_offset_and_salt (x, &offset);
20829   if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
20830     return false;
20831
20832   /* Do not allow const (plus (anchor_symbol, const_int)).  */
20833   if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
20834     return false;
20835
20836   /* Treat symbols as constants.  Avoid TLS symbols as they are complex,
20837      so spilling them is better than rematerialization.  */
20838   if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
20839     return true;
20840
20841   /* Label references are always constant.  */
20842   if (LABEL_REF_P (x))
20843     return true;
20844
20845   return false;
20846 }
20847
20848 rtx
20849 aarch64_load_tp (rtx target)
20850 {
20851   if (!target
20852       || GET_MODE (target) != Pmode
20853       || !register_operand (target, Pmode))
20854     target = gen_reg_rtx (Pmode);
20855
20856   /* Can return in any reg.  */
20857   emit_insn (gen_aarch64_load_tp_hard (target));
20858   return target;
20859 }
20860
20861 /* On AAPCS systems, this is the "struct __va_list".  */
20862 static GTY(()) tree va_list_type;
20863
20864 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
20865    Return the type to use as __builtin_va_list.
20866
20867    AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
20868
20869    struct __va_list
20870    {
20871      void *__stack;
20872      void *__gr_top;
20873      void *__vr_top;
20874      int   __gr_offs;
20875      int   __vr_offs;
20876    };  */
20877
20878 static tree
20879 aarch64_build_builtin_va_list (void)
20880 {
20881   tree va_list_name;
20882   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
20883
20884   /* Create the type.  */
20885   va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
20886   /* Give it the required name.  */
20887   va_list_name = build_decl (BUILTINS_LOCATION,
20888                              TYPE_DECL,
20889                              get_identifier ("__va_list"),
20890                              va_list_type);
20891   DECL_ARTIFICIAL (va_list_name) = 1;
20892   TYPE_NAME (va_list_type) = va_list_name;
20893   TYPE_STUB_DECL (va_list_type) = va_list_name;
20894
20895   /* Create the fields.  */
20896   f_stack = build_decl (BUILTINS_LOCATION,
20897                         FIELD_DECL, get_identifier ("__stack"),
20898                         ptr_type_node);
20899   f_grtop = build_decl (BUILTINS_LOCATION,
20900                         FIELD_DECL, get_identifier ("__gr_top"),
20901                         ptr_type_node);
20902   f_vrtop = build_decl (BUILTINS_LOCATION,
20903                         FIELD_DECL, get_identifier ("__vr_top"),
20904                         ptr_type_node);
20905   f_groff = build_decl (BUILTINS_LOCATION,
20906                         FIELD_DECL, get_identifier ("__gr_offs"),
20907                         integer_type_node);
20908   f_vroff = build_decl (BUILTINS_LOCATION,
20909                         FIELD_DECL, get_identifier ("__vr_offs"),
20910                         integer_type_node);
20911
20912   /* Tell tree-stdarg pass about our internal offset fields.
20913      NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
20914      purpose to identify whether the code is updating va_list internal
20915      offset fields through irregular way.  */
20916   va_list_gpr_counter_field = f_groff;
20917   va_list_fpr_counter_field = f_vroff;
20918
20919   DECL_ARTIFICIAL (f_stack) = 1;
20920   DECL_ARTIFICIAL (f_grtop) = 1;
20921   DECL_ARTIFICIAL (f_vrtop) = 1;
20922   DECL_ARTIFICIAL (f_groff) = 1;
20923   DECL_ARTIFICIAL (f_vroff) = 1;
20924
20925   DECL_FIELD_CONTEXT (f_stack) = va_list_type;
20926   DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
20927   DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
20928   DECL_FIELD_CONTEXT (f_groff) = va_list_type;
20929   DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
20930
20931   TYPE_FIELDS (va_list_type) = f_stack;
20932   DECL_CHAIN (f_stack) = f_grtop;
20933   DECL_CHAIN (f_grtop) = f_vrtop;
20934   DECL_CHAIN (f_vrtop) = f_groff;
20935   DECL_CHAIN (f_groff) = f_vroff;
20936
20937   /* Compute its layout.  */
20938   layout_type (va_list_type);
20939
20940   return va_list_type;
20941 }
20942
20943 /* Implement TARGET_EXPAND_BUILTIN_VA_START.  */
20944 static void
20945 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
20946 {
20947   const CUMULATIVE_ARGS *cum;
20948   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
20949   tree stack, grtop, vrtop, groff, vroff;
20950   tree t;
20951   int gr_save_area_size = cfun->va_list_gpr_size;
20952   int vr_save_area_size = cfun->va_list_fpr_size;
20953   int vr_offset;
20954
20955   cum = &crtl->args.info;
20956   if (cfun->va_list_gpr_size)
20957     gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
20958                              cfun->va_list_gpr_size);
20959   if (cfun->va_list_fpr_size)
20960     vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
20961                              * UNITS_PER_VREG, cfun->va_list_fpr_size);
20962
20963   if (!TARGET_FLOAT)
20964     {
20965       gcc_assert (cum->aapcs_nvrn == 0);
20966       vr_save_area_size = 0;
20967     }
20968
20969   f_stack = TYPE_FIELDS (va_list_type_node);
20970   f_grtop = DECL_CHAIN (f_stack);
20971   f_vrtop = DECL_CHAIN (f_grtop);
20972   f_groff = DECL_CHAIN (f_vrtop);
20973   f_vroff = DECL_CHAIN (f_groff);
20974
20975   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
20976                   NULL_TREE);
20977   grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
20978                   NULL_TREE);
20979   vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
20980                   NULL_TREE);
20981   groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
20982                   NULL_TREE);
20983   vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
20984                   NULL_TREE);
20985
20986   /* Emit code to initialize STACK, which points to the next varargs stack
20987      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
20988      by named arguments.  STACK is 8-byte aligned.  */
20989   t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
20990   if (cum->aapcs_stack_size > 0)
20991     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
20992   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
20993   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
20994
20995   /* Emit code to initialize GRTOP, the top of the GR save area.
20996      virtual_incoming_args_rtx should have been 16 byte aligned.  */
20997   t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
20998   t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
20999   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21000
21001   /* Emit code to initialize VRTOP, the top of the VR save area.
21002      This address is gr_save_area_bytes below GRTOP, rounded
21003      down to the next 16-byte boundary.  */
21004   t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
21005   vr_offset = ROUND_UP (gr_save_area_size,
21006                         STACK_BOUNDARY / BITS_PER_UNIT);
21007
21008   if (vr_offset)
21009     t = fold_build_pointer_plus_hwi (t, -vr_offset);
21010   t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
21011   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21012
21013   /* Emit code to initialize GROFF, the offset from GRTOP of the
21014      next GPR argument.  */
21015   t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
21016               build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
21017   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21018
21019   /* Likewise emit code to initialize VROFF, the offset from FTOP
21020      of the next VR argument.  */
21021   t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
21022               build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
21023   expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
21024 }
21025
21026 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR.  */
21027
21028 static tree
21029 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
21030                               gimple_seq *post_p ATTRIBUTE_UNUSED)
21031 {
21032   tree addr;
21033   bool indirect_p;
21034   bool is_ha;           /* is HFA or HVA.  */
21035   bool dw_align;        /* double-word align.  */
21036   machine_mode ag_mode = VOIDmode;
21037   int nregs;
21038   machine_mode mode;
21039
21040   tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
21041   tree stack, f_top, f_off, off, arg, roundup, on_stack;
21042   HOST_WIDE_INT size, rsize, adjust, align;
21043   tree t, u, cond1, cond2;
21044
21045   indirect_p = pass_va_arg_by_reference (type);
21046   if (indirect_p)
21047     type = build_pointer_type (type);
21048
21049   mode = TYPE_MODE (type);
21050
21051   f_stack = TYPE_FIELDS (va_list_type_node);
21052   f_grtop = DECL_CHAIN (f_stack);
21053   f_vrtop = DECL_CHAIN (f_grtop);
21054   f_groff = DECL_CHAIN (f_vrtop);
21055   f_vroff = DECL_CHAIN (f_groff);
21056
21057   stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
21058                   f_stack, NULL_TREE);
21059   size = int_size_in_bytes (type);
21060
21061   unsigned int abi_break_gcc_9;
21062   unsigned int abi_break_gcc_13;
21063   unsigned int abi_break_gcc_14;
21064   align
21065     = aarch64_function_arg_alignment (mode, type, &abi_break_gcc_9,
21066                                       &abi_break_gcc_13, &abi_break_gcc_14)
21067     / BITS_PER_UNIT;
21068
21069   dw_align = false;
21070   adjust = 0;
21071   if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
21072                                                &is_ha, false))
21073     {
21074       /* No frontends can create types with variable-sized modes, so we
21075          shouldn't be asked to pass or return them.  */
21076       unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
21077
21078       /* TYPE passed in fp/simd registers.  */
21079       if (!TARGET_FLOAT)
21080         aarch64_err_no_fpadvsimd (mode);
21081
21082       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
21083                       unshare_expr (valist), f_vrtop, NULL_TREE);
21084       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
21085                       unshare_expr (valist), f_vroff, NULL_TREE);
21086
21087       rsize = nregs * UNITS_PER_VREG;
21088
21089       if (is_ha)
21090         {
21091           if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
21092             adjust = UNITS_PER_VREG - ag_size;
21093         }
21094       else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21095                && size < UNITS_PER_VREG)
21096         {
21097           adjust = UNITS_PER_VREG - size;
21098         }
21099     }
21100   else
21101     {
21102       /* TYPE passed in general registers.  */
21103       f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
21104                       unshare_expr (valist), f_grtop, NULL_TREE);
21105       f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
21106                       unshare_expr (valist), f_groff, NULL_TREE);
21107       rsize = ROUND_UP (size, UNITS_PER_WORD);
21108       nregs = rsize / UNITS_PER_WORD;
21109
21110       if (align <= 8 && abi_break_gcc_13 && warn_psabi)
21111         inform (input_location, "parameter passing for argument of type "
21112                 "%qT changed in GCC 13.1", type);
21113
21114       if (warn_psabi
21115           && abi_break_gcc_14
21116           && (abi_break_gcc_14 > 8 * BITS_PER_UNIT) != (align > 8))
21117         inform (input_location, "parameter passing for argument of type "
21118                 "%qT changed in GCC 14.1", type);
21119
21120       if (align > 8)
21121         {
21122           if (abi_break_gcc_9 && warn_psabi)
21123             inform (input_location, "parameter passing for argument of type "
21124                     "%qT changed in GCC 9.1", type);
21125           dw_align = true;
21126         }
21127
21128       if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21129           && size < UNITS_PER_WORD)
21130         {
21131           adjust = UNITS_PER_WORD  - size;
21132         }
21133     }
21134
21135   /* Get a local temporary for the field value.  */
21136   off = get_initialized_tmp_var (f_off, pre_p, NULL);
21137
21138   /* Emit code to branch if off >= 0.  */
21139   t = build2 (GE_EXPR, boolean_type_node, off,
21140               build_int_cst (TREE_TYPE (off), 0));
21141   cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
21142
21143   if (dw_align)
21144     {
21145       /* Emit: offs = (offs + 15) & -16.  */
21146       t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
21147                   build_int_cst (TREE_TYPE (off), 15));
21148       t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
21149                   build_int_cst (TREE_TYPE (off), -16));
21150       roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
21151     }
21152   else
21153     roundup = NULL;
21154
21155   /* Update ap.__[g|v]r_offs  */
21156   t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
21157               build_int_cst (TREE_TYPE (off), rsize));
21158   t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
21159
21160   /* String up.  */
21161   if (roundup)
21162     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
21163
21164   /* [cond2] if (ap.__[g|v]r_offs > 0)  */
21165   u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
21166               build_int_cst (TREE_TYPE (f_off), 0));
21167   cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
21168
21169   /* String up: make sure the assignment happens before the use.  */
21170   t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
21171   COND_EXPR_ELSE (cond1) = t;
21172
21173   /* Prepare the trees handling the argument that is passed on the stack;
21174      the top level node will store in ON_STACK.  */
21175   arg = get_initialized_tmp_var (stack, pre_p, NULL);
21176   if (align > 8)
21177     {
21178       /* if (alignof(type) > 8) (arg = arg + 15) & -16;  */
21179       t = fold_build_pointer_plus_hwi (arg, 15);
21180       t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
21181                   build_int_cst (TREE_TYPE (t), -16));
21182       roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
21183     }
21184   else
21185     roundup = NULL;
21186   /* Advance ap.__stack  */
21187   t = fold_build_pointer_plus_hwi (arg, size + 7);
21188   t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
21189               build_int_cst (TREE_TYPE (t), -8));
21190   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
21191   /* String up roundup and advance.  */
21192   if (roundup)
21193     t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
21194   /* String up with arg */
21195   on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
21196   /* Big-endianness related address adjustment.  */
21197   if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
21198       && size < UNITS_PER_WORD)
21199   {
21200     t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
21201                 size_int (UNITS_PER_WORD - size));
21202     on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
21203   }
21204
21205   COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
21206   COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
21207
21208   /* Adjustment to OFFSET in the case of BIG_ENDIAN.  */
21209   t = off;
21210   if (adjust)
21211     t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
21212                 build_int_cst (TREE_TYPE (off), adjust));
21213
21214   t = fold_convert (sizetype, t);
21215   t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
21216
21217   if (is_ha)
21218     {
21219       /* type ha; // treat as "struct {ftype field[n];}"
21220          ... [computing offs]
21221          for (i = 0; i <nregs; ++i, offs += 16)
21222            ha.field[i] = *((ftype *)(ap.__vr_top + offs));
21223          return ha;  */
21224       int i;
21225       tree tmp_ha, field_t, field_ptr_t;
21226
21227       /* Declare a local variable.  */
21228       tmp_ha = create_tmp_var_raw (type, "ha");
21229       gimple_add_tmp_var (tmp_ha);
21230
21231       /* Establish the base type.  */
21232       switch (ag_mode)
21233         {
21234         case E_SFmode:
21235           field_t = float_type_node;
21236           field_ptr_t = float_ptr_type_node;
21237           break;
21238         case E_DFmode:
21239           field_t = double_type_node;
21240           field_ptr_t = double_ptr_type_node;
21241           break;
21242         case E_TFmode:
21243           field_t = long_double_type_node;
21244           field_ptr_t = long_double_ptr_type_node;
21245           break;
21246         case E_SDmode:
21247           field_t = dfloat32_type_node;
21248           field_ptr_t = build_pointer_type (dfloat32_type_node);
21249           break;
21250         case E_DDmode:
21251           field_t = dfloat64_type_node;
21252           field_ptr_t = build_pointer_type (dfloat64_type_node);
21253           break;
21254         case E_TDmode:
21255           field_t = dfloat128_type_node;
21256           field_ptr_t = build_pointer_type (dfloat128_type_node);
21257           break;
21258         case E_HFmode:
21259           field_t = aarch64_fp16_type_node;
21260           field_ptr_t = aarch64_fp16_ptr_type_node;
21261           break;
21262         case E_BFmode:
21263           field_t = bfloat16_type_node;
21264           field_ptr_t = aarch64_bf16_ptr_type_node;
21265           break;
21266         case E_V2SImode:
21267         case E_V4SImode:
21268             {
21269               tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
21270               field_t = build_vector_type_for_mode (innertype, ag_mode);
21271               field_ptr_t = build_pointer_type (field_t);
21272             }
21273           break;
21274         default:
21275           gcc_assert (0);
21276         }
21277
21278       /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area  */
21279       TREE_ADDRESSABLE (tmp_ha) = 1;
21280       tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
21281       addr = t;
21282       t = fold_convert (field_ptr_t, addr);
21283       t = build2 (MODIFY_EXPR, field_t,
21284                   build1 (INDIRECT_REF, field_t, tmp_ha),
21285                   build1 (INDIRECT_REF, field_t, t));
21286
21287       /* ha.field[i] = *((field_ptr_t)vr_saved_area + i)  */
21288       for (i = 1; i < nregs; ++i)
21289         {
21290           addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
21291           u = fold_convert (field_ptr_t, addr);
21292           u = build2 (MODIFY_EXPR, field_t,
21293                       build2 (MEM_REF, field_t, tmp_ha,
21294                               build_int_cst (field_ptr_t,
21295                                              (i *
21296                                               int_size_in_bytes (field_t)))),
21297                       build1 (INDIRECT_REF, field_t, u));
21298           t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
21299         }
21300
21301       u = fold_convert (TREE_TYPE (f_top), tmp_ha);
21302       t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
21303     }
21304
21305   COND_EXPR_ELSE (cond2) = t;
21306   addr = fold_convert (build_pointer_type (type), cond1);
21307   addr = build_va_arg_indirect_ref (addr);
21308
21309   if (indirect_p)
21310     addr = build_va_arg_indirect_ref (addr);
21311
21312   return addr;
21313 }
21314
21315 /* Implement TARGET_SETUP_INCOMING_VARARGS.  */
21316
21317 static void
21318 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
21319                                 const function_arg_info &arg,
21320                                 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
21321 {
21322   CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
21323   CUMULATIVE_ARGS local_cum;
21324   int gr_saved = cfun->va_list_gpr_size;
21325   int vr_saved = cfun->va_list_fpr_size;
21326
21327   /* The caller has advanced CUM up to, but not beyond, the last named
21328      argument.  Advance a local copy of CUM past the last "real" named
21329      argument, to find out how many registers are left over.  */
21330   local_cum = *cum;
21331   if (!TYPE_NO_NAMED_ARGS_STDARG_P (TREE_TYPE (current_function_decl)))
21332     aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
21333
21334   /* Found out how many registers we need to save.
21335      Honor tree-stdvar analysis results.  */
21336   if (cfun->va_list_gpr_size)
21337     gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
21338                     cfun->va_list_gpr_size / UNITS_PER_WORD);
21339   if (cfun->va_list_fpr_size)
21340     vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
21341                     cfun->va_list_fpr_size / UNITS_PER_VREG);
21342
21343   if (!TARGET_FLOAT)
21344     {
21345       gcc_assert (local_cum.aapcs_nvrn == 0);
21346       vr_saved = 0;
21347     }
21348
21349   if (!no_rtl)
21350     {
21351       if (gr_saved > 0)
21352         {
21353           rtx ptr, mem;
21354
21355           /* virtual_incoming_args_rtx should have been 16-byte aligned.  */
21356           ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
21357                                - gr_saved * UNITS_PER_WORD);
21358           mem = gen_frame_mem (BLKmode, ptr);
21359           set_mem_alias_set (mem, get_varargs_alias_set ());
21360
21361           move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
21362                                mem, gr_saved);
21363         }
21364       if (vr_saved > 0)
21365         {
21366           /* We can't use move_block_from_reg, because it will use
21367              the wrong mode, storing D regs only.  */
21368           machine_mode mode = TImode;
21369           int off, i, vr_start;
21370
21371           /* Set OFF to the offset from virtual_incoming_args_rtx of
21372              the first vector register.  The VR save area lies below
21373              the GR one, and is aligned to 16 bytes.  */
21374           off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
21375                            STACK_BOUNDARY / BITS_PER_UNIT);
21376           off -= vr_saved * UNITS_PER_VREG;
21377
21378           vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
21379           for (i = 0; i < vr_saved; ++i)
21380             {
21381               rtx ptr, mem;
21382
21383               ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
21384               mem = gen_frame_mem (mode, ptr);
21385               set_mem_alias_set (mem, get_varargs_alias_set ());
21386               aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
21387               off += UNITS_PER_VREG;
21388             }
21389         }
21390     }
21391
21392   /* We don't save the size into *PRETEND_SIZE because we want to avoid
21393      any complication of having crtl->args.pretend_args_size changed.  */
21394   cfun->machine->frame.saved_varargs_size
21395     = (ROUND_UP (gr_saved * UNITS_PER_WORD,
21396                  STACK_BOUNDARY / BITS_PER_UNIT)
21397        + vr_saved * UNITS_PER_VREG);
21398 }
21399
21400 static void
21401 aarch64_conditional_register_usage (void)
21402 {
21403   int i;
21404   if (!TARGET_FLOAT)
21405     {
21406       for (i = V0_REGNUM; i <= V31_REGNUM; i++)
21407         {
21408           fixed_regs[i] = 1;
21409           call_used_regs[i] = 1;
21410           CLEAR_HARD_REG_BIT (operand_reg_set, i);
21411         }
21412     }
21413   if (!TARGET_SVE)
21414     for (i = P0_REGNUM; i <= P15_REGNUM; i++)
21415       {
21416         fixed_regs[i] = 1;
21417         call_used_regs[i] = 1;
21418       }
21419
21420   /* Only allow these registers to be accessed via special patterns.  */
21421   CLEAR_HARD_REG_BIT (operand_reg_set, VG_REGNUM);
21422   CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
21423   CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
21424   for (int i = FIRST_FAKE_REGNUM; i <= LAST_FAKE_REGNUM; ++i)
21425     CLEAR_HARD_REG_BIT (operand_reg_set, i);
21426
21427   /* When tracking speculation, we need a couple of call-clobbered registers
21428      to track the speculation state.  It would be nice to just use
21429      IP0 and IP1, but currently there are numerous places that just
21430      assume these registers are free for other uses (eg pointer
21431      authentication).  */
21432   if (aarch64_track_speculation)
21433     {
21434       fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
21435       call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
21436       fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
21437       call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
21438     }
21439 }
21440
21441 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK.  */
21442
21443 bool
21444 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
21445 {
21446   /* For records we're passed a FIELD_DECL, for arrays we're passed
21447      an ARRAY_TYPE.  In both cases we're interested in the TREE_TYPE.  */
21448   const_tree type = TREE_TYPE (field_or_array);
21449
21450   /* Assign BLKmode to anything that contains more than 2 SVE predicates.
21451      For structures, the "multiple" case is indicated by MODE being
21452      VOIDmode.  */
21453   unsigned int num_zr, num_pr;
21454   if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr > 2)
21455     {
21456       if (TREE_CODE (field_or_array) == ARRAY_TYPE)
21457         return !simple_cst_equal (TYPE_SIZE (field_or_array),
21458                                   TYPE_SIZE (type));
21459       return mode == VOIDmode;
21460     }
21461
21462   return default_member_type_forces_blk (field_or_array, mode);
21463 }
21464
21465 /* Bitmasks that indicate whether earlier versions of GCC would have
21466    taken a different path through the ABI logic.  This should result in
21467    a -Wpsabi warning if the earlier path led to a different ABI decision.
21468
21469    WARN_PSABI_EMPTY_CXX17_BASE
21470       Indicates that the type includes an artificial empty C++17 base field
21471       that, prior to GCC 10.1, would prevent the type from being treated as
21472       a HFA or HVA.  See PR94383 for details.
21473
21474    WARN_PSABI_NO_UNIQUE_ADDRESS
21475       Indicates that the type includes an empty [[no_unique_address]] field
21476       that, prior to GCC 10.1, would prevent the type from being treated as
21477       a HFA or HVA.  */
21478 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
21479 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
21480 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD = 1U << 2;
21481
21482 /* Walk down the type tree of TYPE counting consecutive base elements.
21483    If *MODEP is VOIDmode, then set it to the first valid floating point
21484    type.  If a non-floating point type is found, or if a floating point
21485    type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
21486    otherwise return the count in the sub-tree.
21487
21488    The WARN_PSABI_FLAGS argument allows the caller to check whether this
21489    function has changed its behavior relative to earlier versions of GCC.
21490    Normally the argument should be nonnull and point to a zero-initialized
21491    variable.  The function then records whether the ABI decision might
21492    be affected by a known fix to the ABI logic, setting the associated
21493    WARN_PSABI_* bits if so.
21494
21495    When the argument is instead a null pointer, the function tries to
21496    simulate the behavior of GCC before all such ABI fixes were made.
21497    This is useful to check whether the function returns something
21498    different after the ABI fixes.  */
21499 static int
21500 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
21501                          unsigned int *warn_psabi_flags)
21502 {
21503   machine_mode mode;
21504   HOST_WIDE_INT size;
21505
21506   if (aarch64_sve::builtin_type_p (type))
21507     return -1;
21508
21509   switch (TREE_CODE (type))
21510     {
21511     case REAL_TYPE:
21512       mode = TYPE_MODE (type);
21513       if (mode != DFmode && mode != SFmode
21514           && mode != TFmode && mode != HFmode
21515           && mode != SDmode && mode != DDmode && mode != TDmode)
21516         return -1;
21517
21518       if (*modep == VOIDmode)
21519         *modep = mode;
21520
21521       if (*modep == mode)
21522         return 1;
21523
21524       break;
21525
21526     case COMPLEX_TYPE:
21527       mode = TYPE_MODE (TREE_TYPE (type));
21528       if (mode != DFmode && mode != SFmode
21529           && mode != TFmode && mode != HFmode)
21530         return -1;
21531
21532       if (*modep == VOIDmode)
21533         *modep = mode;
21534
21535       if (*modep == mode)
21536         return 2;
21537
21538       break;
21539
21540     case VECTOR_TYPE:
21541       /* Use V2SImode and V4SImode as representatives of all 64-bit
21542          and 128-bit vector types.  */
21543       size = int_size_in_bytes (type);
21544       switch (size)
21545         {
21546         case 8:
21547           mode = V2SImode;
21548           break;
21549         case 16:
21550           mode = V4SImode;
21551           break;
21552         default:
21553           return -1;
21554         }
21555
21556       if (*modep == VOIDmode)
21557         *modep = mode;
21558
21559       /* Vector modes are considered to be opaque: two vectors are
21560          equivalent for the purposes of being homogeneous aggregates
21561          if they are the same size.  */
21562       if (*modep == mode)
21563         return 1;
21564
21565       break;
21566
21567     case ARRAY_TYPE:
21568       {
21569         int count;
21570         tree index = TYPE_DOMAIN (type);
21571
21572         /* Can't handle incomplete types nor sizes that are not
21573            fixed.  */
21574         if (!COMPLETE_TYPE_P (type)
21575             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21576           return -1;
21577
21578         count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
21579                                          warn_psabi_flags);
21580         if (count == -1
21581             || !index
21582             || !TYPE_MAX_VALUE (index)
21583             || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
21584             || !TYPE_MIN_VALUE (index)
21585             || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
21586             || count < 0)
21587           return -1;
21588
21589         count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
21590                       - tree_to_uhwi (TYPE_MIN_VALUE (index)));
21591
21592         /* There must be no padding.  */
21593         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
21594                       count * GET_MODE_BITSIZE (*modep)))
21595           return -1;
21596
21597         return count;
21598       }
21599
21600     case RECORD_TYPE:
21601       {
21602         int count = 0;
21603         int sub_count;
21604         tree field;
21605
21606         /* Can't handle incomplete types nor sizes that are not
21607            fixed.  */
21608         if (!COMPLETE_TYPE_P (type)
21609             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21610           return -1;
21611
21612         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
21613           {
21614             if (TREE_CODE (field) != FIELD_DECL)
21615               continue;
21616
21617             if (DECL_FIELD_ABI_IGNORED (field))
21618               {
21619                 /* See whether this is something that earlier versions of
21620                    GCC failed to ignore.  */
21621                 unsigned int flag;
21622                 if (lookup_attribute ("no_unique_address",
21623                                       DECL_ATTRIBUTES (field)))
21624                   flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
21625                 else if (cxx17_empty_base_field_p (field))
21626                   flag = WARN_PSABI_EMPTY_CXX17_BASE;
21627                 else
21628                   /* No compatibility problem.  */
21629                   continue;
21630
21631                 /* Simulate the old behavior when WARN_PSABI_FLAGS is null.  */
21632                 if (warn_psabi_flags)
21633                   {
21634                     *warn_psabi_flags |= flag;
21635                     continue;
21636                   }
21637               }
21638             /* A zero-width bitfield may affect layout in some
21639                circumstances, but adds no members.  The determination
21640                of whether or not a type is an HFA is performed after
21641                layout is complete, so if the type still looks like an
21642                HFA afterwards, it is still classed as one.  This is
21643                potentially an ABI break for the hard-float ABI.  */
21644             else if (DECL_BIT_FIELD (field)
21645                      && integer_zerop (DECL_SIZE (field)))
21646               {
21647                 /* Prior to GCC-12 these fields were striped early,
21648                    hiding them from the back-end entirely and
21649                    resulting in the correct behaviour for argument
21650                    passing.  Simulate that old behaviour without
21651                    generating a warning.  */
21652                 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
21653                   continue;
21654                 if (warn_psabi_flags)
21655                   {
21656                     *warn_psabi_flags |= WARN_PSABI_ZERO_WIDTH_BITFIELD;
21657                     continue;
21658                   }
21659               }
21660
21661             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
21662                                                  warn_psabi_flags);
21663             if (sub_count < 0)
21664               return -1;
21665             count += sub_count;
21666           }
21667
21668         /* There must be no padding.  */
21669         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
21670                       count * GET_MODE_BITSIZE (*modep)))
21671           return -1;
21672
21673         return count;
21674       }
21675
21676     case UNION_TYPE:
21677     case QUAL_UNION_TYPE:
21678       {
21679         /* These aren't very interesting except in a degenerate case.  */
21680         int count = 0;
21681         int sub_count;
21682         tree field;
21683
21684         /* Can't handle incomplete types nor sizes that are not
21685            fixed.  */
21686         if (!COMPLETE_TYPE_P (type)
21687             || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
21688           return -1;
21689
21690         for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
21691           {
21692             if (TREE_CODE (field) != FIELD_DECL)
21693               continue;
21694
21695             sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
21696                                                  warn_psabi_flags);
21697             if (sub_count < 0)
21698               return -1;
21699             count = count > sub_count ? count : sub_count;
21700           }
21701
21702         /* There must be no padding.  */
21703         if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
21704                       count * GET_MODE_BITSIZE (*modep)))
21705           return -1;
21706
21707         return count;
21708       }
21709
21710     default:
21711       break;
21712     }
21713
21714   return -1;
21715 }
21716
21717 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
21718    type as described in AAPCS64 \S 4.1.2.
21719
21720    See the comment above aarch64_composite_type_p for the notes on MODE.  */
21721
21722 static bool
21723 aarch64_short_vector_p (const_tree type,
21724                         machine_mode mode)
21725 {
21726   poly_int64 size = -1;
21727
21728   if (type && VECTOR_TYPE_P (type))
21729     {
21730       if (aarch64_sve::builtin_type_p (type))
21731         return false;
21732       size = int_size_in_bytes (type);
21733     }
21734   else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
21735            || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
21736     {
21737       /* The containing "else if" is too loose: it means that we look at TYPE
21738          if the type is a vector type (good), but that we otherwise ignore TYPE
21739          and look only at the mode.  This is wrong because the type describes
21740          the language-level information whereas the mode is purely an internal
21741          GCC concept.  We can therefore reach here for types that are not
21742          vectors in the AAPCS64 sense.
21743
21744          We can't "fix" that for the traditional Advanced SIMD vector modes
21745          without breaking backwards compatibility.  However, there's no such
21746          baggage for the structure modes, which were introduced in GCC 12.  */
21747       if (aarch64_advsimd_struct_mode_p (mode))
21748         return false;
21749
21750       /* For similar reasons, rely only on the type, not the mode, when
21751          processing SVE types.  */
21752       if (type && aarch64_some_values_include_pst_objects_p (type))
21753         /* Leave later code to report an error if SVE is disabled.  */
21754         gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
21755       else
21756         size = GET_MODE_SIZE (mode);
21757     }
21758   if (known_eq (size, 8) || known_eq (size, 16))
21759     {
21760       /* 64-bit and 128-bit vectors should only acquire an SVE mode if
21761          they are being treated as scalable AAPCS64 types.  */
21762       gcc_assert (!aarch64_sve_mode_p (mode)
21763                   && !aarch64_advsimd_struct_mode_p (mode));
21764       return true;
21765     }
21766   return false;
21767 }
21768
21769 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
21770    type as described in AAPCS64 \S 4.3.  This includes aggregate, union and
21771    array types.  The C99 floating-point complex types are also considered
21772    as composite types, according to AAPCS64 \S 7.1.1.  The complex integer
21773    types, which are GCC extensions and out of the scope of AAPCS64, are
21774    treated as composite types here as well.
21775
21776    Note that MODE itself is not sufficient in determining whether a type
21777    is such a composite type or not.  This is because
21778    stor-layout.cc:compute_record_mode may have already changed the MODE
21779    (BLKmode) of a RECORD_TYPE TYPE to some other mode.  For example, a
21780    structure with only one field may have its MODE set to the mode of the
21781    field.  Also an integer mode whose size matches the size of the
21782    RECORD_TYPE type may be used to substitute the original mode
21783    (i.e. BLKmode) in certain circumstances.  In other words, MODE cannot be
21784    solely relied on.  */
21785
21786 static bool
21787 aarch64_composite_type_p (const_tree type,
21788                           machine_mode mode)
21789 {
21790   if (aarch64_short_vector_p (type, mode))
21791     return false;
21792
21793   if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
21794     return true;
21795
21796   if (mode == BLKmode
21797       || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
21798       || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
21799     return true;
21800
21801   return false;
21802 }
21803
21804 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
21805    shall be passed or returned in simd/fp register(s) (providing these
21806    parameter passing registers are available).
21807
21808    Upon successful return, *COUNT returns the number of needed registers,
21809    *BASE_MODE returns the mode of the individual register and when IS_HA
21810    is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
21811    floating-point aggregate or a homogeneous short-vector aggregate.
21812
21813    SILENT_P is true if the function should refrain from reporting any
21814    diagnostics.  This should only be used if the caller is certain that
21815    any ABI decisions would eventually come through this function with
21816    SILENT_P set to false.  */
21817
21818 static bool
21819 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
21820                                          const_tree type,
21821                                          machine_mode *base_mode,
21822                                          int *count,
21823                                          bool *is_ha,
21824                                          bool silent_p)
21825 {
21826   if (is_ha != NULL) *is_ha = false;
21827
21828   machine_mode new_mode = VOIDmode;
21829   bool composite_p = aarch64_composite_type_p (type, mode);
21830
21831   if ((!composite_p
21832        && (GET_MODE_CLASS (mode) == MODE_FLOAT
21833            || GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT))
21834       || aarch64_short_vector_p (type, mode))
21835     {
21836       *count = 1;
21837       new_mode = mode;
21838     }
21839   else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
21840     {
21841       if (is_ha != NULL) *is_ha = true;
21842       *count = 2;
21843       new_mode = GET_MODE_INNER (mode);
21844     }
21845   else if (type && composite_p)
21846     {
21847       unsigned int warn_psabi_flags = 0;
21848       int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
21849                                               &warn_psabi_flags);
21850       if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
21851         {
21852           static unsigned last_reported_type_uid;
21853           unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
21854           int alt;
21855           if (!silent_p
21856               && warn_psabi
21857               && warn_psabi_flags
21858               && uid != last_reported_type_uid
21859               && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
21860                   != ag_count))
21861             {
21862               const char *url10
21863                 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
21864               const char *url12
21865                 = CHANGES_ROOT_URL "gcc-12/changes.html#zero_width_bitfields";
21866               gcc_assert (alt == -1);
21867               last_reported_type_uid = uid;
21868               /* Use TYPE_MAIN_VARIANT to strip any redundant const
21869                  qualification.  */
21870               if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
21871                 inform (input_location, "parameter passing for argument of "
21872                         "type %qT with %<[[no_unique_address]]%> members "
21873                         "changed %{in GCC 10.1%}",
21874                         TYPE_MAIN_VARIANT (type), url10);
21875               else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
21876                 inform (input_location, "parameter passing for argument of "
21877                         "type %qT when C++17 is enabled changed to match "
21878                         "C++14 %{in GCC 10.1%}",
21879                         TYPE_MAIN_VARIANT (type), url10);
21880               else if (warn_psabi_flags & WARN_PSABI_ZERO_WIDTH_BITFIELD)
21881                 inform (input_location, "parameter passing for argument of "
21882                         "type %qT changed %{in GCC 12.1%}",
21883                         TYPE_MAIN_VARIANT (type), url12);
21884             }
21885
21886           if (is_ha != NULL) *is_ha = true;
21887           *count = ag_count;
21888         }
21889       else
21890         return false;
21891     }
21892   else
21893     return false;
21894
21895   gcc_assert (!aarch64_sve_mode_p (new_mode));
21896   *base_mode = new_mode;
21897   return true;
21898 }
21899
21900 /* Implement TARGET_STRUCT_VALUE_RTX.  */
21901
21902 static rtx
21903 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
21904                           int incoming ATTRIBUTE_UNUSED)
21905 {
21906   return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
21907 }
21908
21909 /* Implements target hook vector_mode_supported_p.  */
21910 static bool
21911 aarch64_vector_mode_supported_p (machine_mode mode)
21912 {
21913   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
21914   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
21915 }
21916
21917 /* Implements target hook vector_mode_supported_any_target_p.  */
21918 static bool
21919 aarch64_vector_mode_supported_any_target_p (machine_mode mode)
21920 {
21921   unsigned int vec_flags = aarch64_classify_vector_mode (mode, true);
21922   return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
21923 }
21924
21925 /* Return the full-width SVE vector mode for element mode MODE, if one
21926    exists.  */
21927 opt_machine_mode
21928 aarch64_full_sve_mode (scalar_mode mode)
21929 {
21930   switch (mode)
21931     {
21932     case E_DFmode:
21933       return VNx2DFmode;
21934     case E_SFmode:
21935       return VNx4SFmode;
21936     case E_HFmode:
21937       return VNx8HFmode;
21938     case E_BFmode:
21939       return VNx8BFmode;
21940     case E_DImode:
21941       return VNx2DImode;
21942     case E_SImode:
21943       return VNx4SImode;
21944     case E_HImode:
21945       return VNx8HImode;
21946     case E_QImode:
21947       return VNx16QImode;
21948     default:
21949       return opt_machine_mode ();
21950     }
21951 }
21952
21953 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
21954    if it exists.  */
21955 opt_machine_mode
21956 aarch64_vq_mode (scalar_mode mode)
21957 {
21958   switch (mode)
21959     {
21960     case E_DFmode:
21961       return V2DFmode;
21962     case E_SFmode:
21963       return V4SFmode;
21964     case E_HFmode:
21965       return V8HFmode;
21966     case E_BFmode:
21967       return V8BFmode;
21968     case E_SImode:
21969       return V4SImode;
21970     case E_HImode:
21971       return V8HImode;
21972     case E_QImode:
21973       return V16QImode;
21974     case E_DImode:
21975       return V2DImode;
21976     default:
21977       return opt_machine_mode ();
21978     }
21979 }
21980
21981 /* Return appropriate SIMD container
21982    for MODE within a vector of WIDTH bits.  */
21983 static machine_mode
21984 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
21985 {
21986   if (TARGET_SVE
21987       && maybe_ne (width, 128)
21988       && known_eq (width, BITS_PER_SVE_VECTOR))
21989     return aarch64_full_sve_mode (mode).else_mode (word_mode);
21990
21991   gcc_assert (known_eq (width, 64) || known_eq (width, 128));
21992   if (TARGET_BASE_SIMD)
21993     {
21994       if (known_eq (width, 128))
21995         return aarch64_vq_mode (mode).else_mode (word_mode);
21996       else
21997         switch (mode)
21998           {
21999           case E_SFmode:
22000             return V2SFmode;
22001           case E_HFmode:
22002             return V4HFmode;
22003           case E_BFmode:
22004             return V4BFmode;
22005           case E_SImode:
22006             return V2SImode;
22007           case E_HImode:
22008             return V4HImode;
22009           case E_QImode:
22010             return V8QImode;
22011           default:
22012             break;
22013           }
22014     }
22015   return word_mode;
22016 }
22017
22018 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
22019    and return whether the SVE mode should be preferred over the
22020    Advanced SIMD one in aarch64_autovectorize_vector_modes.  */
22021 static bool
22022 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
22023 {
22024   /* Take into account the aarch64-autovec-preference param if non-zero.  */
22025   bool only_asimd_p = aarch64_autovec_preference == 1;
22026   bool only_sve_p = aarch64_autovec_preference == 2;
22027
22028   if (only_asimd_p)
22029     return false;
22030   if (only_sve_p)
22031     return true;
22032
22033   /* The preference in case of a tie in costs.  */
22034   bool prefer_asimd = aarch64_autovec_preference == 3;
22035   bool prefer_sve = aarch64_autovec_preference == 4;
22036
22037   poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
22038   poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
22039   /* If the CPU information does not have an SVE width registered use the
22040      generic poly_int comparison that prefers SVE.  If a preference is
22041      explicitly requested avoid this path.  */
22042   if (aarch64_tune_params.sve_width == SVE_SCALABLE
22043       && !prefer_asimd
22044       && !prefer_sve)
22045     return maybe_gt (nunits_sve, nunits_asimd);
22046
22047   /* Otherwise estimate the runtime width of the modes involved.  */
22048   HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
22049   HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
22050
22051   /* Preferring SVE means picking it first unless the Advanced SIMD mode
22052      is clearly wider.  */
22053   if (prefer_sve)
22054     return est_sve >= est_asimd;
22055   /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
22056      is clearly wider.  */
22057   if (prefer_asimd)
22058     return est_sve > est_asimd;
22059
22060   /* In the default case prefer Advanced SIMD over SVE in case of a tie.  */
22061   return est_sve > est_asimd;
22062 }
22063
22064 /* Return 128-bit container as the preferred SIMD mode for MODE.  */
22065 static machine_mode
22066 aarch64_preferred_simd_mode (scalar_mode mode)
22067 {
22068   /* Take into account explicit auto-vectorization ISA preferences through
22069      aarch64_cmp_autovec_modes.  */
22070   if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
22071     return aarch64_full_sve_mode (mode).else_mode (word_mode);
22072   if (TARGET_SIMD)
22073     return aarch64_vq_mode (mode).else_mode (word_mode);
22074   return word_mode;
22075 }
22076
22077 /* Return a list of possible vector sizes for the vectorizer
22078    to iterate over.  */
22079 static unsigned int
22080 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
22081 {
22082   static const machine_mode sve_modes[] = {
22083     /* Try using full vectors for all element types.  */
22084     VNx16QImode,
22085
22086     /* Try using 16-bit containers for 8-bit elements and full vectors
22087        for wider elements.  */
22088     VNx8QImode,
22089
22090     /* Try using 32-bit containers for 8-bit and 16-bit elements and
22091        full vectors for wider elements.  */
22092     VNx4QImode,
22093
22094     /* Try using 64-bit containers for all element types.  */
22095     VNx2QImode
22096   };
22097
22098   static const machine_mode advsimd_modes[] = {
22099     /* Try using 128-bit vectors for all element types.  */
22100     V16QImode,
22101
22102     /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
22103        for wider elements.  */
22104     V8QImode,
22105
22106     /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
22107        for wider elements.
22108
22109        TODO: We could support a limited form of V4QImode too, so that
22110        we use 32-bit vectors for 8-bit elements.  */
22111     V4HImode,
22112
22113     /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
22114        for 64-bit elements.
22115
22116        TODO: We could similarly support limited forms of V2QImode and V2HImode
22117        for this case.  */
22118     V2SImode
22119   };
22120
22121   /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
22122      This is because:
22123
22124      - If we can't use N-byte Advanced SIMD vectors then the placement
22125        doesn't matter; we'll just continue as though the Advanced SIMD
22126        entry didn't exist.
22127
22128      - If an SVE main loop with N bytes ends up being cheaper than an
22129        Advanced SIMD main loop with N bytes then by default we'll replace
22130        the Advanced SIMD version with the SVE one.
22131
22132      - If an Advanced SIMD main loop with N bytes ends up being cheaper
22133        than an SVE main loop with N bytes then by default we'll try to
22134        use the SVE loop to vectorize the epilogue instead.  */
22135
22136   bool only_asimd_p = aarch64_autovec_preference == 1;
22137   bool only_sve_p = aarch64_autovec_preference == 2;
22138
22139   unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
22140   unsigned int advsimd_i = 0;
22141
22142   while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
22143     {
22144       if (sve_i < ARRAY_SIZE (sve_modes)
22145           && aarch64_cmp_autovec_modes (sve_modes[sve_i],
22146                                         advsimd_modes[advsimd_i]))
22147         modes->safe_push (sve_modes[sve_i++]);
22148       else
22149         modes->safe_push (advsimd_modes[advsimd_i++]);
22150     }
22151   while (sve_i < ARRAY_SIZE (sve_modes))
22152    modes->safe_push (sve_modes[sve_i++]);
22153
22154   unsigned int flags = 0;
22155   if (aarch64_vect_compare_costs)
22156     flags |= VECT_COMPARE_COSTS;
22157   return flags;
22158 }
22159
22160 /* Implement TARGET_MANGLE_TYPE.  */
22161
22162 static const char *
22163 aarch64_mangle_type (const_tree type)
22164 {
22165   /* The AArch64 ABI documents say that "__va_list" has to be
22166      mangled as if it is in the "std" namespace.  */
22167   if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
22168     return "St9__va_list";
22169
22170   /* Half-precision floating point types.  */
22171   if (SCALAR_FLOAT_TYPE_P (type) && TYPE_PRECISION (type) == 16)
22172     {
22173       if (TYPE_MAIN_VARIANT (type) == float16_type_node)
22174         return NULL;
22175       if (TYPE_MODE (type) == BFmode)
22176         return "u6__bf16";
22177       else
22178         return "Dh";
22179     }
22180
22181   /* Mangle AArch64-specific internal types.  TYPE_NAME is non-NULL_TREE for
22182      builtin types.  */
22183   if (TYPE_NAME (type) != NULL)
22184     {
22185       const char *res;
22186       if ((res = aarch64_general_mangle_builtin_type (type))
22187           || (res = aarch64_sve::mangle_builtin_type (type)))
22188         return res;
22189     }
22190
22191   /* Use the default mangling.  */
22192   return NULL;
22193 }
22194
22195 /* Implement TARGET_VERIFY_TYPE_CONTEXT.  */
22196
22197 static bool
22198 aarch64_verify_type_context (location_t loc, type_context_kind context,
22199                              const_tree type, bool silent_p)
22200 {
22201   return aarch64_sve::verify_type_context (loc, context, type, silent_p);
22202 }
22203
22204 /* Find the first rtx_insn before insn that will generate an assembly
22205    instruction.  */
22206
22207 static rtx_insn *
22208 aarch64_prev_real_insn (rtx_insn *insn)
22209 {
22210   if (!insn)
22211     return NULL;
22212
22213   do
22214     {
22215       insn = prev_real_insn (insn);
22216     }
22217   while (insn && recog_memoized (insn) < 0);
22218
22219   return insn;
22220 }
22221
22222 static bool
22223 is_madd_op (enum attr_type t1)
22224 {
22225   unsigned int i;
22226   /* A number of these may be AArch32 only.  */
22227   enum attr_type mlatypes[] = {
22228     TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
22229     TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
22230     TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
22231   };
22232
22233   for (i = 0; i < ARRAY_SIZE (mlatypes); i++)
22234     {
22235       if (t1 == mlatypes[i])
22236         return true;
22237     }
22238
22239   return false;
22240 }
22241
22242 /* Check if there is a register dependency between a load and the insn
22243    for which we hold recog_data.  */
22244
22245 static bool
22246 dep_between_memop_and_curr (rtx memop)
22247 {
22248   rtx load_reg;
22249   int opno;
22250
22251   gcc_assert (GET_CODE (memop) == SET);
22252
22253   if (!REG_P (SET_DEST (memop)))
22254     return false;
22255
22256   load_reg = SET_DEST (memop);
22257   for (opno = 1; opno < recog_data.n_operands; opno++)
22258     {
22259       rtx operand = recog_data.operand[opno];
22260       if (REG_P (operand)
22261           && reg_overlap_mentioned_p (load_reg, operand))
22262         return true;
22263
22264     }
22265   return false;
22266 }
22267
22268
22269 /* When working around the Cortex-A53 erratum 835769,
22270    given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
22271    instruction and has a preceding memory instruction such that a NOP
22272    should be inserted between them.  */
22273
22274 bool
22275 aarch64_madd_needs_nop (rtx_insn* insn)
22276 {
22277   enum attr_type attr_type;
22278   rtx_insn *prev;
22279   rtx body;
22280
22281   if (!TARGET_FIX_ERR_A53_835769)
22282     return false;
22283
22284   if (!INSN_P (insn) || recog_memoized (insn) < 0)
22285     return false;
22286
22287   attr_type = get_attr_type (insn);
22288   if (!is_madd_op (attr_type))
22289     return false;
22290
22291   prev = aarch64_prev_real_insn (insn);
22292   /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
22293      Restore recog state to INSN to avoid state corruption.  */
22294   extract_constrain_insn_cached (insn);
22295
22296   if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
22297     return false;
22298
22299   body = single_set (prev);
22300
22301   /* If the previous insn is a memory op and there is no dependency between
22302      it and the DImode madd, emit a NOP between them.  If body is NULL then we
22303      have a complex memory operation, probably a load/store pair.
22304      Be conservative for now and emit a NOP.  */
22305   if (GET_MODE (recog_data.operand[0]) == DImode
22306       && (!body || !dep_between_memop_and_curr (body)))
22307     return true;
22308
22309   return false;
22310
22311 }
22312
22313
22314 /* Implement FINAL_PRESCAN_INSN.  */
22315
22316 void
22317 aarch64_final_prescan_insn (rtx_insn *insn)
22318 {
22319   if (aarch64_madd_needs_nop (insn))
22320     fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
22321 }
22322
22323
22324 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
22325    instruction.  */
22326
22327 bool
22328 aarch64_sve_index_immediate_p (rtx base_or_step)
22329 {
22330   return (CONST_INT_P (base_or_step)
22331           && IN_RANGE (INTVAL (base_or_step), -16, 15));
22332 }
22333
22334 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
22335    when applied to mode MODE.  Negate X first if NEGATE_P is true.  */
22336
22337 bool
22338 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
22339 {
22340   rtx elt = unwrap_const_vec_duplicate (x);
22341   if (!CONST_INT_P (elt))
22342     return false;
22343
22344   HOST_WIDE_INT val = INTVAL (elt);
22345   if (negate_p)
22346     val = -val;
22347   val &= GET_MODE_MASK (GET_MODE_INNER (mode));
22348
22349   if (val & 0xff)
22350     return IN_RANGE (val, 0, 0xff);
22351   return IN_RANGE (val, 0, 0xff00);
22352 }
22353
22354 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
22355    instructions when applied to mode MODE.  Negate X first if NEGATE_P
22356    is true.  */
22357
22358 bool
22359 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
22360 {
22361   if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
22362     return false;
22363
22364   /* After the optional negation, the immediate must be nonnegative.
22365      E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
22366      instead of SQADD Zn.B, Zn.B, #129.  */
22367   rtx elt = unwrap_const_vec_duplicate (x);
22368   return negate_p == (INTVAL (elt) < 0);
22369 }
22370
22371 /* Return true if X is a valid immediate operand for an SVE logical
22372    instruction such as AND.  */
22373
22374 bool
22375 aarch64_sve_bitmask_immediate_p (rtx x)
22376 {
22377   rtx elt;
22378
22379   return (const_vec_duplicate_p (x, &elt)
22380           && CONST_INT_P (elt)
22381           && aarch64_bitmask_imm (INTVAL (elt),
22382                                   GET_MODE_INNER (GET_MODE (x))));
22383 }
22384
22385 /* Return true if X is a valid immediate for the SVE DUP and CPY
22386    instructions.  */
22387
22388 bool
22389 aarch64_sve_dup_immediate_p (rtx x)
22390 {
22391   x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
22392   if (!CONST_INT_P (x))
22393     return false;
22394
22395   HOST_WIDE_INT val = INTVAL (x);
22396   if (val & 0xff)
22397     return IN_RANGE (val, -0x80, 0x7f);
22398   return IN_RANGE (val, -0x8000, 0x7f00);
22399 }
22400
22401 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
22402    SIGNED_P says whether the operand is signed rather than unsigned.  */
22403
22404 bool
22405 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
22406 {
22407   x = unwrap_const_vec_duplicate (x);
22408   return (CONST_INT_P (x)
22409           && (signed_p
22410               ? IN_RANGE (INTVAL (x), -16, 15)
22411               : IN_RANGE (INTVAL (x), 0, 127)));
22412 }
22413
22414 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
22415    instruction.  Negate X first if NEGATE_P is true.  */
22416
22417 bool
22418 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
22419 {
22420   rtx elt;
22421   REAL_VALUE_TYPE r;
22422
22423   if (!const_vec_duplicate_p (x, &elt)
22424       || !CONST_DOUBLE_P (elt))
22425     return false;
22426
22427   r = *CONST_DOUBLE_REAL_VALUE (elt);
22428
22429   if (negate_p)
22430     r = real_value_negate (&r);
22431
22432   if (real_equal (&r, &dconst1))
22433     return true;
22434   if (real_equal (&r, &dconsthalf))
22435     return true;
22436   return false;
22437 }
22438
22439 /* Return true if X is a valid immediate operand for an SVE FMUL
22440    instruction.  */
22441
22442 bool
22443 aarch64_sve_float_mul_immediate_p (rtx x)
22444 {
22445   rtx elt;
22446
22447   return (const_vec_duplicate_p (x, &elt)
22448           && CONST_DOUBLE_P (elt)
22449           && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
22450               || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
22451 }
22452
22453 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
22454    for the Advanced SIMD operation described by WHICH and INSN.  If INFO
22455    is nonnull, use it to describe valid immediates.  */
22456 static bool
22457 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
22458                                     simd_immediate_info *info,
22459                                     enum simd_immediate_check which,
22460                                     simd_immediate_info::insn_type insn)
22461 {
22462   /* Try a 4-byte immediate with LSL.  */
22463   for (unsigned int shift = 0; shift < 32; shift += 8)
22464     if ((val32 & (0xff << shift)) == val32)
22465       {
22466         if (info)
22467           *info = simd_immediate_info (SImode, val32 >> shift, insn,
22468                                        simd_immediate_info::LSL, shift);
22469         return true;
22470       }
22471
22472   /* Try a 2-byte immediate with LSL.  */
22473   unsigned int imm16 = val32 & 0xffff;
22474   if (imm16 == (val32 >> 16))
22475     for (unsigned int shift = 0; shift < 16; shift += 8)
22476       if ((imm16 & (0xff << shift)) == imm16)
22477         {
22478           if (info)
22479             *info = simd_immediate_info (HImode, imm16 >> shift, insn,
22480                                          simd_immediate_info::LSL, shift);
22481           return true;
22482         }
22483
22484   /* Try a 4-byte immediate with MSL, except for cases that MVN
22485      can handle.  */
22486   if (which == AARCH64_CHECK_MOV)
22487     for (unsigned int shift = 8; shift < 24; shift += 8)
22488       {
22489         unsigned int low = (1 << shift) - 1;
22490         if (((val32 & (0xff << shift)) | low) == val32)
22491           {
22492             if (info)
22493               *info = simd_immediate_info (SImode, val32 >> shift, insn,
22494                                            simd_immediate_info::MSL, shift);
22495             return true;
22496           }
22497       }
22498
22499   return false;
22500 }
22501
22502 /* Return true if replicating VAL64 is a valid immediate for the
22503    Advanced SIMD operation described by WHICH.  If INFO is nonnull,
22504    use it to describe valid immediates.  */
22505 static bool
22506 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
22507                                  simd_immediate_info *info,
22508                                  enum simd_immediate_check which)
22509 {
22510   unsigned int val32 = val64 & 0xffffffff;
22511   unsigned int val16 = val64 & 0xffff;
22512   unsigned int val8 = val64 & 0xff;
22513
22514   if (val32 == (val64 >> 32))
22515     {
22516       if ((which & AARCH64_CHECK_ORR) != 0
22517           && aarch64_advsimd_valid_immediate_hs (val32, info, which,
22518                                                  simd_immediate_info::MOV))
22519         return true;
22520
22521       if ((which & AARCH64_CHECK_BIC) != 0
22522           && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
22523                                                  simd_immediate_info::MVN))
22524         return true;
22525
22526       /* Try using a replicated byte.  */
22527       if (which == AARCH64_CHECK_MOV
22528           && val16 == (val32 >> 16)
22529           && val8 == (val16 >> 8))
22530         {
22531           if (info)
22532             *info = simd_immediate_info (QImode, val8);
22533           return true;
22534         }
22535     }
22536
22537   /* Try using a bit-to-bytemask.  */
22538   if (which == AARCH64_CHECK_MOV)
22539     {
22540       unsigned int i;
22541       for (i = 0; i < 64; i += 8)
22542         {
22543           unsigned char byte = (val64 >> i) & 0xff;
22544           if (byte != 0 && byte != 0xff)
22545             break;
22546         }
22547       if (i == 64)
22548         {
22549           if (info)
22550             *info = simd_immediate_info (DImode, val64);
22551           return true;
22552         }
22553     }
22554   return false;
22555 }
22556
22557 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
22558    instruction.  If INFO is nonnull, use it to describe valid immediates.  */
22559
22560 static bool
22561 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
22562                              simd_immediate_info *info)
22563 {
22564   scalar_int_mode mode = DImode;
22565   unsigned int val32 = val64 & 0xffffffff;
22566   if (val32 == (val64 >> 32))
22567     {
22568       mode = SImode;
22569       unsigned int val16 = val32 & 0xffff;
22570       if (val16 == (val32 >> 16))
22571         {
22572           mode = HImode;
22573           unsigned int val8 = val16 & 0xff;
22574           if (val8 == (val16 >> 8))
22575             mode = QImode;
22576         }
22577     }
22578   HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
22579   if (IN_RANGE (val, -0x80, 0x7f))
22580     {
22581       /* DUP with no shift.  */
22582       if (info)
22583         *info = simd_immediate_info (mode, val);
22584       return true;
22585     }
22586   if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
22587     {
22588       /* DUP with LSL #8.  */
22589       if (info)
22590         *info = simd_immediate_info (mode, val);
22591       return true;
22592     }
22593   if (aarch64_bitmask_imm (val64, mode))
22594     {
22595       /* DUPM.  */
22596       if (info)
22597         *info = simd_immediate_info (mode, val);
22598       return true;
22599     }
22600   return false;
22601 }
22602
22603 /* Return true if X is an UNSPEC_PTRUE constant of the form:
22604
22605        (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
22606
22607    where PATTERN is the svpattern as a CONST_INT and where ZERO
22608    is a zero constant of the required PTRUE mode (which can have
22609    fewer elements than X's mode, if zero bits are significant).
22610
22611    If so, and if INFO is nonnull, describe the immediate in INFO.  */
22612 bool
22613 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
22614 {
22615   if (GET_CODE (x) != CONST)
22616     return false;
22617
22618   x = XEXP (x, 0);
22619   if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
22620     return false;
22621
22622   if (info)
22623     {
22624       aarch64_svpattern pattern
22625         = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
22626       machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
22627       scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
22628       *info = simd_immediate_info (int_mode, pattern);
22629     }
22630   return true;
22631 }
22632
22633 /* Return true if X is a valid SVE predicate.  If INFO is nonnull, use
22634    it to describe valid immediates.  */
22635
22636 static bool
22637 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
22638 {
22639   if (aarch64_sve_ptrue_svpattern_p (x, info))
22640     return true;
22641
22642   if (x == CONST0_RTX (GET_MODE (x)))
22643     {
22644       if (info)
22645         *info = simd_immediate_info (DImode, 0);
22646       return true;
22647     }
22648
22649   /* Analyze the value as a VNx16BImode.  This should be relatively
22650      efficient, since rtx_vector_builder has enough built-in capacity
22651      to store all VLA predicate constants without needing the heap.  */
22652   rtx_vector_builder builder;
22653   if (!aarch64_get_sve_pred_bits (builder, x))
22654     return false;
22655
22656   unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
22657   if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
22658     {
22659       machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
22660       aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
22661       if (pattern != AARCH64_NUM_SVPATTERNS)
22662         {
22663           if (info)
22664             {
22665               scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
22666               *info = simd_immediate_info (int_mode, pattern);
22667             }
22668           return true;
22669         }
22670     }
22671   return false;
22672 }
22673
22674 /* Return true if OP is a valid SIMD immediate for the operation
22675    described by WHICH.  If INFO is nonnull, use it to describe valid
22676    immediates.  */
22677 bool
22678 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
22679                               enum simd_immediate_check which)
22680 {
22681   machine_mode mode = GET_MODE (op);
22682   unsigned int vec_flags = aarch64_classify_vector_mode (mode);
22683   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
22684     return false;
22685
22686   if ((vec_flags & VEC_ADVSIMD) && !TARGET_SIMD)
22687     return false;
22688
22689   if (vec_flags == (VEC_SVE_PRED | VEC_STRUCT))
22690     return op == CONST0_RTX (mode) || op == CONSTM1_RTX (mode);
22691
22692   if (vec_flags & VEC_SVE_PRED)
22693     return aarch64_sve_pred_valid_immediate (op, info);
22694
22695   scalar_mode elt_mode = GET_MODE_INNER (mode);
22696   rtx base, step;
22697   unsigned int n_elts;
22698   if (CONST_VECTOR_P (op)
22699       && CONST_VECTOR_DUPLICATE_P (op))
22700     n_elts = CONST_VECTOR_NPATTERNS (op);
22701   else if ((vec_flags & VEC_SVE_DATA)
22702            && const_vec_series_p (op, &base, &step))
22703     {
22704       gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
22705       if (!aarch64_sve_index_immediate_p (base)
22706           || !aarch64_sve_index_immediate_p (step))
22707         return false;
22708
22709       if (info)
22710         {
22711           /* Get the corresponding container mode.  E.g. an INDEX on V2SI
22712              should yield two integer values per 128-bit block, meaning
22713              that we need to treat it in the same way as V2DI and then
22714              ignore the upper 32 bits of each element.  */
22715           elt_mode = aarch64_sve_container_int_mode (mode);
22716           *info = simd_immediate_info (elt_mode, base, step);
22717         }
22718       return true;
22719     }
22720   else if (CONST_VECTOR_P (op)
22721            && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
22722     /* N_ELTS set above.  */;
22723   else
22724     return false;
22725
22726   scalar_float_mode elt_float_mode;
22727   if (n_elts == 1
22728       && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
22729     {
22730       rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
22731       if (aarch64_float_const_zero_rtx_p (elt)
22732           || aarch64_float_const_representable_p (elt))
22733         {
22734           if (info)
22735             *info = simd_immediate_info (elt_float_mode, elt);
22736           return true;
22737         }
22738     }
22739
22740   /* If all elements in an SVE vector have the same value, we have a free
22741      choice between using the element mode and using the container mode.
22742      Using the element mode means that unused parts of the vector are
22743      duplicates of the used elements, while using the container mode means
22744      that the unused parts are an extension of the used elements.  Using the
22745      element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
22746      for its container mode VNx4SI while 0x00000101 isn't.
22747
22748      If not all elements in an SVE vector have the same value, we need the
22749      transition from one element to the next to occur at container boundaries.
22750      E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
22751      in the same way as a VNx4SI containing { 1, 2, 3, 4 }.  */
22752   scalar_int_mode elt_int_mode;
22753   if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
22754     elt_int_mode = aarch64_sve_container_int_mode (mode);
22755   else
22756     elt_int_mode = int_mode_for_mode (elt_mode).require ();
22757
22758   unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
22759   if (elt_size > 8)
22760     return false;
22761
22762   /* Expand the vector constant out into a byte vector, with the least
22763      significant byte of the register first.  */
22764   auto_vec<unsigned char, 16> bytes;
22765   bytes.reserve (n_elts * elt_size);
22766   for (unsigned int i = 0; i < n_elts; i++)
22767     {
22768       /* The vector is provided in gcc endian-neutral fashion.
22769          For aarch64_be Advanced SIMD, it must be laid out in the vector
22770          register in reverse order.  */
22771       bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
22772       rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
22773
22774       if (elt_mode != elt_int_mode)
22775         elt = gen_lowpart (elt_int_mode, elt);
22776
22777       if (!CONST_INT_P (elt))
22778         return false;
22779
22780       unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
22781       for (unsigned int byte = 0; byte < elt_size; byte++)
22782         {
22783           bytes.quick_push (elt_val & 0xff);
22784           elt_val >>= BITS_PER_UNIT;
22785         }
22786     }
22787
22788   /* The immediate must repeat every eight bytes.  */
22789   unsigned int nbytes = bytes.length ();
22790   for (unsigned i = 8; i < nbytes; ++i)
22791     if (bytes[i] != bytes[i - 8])
22792       return false;
22793
22794   /* Get the repeating 8-byte value as an integer.  No endian correction
22795      is needed here because bytes is already in lsb-first order.  */
22796   unsigned HOST_WIDE_INT val64 = 0;
22797   for (unsigned int i = 0; i < 8; i++)
22798     val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
22799               << (i * BITS_PER_UNIT));
22800
22801   if (vec_flags & VEC_SVE_DATA)
22802     return aarch64_sve_valid_immediate (val64, info);
22803   else
22804     return aarch64_advsimd_valid_immediate (val64, info, which);
22805 }
22806
22807 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
22808    has a step in the range of INDEX.  Return the index expression if so,
22809    otherwise return null.  */
22810 rtx
22811 aarch64_check_zero_based_sve_index_immediate (rtx x)
22812 {
22813   rtx base, step;
22814   if (const_vec_series_p (x, &base, &step)
22815       && base == const0_rtx
22816       && aarch64_sve_index_immediate_p (step))
22817     return step;
22818   return NULL_RTX;
22819 }
22820
22821 /* Check of immediate shift constants are within range.  */
22822 bool
22823 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
22824 {
22825   x = unwrap_const_vec_duplicate (x);
22826   if (!CONST_INT_P (x))
22827     return false;
22828   int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
22829   if (left)
22830     return IN_RANGE (INTVAL (x), 0, bit_width - 1);
22831   else
22832     return IN_RANGE (INTVAL (x), 1, bit_width);
22833 }
22834
22835 /* Return the bitmask CONST_INT to select the bits required by a zero extract
22836    operation of width WIDTH at bit position POS.  */
22837
22838 rtx
22839 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
22840 {
22841   gcc_assert (CONST_INT_P (width));
22842   gcc_assert (CONST_INT_P (pos));
22843
22844   unsigned HOST_WIDE_INT mask
22845     = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
22846   return GEN_INT (mask << UINTVAL (pos));
22847 }
22848
22849 bool
22850 aarch64_mov_operand_p (rtx x, machine_mode mode)
22851 {
22852   if (GET_CODE (x) == HIGH
22853       && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
22854     return true;
22855
22856   if (CONST_INT_P (x))
22857     return true;
22858
22859   if (VECTOR_MODE_P (GET_MODE (x)))
22860     {
22861       /* Require predicate constants to be VNx16BI before RA, so that we
22862          force everything to have a canonical form.  */
22863       if (!lra_in_progress
22864           && !reload_completed
22865           && aarch64_sve_pred_mode_p (GET_MODE (x))
22866           && known_eq (GET_MODE_SIZE (GET_MODE (x)), BYTES_PER_SVE_PRED)
22867           && GET_MODE (x) != VNx16BImode)
22868         return false;
22869
22870       return aarch64_simd_valid_immediate (x, NULL);
22871     }
22872
22873   /* Remove UNSPEC_SALT_ADDR before checking symbol reference.  */
22874   x = strip_salt (x);
22875
22876   /* GOT accesses are valid moves.  */
22877   if (SYMBOL_REF_P (x)
22878       && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
22879     return true;
22880
22881   if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
22882     return true;
22883
22884   if (TARGET_SVE
22885       && (aarch64_sve_cnt_immediate_p (x)
22886           || aarch64_sve_rdvl_immediate_p (x)))
22887     return true;
22888
22889   if (aarch64_rdsvl_immediate_p (x))
22890     return true;
22891
22892   return aarch64_classify_symbolic_expression (x)
22893     == SYMBOL_TINY_ABSOLUTE;
22894 }
22895
22896 /* Return a function-invariant register that contains VALUE.  *CACHED_INSN
22897    caches instructions that set up such registers, so that they can be
22898    reused by future calls.  */
22899
22900 static rtx
22901 aarch64_get_shareable_reg (rtx_insn **cached_insn, rtx value)
22902 {
22903   rtx_insn *insn = *cached_insn;
22904   if (insn && INSN_P (insn) && !insn->deleted ())
22905     {
22906       rtx pat = PATTERN (insn);
22907       if (GET_CODE (pat) == SET)
22908         {
22909           rtx dest = SET_DEST (pat);
22910           if (REG_P (dest)
22911               && !HARD_REGISTER_P (dest)
22912               && rtx_equal_p (SET_SRC (pat), value))
22913             return dest;
22914         }
22915     }
22916   rtx reg = gen_reg_rtx (GET_MODE (value));
22917   *cached_insn = emit_insn_before (gen_rtx_SET (reg, value),
22918                                    function_beg_insn);
22919   return reg;
22920 }
22921
22922 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
22923    the constant creation.  */
22924
22925 rtx
22926 aarch64_gen_shareable_zero (machine_mode mode)
22927 {
22928   rtx reg = aarch64_get_shareable_reg (&cfun->machine->advsimd_zero_insn,
22929                                        CONST0_RTX (V4SImode));
22930   return lowpart_subreg (mode, reg, GET_MODE (reg));
22931 }
22932
22933 /* INSN is some form of extension or shift that can be split into a
22934    permutation involving a shared zero.  Return true if we should
22935    perform such a split.
22936
22937    ??? For now, make sure that the split instruction executes more
22938    frequently than the zero that feeds it.  In future it would be good
22939    to split without that restriction and instead recombine shared zeros
22940    if they turn out not to be worthwhile.  This would allow splits in
22941    single-block functions and would also cope more naturally with
22942    rematerialization.  */
22943
22944 bool
22945 aarch64_split_simd_shift_p (rtx_insn *insn)
22946 {
22947   return (can_create_pseudo_p ()
22948           && optimize_bb_for_speed_p (BLOCK_FOR_INSN (insn))
22949           && (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count
22950               < BLOCK_FOR_INSN (insn)->count));
22951 }
22952
22953 /* Return a const_int vector of VAL.  */
22954 rtx
22955 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
22956 {
22957   rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
22958   return gen_const_vec_duplicate (mode, c);
22959 }
22960
22961 /* Check OP is a legal scalar immediate for the MOVI instruction.  */
22962
22963 bool
22964 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
22965 {
22966   machine_mode vmode;
22967
22968   vmode = aarch64_simd_container_mode (mode, 64);
22969   rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
22970   return aarch64_simd_valid_immediate (op_v, NULL);
22971 }
22972
22973 /* Construct and return a PARALLEL RTX vector with elements numbering the
22974    lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
22975    the vector - from the perspective of the architecture.  This does not
22976    line up with GCC's perspective on lane numbers, so we end up with
22977    different masks depending on our target endian-ness.  The diagram
22978    below may help.  We must draw the distinction when building masks
22979    which select one half of the vector.  An instruction selecting
22980    architectural low-lanes for a big-endian target, must be described using
22981    a mask selecting GCC high-lanes.
22982
22983                  Big-Endian             Little-Endian
22984
22985 GCC             0   1   2   3           3   2   1   0
22986               | x | x | x | x |       | x | x | x | x |
22987 Architecture    3   2   1   0           3   2   1   0
22988
22989 Low Mask:         { 2, 3 }                { 0, 1 }
22990 High Mask:        { 0, 1 }                { 2, 3 }
22991
22992    MODE Is the mode of the vector and NUNITS is the number of units in it.  */
22993
22994 rtx
22995 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
22996 {
22997   rtvec v = rtvec_alloc (nunits / 2);
22998   int high_base = nunits / 2;
22999   int low_base = 0;
23000   int base;
23001   rtx t1;
23002   int i;
23003
23004   if (BYTES_BIG_ENDIAN)
23005     base = high ? low_base : high_base;
23006   else
23007     base = high ? high_base : low_base;
23008
23009   for (i = 0; i < nunits / 2; i++)
23010     RTVEC_ELT (v, i) = GEN_INT (base + i);
23011
23012   t1 = gen_rtx_PARALLEL (mode, v);
23013   return t1;
23014 }
23015
23016 /* Check OP for validity as a PARALLEL RTX vector with elements
23017    numbering the lanes of either the high (HIGH == TRUE) or low lanes,
23018    from the perspective of the architecture.  See the diagram above
23019    aarch64_simd_vect_par_cnst_half for more details.  */
23020
23021 bool
23022 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
23023                                        bool high)
23024 {
23025   int nelts;
23026   if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
23027     return false;
23028
23029   rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
23030   HOST_WIDE_INT count_op = XVECLEN (op, 0);
23031   HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
23032   int i = 0;
23033
23034   if (count_op != count_ideal)
23035     return false;
23036
23037   for (i = 0; i < count_ideal; i++)
23038     {
23039       rtx elt_op = XVECEXP (op, 0, i);
23040       rtx elt_ideal = XVECEXP (ideal, 0, i);
23041
23042       if (!CONST_INT_P (elt_op)
23043           || INTVAL (elt_ideal) != INTVAL (elt_op))
23044         return false;
23045     }
23046   return true;
23047 }
23048
23049 /* Return a PARALLEL containing NELTS elements, with element I equal
23050    to BASE + I * STEP.  */
23051
23052 rtx
23053 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
23054 {
23055   rtvec vec = rtvec_alloc (nelts);
23056   for (unsigned int i = 0; i < nelts; ++i)
23057     RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
23058   return gen_rtx_PARALLEL (VOIDmode, vec);
23059 }
23060
23061 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
23062    series with step STEP.  */
23063
23064 bool
23065 aarch64_stepped_int_parallel_p (rtx op, int step)
23066 {
23067   if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
23068     return false;
23069
23070   unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
23071   for (int i = 1; i < XVECLEN (op, 0); ++i)
23072     if (!CONST_INT_P (XVECEXP (op, 0, i))
23073         || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
23074       return false;
23075
23076   return true;
23077 }
23078
23079 /* Return true if OPERANDS[0] to OPERANDS[NUM_OPERANDS - 1] form a
23080    sequence of strided registers, with the stride being equal STRIDE.
23081    The operands are already known to be FPRs.  */
23082 bool
23083 aarch64_strided_registers_p (rtx *operands, unsigned int num_operands,
23084                              unsigned int stride)
23085 {
23086   for (unsigned int i = 1; i < num_operands; ++i)
23087     if (REGNO (operands[i]) != REGNO (operands[0]) + i * stride)
23088       return false;
23089   return true;
23090 }
23091
23092 /* Bounds-check lanes.  Ensure OPERAND lies between LOW (inclusive) and
23093    HIGH (exclusive).  */
23094 void
23095 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
23096                           const_tree exp)
23097 {
23098   HOST_WIDE_INT lane;
23099   gcc_assert (CONST_INT_P (operand));
23100   lane = INTVAL (operand);
23101
23102   if (lane < low || lane >= high)
23103   {
23104     if (exp)
23105       error_at (EXPR_LOCATION (exp), "lane %wd out of range %wd - %wd",
23106                 lane, low, high - 1);
23107     else
23108       error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
23109   }
23110 }
23111
23112 /* Peform endian correction on lane number N, which indexes a vector
23113    of mode MODE, and return the result as an SImode rtx.  */
23114
23115 rtx
23116 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
23117 {
23118   return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
23119 }
23120
23121 /* Return TRUE if OP is a valid vector addressing mode.  */
23122
23123 bool
23124 aarch64_simd_mem_operand_p (rtx op)
23125 {
23126   return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
23127                         || REG_P (XEXP (op, 0)));
23128 }
23129
23130 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction.  */
23131
23132 bool
23133 aarch64_sve_ld1r_operand_p (rtx op)
23134 {
23135   struct aarch64_address_info addr;
23136   scalar_mode mode;
23137
23138   return (MEM_P (op)
23139           && is_a <scalar_mode> (GET_MODE (op), &mode)
23140           && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
23141           && addr.type == ADDRESS_REG_IMM
23142           && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
23143 }
23144
23145 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
23146    where the size of the read data is specified by `mode` and the size of the
23147    vector elements are specified by `elem_mode`.   */
23148 bool
23149 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
23150                                    scalar_mode elem_mode)
23151 {
23152   struct aarch64_address_info addr;
23153   if (!MEM_P (op)
23154       || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
23155     return false;
23156
23157   if (addr.type == ADDRESS_REG_IMM)
23158     return offset_4bit_signed_scaled_p (mode, addr.const_offset);
23159
23160   if (addr.type == ADDRESS_REG_REG)
23161     return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
23162
23163   return false;
23164 }
23165
23166 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction.  */
23167 bool
23168 aarch64_sve_ld1rq_operand_p (rtx op)
23169 {
23170   return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
23171                                             GET_MODE_INNER (GET_MODE (op)));
23172 }
23173
23174 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
23175    accessing a vector where the element size is specified by `elem_mode`.  */
23176 bool
23177 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
23178 {
23179   return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
23180 }
23181
23182 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction.  */
23183 bool
23184 aarch64_sve_ldff1_operand_p (rtx op)
23185 {
23186   if (!MEM_P (op))
23187     return false;
23188
23189   struct aarch64_address_info addr;
23190   if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
23191     return false;
23192
23193   if (addr.type == ADDRESS_REG_IMM)
23194     return known_eq (addr.const_offset, 0);
23195
23196   return addr.type == ADDRESS_REG_REG;
23197 }
23198
23199 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction.  */
23200 bool
23201 aarch64_sve_ldnf1_operand_p (rtx op)
23202 {
23203   struct aarch64_address_info addr;
23204
23205   return (MEM_P (op)
23206           && aarch64_classify_address (&addr, XEXP (op, 0),
23207                                        GET_MODE (op), false)
23208           && addr.type == ADDRESS_REG_IMM);
23209 }
23210
23211 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
23212    The conditions for STR are the same.  */
23213 bool
23214 aarch64_sve_ldr_operand_p (rtx op)
23215 {
23216   struct aarch64_address_info addr;
23217
23218   return (MEM_P (op)
23219           && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
23220                                        false, ADDR_QUERY_ANY)
23221           && addr.type == ADDRESS_REG_IMM);
23222 }
23223
23224 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
23225    addressing memory of mode MODE.  */
23226 bool
23227 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
23228 {
23229   struct aarch64_address_info addr;
23230   if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
23231     return false;
23232
23233   if (addr.type == ADDRESS_REG_IMM)
23234     return offset_6bit_signed_scaled_p (mode, addr.const_offset);
23235
23236   return addr.type == ADDRESS_REG_REG;
23237 }
23238
23239 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
23240    We need to be able to access the individual pieces, so the range
23241    is different from LD[234] and ST[234].  */
23242 bool
23243 aarch64_sve_struct_memory_operand_p (rtx op)
23244 {
23245   if (!MEM_P (op))
23246     return false;
23247
23248   machine_mode mode = GET_MODE (op);
23249   struct aarch64_address_info addr;
23250   if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
23251                                  ADDR_QUERY_ANY)
23252       || addr.type != ADDRESS_REG_IMM)
23253     return false;
23254
23255   poly_int64 first = addr.const_offset;
23256   poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
23257   return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
23258           && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
23259 }
23260
23261 /* Return true if OFFSET is a constant integer and if VNUM is
23262    OFFSET * the number of bytes in an SVE vector.  This is the requirement
23263    that exists in SME LDR and STR instructions, where the VL offset must
23264    equal the ZA slice offset.  */
23265 bool
23266 aarch64_sme_ldr_vnum_offset_p (rtx offset, rtx vnum)
23267 {
23268   if (!CONST_INT_P (offset) || !IN_RANGE (INTVAL (offset), 0, 15))
23269     return false;
23270
23271   if (TARGET_STREAMING)
23272     {
23273       poly_int64 const_vnum;
23274       return (poly_int_rtx_p (vnum, &const_vnum)
23275               && known_eq (const_vnum,
23276                            INTVAL (offset) * BYTES_PER_SVE_VECTOR));
23277     }
23278   else
23279     {
23280       HOST_WIDE_INT factor;
23281       return (aarch64_sme_vq_unspec_p (vnum, &factor)
23282               && factor == INTVAL (offset) * 16);
23283     }
23284 }
23285
23286 /* Emit a register copy from operand to operand, taking care not to
23287    early-clobber source registers in the process.
23288
23289    COUNT is the number of components into which the copy needs to be
23290    decomposed.  */
23291 void
23292 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
23293                                 unsigned int count)
23294 {
23295   unsigned int i;
23296   int rdest = REGNO (operands[0]);
23297   int rsrc = REGNO (operands[1]);
23298
23299   if (!reg_overlap_mentioned_p (operands[0], operands[1])
23300       || rdest < rsrc)
23301     for (i = 0; i < count; i++)
23302       emit_move_insn (gen_rtx_REG (mode, rdest + i),
23303                       gen_rtx_REG (mode, rsrc + i));
23304   else
23305     for (i = 0; i < count; i++)
23306       emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
23307                       gen_rtx_REG (mode, rsrc + count - i - 1));
23308 }
23309
23310 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
23311    one of VSTRUCT modes: OI, CI, or XI.  */
23312 int
23313 aarch64_simd_attr_length_rglist (machine_mode mode)
23314 {
23315   /* This is only used (and only meaningful) for Advanced SIMD, not SVE.  */
23316   return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
23317 }
23318
23319 /* Implement target hook TARGET_VECTOR_ALIGNMENT.  The AAPCS64 sets the maximum
23320    alignment of a vector to 128 bits.  SVE predicates have an alignment of
23321    16 bits.  */
23322 static HOST_WIDE_INT
23323 aarch64_simd_vector_alignment (const_tree type)
23324 {
23325   /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
23326      be set for non-predicate vectors of booleans.  Modes are the most
23327      direct way we have of identifying real SVE predicate types.  */
23328   if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
23329     return 16;
23330   widest_int min_size
23331     = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
23332   return wi::umin (min_size, 128).to_uhwi ();
23333 }
23334
23335 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT.  */
23336 static poly_uint64
23337 aarch64_vectorize_preferred_vector_alignment (const_tree type)
23338 {
23339   if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
23340     {
23341       /* If the length of the vector is a fixed power of 2, try to align
23342          to that length, otherwise don't try to align at all.  */
23343       HOST_WIDE_INT result;
23344       if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
23345           || !pow2p_hwi (result))
23346         result = TYPE_ALIGN (TREE_TYPE (type));
23347       return result;
23348     }
23349   return TYPE_ALIGN (type);
23350 }
23351
23352 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE.  */
23353 static bool
23354 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
23355 {
23356   if (is_packed)
23357     return false;
23358
23359   /* For fixed-length vectors, check that the vectorizer will aim for
23360      full-vector alignment.  This isn't true for generic GCC vectors
23361      that are wider than the ABI maximum of 128 bits.  */
23362   poly_uint64 preferred_alignment =
23363     aarch64_vectorize_preferred_vector_alignment (type);
23364   if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
23365       && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
23366                    preferred_alignment))
23367     return false;
23368
23369   /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned.  */
23370   return true;
23371 }
23372
23373 /* Return true if the vector misalignment factor is supported by the
23374    target.  */
23375 static bool
23376 aarch64_builtin_support_vector_misalignment (machine_mode mode,
23377                                              const_tree type, int misalignment,
23378                                              bool is_packed)
23379 {
23380   if (TARGET_SIMD && STRICT_ALIGNMENT)
23381     {
23382       /* Return if movmisalign pattern is not supported for this mode.  */
23383       if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
23384         return false;
23385
23386       /* Misalignment factor is unknown at compile time.  */
23387       if (misalignment == -1)
23388         return false;
23389     }
23390   return default_builtin_support_vector_misalignment (mode, type, misalignment,
23391                                                       is_packed);
23392 }
23393
23394 /* If VALS is a vector constant that can be loaded into a register
23395    using DUP, generate instructions to do so and return an RTX to
23396    assign to the register.  Otherwise return NULL_RTX.  */
23397 static rtx
23398 aarch64_simd_dup_constant (rtx vals)
23399 {
23400   machine_mode mode = GET_MODE (vals);
23401   machine_mode inner_mode = GET_MODE_INNER (mode);
23402   rtx x;
23403
23404   if (!const_vec_duplicate_p (vals, &x))
23405     return NULL_RTX;
23406
23407   /* We can load this constant by using DUP and a constant in a
23408      single ARM register.  This will be cheaper than a vector
23409      load.  */
23410   x = force_reg (inner_mode, x);
23411   return gen_vec_duplicate (mode, x);
23412 }
23413
23414
23415 /* Generate code to load VALS, which is a PARALLEL containing only
23416    constants (for vec_init) or CONST_VECTOR, efficiently into a
23417    register.  Returns an RTX to copy into the register, or NULL_RTX
23418    for a PARALLEL that cannot be converted into a CONST_VECTOR.  */
23419 static rtx
23420 aarch64_simd_make_constant (rtx vals)
23421 {
23422   machine_mode mode = GET_MODE (vals);
23423   rtx const_dup;
23424   rtx const_vec = NULL_RTX;
23425   int n_const = 0;
23426   int i;
23427
23428   if (CONST_VECTOR_P (vals))
23429     const_vec = vals;
23430   else if (GET_CODE (vals) == PARALLEL)
23431     {
23432       /* A CONST_VECTOR must contain only CONST_INTs and
23433          CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
23434          Only store valid constants in a CONST_VECTOR.  */
23435       int n_elts = XVECLEN (vals, 0);
23436       for (i = 0; i < n_elts; ++i)
23437         {
23438           rtx x = XVECEXP (vals, 0, i);
23439           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
23440             n_const++;
23441         }
23442       if (n_const == n_elts)
23443         const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
23444     }
23445   else
23446     gcc_unreachable ();
23447
23448   if (const_vec != NULL_RTX
23449       && aarch64_simd_valid_immediate (const_vec, NULL))
23450     /* Load using MOVI/MVNI.  */
23451     return const_vec;
23452   else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
23453     /* Loaded using DUP.  */
23454     return const_dup;
23455   else if (const_vec != NULL_RTX)
23456     /* Load from constant pool. We cannot take advantage of single-cycle
23457        LD1 because we need a PC-relative addressing mode.  */
23458     return const_vec;
23459   else
23460     /* A PARALLEL containing something not valid inside CONST_VECTOR.
23461        We cannot construct an initializer.  */
23462     return NULL_RTX;
23463 }
23464
23465 /* A subroutine of aarch64_expand_vector_init, with the same interface.
23466    The caller has already tried a divide-and-conquer approach, so do
23467    not consider that case here.  */
23468
23469 void
23470 aarch64_expand_vector_init_fallback (rtx target, rtx vals)
23471 {
23472   machine_mode mode = GET_MODE (target);
23473   scalar_mode inner_mode = GET_MODE_INNER (mode);
23474   /* The number of vector elements.  */
23475   int n_elts = XVECLEN (vals, 0);
23476   /* The number of vector elements which are not constant.  */
23477   int n_var = 0;
23478   rtx any_const = NULL_RTX;
23479   /* The first element of vals.  */
23480   rtx v0 = XVECEXP (vals, 0, 0);
23481   bool all_same = true;
23482
23483   /* This is a special vec_init<M><N> where N is not an element mode but a
23484      vector mode with half the elements of M.  We expect to find two entries
23485      of mode N in VALS and we must put their concatentation into TARGET.  */
23486   if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
23487     {
23488       machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
23489       gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
23490                   && known_eq (GET_MODE_SIZE (mode),
23491                                2 * GET_MODE_SIZE (narrow_mode)));
23492       emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
23493                                          XVECEXP (vals, 0, 0),
23494                                          XVECEXP (vals, 0, 1)));
23495      return;
23496    }
23497
23498   /* Count the number of variable elements to initialise.  */
23499   for (int i = 0; i < n_elts; ++i)
23500     {
23501       rtx x = XVECEXP (vals, 0, i);
23502       if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
23503         ++n_var;
23504       else
23505         any_const = x;
23506
23507       all_same &= rtx_equal_p (x, v0);
23508     }
23509
23510   /* No variable elements, hand off to aarch64_simd_make_constant which knows
23511      how best to handle this.  */
23512   if (n_var == 0)
23513     {
23514       rtx constant = aarch64_simd_make_constant (vals);
23515       if (constant != NULL_RTX)
23516         {
23517           emit_move_insn (target, constant);
23518           return;
23519         }
23520     }
23521
23522   /* Splat a single non-constant element if we can.  */
23523   if (all_same)
23524     {
23525       rtx x = force_reg (inner_mode, v0);
23526       aarch64_emit_move (target, gen_vec_duplicate (mode, x));
23527       return;
23528     }
23529
23530   enum insn_code icode = optab_handler (vec_set_optab, mode);
23531   gcc_assert (icode != CODE_FOR_nothing);
23532
23533   /* If there are only variable elements, try to optimize
23534      the insertion using dup for the most common element
23535      followed by insertions.  */
23536
23537   /* The algorithm will fill matches[*][0] with the earliest matching element,
23538      and matches[X][1] with the count of duplicate elements (if X is the
23539      earliest element which has duplicates).  */
23540
23541   if (n_var >= n_elts - 1 && n_elts <= 16)
23542     {
23543       int matches[16][2] = {0};
23544       for (int i = 0; i < n_elts; i++)
23545         {
23546           for (int j = 0; j <= i; j++)
23547             {
23548               if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
23549                 {
23550                   matches[i][0] = j;
23551                   matches[j][1]++;
23552                   break;
23553                 }
23554             }
23555         }
23556       int maxelement = 0;
23557       int maxv = 0;
23558       rtx const_elem = NULL_RTX;
23559       int const_elem_pos = 0;
23560
23561       for (int i = 0; i < n_elts; i++)
23562         {
23563           if (matches[i][1] > maxv)
23564             {
23565               maxelement = i;
23566               maxv = matches[i][1];
23567             }
23568           if (CONST_INT_P (XVECEXP (vals, 0, i))
23569               || CONST_DOUBLE_P (XVECEXP (vals, 0, i)))
23570             {
23571               const_elem_pos = i;
23572               const_elem = XVECEXP (vals, 0, i);
23573             }
23574         }
23575
23576       /* Create a duplicate of the most common element, unless all elements
23577          are equally useless to us, in which case just immediately set the
23578          vector register using the first element.  */
23579
23580       if (maxv == 1)
23581         {
23582           /* For vectors of two 64-bit elements, we can do even better.  */
23583           if (n_elts == 2
23584               && (inner_mode == E_DImode
23585                   || inner_mode == E_DFmode))
23586
23587             {
23588               rtx x0 = XVECEXP (vals, 0, 0);
23589               rtx x1 = XVECEXP (vals, 0, 1);
23590               /* Combine can pick up this case, but handling it directly
23591                  here leaves clearer RTL.
23592
23593                  This is load_pair_lanes<mode>, and also gives us a clean-up
23594                  for store_pair_lanes<mode>.  */
23595               if (memory_operand (x0, inner_mode)
23596                   && memory_operand (x1, inner_mode)
23597                   && aarch64_mergeable_load_pair_p (mode, x0, x1))
23598                 {
23599                   rtx t;
23600                   if (inner_mode == DFmode)
23601                     t = gen_load_pair_lanesdf (target, x0, x1);
23602                   else
23603                     t = gen_load_pair_lanesdi (target, x0, x1);
23604                   emit_insn (t);
23605                   return;
23606                 }
23607             }
23608           /* The subreg-move sequence below will move into lane zero of the
23609              vector register.  For big-endian we want that position to hold
23610              the last element of VALS.  */
23611           maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
23612
23613           /* If we have a single constant element, use that for duplicating
23614              instead.  */
23615           if (const_elem)
23616             {
23617               maxelement = const_elem_pos;
23618               aarch64_emit_move (target, gen_vec_duplicate (mode, const_elem));
23619             }
23620           else
23621             {
23622               rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
23623               aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
23624             }
23625         }
23626       else
23627         {
23628           rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
23629           aarch64_emit_move (target, gen_vec_duplicate (mode, x));
23630         }
23631
23632       /* Insert the rest.  */
23633       for (int i = 0; i < n_elts; i++)
23634         {
23635           rtx x = XVECEXP (vals, 0, i);
23636           if (matches[i][0] == maxelement)
23637             continue;
23638           x = force_reg (inner_mode, x);
23639           emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
23640         }
23641       return;
23642     }
23643
23644   /* Initialise a vector which is part-variable.  We want to first try
23645      to build those lanes which are constant in the most efficient way we
23646      can.  */
23647   if (n_var != n_elts)
23648     {
23649       rtx copy = copy_rtx (vals);
23650
23651       /* Load constant part of vector.  We really don't care what goes into the
23652          parts we will overwrite, but we're more likely to be able to load the
23653          constant efficiently if it has fewer, larger, repeating parts
23654          (see aarch64_simd_valid_immediate).  */
23655       for (int i = 0; i < n_elts; i++)
23656         {
23657           rtx x = XVECEXP (vals, 0, i);
23658           if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
23659             continue;
23660           rtx subst = any_const;
23661           for (int bit = n_elts / 2; bit > 0; bit /= 2)
23662             {
23663               /* Look in the copied vector, as more elements are const.  */
23664               rtx test = XVECEXP (copy, 0, i ^ bit);
23665               if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
23666                 {
23667                   subst = test;
23668                   break;
23669                 }
23670             }
23671           XVECEXP (copy, 0, i) = subst;
23672         }
23673       aarch64_expand_vector_init_fallback (target, copy);
23674     }
23675
23676   /* Insert the variable lanes directly.  */
23677   for (int i = 0; i < n_elts; i++)
23678     {
23679       rtx x = XVECEXP (vals, 0, i);
23680       if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
23681         continue;
23682       x = force_reg (inner_mode, x);
23683       emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
23684     }
23685 }
23686
23687 /* Return even or odd half of VALS depending on EVEN_P.  */
23688
23689 static rtx
23690 aarch64_unzip_vector_init (machine_mode mode, rtx vals, bool even_p)
23691 {
23692   int n = XVECLEN (vals, 0);
23693   machine_mode new_mode
23694     = aarch64_simd_container_mode (GET_MODE_INNER (mode),
23695                                    GET_MODE_BITSIZE (mode).to_constant () / 2);
23696   rtvec vec = rtvec_alloc (n / 2);
23697   for (int i = 0; i < n / 2; i++)
23698     RTVEC_ELT (vec, i) = (even_p) ? XVECEXP (vals, 0, 2 * i)
23699                                   : XVECEXP (vals, 0, 2 * i + 1);
23700   return gen_rtx_PARALLEL (new_mode, vec);
23701 }
23702
23703 /* Return true if SET is a scalar move.  */
23704
23705 static bool
23706 scalar_move_insn_p (rtx set)
23707 {
23708   rtx src = SET_SRC (set);
23709   rtx dest = SET_DEST (set);
23710   return (is_a<scalar_mode> (GET_MODE (dest))
23711           && aarch64_mov_operand (src, GET_MODE (dest)));
23712 }
23713
23714 /* Similar to seq_cost, but ignore cost for scalar moves.  */
23715
23716 static unsigned
23717 seq_cost_ignoring_scalar_moves (const rtx_insn *seq, bool speed)
23718 {
23719   unsigned cost = 0;
23720
23721   for (; seq; seq = NEXT_INSN (seq))
23722     if (NONDEBUG_INSN_P (seq))
23723       {
23724         if (rtx set = single_set (seq))
23725           {
23726             if (!scalar_move_insn_p (set))
23727               cost += set_rtx_cost (set, speed);
23728           }
23729         else
23730           {
23731             int this_cost = insn_cost (CONST_CAST_RTX_INSN (seq), speed);
23732             if (this_cost > 0)
23733               cost += this_cost;
23734             else
23735               cost++;
23736           }
23737       }
23738
23739   return cost;
23740 }
23741
23742 /* Expand a vector initialization sequence, such that TARGET is
23743    initialized to contain VALS.  */
23744
23745 void
23746 aarch64_expand_vector_init (rtx target, rtx vals)
23747 {
23748   /* Try decomposing the initializer into even and odd halves and
23749      then ZIP them together.  Use the resulting sequence if it is
23750      strictly cheaper than loading VALS directly.
23751
23752      Prefer the fallback sequence in the event of a tie, since it
23753      will tend to use fewer registers.  */
23754
23755   machine_mode mode = GET_MODE (target);
23756   int n_elts = XVECLEN (vals, 0);
23757
23758   if (n_elts < 4
23759       || maybe_ne (GET_MODE_BITSIZE (mode), 128))
23760     {
23761       aarch64_expand_vector_init_fallback (target, vals);
23762       return;
23763     }
23764
23765   start_sequence ();
23766   rtx halves[2];
23767   unsigned costs[2];
23768   for (int i = 0; i < 2; i++)
23769     {
23770       start_sequence ();
23771       rtx new_vals = aarch64_unzip_vector_init (mode, vals, i == 0);
23772       rtx tmp_reg = gen_reg_rtx (GET_MODE (new_vals));
23773       aarch64_expand_vector_init (tmp_reg, new_vals);
23774       halves[i] = gen_rtx_SUBREG (mode, tmp_reg, 0);
23775       rtx_insn *rec_seq = get_insns ();
23776       end_sequence ();
23777       costs[i] = seq_cost_ignoring_scalar_moves (rec_seq, !optimize_size);
23778       emit_insn (rec_seq);
23779     }
23780
23781   rtvec v = gen_rtvec (2, halves[0], halves[1]);
23782   rtx_insn *zip1_insn
23783     = emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
23784   unsigned seq_total_cost
23785     = (!optimize_size) ? std::max (costs[0], costs[1]) : costs[0] + costs[1];
23786   seq_total_cost += insn_cost (zip1_insn, !optimize_size);
23787
23788   rtx_insn *seq = get_insns ();
23789   end_sequence ();
23790
23791   start_sequence ();
23792   aarch64_expand_vector_init_fallback (target, vals);
23793   rtx_insn *fallback_seq = get_insns ();
23794   unsigned fallback_seq_cost
23795     = seq_cost_ignoring_scalar_moves (fallback_seq, !optimize_size);
23796   end_sequence ();
23797
23798   emit_insn (seq_total_cost < fallback_seq_cost ? seq : fallback_seq);
23799 }
23800
23801 /* Emit RTL corresponding to:
23802    insr TARGET, ELEM.  */
23803
23804 static void
23805 emit_insr (rtx target, rtx elem)
23806 {
23807   machine_mode mode = GET_MODE (target);
23808   scalar_mode elem_mode = GET_MODE_INNER (mode);
23809   elem = force_reg (elem_mode, elem);
23810
23811   insn_code icode = optab_handler (vec_shl_insert_optab, mode);
23812   gcc_assert (icode != CODE_FOR_nothing);
23813   emit_insn (GEN_FCN (icode) (target, target, elem));
23814 }
23815
23816 /* Subroutine of aarch64_sve_expand_vector_init for handling
23817    trailing constants.
23818    This function works as follows:
23819    (a) Create a new vector consisting of trailing constants.
23820    (b) Initialize TARGET with the constant vector using emit_move_insn.
23821    (c) Insert remaining elements in TARGET using insr.
23822    NELTS is the total number of elements in original vector while
23823    while NELTS_REQD is the number of elements that are actually
23824    significant.
23825
23826    ??? The heuristic used is to do above only if number of constants
23827    is at least half the total number of elements.  May need fine tuning.  */
23828
23829 static bool
23830 aarch64_sve_expand_vector_init_handle_trailing_constants
23831  (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
23832 {
23833   machine_mode mode = GET_MODE (target);
23834   scalar_mode elem_mode = GET_MODE_INNER (mode);
23835   int n_trailing_constants = 0;
23836
23837   for (int i = nelts_reqd - 1;
23838        i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
23839        i--)
23840     n_trailing_constants++;
23841
23842   if (n_trailing_constants >= nelts_reqd / 2)
23843     {
23844       /* Try to use the natural pattern of BUILDER to extend the trailing
23845          constant elements to a full vector.  Replace any variables in the
23846          extra elements with zeros.
23847
23848          ??? It would be better if the builders supported "don't care"
23849              elements, with the builder filling in whichever elements
23850              give the most compact encoding.  */
23851       rtx_vector_builder v (mode, nelts, 1);
23852       for (int i = 0; i < nelts; i++)
23853         {
23854           rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
23855           if (!valid_for_const_vector_p (elem_mode, x))
23856             x = CONST0_RTX (elem_mode);
23857           v.quick_push (x);
23858         }
23859       rtx const_vec = v.build ();
23860       emit_move_insn (target, const_vec);
23861
23862       for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
23863         emit_insr (target, builder.elt (i));
23864
23865       return true;
23866     }
23867
23868   return false;
23869 }
23870
23871 /* Subroutine of aarch64_sve_expand_vector_init.
23872    Works as follows:
23873    (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
23874    (b) Skip trailing elements from BUILDER, which are the same as
23875        element NELTS_REQD - 1.
23876    (c) Insert earlier elements in reverse order in TARGET using insr.  */
23877
23878 static void
23879 aarch64_sve_expand_vector_init_insert_elems (rtx target,
23880                                              const rtx_vector_builder &builder,
23881                                              int nelts_reqd)
23882 {
23883   machine_mode mode = GET_MODE (target);
23884   scalar_mode elem_mode = GET_MODE_INNER (mode);
23885
23886   struct expand_operand ops[2];
23887   enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
23888   gcc_assert (icode != CODE_FOR_nothing);
23889
23890   create_output_operand (&ops[0], target, mode);
23891   create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
23892   expand_insn (icode, 2, ops);
23893
23894   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
23895   for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
23896     emit_insr (target, builder.elt (i));
23897 }
23898
23899 /* Subroutine of aarch64_sve_expand_vector_init to handle case
23900    when all trailing elements of builder are same.
23901    This works as follows:
23902    (a) Use expand_insn interface to broadcast last vector element in TARGET.
23903    (b) Insert remaining elements in TARGET using insr.
23904
23905    ??? The heuristic used is to do above if number of same trailing elements
23906    is at least 3/4 of total number of elements, loosely based on
23907    heuristic from mostly_zeros_p.  May need fine-tuning.  */
23908
23909 static bool
23910 aarch64_sve_expand_vector_init_handle_trailing_same_elem
23911  (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
23912 {
23913   int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
23914   if (ndups >= (3 * nelts_reqd) / 4)
23915     {
23916       aarch64_sve_expand_vector_init_insert_elems (target, builder,
23917                                                    nelts_reqd - ndups + 1);
23918       return true;
23919     }
23920
23921   return false;
23922 }
23923
23924 /* Initialize register TARGET from BUILDER. NELTS is the constant number
23925    of elements in BUILDER.
23926
23927    The function tries to initialize TARGET from BUILDER if it fits one
23928    of the special cases outlined below.
23929
23930    Failing that, the function divides BUILDER into two sub-vectors:
23931    v_even = even elements of BUILDER;
23932    v_odd = odd elements of BUILDER;
23933
23934    and recursively calls itself with v_even and v_odd.
23935
23936    if (recursive call succeeded for v_even or v_odd)
23937      TARGET = zip (v_even, v_odd)
23938
23939    The function returns true if it managed to build TARGET from BUILDER
23940    with one of the special cases, false otherwise.
23941
23942    Example: {a, 1, b, 2, c, 3, d, 4}
23943
23944    The vector gets divided into:
23945    v_even = {a, b, c, d}
23946    v_odd = {1, 2, 3, 4}
23947
23948    aarch64_sve_expand_vector_init(v_odd) hits case 1 and
23949    initialize tmp2 from constant vector v_odd using emit_move_insn.
23950
23951    aarch64_sve_expand_vector_init(v_even) fails since v_even contains
23952    4 elements, so we construct tmp1 from v_even using insr:
23953    tmp1 = dup(d)
23954    insr tmp1, c
23955    insr tmp1, b
23956    insr tmp1, a
23957
23958    And finally:
23959    TARGET = zip (tmp1, tmp2)
23960    which sets TARGET to {a, 1, b, 2, c, 3, d, 4}.  */
23961
23962 static bool
23963 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
23964                                 int nelts, int nelts_reqd)
23965 {
23966   machine_mode mode = GET_MODE (target);
23967
23968   /* Case 1: Vector contains trailing constants.  */
23969
23970   if (aarch64_sve_expand_vector_init_handle_trailing_constants
23971        (target, builder, nelts, nelts_reqd))
23972     return true;
23973
23974   /* Case 2: Vector contains leading constants.  */
23975
23976   rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
23977   for (int i = 0; i < nelts_reqd; i++)
23978     rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
23979   rev_builder.finalize ();
23980
23981   if (aarch64_sve_expand_vector_init_handle_trailing_constants
23982        (target, rev_builder, nelts, nelts_reqd))
23983     {
23984       emit_insn (gen_aarch64_sve_rev (mode, target, target));
23985       return true;
23986     }
23987
23988   /* Case 3: Vector contains trailing same element.  */
23989
23990   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
23991        (target, builder, nelts_reqd))
23992     return true;
23993
23994   /* Case 4: Vector contains leading same element.  */
23995
23996   if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
23997        (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
23998     {
23999       emit_insn (gen_aarch64_sve_rev (mode, target, target));
24000       return true;
24001     }
24002
24003   /* Avoid recursing below 4-elements.
24004      ??? The threshold 4 may need fine-tuning.  */
24005
24006   if (nelts_reqd <= 4)
24007     return false;
24008
24009   rtx_vector_builder v_even (mode, nelts, 1);
24010   rtx_vector_builder v_odd (mode, nelts, 1);
24011
24012   for (int i = 0; i < nelts * 2; i += 2)
24013     {
24014       v_even.quick_push (builder.elt (i));
24015       v_odd.quick_push (builder.elt (i + 1));
24016     }
24017
24018   v_even.finalize ();
24019   v_odd.finalize ();
24020
24021   rtx tmp1 = gen_reg_rtx (mode);
24022   bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
24023                                                     nelts, nelts_reqd / 2);
24024
24025   rtx tmp2 = gen_reg_rtx (mode);
24026   bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
24027                                                    nelts, nelts_reqd / 2);
24028
24029   if (!did_even_p && !did_odd_p)
24030     return false;
24031
24032   /* Initialize v_even and v_odd using INSR if it didn't match any of the
24033      special cases and zip v_even, v_odd.  */
24034
24035   if (!did_even_p)
24036     aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
24037
24038   if (!did_odd_p)
24039     aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
24040
24041   rtvec v = gen_rtvec (2, tmp1, tmp2);
24042   emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
24043   return true;
24044 }
24045
24046 /* Initialize register TARGET from the elements in PARALLEL rtx VALS.  */
24047
24048 void
24049 aarch64_sve_expand_vector_init (rtx target, rtx vals)
24050 {
24051   machine_mode mode = GET_MODE (target);
24052   int nelts = XVECLEN (vals, 0);
24053
24054   rtx_vector_builder v (mode, nelts, 1);
24055   for (int i = 0; i < nelts; i++)
24056     v.quick_push (XVECEXP (vals, 0, i));
24057   v.finalize ();
24058
24059   /* If neither sub-vectors of v could be initialized specially,
24060      then use INSR to insert all elements from v into TARGET.
24061      ??? This might not be optimal for vectors with large
24062      initializers like 16-element or above.
24063      For nelts < 4, it probably isn't useful to handle specially.  */
24064
24065   if (nelts < 4
24066       || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
24067     aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
24068 }
24069
24070 /* Check whether VALUE is a vector constant in which every element
24071    is either a power of 2 or a negated power of 2.  If so, return
24072    a constant vector of log2s, and flip CODE between PLUS and MINUS
24073    if VALUE contains negated powers of 2.  Return NULL_RTX otherwise.  */
24074
24075 static rtx
24076 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
24077 {
24078   if (!CONST_VECTOR_P (value))
24079     return NULL_RTX;
24080
24081   rtx_vector_builder builder;
24082   if (!builder.new_unary_operation (GET_MODE (value), value, false))
24083     return NULL_RTX;
24084
24085   scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
24086   /* 1 if the result of the multiplication must be negated,
24087      0 if it mustn't, or -1 if we don't yet care.  */
24088   int negate = -1;
24089   unsigned int encoded_nelts = const_vector_encoded_nelts (value);
24090   for (unsigned int i = 0; i < encoded_nelts; ++i)
24091     {
24092       rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
24093       if (!CONST_SCALAR_INT_P (elt))
24094         return NULL_RTX;
24095       rtx_mode_t val (elt, int_mode);
24096       wide_int pow2 = wi::neg (val);
24097       if (val != pow2)
24098         {
24099           /* It matters whether we negate or not.  Make that choice,
24100              and make sure that it's consistent with previous elements.  */
24101           if (negate == !wi::neg_p (val))
24102             return NULL_RTX;
24103           negate = wi::neg_p (val);
24104           if (!negate)
24105             pow2 = val;
24106         }
24107       /* POW2 is now the value that we want to be a power of 2.  */
24108       int shift = wi::exact_log2 (pow2);
24109       if (shift < 0)
24110         return NULL_RTX;
24111       builder.quick_push (gen_int_mode (shift, int_mode));
24112     }
24113   if (negate == -1)
24114     /* PLUS and MINUS are equivalent; canonicalize on PLUS.  */
24115     code = PLUS;
24116   else if (negate == 1)
24117     code = code == PLUS ? MINUS : PLUS;
24118   return builder.build ();
24119 }
24120
24121 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
24122    CODE is PLUS for the former and MINUS for the latter.  OPERANDS is the
24123    operands array, in the same order as for fma_optab.  Return true if
24124    the function emitted all the necessary instructions, false if the caller
24125    should generate the pattern normally with the new OPERANDS array.  */
24126
24127 bool
24128 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
24129 {
24130   machine_mode mode = GET_MODE (operands[0]);
24131   if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
24132     {
24133       rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
24134                                   NULL_RTX, true, OPTAB_DIRECT);
24135       force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
24136                           operands[3], product, operands[0], true,
24137                           OPTAB_DIRECT);
24138       return true;
24139     }
24140   operands[2] = force_reg (mode, operands[2]);
24141   return false;
24142 }
24143
24144 /* Likewise, but for a conditional pattern.  */
24145
24146 bool
24147 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
24148 {
24149   machine_mode mode = GET_MODE (operands[0]);
24150   if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
24151     {
24152       rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
24153                                   NULL_RTX, true, OPTAB_DIRECT);
24154       emit_insn (gen_cond (code, mode, operands[0], operands[1],
24155                            operands[4], product, operands[5]));
24156       return true;
24157     }
24158   operands[3] = force_reg (mode, operands[3]);
24159   return false;
24160 }
24161
24162 static unsigned HOST_WIDE_INT
24163 aarch64_shift_truncation_mask (machine_mode mode)
24164 {
24165   if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
24166     return 0;
24167   return GET_MODE_UNIT_BITSIZE (mode) - 1;
24168 }
24169
24170 /* Select a format to encode pointers in exception handling data.  */
24171 int
24172 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
24173 {
24174    int type;
24175    switch (aarch64_cmodel)
24176      {
24177      case AARCH64_CMODEL_TINY:
24178      case AARCH64_CMODEL_TINY_PIC:
24179      case AARCH64_CMODEL_SMALL:
24180      case AARCH64_CMODEL_SMALL_PIC:
24181      case AARCH64_CMODEL_SMALL_SPIC:
24182        /* text+got+data < 4Gb.  4-byte signed relocs are sufficient
24183           for everything.  */
24184        type = DW_EH_PE_sdata4;
24185        break;
24186      default:
24187        /* No assumptions here.  8-byte relocs required.  */
24188        type = DW_EH_PE_sdata8;
24189        break;
24190      }
24191    return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
24192 }
24193
24194 /* Output .variant_pcs for aarch64_vector_pcs function symbols.  */
24195
24196 static void
24197 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
24198 {
24199   if (TREE_CODE (decl) == FUNCTION_DECL)
24200     {
24201       arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
24202       if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
24203         {
24204           fprintf (stream, "\t.variant_pcs\t");
24205           assemble_name (stream, name);
24206           fprintf (stream, "\n");
24207         }
24208     }
24209 }
24210
24211 /* The last .arch and .tune assembly strings that we printed.  */
24212 static std::string aarch64_last_printed_arch_string;
24213 static std::string aarch64_last_printed_tune_string;
24214
24215 /* Implement ASM_DECLARE_FUNCTION_NAME.  Output the ISA features used
24216    by the function fndecl.  */
24217
24218 void
24219 aarch64_declare_function_name (FILE *stream, const char* name,
24220                                 tree fndecl)
24221 {
24222   tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
24223
24224   struct cl_target_option *targ_options;
24225   if (target_parts)
24226     targ_options = TREE_TARGET_OPTION (target_parts);
24227   else
24228     targ_options = TREE_TARGET_OPTION (target_option_current_node);
24229   gcc_assert (targ_options);
24230
24231   const struct processor *this_arch
24232     = aarch64_get_arch (targ_options->x_selected_arch);
24233
24234   auto isa_flags = targ_options->x_aarch64_asm_isa_flags;
24235   std::string extension
24236     = aarch64_get_extension_string_for_isa_flags (isa_flags,
24237                                                   this_arch->flags);
24238   /* Only update the assembler .arch string if it is distinct from the last
24239      such string we printed.  */
24240   std::string to_print = this_arch->name + extension;
24241   if (to_print != aarch64_last_printed_arch_string)
24242     {
24243       asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
24244       aarch64_last_printed_arch_string = to_print;
24245     }
24246
24247   /* Print the cpu name we're tuning for in the comments, might be
24248      useful to readers of the generated asm.  Do it only when it changes
24249      from function to function and verbose assembly is requested.  */
24250   const struct processor *this_tune
24251     = aarch64_get_tune_cpu (targ_options->x_selected_tune);
24252
24253   if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
24254     {
24255       asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
24256                    this_tune->name);
24257       aarch64_last_printed_tune_string = this_tune->name;
24258     }
24259
24260   aarch64_asm_output_variant_pcs (stream, fndecl, name);
24261
24262   /* Don't forget the type directive for ELF.  */
24263   ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
24264   ASM_OUTPUT_FUNCTION_LABEL (stream, name, fndecl);
24265
24266   cfun->machine->label_is_assembled = true;
24267 }
24268
24269 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY.  */
24270
24271 void
24272 aarch64_print_patchable_function_entry (FILE *file,
24273                                         unsigned HOST_WIDE_INT patch_area_size,
24274                                         bool record_p)
24275 {
24276   if (!cfun->machine->label_is_assembled)
24277     {
24278       /* Emit the patching area before the entry label, if any.  */
24279       default_print_patchable_function_entry (file, patch_area_size,
24280                                               record_p);
24281       return;
24282     }
24283
24284   rtx pa = gen_patchable_area (GEN_INT (patch_area_size),
24285                                GEN_INT (record_p));
24286   basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
24287
24288   if (!aarch_bti_enabled ()
24289       || cgraph_node::get (cfun->decl)->only_called_directly_p ())
24290     {
24291       /* Emit the patchable_area at the beginning of the function.  */
24292       rtx_insn *insn = emit_insn_before (pa, BB_HEAD (bb));
24293       INSN_ADDRESSES_NEW (insn, -1);
24294       return;
24295     }
24296
24297   rtx_insn *insn = next_real_nondebug_insn (get_insns ());
24298   if (!insn
24299       || !INSN_P (insn)
24300       || GET_CODE (PATTERN (insn)) != UNSPEC_VOLATILE
24301       || XINT (PATTERN (insn), 1) != UNSPECV_BTI_C)
24302     {
24303       /* Emit a BTI_C.  */
24304       insn = emit_insn_before (gen_bti_c (), BB_HEAD (bb));
24305     }
24306
24307   /* Emit the patchable_area after BTI_C.  */
24308   insn = emit_insn_after (pa, insn);
24309   INSN_ADDRESSES_NEW (insn, -1);
24310 }
24311
24312 /* Output patchable area.  */
24313
24314 void
24315 aarch64_output_patchable_area (unsigned int patch_area_size, bool record_p)
24316 {
24317   default_print_patchable_function_entry (asm_out_file, patch_area_size,
24318                                           record_p);
24319 }
24320
24321 /* Implement ASM_OUTPUT_DEF_FROM_DECLS.  Output .variant_pcs for aliases.  */
24322
24323 void
24324 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
24325 {
24326   const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
24327   const char *value = IDENTIFIER_POINTER (target);
24328   aarch64_asm_output_variant_pcs (stream, decl, name);
24329   ASM_OUTPUT_DEF (stream, name, value);
24330 }
24331
24332 /* Implement ASM_OUTPUT_EXTERNAL.  Output .variant_pcs for undefined
24333    function symbol references.  */
24334
24335 void
24336 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
24337 {
24338   default_elf_asm_output_external (stream, decl, name);
24339   aarch64_asm_output_variant_pcs (stream, decl, name);
24340 }
24341
24342 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
24343    Used to output the .cfi_b_key_frame directive when signing the current
24344    function with the B key.  */
24345
24346 void
24347 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
24348 {
24349   if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
24350       && aarch_ra_sign_key == AARCH_KEY_B)
24351         asm_fprintf (f, "\t.cfi_b_key_frame\n");
24352 }
24353
24354 /* Implements TARGET_ASM_FILE_START.  Output the assembly header.  */
24355
24356 static void
24357 aarch64_start_file (void)
24358 {
24359   struct cl_target_option *default_options
24360     = TREE_TARGET_OPTION (target_option_default_node);
24361
24362   const struct processor *default_arch
24363     = aarch64_get_arch (default_options->x_selected_arch);
24364   auto default_isa_flags = default_options->x_aarch64_asm_isa_flags;
24365   std::string extension
24366     = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
24367                                                   default_arch->flags);
24368
24369    aarch64_last_printed_arch_string = default_arch->name + extension;
24370    aarch64_last_printed_tune_string = "";
24371    asm_fprintf (asm_out_file, "\t.arch %s\n",
24372                 aarch64_last_printed_arch_string.c_str ());
24373
24374    default_file_start ();
24375 }
24376
24377 /* Emit load exclusive.  */
24378
24379 static void
24380 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
24381                              rtx mem, rtx model_rtx)
24382 {
24383   if (mode == TImode)
24384     emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
24385                                                 gen_highpart (DImode, rval),
24386                                                 mem, model_rtx));
24387   else
24388     emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
24389 }
24390
24391 /* Emit store exclusive.  */
24392
24393 static void
24394 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
24395                               rtx mem, rtx rval, rtx model_rtx)
24396 {
24397   if (mode == TImode)
24398     emit_insn (gen_aarch64_store_exclusive_pair
24399                (bval, mem, operand_subword (rval, 0, 0, TImode),
24400                 operand_subword (rval, 1, 0, TImode), model_rtx));
24401   else
24402     emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
24403 }
24404
24405 /* Mark the previous jump instruction as unlikely.  */
24406
24407 static void
24408 aarch64_emit_unlikely_jump (rtx insn)
24409 {
24410   rtx_insn *jump = emit_jump_insn (insn);
24411   add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
24412 }
24413
24414 /* We store the names of the various atomic helpers in a 5x5 array.
24415    Return the libcall function given MODE, MODEL and NAMES.  */
24416
24417 rtx
24418 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
24419                         const atomic_ool_names *names)
24420 {
24421   memmodel model = memmodel_from_int (INTVAL (model_rtx));
24422   int mode_idx, model_idx;
24423
24424   switch (mode)
24425     {
24426     case E_QImode:
24427       mode_idx = 0;
24428       break;
24429     case E_HImode:
24430       mode_idx = 1;
24431       break;
24432     case E_SImode:
24433       mode_idx = 2;
24434       break;
24435     case E_DImode:
24436       mode_idx = 3;
24437       break;
24438     case E_TImode:
24439       mode_idx = 4;
24440       break;
24441     default:
24442       gcc_unreachable ();
24443     }
24444
24445   switch (model)
24446     {
24447     case MEMMODEL_RELAXED:
24448       model_idx = 0;
24449       break;
24450     case MEMMODEL_CONSUME:
24451     case MEMMODEL_ACQUIRE:
24452       model_idx = 1;
24453       break;
24454     case MEMMODEL_RELEASE:
24455       model_idx = 2;
24456       break;
24457     case MEMMODEL_ACQ_REL:
24458     case MEMMODEL_SEQ_CST:
24459       model_idx = 3;
24460       break;
24461     case MEMMODEL_SYNC_ACQUIRE:
24462     case MEMMODEL_SYNC_RELEASE:
24463     case MEMMODEL_SYNC_SEQ_CST:
24464       model_idx = 4;
24465       break;
24466     default:
24467       gcc_unreachable ();
24468     }
24469
24470   return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
24471                                       VISIBILITY_HIDDEN);
24472 }
24473
24474 #define DEF0(B, N) \
24475   { "__aarch64_" #B #N "_relax", \
24476     "__aarch64_" #B #N "_acq", \
24477     "__aarch64_" #B #N "_rel", \
24478     "__aarch64_" #B #N "_acq_rel", \
24479     "__aarch64_" #B #N "_sync" }
24480
24481 #define DEF4(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
24482                  { NULL, NULL, NULL, NULL }
24483 #define DEF5(B)  DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
24484
24485 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
24486 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
24487 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
24488 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
24489 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
24490 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
24491
24492 #undef DEF0
24493 #undef DEF4
24494 #undef DEF5
24495
24496 /* Expand a compare and swap pattern.  */
24497
24498 void
24499 aarch64_expand_compare_and_swap (rtx operands[])
24500 {
24501   rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
24502   machine_mode mode, r_mode;
24503
24504   bval = operands[0];
24505   rval = operands[1];
24506   mem = operands[2];
24507   oldval = operands[3];
24508   newval = operands[4];
24509   is_weak = operands[5];
24510   mod_s = operands[6];
24511   mod_f = operands[7];
24512   mode = GET_MODE (mem);
24513
24514   /* Normally the succ memory model must be stronger than fail, but in the
24515      unlikely event of fail being ACQUIRE and succ being RELEASE we need to
24516      promote succ to ACQ_REL so that we don't lose the acquire semantics.  */
24517   if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
24518       && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
24519     mod_s = GEN_INT (MEMMODEL_ACQ_REL);
24520
24521   r_mode = mode;
24522   if (mode == QImode || mode == HImode)
24523     {
24524       r_mode = SImode;
24525       rval = gen_reg_rtx (r_mode);
24526     }
24527
24528   if (TARGET_LSE)
24529     {
24530       /* The CAS insn requires oldval and rval overlap, but we need to
24531          have a copy of oldval saved across the operation to tell if
24532          the operation is successful.  */
24533       if (reg_overlap_mentioned_p (rval, oldval))
24534         rval = copy_to_mode_reg (r_mode, oldval);
24535       else
24536         emit_move_insn (rval, gen_lowpart (r_mode, oldval));
24537
24538       emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
24539                                                    newval, mod_s));
24540       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
24541     }
24542   else if (TARGET_OUTLINE_ATOMICS)
24543     {
24544       /* Oldval must satisfy compare afterward.  */
24545       if (!aarch64_plus_operand (oldval, mode))
24546         oldval = force_reg (mode, oldval);
24547       rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
24548       rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
24549                                       oldval, mode, newval, mode,
24550                                       XEXP (mem, 0), Pmode);
24551       cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
24552     }
24553   else
24554     {
24555       /* The oldval predicate varies by mode.  Test it and force to reg.  */
24556       insn_code code = code_for_aarch64_compare_and_swap (mode);
24557       if (!insn_data[code].operand[2].predicate (oldval, mode))
24558         oldval = force_reg (mode, oldval);
24559
24560       emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
24561                                  is_weak, mod_s, mod_f));
24562       cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
24563     }
24564
24565   if (r_mode != mode)
24566     rval = gen_lowpart (mode, rval);
24567   emit_move_insn (operands[1], rval);
24568
24569   x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
24570   emit_insn (gen_rtx_SET (bval, x));
24571 }
24572
24573 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
24574    sequence implementing an atomic operation.  */
24575
24576 static void
24577 aarch64_emit_post_barrier (enum memmodel model)
24578 {
24579   const enum memmodel base_model = memmodel_base (model);
24580
24581   if (is_mm_sync (model)
24582       && (base_model == MEMMODEL_ACQUIRE
24583           || base_model == MEMMODEL_ACQ_REL
24584           || base_model == MEMMODEL_SEQ_CST))
24585     {
24586       emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
24587     }
24588 }
24589
24590 /* Split a compare and swap pattern.  */
24591
24592 void
24593 aarch64_split_compare_and_swap (rtx operands[])
24594 {
24595   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
24596   gcc_assert (epilogue_completed);
24597
24598   rtx rval, mem, oldval, newval, scratch, x, model_rtx;
24599   machine_mode mode;
24600   bool is_weak;
24601   rtx_code_label *label1, *label2;
24602   enum memmodel model;
24603
24604   rval = operands[0];
24605   mem = operands[1];
24606   oldval = operands[2];
24607   newval = operands[3];
24608   model_rtx = operands[5];
24609   scratch = operands[7];
24610   mode = GET_MODE (mem);
24611   model = memmodel_from_int (INTVAL (model_rtx));
24612   is_weak = operands[4] != const0_rtx && mode != TImode;
24613
24614   /* When OLDVAL is zero and we want the strong version we can emit a tighter
24615     loop:
24616     .label1:
24617         LD[A]XR rval, [mem]
24618         CBNZ    rval, .label2
24619         ST[L]XR scratch, newval, [mem]
24620         CBNZ    scratch, .label1
24621     .label2:
24622         CMP     rval, 0.  */
24623   bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
24624                         oldval == const0_rtx && mode != TImode);
24625
24626   label1 = NULL;
24627   if (!is_weak)
24628     {
24629       label1 = gen_label_rtx ();
24630       emit_label (label1);
24631     }
24632   label2 = gen_label_rtx ();
24633
24634   /* The initial load can be relaxed for a __sync operation since a final
24635      barrier will be emitted to stop code hoisting.  */
24636   if (is_mm_sync (model))
24637     aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
24638   else
24639     aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
24640
24641   if (strong_zero_p)
24642     x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
24643   else
24644     {
24645       rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
24646       x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
24647     }
24648   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
24649                             gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
24650   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
24651
24652   aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
24653
24654   if (!is_weak)
24655     {
24656       if (aarch64_track_speculation)
24657         {
24658           /* Emit an explicit compare instruction, so that we can correctly
24659              track the condition codes.  */
24660           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
24661           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
24662         }
24663       else
24664         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
24665
24666       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
24667                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
24668       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
24669     }
24670   else
24671     aarch64_gen_compare_reg (NE, scratch, const0_rtx);
24672
24673   /* 128-bit LDAXP is not atomic unless STLXP succeeds.  So for a mismatch,
24674      store the returned value and loop if the STLXP fails.  */
24675   if (mode == TImode)
24676     {
24677       rtx_code_label *label3 = gen_label_rtx ();
24678       emit_jump_insn (gen_rtx_SET (pc_rtx, gen_rtx_LABEL_REF (Pmode, label3)));
24679       emit_barrier ();
24680
24681       emit_label (label2);
24682       aarch64_emit_store_exclusive (mode, scratch, mem, rval, model_rtx);
24683
24684       if (aarch64_track_speculation)
24685         {
24686           /* Emit an explicit compare instruction, so that we can correctly
24687              track the condition codes.  */
24688           rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
24689           x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
24690         }
24691       else
24692         x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
24693       x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
24694                                 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
24695       aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
24696
24697       label2 = label3;
24698     }
24699
24700   emit_label (label2);
24701
24702   /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
24703      to set the condition flags.  If this is not used it will be removed by
24704      later passes.  */
24705   if (strong_zero_p)
24706     aarch64_gen_compare_reg (NE, rval, const0_rtx);
24707
24708   /* Emit any final barrier needed for a __sync operation.  */
24709   if (is_mm_sync (model))
24710     aarch64_emit_post_barrier (model);
24711 }
24712
24713 /* Split an atomic operation.  */
24714
24715 void
24716 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
24717                          rtx value, rtx model_rtx, rtx cond)
24718 {
24719   /* Split after prolog/epilog to avoid interactions with shrinkwrapping.  */
24720   gcc_assert (epilogue_completed);
24721
24722   machine_mode mode = GET_MODE (mem);
24723   machine_mode wmode = (mode == DImode ? DImode : SImode);
24724   const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
24725   const bool is_sync = is_mm_sync (model);
24726   rtx_code_label *label;
24727   rtx x;
24728
24729   /* Split the atomic operation into a sequence.  */
24730   label = gen_label_rtx ();
24731   emit_label (label);
24732
24733   if (new_out)
24734     new_out = gen_lowpart (wmode, new_out);
24735   if (old_out)
24736     old_out = gen_lowpart (wmode, old_out);
24737   else
24738     old_out = new_out;
24739   value = simplify_gen_subreg (wmode, value, mode, 0);
24740
24741   /* The initial load can be relaxed for a __sync operation since a final
24742      barrier will be emitted to stop code hoisting.  */
24743  if (is_sync)
24744     aarch64_emit_load_exclusive (mode, old_out, mem,
24745                                  GEN_INT (MEMMODEL_RELAXED));
24746   else
24747     aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
24748
24749   switch (code)
24750     {
24751     case SET:
24752       new_out = value;
24753       break;
24754
24755     case NOT:
24756       x = gen_rtx_AND (wmode, old_out, value);
24757       emit_insn (gen_rtx_SET (new_out, x));
24758       x = gen_rtx_NOT (wmode, new_out);
24759       emit_insn (gen_rtx_SET (new_out, x));
24760       break;
24761
24762     case MINUS:
24763       if (CONST_INT_P (value))
24764         {
24765           value = GEN_INT (-UINTVAL (value));
24766           code = PLUS;
24767         }
24768       /* Fall through.  */
24769
24770     default:
24771       x = gen_rtx_fmt_ee (code, wmode, old_out, value);
24772       emit_insn (gen_rtx_SET (new_out, x));
24773       break;
24774     }
24775
24776   aarch64_emit_store_exclusive (mode, cond, mem,
24777                                 gen_lowpart (mode, new_out), model_rtx);
24778
24779   if (aarch64_track_speculation)
24780     {
24781       /* Emit an explicit compare instruction, so that we can correctly
24782          track the condition codes.  */
24783       rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
24784       x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
24785     }
24786   else
24787     x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
24788
24789   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
24790                             gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
24791   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
24792
24793   /* Emit any final barrier needed for a __sync operation.  */
24794   if (is_sync)
24795     aarch64_emit_post_barrier (model);
24796 }
24797
24798 static void
24799 aarch64_init_libfuncs (void)
24800 {
24801    /* Half-precision float operations.  The compiler handles all operations
24802      with NULL libfuncs by converting to SFmode.  */
24803
24804   /* Conversions.  */
24805   set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
24806   set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
24807
24808   /* Arithmetic.  */
24809   set_optab_libfunc (add_optab, HFmode, NULL);
24810   set_optab_libfunc (sdiv_optab, HFmode, NULL);
24811   set_optab_libfunc (smul_optab, HFmode, NULL);
24812   set_optab_libfunc (neg_optab, HFmode, NULL);
24813   set_optab_libfunc (sub_optab, HFmode, NULL);
24814
24815   /* Comparisons.  */
24816   set_optab_libfunc (eq_optab, HFmode, NULL);
24817   set_optab_libfunc (ne_optab, HFmode, NULL);
24818   set_optab_libfunc (lt_optab, HFmode, NULL);
24819   set_optab_libfunc (le_optab, HFmode, NULL);
24820   set_optab_libfunc (ge_optab, HFmode, NULL);
24821   set_optab_libfunc (gt_optab, HFmode, NULL);
24822   set_optab_libfunc (unord_optab, HFmode, NULL);
24823 }
24824
24825 /* Target hook for c_mode_for_suffix.  */
24826 static machine_mode
24827 aarch64_c_mode_for_suffix (char suffix)
24828 {
24829   if (suffix == 'q')
24830     return TFmode;
24831
24832   return VOIDmode;
24833 }
24834
24835 /* We can only represent floating point constants which will fit in
24836    "quarter-precision" values.  These values are characterised by
24837    a sign bit, a 4-bit mantissa and a 3-bit exponent.  And are given
24838    by:
24839
24840    (-1)^s * (n/16) * 2^r
24841
24842    Where:
24843      's' is the sign bit.
24844      'n' is an integer in the range 16 <= n <= 31.
24845      'r' is an integer in the range -3 <= r <= 4.  */
24846
24847 /* Return true iff X can be represented by a quarter-precision
24848    floating point immediate operand X.  Note, we cannot represent 0.0.  */
24849 bool
24850 aarch64_float_const_representable_p (rtx x)
24851 {
24852   /* This represents our current view of how many bits
24853      make up the mantissa.  */
24854   int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
24855   int exponent;
24856   unsigned HOST_WIDE_INT mantissa, mask;
24857   REAL_VALUE_TYPE r, m;
24858   bool fail;
24859
24860   x = unwrap_const_vec_duplicate (x);
24861   if (!CONST_DOUBLE_P (x))
24862     return false;
24863
24864   if (GET_MODE (x) == VOIDmode
24865       || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
24866     return false;
24867
24868   r = *CONST_DOUBLE_REAL_VALUE (x);
24869
24870   /* We cannot represent infinities, NaNs or +/-zero.  We won't
24871      know if we have +zero until we analyse the mantissa, but we
24872      can reject the other invalid values.  */
24873   if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
24874       || REAL_VALUE_MINUS_ZERO (r))
24875     return false;
24876
24877   /* For BFmode, only handle 0.0. */
24878   if (GET_MODE (x) == BFmode)
24879     return real_iszero (&r, false);
24880
24881   /* Extract exponent.  */
24882   r = real_value_abs (&r);
24883   exponent = REAL_EXP (&r);
24884
24885   /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
24886      highest (sign) bit, with a fixed binary point at bit point_pos.
24887      m1 holds the low part of the mantissa, m2 the high part.
24888      WARNING: If we ever have a representation using more than 2 * H_W_I - 1
24889      bits for the mantissa, this can fail (low bits will be lost).  */
24890   real_ldexp (&m, &r, point_pos - exponent);
24891   wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
24892
24893   /* If the low part of the mantissa has bits set we cannot represent
24894      the value.  */
24895   if (w.ulow () != 0)
24896     return false;
24897   /* We have rejected the lower HOST_WIDE_INT, so update our
24898      understanding of how many bits lie in the mantissa and
24899      look only at the high HOST_WIDE_INT.  */
24900   mantissa = w.elt (1);
24901   point_pos -= HOST_BITS_PER_WIDE_INT;
24902
24903   /* We can only represent values with a mantissa of the form 1.xxxx.  */
24904   mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
24905   if ((mantissa & mask) != 0)
24906     return false;
24907
24908   /* Having filtered unrepresentable values, we may now remove all
24909      but the highest 5 bits.  */
24910   mantissa >>= point_pos - 5;
24911
24912   /* We cannot represent the value 0.0, so reject it.  This is handled
24913      elsewhere.  */
24914   if (mantissa == 0)
24915     return false;
24916
24917   /* Then, as bit 4 is always set, we can mask it off, leaving
24918      the mantissa in the range [0, 15].  */
24919   mantissa &= ~(1 << 4);
24920   gcc_assert (mantissa <= 15);
24921
24922   /* GCC internally does not use IEEE754-like encoding (where normalized
24923      significands are in the range [1, 2).  GCC uses [0.5, 1) (see real.cc).
24924      Our mantissa values are shifted 4 places to the left relative to
24925      normalized IEEE754 so we must modify the exponent returned by REAL_EXP
24926      by 5 places to correct for GCC's representation.  */
24927   exponent = 5 - exponent;
24928
24929   return (exponent >= 0 && exponent <= 7);
24930 }
24931
24932 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
24933    immediate with a CONST_VECTOR of MODE and WIDTH.  WHICH selects whether to
24934    output MOVI/MVNI, ORR or BIC immediate.  */
24935 char*
24936 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
24937                                    enum simd_immediate_check which)
24938 {
24939   bool is_valid;
24940   static char templ[40];
24941   const char *mnemonic;
24942   const char *shift_op;
24943   unsigned int lane_count = 0;
24944   char element_char;
24945
24946   struct simd_immediate_info info;
24947
24948   /* This will return true to show const_vector is legal for use as either
24949      a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
24950      It will also update INFO to show how the immediate should be generated.
24951      WHICH selects whether to check for MOVI/MVNI, ORR or BIC.  */
24952   is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
24953   gcc_assert (is_valid);
24954
24955   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
24956   lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
24957
24958   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
24959     {
24960       gcc_assert (info.insn == simd_immediate_info::MOV
24961                   && info.u.mov.shift == 0);
24962       /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
24963          move immediate path.  */
24964       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
24965         info.u.mov.value = GEN_INT (0);
24966       else
24967         {
24968           const unsigned int buf_size = 20;
24969           char float_buf[buf_size] = {'\0'};
24970           real_to_decimal_for_mode (float_buf,
24971                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
24972                                     buf_size, buf_size, 1, info.elt_mode);
24973
24974           if (lane_count == 1)
24975             snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
24976           else
24977             snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
24978                       lane_count, element_char, float_buf);
24979           return templ;
24980         }
24981     }
24982
24983   gcc_assert (CONST_INT_P (info.u.mov.value));
24984
24985   if (which == AARCH64_CHECK_MOV)
24986     {
24987       mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
24988       shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
24989                   ? "msl" : "lsl");
24990       if (lane_count == 1)
24991         snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
24992                   mnemonic, UINTVAL (info.u.mov.value));
24993       else if (info.u.mov.shift)
24994         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
24995                   HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
24996                   element_char, UINTVAL (info.u.mov.value), shift_op,
24997                   info.u.mov.shift);
24998       else
24999         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
25000                   HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
25001                   element_char, UINTVAL (info.u.mov.value));
25002     }
25003   else
25004     {
25005       /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR.  */
25006       mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
25007       if (info.u.mov.shift)
25008         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
25009                   HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
25010                   element_char, UINTVAL (info.u.mov.value), "lsl",
25011                   info.u.mov.shift);
25012       else
25013         snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
25014                   HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
25015                   element_char, UINTVAL (info.u.mov.value));
25016     }
25017   return templ;
25018 }
25019
25020 char*
25021 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
25022 {
25023
25024   /* If a floating point number was passed and we desire to use it in an
25025      integer mode do the conversion to integer.  */
25026   if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
25027     {
25028       unsigned HOST_WIDE_INT ival;
25029       if (!aarch64_reinterpret_float_as_int (immediate, &ival))
25030           gcc_unreachable ();
25031       immediate = gen_int_mode (ival, mode);
25032     }
25033
25034   machine_mode vmode;
25035   /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
25036      a 128 bit vector mode.  */
25037   int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
25038
25039   vmode = aarch64_simd_container_mode (mode, width);
25040   rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
25041   return aarch64_output_simd_mov_immediate (v_op, width);
25042 }
25043
25044 /* Return the output string to use for moving immediate CONST_VECTOR
25045    into an SVE register.  */
25046
25047 char *
25048 aarch64_output_sve_mov_immediate (rtx const_vector)
25049 {
25050   static char templ[40];
25051   struct simd_immediate_info info;
25052   char element_char;
25053
25054   bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
25055   gcc_assert (is_valid);
25056
25057   element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25058
25059   machine_mode vec_mode = GET_MODE (const_vector);
25060   if (aarch64_sve_pred_mode_p (vec_mode))
25061     {
25062       static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
25063       if (info.insn == simd_immediate_info::MOV)
25064         {
25065           gcc_assert (info.u.mov.value == const0_rtx);
25066           snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
25067         }
25068       else
25069         {
25070           gcc_assert (info.insn == simd_immediate_info::PTRUE);
25071           unsigned int total_bytes;
25072           if (info.u.pattern == AARCH64_SV_ALL
25073               && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
25074             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
25075                       total_bytes / GET_MODE_SIZE (info.elt_mode));
25076           else
25077             snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
25078                       svpattern_token (info.u.pattern));
25079         }
25080       return buf;
25081     }
25082
25083   if (info.insn == simd_immediate_info::INDEX)
25084     {
25085       snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
25086                 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
25087                 element_char, INTVAL (info.u.index.base),
25088                 INTVAL (info.u.index.step));
25089       return templ;
25090     }
25091
25092   if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
25093     {
25094       if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
25095         info.u.mov.value = GEN_INT (0);
25096       else
25097         {
25098           const int buf_size = 20;
25099           char float_buf[buf_size] = {};
25100           real_to_decimal_for_mode (float_buf,
25101                                     CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
25102                                     buf_size, buf_size, 1, info.elt_mode);
25103
25104           snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
25105                     element_char, float_buf);
25106           return templ;
25107         }
25108     }
25109
25110   snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
25111             element_char, INTVAL (info.u.mov.value));
25112   return templ;
25113 }
25114
25115 /* Return the asm template for a PTRUES.  CONST_UNSPEC is the
25116    aarch64_sve_ptrue_svpattern_immediate that describes the predicate
25117    pattern.  */
25118
25119 char *
25120 aarch64_output_sve_ptrues (rtx const_unspec)
25121 {
25122   static char templ[40];
25123
25124   struct simd_immediate_info info;
25125   bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
25126   gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
25127
25128   char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
25129   snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
25130             svpattern_token (info.u.pattern));
25131   return templ;
25132 }
25133
25134 /* Split operands into moves from op[1] + op[2] into op[0].  */
25135
25136 void
25137 aarch64_split_combinev16qi (rtx operands[3])
25138 {
25139   unsigned int dest = REGNO (operands[0]);
25140   unsigned int src1 = REGNO (operands[1]);
25141   unsigned int src2 = REGNO (operands[2]);
25142   machine_mode halfmode = GET_MODE (operands[1]);
25143   unsigned int halfregs = REG_NREGS (operands[1]);
25144   rtx destlo, desthi;
25145
25146   gcc_assert (halfmode == V16QImode);
25147
25148   if (src1 == dest && src2 == dest + halfregs)
25149     {
25150       /* No-op move.  Can't split to nothing; emit something.  */
25151       emit_note (NOTE_INSN_DELETED);
25152       return;
25153     }
25154
25155   /* Preserve register attributes for variable tracking.  */
25156   destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
25157   desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
25158                                GET_MODE_SIZE (halfmode));
25159
25160   /* Special case of reversed high/low parts.  */
25161   if (reg_overlap_mentioned_p (operands[2], destlo)
25162       && reg_overlap_mentioned_p (operands[1], desthi))
25163     {
25164       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
25165       emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
25166       emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
25167     }
25168   else if (!reg_overlap_mentioned_p (operands[2], destlo))
25169     {
25170       /* Try to avoid unnecessary moves if part of the result
25171          is in the right place already.  */
25172       if (src1 != dest)
25173         emit_move_insn (destlo, operands[1]);
25174       if (src2 != dest + halfregs)
25175         emit_move_insn (desthi, operands[2]);
25176     }
25177   else
25178     {
25179       if (src2 != dest + halfregs)
25180         emit_move_insn (desthi, operands[2]);
25181       if (src1 != dest)
25182         emit_move_insn (destlo, operands[1]);
25183     }
25184 }
25185
25186 /* vec_perm support.  */
25187
25188 struct expand_vec_perm_d
25189 {
25190   rtx target, op0, op1;
25191   vec_perm_indices perm;
25192   machine_mode vmode;
25193   machine_mode op_mode;
25194   unsigned int vec_flags;
25195   unsigned int op_vec_flags;
25196   bool one_vector_p;
25197   bool testing_p;
25198 };
25199
25200 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
25201
25202 /* Generate a variable permutation.  */
25203
25204 static void
25205 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
25206 {
25207   machine_mode vmode = GET_MODE (target);
25208   bool one_vector_p = rtx_equal_p (op0, op1);
25209
25210   gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
25211   gcc_checking_assert (GET_MODE (op0) == vmode);
25212   gcc_checking_assert (GET_MODE (op1) == vmode);
25213   gcc_checking_assert (GET_MODE (sel) == vmode);
25214   gcc_checking_assert (TARGET_SIMD);
25215
25216   if (one_vector_p)
25217     {
25218       if (vmode == V8QImode)
25219         {
25220           /* Expand the argument to a V16QI mode by duplicating it.  */
25221           rtx pair = gen_reg_rtx (V16QImode);
25222           emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
25223           emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
25224         }
25225       else
25226         {
25227           emit_insn (gen_aarch64_qtbl1v16qi (target, op0, sel));
25228         }
25229     }
25230   else
25231     {
25232       rtx pair;
25233
25234       if (vmode == V8QImode)
25235         {
25236           pair = gen_reg_rtx (V16QImode);
25237           emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
25238           emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
25239         }
25240       else
25241         {
25242           pair = gen_reg_rtx (V2x16QImode);
25243           emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
25244           emit_insn (gen_aarch64_qtbl2v16qi (target, pair, sel));
25245         }
25246     }
25247 }
25248
25249 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
25250    NELT is the number of elements in the vector.  */
25251
25252 void
25253 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
25254                          unsigned int nelt)
25255 {
25256   machine_mode vmode = GET_MODE (target);
25257   bool one_vector_p = rtx_equal_p (op0, op1);
25258   rtx mask;
25259
25260   /* The TBL instruction does not use a modulo index, so we must take care
25261      of that ourselves.  */
25262   mask = aarch64_simd_gen_const_vector_dup (vmode,
25263       one_vector_p ? nelt - 1 : 2 * nelt - 1);
25264   sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
25265
25266   /* For big-endian, we also need to reverse the index within the vector
25267      (but not which vector).  */
25268   if (BYTES_BIG_ENDIAN)
25269     {
25270       /* If one_vector_p, mask is a vector of (nelt - 1)'s already.  */
25271       if (!one_vector_p)
25272         mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
25273       sel = expand_simple_binop (vmode, XOR, sel, mask,
25274                                  NULL, 0, OPTAB_LIB_WIDEN);
25275     }
25276   aarch64_expand_vec_perm_1 (target, op0, op1, sel);
25277 }
25278
25279 /* Generate (set TARGET (unspec [OP0 OP1] CODE)).  */
25280
25281 static void
25282 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
25283 {
25284   emit_insn (gen_rtx_SET (target,
25285                           gen_rtx_UNSPEC (GET_MODE (target),
25286                                           gen_rtvec (2, op0, op1), code)));
25287 }
25288
25289 /* Expand an SVE vec_perm with the given operands.  */
25290
25291 void
25292 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
25293 {
25294   machine_mode data_mode = GET_MODE (target);
25295   machine_mode sel_mode = GET_MODE (sel);
25296   /* Enforced by the pattern condition.  */
25297   int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
25298
25299   /* Note: vec_perm indices are supposed to wrap when they go beyond the
25300      size of the two value vectors, i.e. the upper bits of the indices
25301      are effectively ignored.  SVE TBL instead produces 0 for any
25302      out-of-range indices, so we need to modulo all the vec_perm indices
25303      to ensure they are all in range.  */
25304   rtx sel_reg = force_reg (sel_mode, sel);
25305
25306   /* Check if the sel only references the first values vector.  */
25307   if (CONST_VECTOR_P (sel)
25308       && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
25309     {
25310       emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
25311       return;
25312     }
25313
25314   /* Check if the two values vectors are the same.  */
25315   if (rtx_equal_p (op0, op1))
25316     {
25317       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
25318       rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
25319                                          NULL, 0, OPTAB_DIRECT);
25320       emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
25321       return;
25322     }
25323
25324   /* Run TBL on for each value vector and combine the results.  */
25325
25326   rtx res0 = gen_reg_rtx (data_mode);
25327   rtx res1 = gen_reg_rtx (data_mode);
25328   rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
25329   if (!CONST_VECTOR_P (sel)
25330       || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
25331     {
25332       rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
25333                                                        2 * nunits - 1);
25334       sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
25335                                      NULL, 0, OPTAB_DIRECT);
25336     }
25337   emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
25338   rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
25339                                      NULL, 0, OPTAB_DIRECT);
25340   emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
25341   if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
25342     emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
25343   else
25344     emit_unspec2 (target, UNSPEC_IORF, res0, res1);
25345 }
25346
25347 /* Recognize patterns suitable for the TRN instructions.  */
25348 static bool
25349 aarch64_evpc_trn (struct expand_vec_perm_d *d)
25350 {
25351   HOST_WIDE_INT odd;
25352   poly_uint64 nelt = d->perm.length ();
25353   rtx out, in0, in1;
25354   machine_mode vmode = d->vmode;
25355
25356   if (GET_MODE_UNIT_SIZE (vmode) > 8)
25357     return false;
25358
25359   /* Note that these are little-endian tests.
25360      We correct for big-endian later.  */
25361   if (!d->perm[0].is_constant (&odd)
25362       || (odd != 0 && odd != 1)
25363       || !d->perm.series_p (0, 2, odd, 2)
25364       || !d->perm.series_p (1, 2, nelt + odd, 2))
25365     return false;
25366
25367   /* Success!  */
25368   if (d->testing_p)
25369     return true;
25370
25371   in0 = d->op0;
25372   in1 = d->op1;
25373   /* We don't need a big-endian lane correction for SVE; see the comment
25374      at the head of aarch64-sve.md for details.  */
25375   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25376     {
25377       std::swap (in0, in1);
25378       odd = !odd;
25379     }
25380   out = d->target;
25381
25382   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25383                                       odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
25384   return true;
25385 }
25386
25387 /* Try to re-encode the PERM constant so it combines odd and even elements.
25388    This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
25389    We retry with this new constant with the full suite of patterns.  */
25390 static bool
25391 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
25392 {
25393   expand_vec_perm_d newd;
25394   unsigned HOST_WIDE_INT nelt;
25395
25396   if (d->vec_flags != VEC_ADVSIMD)
25397     return false;
25398
25399   /* Get the new mode.  Always twice the size of the inner
25400      and half the elements.  */
25401   poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
25402   unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
25403   auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
25404   machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
25405
25406   if (new_mode == word_mode)
25407     return false;
25408
25409   /* to_constant is safe since this routine is specific to Advanced SIMD
25410      vectors.  */
25411   nelt = d->perm.length ().to_constant ();
25412
25413   vec_perm_builder newpermconst;
25414   newpermconst.new_vector (nelt / 2, nelt / 2, 1);
25415
25416   /* Convert the perm constant if we can.  Require even, odd as the pairs.  */
25417   for (unsigned int i = 0; i < nelt; i += 2)
25418     {
25419       poly_int64 elt0 = d->perm[i];
25420       poly_int64 elt1 = d->perm[i + 1];
25421       poly_int64 newelt;
25422       if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
25423         return false;
25424       newpermconst.quick_push (newelt.to_constant ());
25425     }
25426   newpermconst.finalize ();
25427
25428   newd.vmode = new_mode;
25429   newd.vec_flags = VEC_ADVSIMD;
25430   newd.op_mode = newd.vmode;
25431   newd.op_vec_flags = newd.vec_flags;
25432   newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
25433   newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
25434   newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
25435   newd.testing_p = d->testing_p;
25436   newd.one_vector_p = d->one_vector_p;
25437
25438   newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
25439   return aarch64_expand_vec_perm_const_1 (&newd);
25440 }
25441
25442 /* Recognize patterns suitable for the UZP instructions.  */
25443 static bool
25444 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
25445 {
25446   HOST_WIDE_INT odd;
25447   rtx out, in0, in1;
25448   machine_mode vmode = d->vmode;
25449
25450   if (GET_MODE_UNIT_SIZE (vmode) > 8)
25451     return false;
25452
25453   /* Note that these are little-endian tests.
25454      We correct for big-endian later.  */
25455   if (!d->perm[0].is_constant (&odd)
25456       || (odd != 0 && odd != 1)
25457       || !d->perm.series_p (0, 1, odd, 2))
25458     return false;
25459
25460   /* Success!  */
25461   if (d->testing_p)
25462     return true;
25463
25464   in0 = d->op0;
25465   in1 = d->op1;
25466   /* We don't need a big-endian lane correction for SVE; see the comment
25467      at the head of aarch64-sve.md for details.  */
25468   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25469     {
25470       std::swap (in0, in1);
25471       odd = !odd;
25472     }
25473   out = d->target;
25474
25475   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25476                                       odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
25477   return true;
25478 }
25479
25480 /* Recognize patterns suitable for the ZIP instructions.  */
25481 static bool
25482 aarch64_evpc_zip (struct expand_vec_perm_d *d)
25483 {
25484   unsigned int high;
25485   poly_uint64 nelt = d->perm.length ();
25486   rtx out, in0, in1;
25487   machine_mode vmode = d->vmode;
25488
25489   if (GET_MODE_UNIT_SIZE (vmode) > 8)
25490     return false;
25491
25492   /* Note that these are little-endian tests.
25493      We correct for big-endian later.  */
25494   poly_uint64 first = d->perm[0];
25495   if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
25496       || !d->perm.series_p (0, 2, first, 1)
25497       || !d->perm.series_p (1, 2, first + nelt, 1))
25498     return false;
25499   high = maybe_ne (first, 0U);
25500
25501   /* Success!  */
25502   if (d->testing_p)
25503     return true;
25504
25505   in0 = d->op0;
25506   in1 = d->op1;
25507   /* We don't need a big-endian lane correction for SVE; see the comment
25508      at the head of aarch64-sve.md for details.  */
25509   if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
25510     {
25511       std::swap (in0, in1);
25512       high = !high;
25513     }
25514   out = d->target;
25515
25516   emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
25517                                       high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
25518   return true;
25519 }
25520
25521 /* Recognize patterns for the EXT insn.  */
25522
25523 static bool
25524 aarch64_evpc_ext (struct expand_vec_perm_d *d)
25525 {
25526   HOST_WIDE_INT location;
25527   rtx offset;
25528
25529   /* The first element always refers to the first vector.
25530      Check if the extracted indices are increasing by one.  */
25531   if ((d->vec_flags & VEC_SVE_PRED)
25532       || !d->perm[0].is_constant (&location)
25533       || !d->perm.series_p (0, 1, location, 1))
25534     return false;
25535
25536   /* Success! */
25537   if (d->testing_p)
25538     return true;
25539
25540   /* The case where (location == 0) is a no-op for both big- and little-endian,
25541      and is removed by the mid-end at optimization levels -O1 and higher.
25542
25543      We don't need a big-endian lane correction for SVE; see the comment
25544      at the head of aarch64-sve.md for details.  */
25545   if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
25546     {
25547       /* After setup, we want the high elements of the first vector (stored
25548          at the LSB end of the register), and the low elements of the second
25549          vector (stored at the MSB end of the register). So swap.  */
25550       std::swap (d->op0, d->op1);
25551       /* location != 0 (above), so safe to assume (nelt - location) < nelt.
25552          to_constant () is safe since this is restricted to Advanced SIMD
25553          vectors.  */
25554       location = d->perm.length ().to_constant () - location;
25555     }
25556
25557   offset = GEN_INT (location);
25558   emit_set_insn (d->target,
25559                  gen_rtx_UNSPEC (d->vmode,
25560                                  gen_rtvec (3, d->op0, d->op1, offset),
25561                                  UNSPEC_EXT));
25562   return true;
25563 }
25564
25565 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
25566    within each 64-bit, 32-bit or 16-bit granule.  */
25567
25568 static bool
25569 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
25570 {
25571   HOST_WIDE_INT diff;
25572   unsigned int i, size, unspec;
25573   machine_mode pred_mode;
25574
25575   if ((d->vec_flags & VEC_SVE_PRED)
25576       || !d->one_vector_p
25577       || !d->perm[0].is_constant (&diff)
25578       || !diff)
25579     return false;
25580
25581   if (d->vec_flags & VEC_SVE_DATA)
25582     size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
25583   else
25584     size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
25585   if (size == 64)
25586     {
25587       unspec = UNSPEC_REV64;
25588       pred_mode = VNx2BImode;
25589     }
25590   else if (size == 32)
25591     {
25592       unspec = UNSPEC_REV32;
25593       pred_mode = VNx4BImode;
25594     }
25595   else if (size == 16)
25596     {
25597       unspec = UNSPEC_REV16;
25598       pred_mode = VNx8BImode;
25599     }
25600   else
25601     return false;
25602
25603   unsigned int step = diff + 1;
25604   for (i = 0; i < step; ++i)
25605     if (!d->perm.series_p (i, step, diff - i, step))
25606       return false;
25607
25608   /* Success! */
25609   if (d->testing_p)
25610     return true;
25611
25612   if (d->vec_flags & VEC_SVE_DATA)
25613     {
25614       rtx pred = aarch64_ptrue_reg (pred_mode);
25615       emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
25616                                          d->target, pred, d->op0));
25617       return true;
25618     }
25619   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
25620   emit_set_insn (d->target, src);
25621   return true;
25622 }
25623
25624 /* Recognize patterns for the REV insn, which reverses elements within
25625    a full vector.  */
25626
25627 static bool
25628 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
25629 {
25630   poly_uint64 nelt = d->perm.length ();
25631
25632   if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
25633     return false;
25634
25635   if (!d->perm.series_p (0, 1, nelt - 1, -1))
25636     return false;
25637
25638   /* Success! */
25639   if (d->testing_p)
25640     return true;
25641
25642   rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
25643   emit_set_insn (d->target, src);
25644   return true;
25645 }
25646
25647 static bool
25648 aarch64_evpc_dup (struct expand_vec_perm_d *d)
25649 {
25650   rtx out = d->target;
25651   rtx in0;
25652   HOST_WIDE_INT elt;
25653   machine_mode vmode = d->vmode;
25654   rtx lane;
25655
25656   if ((d->vec_flags & VEC_SVE_PRED)
25657       || d->perm.encoding ().encoded_nelts () != 1
25658       || !d->perm[0].is_constant (&elt))
25659     return false;
25660
25661   if ((d->vec_flags & VEC_SVE_DATA)
25662       && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
25663     return false;
25664
25665   /* Success! */
25666   if (d->testing_p)
25667     return true;
25668
25669   /* The generic preparation in aarch64_expand_vec_perm_const_1
25670      swaps the operand order and the permute indices if it finds
25671      d->perm[0] to be in the second operand.  Thus, we can always
25672      use d->op0 and need not do any extra arithmetic to get the
25673      correct lane number.  */
25674   in0 = d->op0;
25675   lane = GEN_INT (elt); /* The pattern corrects for big-endian.  */
25676
25677   rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
25678   rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
25679   emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
25680   return true;
25681 }
25682
25683 static bool
25684 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
25685 {
25686   rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
25687   machine_mode vmode = d->vmode;
25688
25689   /* Make sure that the indices are constant.  */
25690   unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
25691   for (unsigned int i = 0; i < encoded_nelts; ++i)
25692     if (!d->perm[i].is_constant ())
25693       return false;
25694
25695   if (d->testing_p)
25696     return true;
25697
25698   /* Generic code will try constant permutation twice.  Once with the
25699      original mode and again with the elements lowered to QImode.
25700      So wait and don't do the selector expansion ourselves.  */
25701   if (vmode != V8QImode && vmode != V16QImode)
25702     return false;
25703
25704   /* to_constant is safe since this routine is specific to Advanced SIMD
25705      vectors.  */
25706   unsigned int nelt = d->perm.length ().to_constant ();
25707   for (unsigned int i = 0; i < nelt; ++i)
25708     /* If big-endian and two vectors we end up with a weird mixed-endian
25709        mode on NEON.  Reverse the index within each word but not the word
25710        itself.  to_constant is safe because we checked is_constant above.  */
25711     rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
25712                         ? d->perm[i].to_constant () ^ (nelt - 1)
25713                         : d->perm[i].to_constant ());
25714
25715   sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
25716   sel = force_reg (vmode, sel);
25717
25718   aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
25719   return true;
25720 }
25721
25722 /* Try to implement D using an SVE TBL instruction.  */
25723
25724 static bool
25725 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
25726 {
25727   unsigned HOST_WIDE_INT nelt;
25728
25729   /* Permuting two variable-length vectors could overflow the
25730      index range.  */
25731   if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
25732     return false;
25733
25734   if (d->testing_p)
25735     return true;
25736
25737   machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
25738   rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
25739   if (d->one_vector_p)
25740     emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
25741   else
25742     aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
25743   return true;
25744 }
25745
25746 /* Try to implement D using SVE dup instruction.  */
25747
25748 static bool
25749 aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
25750 {
25751   if (BYTES_BIG_ENDIAN
25752       || !d->one_vector_p
25753       || d->vec_flags != VEC_SVE_DATA
25754       || d->op_vec_flags != VEC_ADVSIMD
25755       || d->perm.encoding ().nelts_per_pattern () != 1
25756       || !known_eq (d->perm.encoding ().npatterns (),
25757                     GET_MODE_NUNITS (d->op_mode))
25758       || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
25759     return false;
25760
25761   int npatterns = d->perm.encoding ().npatterns ();
25762   for (int i = 0; i < npatterns; i++)
25763     if (!known_eq (d->perm[i], i))
25764       return false;
25765
25766   if (d->testing_p)
25767     return true;
25768
25769   aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
25770   return true;
25771 }
25772
25773 /* Try to implement D using SVE SEL instruction.  */
25774
25775 static bool
25776 aarch64_evpc_sel (struct expand_vec_perm_d *d)
25777 {
25778   machine_mode vmode = d->vmode;
25779   int unit_size = GET_MODE_UNIT_SIZE (vmode);
25780
25781   if (d->vec_flags != VEC_SVE_DATA
25782       || unit_size > 8)
25783     return false;
25784
25785   int n_patterns = d->perm.encoding ().npatterns ();
25786   poly_int64 vec_len = d->perm.length ();
25787
25788   for (int i = 0; i < n_patterns; ++i)
25789     if (!known_eq (d->perm[i], i)
25790         && !known_eq (d->perm[i], vec_len + i))
25791       return false;
25792
25793   for (int i = n_patterns; i < n_patterns * 2; i++)
25794     if (!d->perm.series_p (i, n_patterns, i, n_patterns)
25795         && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
25796       return false;
25797
25798   if (d->testing_p)
25799     return true;
25800
25801   machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
25802
25803   /* Build a predicate that is true when op0 elements should be used.  */
25804   rtx_vector_builder builder (pred_mode, n_patterns, 2);
25805   for (int i = 0; i < n_patterns * 2; i++)
25806     {
25807       rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
25808                                           : CONST0_RTX (BImode);
25809       builder.quick_push (elem);
25810     }
25811
25812   rtx const_vec = builder.build ();
25813   rtx pred = force_reg (pred_mode, const_vec);
25814   /* TARGET = PRED ? OP0 : OP1.  */
25815   emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
25816   return true;
25817 }
25818
25819 /* Recognize patterns suitable for the INS instructions.  */
25820 static bool
25821 aarch64_evpc_ins (struct expand_vec_perm_d *d)
25822 {
25823   machine_mode mode = d->vmode;
25824   unsigned HOST_WIDE_INT nelt;
25825
25826   if (d->vec_flags != VEC_ADVSIMD)
25827     return false;
25828
25829   /* to_constant is safe since this routine is specific to Advanced SIMD
25830      vectors.  */
25831   nelt = d->perm.length ().to_constant ();
25832   rtx insv = d->op0;
25833
25834   HOST_WIDE_INT idx = -1;
25835
25836   for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
25837     {
25838       HOST_WIDE_INT elt;
25839       if (!d->perm[i].is_constant (&elt))
25840         return false;
25841       if (elt == (HOST_WIDE_INT) i)
25842         continue;
25843       if (idx != -1)
25844         {
25845           idx = -1;
25846           break;
25847         }
25848       idx = i;
25849     }
25850
25851   if (idx == -1)
25852     {
25853       insv = d->op1;
25854       for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
25855         {
25856           if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
25857             continue;
25858           if (idx != -1)
25859             return false;
25860           idx = i;
25861         }
25862
25863       if (idx == -1)
25864         return false;
25865     }
25866
25867   if (d->testing_p)
25868     return true;
25869
25870   gcc_assert (idx != -1);
25871
25872   unsigned extractindex = d->perm[idx].to_constant ();
25873   rtx extractv = d->op0;
25874   if (extractindex >= nelt)
25875     {
25876       extractv = d->op1;
25877       extractindex -= nelt;
25878     }
25879   gcc_assert (extractindex < nelt);
25880
25881   insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
25882   expand_operand ops[5];
25883   create_output_operand (&ops[0], d->target, mode);
25884   create_input_operand (&ops[1], insv, mode);
25885   create_integer_operand (&ops[2], 1 << idx);
25886   create_input_operand (&ops[3], extractv, mode);
25887   create_integer_operand (&ops[4], extractindex);
25888   expand_insn (icode, 5, ops);
25889
25890   return true;
25891 }
25892
25893 static bool
25894 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
25895 {
25896   gcc_assert (d->op_mode != E_VOIDmode);
25897
25898   /* The pattern matching functions above are written to look for a small
25899      number to begin the sequence (0, 1, N/2).  If we begin with an index
25900      from the second operand, we can swap the operands.  */
25901   poly_int64 nelt = d->perm.length ();
25902   if (known_ge (d->perm[0], nelt))
25903     {
25904       d->perm.rotate_inputs (1);
25905       std::swap (d->op0, d->op1);
25906     }
25907
25908   if (((d->vec_flags == VEC_ADVSIMD && TARGET_SIMD)
25909        || d->vec_flags == VEC_SVE_DATA
25910        || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
25911        || d->vec_flags == VEC_SVE_PRED)
25912       && known_gt (nelt, 1))
25913     {
25914       if (d->vmode == d->op_mode)
25915         {
25916           if (aarch64_evpc_rev_local (d))
25917             return true;
25918           else if (aarch64_evpc_rev_global (d))
25919             return true;
25920           else if (aarch64_evpc_ext (d))
25921             return true;
25922           else if (aarch64_evpc_dup (d))
25923             return true;
25924           else if (aarch64_evpc_zip (d))
25925             return true;
25926           else if (aarch64_evpc_uzp (d))
25927             return true;
25928           else if (aarch64_evpc_trn (d))
25929             return true;
25930           else if (aarch64_evpc_sel (d))
25931             return true;
25932           else if (aarch64_evpc_ins (d))
25933             return true;
25934           else if (aarch64_evpc_reencode (d))
25935             return true;
25936
25937           if (d->vec_flags == VEC_SVE_DATA)
25938             return aarch64_evpc_sve_tbl (d);
25939           else if (d->vec_flags == VEC_ADVSIMD)
25940             return aarch64_evpc_tbl (d);
25941         }
25942       else
25943         {
25944           if (aarch64_evpc_sve_dup (d))
25945             return true;
25946         }
25947     }
25948   return false;
25949 }
25950
25951 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST.  */
25952
25953 static bool
25954 aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
25955                                   rtx target, rtx op0, rtx op1,
25956                                   const vec_perm_indices &sel)
25957 {
25958   struct expand_vec_perm_d d;
25959
25960   /* Check whether the mask can be applied to a single vector.  */
25961   if (sel.ninputs () == 1
25962       || (op0 && rtx_equal_p (op0, op1)))
25963     d.one_vector_p = true;
25964   else if (sel.all_from_input_p (0))
25965     {
25966       d.one_vector_p = true;
25967       op1 = op0;
25968     }
25969   else if (sel.all_from_input_p (1))
25970     {
25971       d.one_vector_p = true;
25972       op0 = op1;
25973     }
25974   else
25975     d.one_vector_p = false;
25976
25977   d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
25978                      sel.nelts_per_input ());
25979   d.vmode = vmode;
25980   d.vec_flags = aarch64_classify_vector_mode (d.vmode);
25981   d.op_mode = op_mode;
25982   d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
25983   d.target = target;
25984   d.op0 = op0 ? force_reg (op_mode, op0) : NULL_RTX;
25985   if (op0 == op1)
25986     d.op1 = d.op0;
25987   else
25988     d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX;
25989   d.testing_p = !target;
25990
25991   if (!d.testing_p)
25992     return aarch64_expand_vec_perm_const_1 (&d);
25993
25994   rtx_insn *last = get_last_insn ();
25995   bool ret = aarch64_expand_vec_perm_const_1 (&d);
25996   gcc_assert (last == get_last_insn ());
25997
25998   return ret;
25999 }
26000 /* Generate a byte permute mask for a register of mode MODE,
26001    which has NUNITS units.  */
26002
26003 rtx
26004 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
26005 {
26006   /* We have to reverse each vector because we dont have
26007      a permuted load that can reverse-load according to ABI rules.  */
26008   rtx mask;
26009   rtvec v = rtvec_alloc (16);
26010   unsigned int i, j;
26011   unsigned int usize = GET_MODE_UNIT_SIZE (mode);
26012
26013   gcc_assert (BYTES_BIG_ENDIAN);
26014   gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
26015
26016   for (i = 0; i < nunits; i++)
26017     for (j = 0; j < usize; j++)
26018       RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
26019   mask = gen_rtx_CONST_VECTOR (V16QImode, v);
26020   return force_reg (V16QImode, mask);
26021 }
26022
26023 /* Expand an SVE integer comparison using the SVE equivalent of:
26024
26025      (set TARGET (CODE OP0 OP1)).  */
26026
26027 void
26028 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
26029 {
26030   machine_mode pred_mode = GET_MODE (target);
26031   machine_mode data_mode = GET_MODE (op0);
26032   rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
26033                                       op0, op1);
26034   if (!rtx_equal_p (target, res))
26035     emit_move_insn (target, res);
26036 }
26037
26038 /* Return the UNSPEC_COND_* code for comparison CODE.  */
26039
26040 static unsigned int
26041 aarch64_unspec_cond_code (rtx_code code)
26042 {
26043   switch (code)
26044     {
26045     case NE:
26046       return UNSPEC_COND_FCMNE;
26047     case EQ:
26048       return UNSPEC_COND_FCMEQ;
26049     case LT:
26050       return UNSPEC_COND_FCMLT;
26051     case GT:
26052       return UNSPEC_COND_FCMGT;
26053     case LE:
26054       return UNSPEC_COND_FCMLE;
26055     case GE:
26056       return UNSPEC_COND_FCMGE;
26057     case UNORDERED:
26058       return UNSPEC_COND_FCMUO;
26059     default:
26060       gcc_unreachable ();
26061     }
26062 }
26063
26064 /* Emit:
26065
26066       (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26067
26068    where <X> is the operation associated with comparison CODE.
26069    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
26070
26071 static void
26072 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
26073                           bool known_ptrue_p, rtx op0, rtx op1)
26074 {
26075   rtx flag = gen_int_mode (known_ptrue_p, SImode);
26076   rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
26077                                gen_rtvec (4, pred, flag, op0, op1),
26078                                aarch64_unspec_cond_code (code));
26079   emit_set_insn (target, unspec);
26080 }
26081
26082 /* Emit the SVE equivalent of:
26083
26084       (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
26085       (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
26086       (set TARGET (ior:PRED_MODE TMP1 TMP2))
26087
26088    where <Xi> is the operation associated with comparison CODEi.
26089    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
26090
26091 static void
26092 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
26093                               rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
26094 {
26095   machine_mode pred_mode = GET_MODE (pred);
26096   rtx tmp1 = gen_reg_rtx (pred_mode);
26097   aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
26098   rtx tmp2 = gen_reg_rtx (pred_mode);
26099   aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
26100   aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
26101 }
26102
26103 /* Emit the SVE equivalent of:
26104
26105       (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
26106       (set TARGET (not TMP))
26107
26108    where <X> is the operation associated with comparison CODE.
26109    KNOWN_PTRUE_P is true if PRED is known to be a PTRUE.  */
26110
26111 static void
26112 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
26113                                  bool known_ptrue_p, rtx op0, rtx op1)
26114 {
26115   machine_mode pred_mode = GET_MODE (pred);
26116   rtx tmp = gen_reg_rtx (pred_mode);
26117   aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
26118   aarch64_emit_unop (target, one_cmpl_optab, tmp);
26119 }
26120
26121 /* Expand an SVE floating-point comparison using the SVE equivalent of:
26122
26123      (set TARGET (CODE OP0 OP1))
26124
26125    If CAN_INVERT_P is true, the caller can also handle inverted results;
26126    return true if the result is in fact inverted.  */
26127
26128 bool
26129 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
26130                                   rtx op0, rtx op1, bool can_invert_p)
26131 {
26132   machine_mode pred_mode = GET_MODE (target);
26133   machine_mode data_mode = GET_MODE (op0);
26134
26135   rtx ptrue = aarch64_ptrue_reg (pred_mode);
26136   switch (code)
26137     {
26138     case UNORDERED:
26139       /* UNORDERED has no immediate form.  */
26140       op1 = force_reg (data_mode, op1);
26141       /* fall through */
26142     case LT:
26143     case LE:
26144     case GT:
26145     case GE:
26146     case EQ:
26147     case NE:
26148       {
26149         /* There is native support for the comparison.  */
26150         aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
26151         return false;
26152       }
26153
26154     case LTGT:
26155       /* This is a trapping operation (LT or GT).  */
26156       aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
26157       return false;
26158
26159     case UNEQ:
26160       if (!flag_trapping_math)
26161         {
26162           /* This would trap for signaling NaNs.  */
26163           op1 = force_reg (data_mode, op1);
26164           aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
26165                                         ptrue, true, op0, op1);
26166           return false;
26167         }
26168       /* fall through */
26169     case UNLT:
26170     case UNLE:
26171     case UNGT:
26172     case UNGE:
26173       if (flag_trapping_math)
26174         {
26175           /* Work out which elements are ordered.  */
26176           rtx ordered = gen_reg_rtx (pred_mode);
26177           op1 = force_reg (data_mode, op1);
26178           aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
26179                                            ptrue, true, op0, op1);
26180
26181           /* Test the opposite condition for the ordered elements,
26182              then invert the result.  */
26183           if (code == UNEQ)
26184             code = NE;
26185           else
26186             code = reverse_condition_maybe_unordered (code);
26187           if (can_invert_p)
26188             {
26189               aarch64_emit_sve_fp_cond (target, code,
26190                                         ordered, false, op0, op1);
26191               return true;
26192             }
26193           aarch64_emit_sve_invert_fp_cond (target, code,
26194                                            ordered, false, op0, op1);
26195           return false;
26196         }
26197       break;
26198
26199     case ORDERED:
26200       /* ORDERED has no immediate form.  */
26201       op1 = force_reg (data_mode, op1);
26202       break;
26203
26204     default:
26205       gcc_unreachable ();
26206     }
26207
26208   /* There is native support for the inverse comparison.  */
26209   code = reverse_condition_maybe_unordered (code);
26210   if (can_invert_p)
26211     {
26212       aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
26213       return true;
26214     }
26215   aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
26216   return false;
26217 }
26218
26219 /* Expand an SVE vcond pattern with operands OPS.  DATA_MODE is the mode
26220    of the data being selected and CMP_MODE is the mode of the values being
26221    compared.  */
26222
26223 void
26224 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
26225                           rtx *ops)
26226 {
26227   machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
26228   rtx pred = gen_reg_rtx (pred_mode);
26229   if (FLOAT_MODE_P (cmp_mode))
26230     {
26231       if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
26232                                             ops[4], ops[5], true))
26233         std::swap (ops[1], ops[2]);
26234     }
26235   else
26236     aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
26237
26238   if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
26239     ops[1] = force_reg (data_mode, ops[1]);
26240   /* The "false" value can only be zero if the "true" value is a constant.  */
26241   if (register_operand (ops[1], data_mode)
26242       || !aarch64_simd_reg_or_zero (ops[2], data_mode))
26243     ops[2] = force_reg (data_mode, ops[2]);
26244
26245   rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
26246   emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
26247 }
26248
26249 /* Return true if:
26250
26251    (a) MODE1 and MODE2 use the same layout for bytes that are common
26252        to both modes;
26253
26254    (b) subregs involving the two modes behave as the target-independent
26255        subreg rules require; and
26256
26257    (c) there is at least one register that can hold both modes.
26258
26259    Return false otherwise.  */
26260
26261 static bool
26262 aarch64_modes_compatible_p (machine_mode mode1, machine_mode mode2)
26263 {
26264   unsigned int flags1 = aarch64_classify_vector_mode (mode1);
26265   unsigned int flags2 = aarch64_classify_vector_mode (mode2);
26266
26267   bool sve1_p = (flags1 & VEC_ANY_SVE);
26268   bool sve2_p = (flags2 & VEC_ANY_SVE);
26269
26270   bool partial_sve1_p = sve1_p && (flags1 & VEC_PARTIAL);
26271   bool partial_sve2_p = sve2_p && (flags2 & VEC_PARTIAL);
26272
26273   bool pred1_p = (flags1 & VEC_SVE_PRED);
26274   bool pred2_p = (flags2 & VEC_SVE_PRED);
26275
26276   bool partial_advsimd_struct1_p = (flags1 == (VEC_ADVSIMD | VEC_STRUCT
26277                                                | VEC_PARTIAL));
26278   bool partial_advsimd_struct2_p = (flags2 == (VEC_ADVSIMD | VEC_STRUCT
26279                                                | VEC_PARTIAL));
26280
26281   /* Don't allow changes between predicate modes and other modes.
26282      Only predicate registers can hold predicate modes and only
26283      non-predicate registers can hold non-predicate modes, so any
26284      attempt to mix them would require a round trip through memory.  */
26285   if (pred1_p != pred2_p)
26286     return false;
26287
26288   /* The contents of partial SVE modes are distributed evenly across
26289      the register, whereas GCC expects them to be clustered together.
26290      We therefore need to be careful about mode changes involving them.  */
26291   if (partial_sve1_p && partial_sve2_p)
26292     {
26293       /* Reject changes between partial SVE modes that have different
26294          patterns of significant and insignificant bits.  */
26295       if ((aarch64_sve_container_bits (mode1)
26296            != aarch64_sve_container_bits (mode2))
26297           || GET_MODE_UNIT_SIZE (mode1) != GET_MODE_UNIT_SIZE (mode2))
26298         return false;
26299     }
26300   else if (partial_sve1_p)
26301     {
26302       /* The first lane of MODE1 is where GCC expects it, but anything
26303          bigger than that is not.  */
26304       if (maybe_gt (GET_MODE_SIZE (mode2), GET_MODE_UNIT_SIZE (mode1)))
26305         return false;
26306     }
26307   else if (partial_sve2_p)
26308     {
26309       /* Similarly in reverse.  */
26310       if (maybe_gt (GET_MODE_SIZE (mode1), GET_MODE_UNIT_SIZE (mode2)))
26311         return false;
26312     }
26313
26314   /* Don't allow changes between partial Advanced SIMD structure modes
26315      and other modes that are bigger than 8 bytes.  E.g. V16QI and V2x8QI
26316      are the same size, but the former occupies one Q register while the
26317      latter occupies two D registers.  */
26318   if (partial_advsimd_struct1_p != partial_advsimd_struct2_p
26319       && maybe_gt (GET_MODE_SIZE (mode1), 8)
26320       && maybe_gt (GET_MODE_SIZE (mode2), 8))
26321     return false;
26322
26323   if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
26324     {
26325       /* Don't allow changes between SVE modes and other modes that might
26326          be bigger than 128 bits.  In particular, OImode, CImode and XImode
26327          divide into 128-bit quantities while SVE modes divide into
26328          BITS_PER_SVE_VECTOR quantities.  */
26329       if (sve1_p && !sve2_p && maybe_gt (GET_MODE_BITSIZE (mode2), 128))
26330         return false;
26331       if (sve2_p && !sve1_p && maybe_gt (GET_MODE_BITSIZE (mode1), 128))
26332         return false;
26333     }
26334
26335   if (BYTES_BIG_ENDIAN)
26336     {
26337       /* Don't allow changes between SVE data modes and non-SVE modes.
26338          See the comment at the head of aarch64-sve.md for details.  */
26339       if (sve1_p != sve2_p)
26340         return false;
26341
26342       /* Don't allow changes in element size: lane 0 of the new vector
26343          would not then be lane 0 of the old vector.  See the comment
26344          above aarch64_maybe_expand_sve_subreg_move for a more detailed
26345          description.
26346
26347          In the worst case, this forces a register to be spilled in
26348          one mode and reloaded in the other, which handles the
26349          endianness correctly.  */
26350       if (sve1_p && GET_MODE_UNIT_SIZE (mode1) != GET_MODE_UNIT_SIZE (mode2))
26351         return false;
26352     }
26353   return true;
26354 }
26355
26356 /* Implement TARGET_MODES_TIEABLE_P.  In principle we should always defer
26357    to aarch64_modes_compatible_p.  However due to issues with register
26358    allocation it is preferable to avoid tieing integer scalar and FP
26359    scalar modes.  Executing integer operations in general registers is
26360    better than treating them as scalar vector operations.  This reduces
26361    latency and avoids redundant int<->FP moves.  So tie modes if they
26362    are either the same class, or one of them is a vector mode.  */
26363
26364 static bool
26365 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
26366 {
26367   if (aarch64_modes_compatible_p (mode1, mode2))
26368     {
26369       if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
26370         return true;
26371       if (VECTOR_MODE_P (mode1) || VECTOR_MODE_P (mode2))
26372         return true;
26373     }
26374   return false;
26375 }
26376
26377 /* Return a new RTX holding the result of moving POINTER forward by
26378    AMOUNT bytes.  */
26379
26380 static rtx
26381 aarch64_move_pointer (rtx pointer, poly_int64 amount)
26382 {
26383   rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
26384
26385   return adjust_automodify_address (pointer, GET_MODE (pointer),
26386                                     next, amount);
26387 }
26388
26389 /* Return a new RTX holding the result of moving POINTER forward by the
26390    size of the mode it points to.  */
26391
26392 static rtx
26393 aarch64_progress_pointer (rtx pointer)
26394 {
26395   return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
26396 }
26397
26398 typedef auto_vec<std::pair<rtx, rtx>, 12> copy_ops;
26399
26400 /* Copy one block of size MODE from SRC to DST at offset OFFSET.  */
26401 static void
26402 aarch64_copy_one_block (copy_ops &ops, rtx src, rtx dst,
26403                         int offset, machine_mode mode)
26404 {
26405   /* Emit explict load/store pair instructions for 32-byte copies.  */
26406   if (known_eq (GET_MODE_SIZE (mode), 32))
26407     {
26408       mode = V4SImode;
26409       rtx src1 = adjust_address (src, mode, offset);
26410       rtx dst1 = adjust_address (dst, mode, offset);
26411       rtx reg1 = gen_reg_rtx (mode);
26412       rtx reg2 = gen_reg_rtx (mode);
26413       rtx load = aarch64_gen_load_pair (reg1, reg2, src1);
26414       rtx store = aarch64_gen_store_pair (dst1, reg1, reg2);
26415       ops.safe_push ({ load, store });
26416       return;
26417     }
26418
26419   rtx reg = gen_reg_rtx (mode);
26420   rtx load = gen_move_insn (reg, adjust_address (src, mode, offset));
26421   rtx store = gen_move_insn (adjust_address (dst, mode, offset), reg);
26422   ops.safe_push ({ load, store });
26423 }
26424
26425 /* Expand a cpymem/movmem using the MOPS extension.  OPERANDS are taken
26426    from the cpymem/movmem pattern.  IS_MEMMOVE is true if this is a memmove
26427    rather than memcpy.  Return true iff we succeeded.  */
26428 bool
26429 aarch64_expand_cpymem_mops (rtx *operands, bool is_memmove)
26430 {
26431   if (!TARGET_MOPS)
26432     return false;
26433
26434   /* All three registers are changed by the instruction, so each one
26435      must be a fresh pseudo.  */
26436   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
26437   rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
26438   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
26439   rtx src_mem = replace_equiv_address (operands[1], src_addr);
26440   rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
26441   if (is_memmove)
26442     emit_insn (gen_aarch64_movmemdi (dst_mem, src_mem, sz_reg));
26443   else
26444     emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
26445   return true;
26446 }
26447
26448 /* Expand cpymem/movmem, as if from a __builtin_memcpy/memmove.
26449    OPERANDS are taken from the cpymem/movmem pattern.  IS_MEMMOVE is true
26450    if this is a memmove rather than memcpy.  Return true if we succeed,
26451    otherwise return false, indicating that a libcall should be emitted.  */
26452 bool
26453 aarch64_expand_cpymem (rtx *operands, bool is_memmove)
26454 {
26455   int mode_bytes;
26456   rtx dst = operands[0];
26457   rtx src = operands[1];
26458   unsigned align = UINTVAL (operands[3]);
26459   rtx base;
26460   machine_mode cur_mode = BLKmode, next_mode;
26461
26462   /* Variable-sized or strict-align copies may use the MOPS expansion.  */
26463   if (!CONST_INT_P (operands[2]) || (STRICT_ALIGNMENT && align < 16))
26464     return aarch64_expand_cpymem_mops (operands, is_memmove);
26465
26466   unsigned HOST_WIDE_INT size = UINTVAL (operands[2]);
26467   bool use_ldpq = TARGET_SIMD && !(aarch64_tune_params.extra_tuning_flags
26468                                    & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS);
26469
26470   /* Set inline limits for memmove/memcpy.  MOPS has a separate threshold.  */
26471   unsigned max_copy_size = use_ldpq ? 256 : 128;
26472   unsigned mops_threshold = is_memmove ? aarch64_mops_memmove_size_threshold
26473                                        : aarch64_mops_memcpy_size_threshold;
26474
26475   /* Reduce the maximum size with -Os.  */
26476   if (optimize_function_for_size_p (cfun))
26477     max_copy_size /= 4;
26478
26479   /* Large copies use MOPS when available or a library call.  */
26480   if (size > max_copy_size || (TARGET_MOPS && size > mops_threshold))
26481     return aarch64_expand_cpymem_mops (operands, is_memmove);
26482
26483   unsigned copy_max = 32;
26484
26485   /* Default to 32-byte LDP/STP on large copies, however small copies, no SIMD
26486      support or slow LDP/STP fall back to 16-byte chunks.
26487
26488      ??? Although it would be possible to use LDP/STP Qn in streaming mode
26489      (so using TARGET_BASE_SIMD instead of TARGET_SIMD), it isn't clear
26490      whether that would improve performance.  */
26491   if (size <= 24 || !use_ldpq)
26492     copy_max = 16;
26493
26494   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
26495   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
26496
26497   base = copy_to_mode_reg (Pmode, XEXP (src, 0));
26498   src = adjust_automodify_address (src, VOIDmode, base, 0);
26499
26500   copy_ops ops;
26501   int offset = 0;
26502
26503   while (size > 0)
26504     {
26505       /* Find the largest mode in which to do the copy in without over reading
26506          or writing.  */
26507       opt_scalar_int_mode mode_iter;
26508       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
26509         if (GET_MODE_SIZE (mode_iter.require ()) <= MIN (size, copy_max))
26510           cur_mode = mode_iter.require ();
26511
26512       gcc_assert (cur_mode != BLKmode);
26513
26514       mode_bytes = GET_MODE_SIZE (cur_mode).to_constant ();
26515
26516       /* Prefer Q-register accesses for the last bytes.  */
26517       if (mode_bytes == 16 && copy_max == 32)
26518         cur_mode = V4SImode;
26519       aarch64_copy_one_block (ops, src, dst, offset, cur_mode);
26520       size -= mode_bytes;
26521       offset += mode_bytes;
26522
26523       /* Emit trailing copies using overlapping unaligned accesses
26524          (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
26525       if (size > 0 && size < copy_max / 2 && !STRICT_ALIGNMENT)
26526         {
26527           next_mode = smallest_mode_for_size (size * BITS_PER_UNIT, MODE_INT);
26528           int n_bytes = GET_MODE_SIZE (next_mode).to_constant ();
26529           gcc_assert (n_bytes <= mode_bytes);
26530           offset -= n_bytes - size;
26531           size = n_bytes;
26532         }
26533     }
26534
26535   /* Memcpy interleaves loads with stores, memmove emits all loads first.  */
26536   int nops = ops.length();
26537   int inc = is_memmove ? nops : nops == 4 ? 2 : 3;
26538
26539   for (int i = 0; i < nops; i += inc)
26540     {
26541       int m = MIN (nops, i + inc);
26542       /* Emit loads.  */
26543       for (int j = i; j < m; j++)
26544         emit_insn (ops[j].first);
26545       /* Emit stores.  */
26546       for (int j = i; j < m; j++)
26547         emit_insn (ops[j].second);
26548     }
26549   return true;
26550 }
26551
26552 /* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
26553    SRC is a register we have created with the duplicated value to be set.  */
26554 static void
26555 aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
26556                                             machine_mode mode)
26557 {
26558   /* If we are copying 128bits or 256bits, we can do that straight from
26559      the SIMD register we prepared.  */
26560   if (known_eq (GET_MODE_BITSIZE (mode), 256))
26561     {
26562       mode = GET_MODE (src);
26563       /* "Cast" the *dst to the correct mode.  */
26564       *dst = adjust_address (*dst, mode, 0);
26565       /* Emit the memset.  */
26566       emit_insn (aarch64_gen_store_pair (*dst, src, src));
26567
26568       /* Move the pointers forward.  */
26569       *dst = aarch64_move_pointer (*dst, 32);
26570       return;
26571     }
26572   if (known_eq (GET_MODE_BITSIZE (mode), 128))
26573     {
26574       /* "Cast" the *dst to the correct mode.  */
26575       *dst = adjust_address (*dst, GET_MODE (src), 0);
26576       /* Emit the memset.  */
26577       emit_move_insn (*dst, src);
26578       /* Move the pointers forward.  */
26579       *dst = aarch64_move_pointer (*dst, 16);
26580       return;
26581     }
26582   /* For copying less, we have to extract the right amount from src.  */
26583   rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
26584
26585   /* "Cast" the *dst to the correct mode.  */
26586   *dst = adjust_address (*dst, mode, 0);
26587   /* Emit the memset.  */
26588   emit_move_insn (*dst, reg);
26589   /* Move the pointer forward.  */
26590   *dst = aarch64_progress_pointer (*dst);
26591 }
26592
26593 /* Expand a setmem using the MOPS instructions.  OPERANDS are the same
26594    as for the setmem pattern.  Return true iff we succeed.  */
26595 static bool
26596 aarch64_expand_setmem_mops (rtx *operands)
26597 {
26598   if (!TARGET_MOPS)
26599     return false;
26600
26601   /* The first two registers are changed by the instruction, so both
26602      of them must be a fresh pseudo.  */
26603   rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
26604   rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
26605   rtx sz_reg = copy_to_mode_reg (DImode, operands[1]);
26606   rtx val = operands[2];
26607   if (val != CONST0_RTX (QImode))
26608     val = force_reg (QImode, val);
26609   emit_insn (gen_aarch64_setmemdi (dst_mem, val, sz_reg));
26610   return true;
26611 }
26612
26613 /* Expand setmem, as if from a __builtin_memset.  Return true if
26614    we succeed, otherwise return false.  */
26615
26616 bool
26617 aarch64_expand_setmem (rtx *operands)
26618 {
26619   int n, mode_bits;
26620   unsigned HOST_WIDE_INT len;
26621   rtx dst = operands[0];
26622   rtx val = operands[2], src;
26623   unsigned align = UINTVAL (operands[3]);
26624   rtx base;
26625   machine_mode cur_mode = BLKmode, next_mode;
26626
26627   /* Variable-sized or strict-align memset may use the MOPS expansion.  */
26628   if (!CONST_INT_P (operands[1]) || !TARGET_SIMD
26629       || (STRICT_ALIGNMENT && align < 16))
26630     return aarch64_expand_setmem_mops (operands);
26631
26632   bool size_p = optimize_function_for_size_p (cfun);
26633
26634   /* Default the maximum to 256-bytes when considering only libcall vs
26635      SIMD broadcast sequence.  */
26636   unsigned max_set_size = 256;
26637   unsigned mops_threshold = aarch64_mops_memset_size_threshold;
26638
26639   len = UINTVAL (operands[1]);
26640
26641   /* Large memset uses MOPS when available or a library call.  */
26642   if (len > max_set_size || (TARGET_MOPS && len > mops_threshold))
26643     return aarch64_expand_setmem_mops (operands);
26644
26645   int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
26646   /* The MOPS sequence takes:
26647      3 instructions for the memory storing
26648      + 1 to move the constant size into a reg
26649      + 1 if VAL is a non-zero constant to move into a reg
26650     (zero constants can use XZR directly).  */
26651   unsigned mops_cost = 3 + 1 + cst_val;
26652   /* A libcall to memset in the worst case takes 3 instructions to prepare
26653      the arguments + 1 for the call.  */
26654   unsigned libcall_cost = 4;
26655
26656   /* Attempt a sequence with a vector broadcast followed by stores.
26657      Count the number of operations involved to see if it's worth it
26658      against the alternatives.  A simple counter simd_ops on the
26659      algorithmically-relevant operations is used rather than an rtx_insn count
26660      as all the pointer adjusmtents and mode reinterprets will be optimized
26661      away later.  */
26662   start_sequence ();
26663   unsigned simd_ops = 0;
26664
26665   base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
26666   dst = adjust_automodify_address (dst, VOIDmode, base, 0);
26667
26668   /* Prepare the val using a DUP/MOVI v0.16B, val.  */
26669   src = expand_vector_broadcast (V16QImode, val);
26670   src = force_reg (V16QImode, src);
26671   simd_ops++;
26672   /* Convert len to bits to make the rest of the code simpler.  */
26673   n = len * BITS_PER_UNIT;
26674
26675   /* Maximum amount to copy in one go.  We allow 256-bit chunks based on the
26676      AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter.  */
26677   const int copy_limit = (aarch64_tune_params.extra_tuning_flags
26678                           & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
26679                           ? GET_MODE_BITSIZE (TImode) : 256;
26680
26681   while (n > 0)
26682     {
26683       /* Find the largest mode in which to do the copy without
26684          over writing.  */
26685       opt_scalar_int_mode mode_iter;
26686       FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
26687         if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
26688           cur_mode = mode_iter.require ();
26689
26690       gcc_assert (cur_mode != BLKmode);
26691
26692       mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
26693       aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
26694       simd_ops++;
26695       n -= mode_bits;
26696
26697       /* Emit trailing writes using overlapping unaligned accesses
26698         (when !STRICT_ALIGNMENT) - this is smaller and faster.  */
26699       if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
26700         {
26701           next_mode = smallest_mode_for_size (n, MODE_INT);
26702           int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
26703           gcc_assert (n_bits <= mode_bits);
26704           dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
26705           n = n_bits;
26706         }
26707     }
26708   rtx_insn *seq = get_insns ();
26709   end_sequence ();
26710
26711   if (size_p)
26712     {
26713       /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
26714          call to memset or the MOPS expansion.  */
26715       if (TARGET_MOPS
26716           && mops_cost <= libcall_cost
26717           && mops_cost <= simd_ops)
26718         return aarch64_expand_setmem_mops (operands);
26719       /* If MOPS is not available or not shorter pick a libcall if the SIMD
26720          sequence is too long.  */
26721       else if (libcall_cost < simd_ops)
26722         return false;
26723       emit_insn (seq);
26724       return true;
26725     }
26726
26727   /* At this point the SIMD broadcast sequence is the best choice when
26728      optimizing for speed.  */
26729   emit_insn (seq);
26730   return true;
26731 }
26732
26733
26734 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
26735    SImode stores.  Handle the case when the constant has identical
26736    bottom and top halves.  This is beneficial when the two stores can be
26737    merged into an STP and we avoid synthesising potentially expensive
26738    immediates twice.  Return true if such a split is possible.  */
26739
26740 bool
26741 aarch64_split_dimode_const_store (rtx dst, rtx src)
26742 {
26743   rtx lo = gen_lowpart (SImode, src);
26744   rtx hi = gen_highpart_mode (SImode, DImode, src);
26745
26746   if (!rtx_equal_p (lo, hi))
26747     return false;
26748
26749   unsigned int orig_cost
26750     = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
26751   unsigned int lo_cost
26752     = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
26753
26754   /* We want to transform:
26755      MOV        x1, 49370
26756      MOVK       x1, 0x140, lsl 16
26757      MOVK       x1, 0xc0da, lsl 32
26758      MOVK       x1, 0x140, lsl 48
26759      STR        x1, [x0]
26760    into:
26761      MOV        w1, 49370
26762      MOVK       w1, 0x140, lsl 16
26763      STP        w1, w1, [x0]
26764    So we want to perform this when we save at least one instruction.  */
26765   if (orig_cost <= lo_cost)
26766     return false;
26767
26768   rtx mem_lo = adjust_address (dst, SImode, 0);
26769   if (!aarch64_mem_pair_operand (mem_lo, SImode))
26770     return false;
26771
26772   rtx tmp_reg = gen_reg_rtx (SImode);
26773   aarch64_expand_mov_immediate (tmp_reg, lo);
26774   rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
26775   /* Don't emit an explicit store pair as this may not be always profitable.
26776      Let the sched-fusion logic decide whether to merge them.  */
26777   emit_move_insn (mem_lo, tmp_reg);
26778   emit_move_insn (mem_hi, tmp_reg);
26779
26780   return true;
26781 }
26782
26783 /* Generate RTL for a conditional branch with rtx comparison CODE in
26784    mode CC_MODE.  The destination of the unlikely conditional branch
26785    is LABEL_REF.  */
26786
26787 void
26788 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
26789                               rtx label_ref)
26790 {
26791   rtx x;
26792   x = gen_rtx_fmt_ee (code, VOIDmode,
26793                       gen_rtx_REG (cc_mode, CC_REGNUM),
26794                       const0_rtx);
26795
26796   x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
26797                             gen_rtx_LABEL_REF (VOIDmode, label_ref),
26798                             pc_rtx);
26799   aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
26800 }
26801
26802 /* Generate DImode scratch registers for 128-bit (TImode) addition.
26803
26804    OP1 represents the TImode destination operand 1
26805    OP2 represents the TImode destination operand 2
26806    LOW_DEST represents the low half (DImode) of TImode operand 0
26807    LOW_IN1 represents the low half (DImode) of TImode operand 1
26808    LOW_IN2 represents the low half (DImode) of TImode operand 2
26809    HIGH_DEST represents the high half (DImode) of TImode operand 0
26810    HIGH_IN1 represents the high half (DImode) of TImode operand 1
26811    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
26812
26813 void
26814 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
26815                             rtx *low_in1, rtx *low_in2,
26816                             rtx *high_dest, rtx *high_in1,
26817                             rtx *high_in2)
26818 {
26819   *low_dest = gen_reg_rtx (DImode);
26820   *low_in1 = gen_lowpart (DImode, op1);
26821   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
26822                                   subreg_lowpart_offset (DImode, TImode));
26823   *high_dest = gen_reg_rtx (DImode);
26824   *high_in1 = gen_highpart (DImode, op1);
26825   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
26826                                    subreg_highpart_offset (DImode, TImode));
26827 }
26828
26829 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
26830
26831    This function differs from 'arch64_addti_scratch_regs' in that
26832    OP1 can be an immediate constant (zero). We must call
26833    subreg_highpart_offset with DImode and TImode arguments, otherwise
26834    VOIDmode will be used for the const_int which generates an internal
26835    error from subreg_size_highpart_offset which does not expect a size of zero.
26836
26837    OP1 represents the TImode destination operand 1
26838    OP2 represents the TImode destination operand 2
26839    LOW_DEST represents the low half (DImode) of TImode operand 0
26840    LOW_IN1 represents the low half (DImode) of TImode operand 1
26841    LOW_IN2 represents the low half (DImode) of TImode operand 2
26842    HIGH_DEST represents the high half (DImode) of TImode operand 0
26843    HIGH_IN1 represents the high half (DImode) of TImode operand 1
26844    HIGH_IN2 represents the high half (DImode) of TImode operand 2.  */
26845
26846
26847 void
26848 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
26849                              rtx *low_in1, rtx *low_in2,
26850                              rtx *high_dest, rtx *high_in1,
26851                              rtx *high_in2)
26852 {
26853   *low_dest = gen_reg_rtx (DImode);
26854   *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
26855                                   subreg_lowpart_offset (DImode, TImode));
26856
26857   *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
26858                                   subreg_lowpart_offset (DImode, TImode));
26859   *high_dest = gen_reg_rtx (DImode);
26860
26861   *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
26862                                    subreg_highpart_offset (DImode, TImode));
26863   *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
26864                                    subreg_highpart_offset (DImode, TImode));
26865 }
26866
26867 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
26868
26869    OP0 represents the TImode destination operand 0
26870    LOW_DEST represents the low half (DImode) of TImode operand 0
26871    LOW_IN1 represents the low half (DImode) of TImode operand 1
26872    LOW_IN2 represents the low half (DImode) of TImode operand 2
26873    HIGH_DEST represents the high half (DImode) of TImode operand 0
26874    HIGH_IN1 represents the high half (DImode) of TImode operand 1
26875    HIGH_IN2 represents the high half (DImode) of TImode operand 2
26876    UNSIGNED_P is true if the operation is being performed on unsigned
26877    values.  */
26878 void
26879 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
26880                        rtx low_in2, rtx high_dest, rtx high_in1,
26881                        rtx high_in2, bool unsigned_p)
26882 {
26883   if (low_in2 == const0_rtx)
26884     {
26885       low_dest = low_in1;
26886       high_in2 = force_reg (DImode, high_in2);
26887       if (unsigned_p)
26888         emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
26889       else
26890         emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
26891     }
26892   else
26893     {
26894       if (aarch64_plus_immediate (low_in2, DImode))
26895         emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
26896                                             GEN_INT (-UINTVAL (low_in2))));
26897       else
26898         {
26899           low_in2 = force_reg (DImode, low_in2);
26900           emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
26901         }
26902       high_in2 = force_reg (DImode, high_in2);
26903
26904       if (unsigned_p)
26905         emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
26906       else
26907         emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
26908     }
26909
26910   emit_move_insn (gen_lowpart (DImode, op0), low_dest);
26911   emit_move_insn (gen_highpart (DImode, op0), high_dest);
26912
26913 }
26914
26915 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
26916
26917 static unsigned HOST_WIDE_INT
26918 aarch64_asan_shadow_offset (void)
26919 {
26920   if (TARGET_ILP32)
26921     return (HOST_WIDE_INT_1 << 29);
26922   else
26923     return (HOST_WIDE_INT_1 << 36);
26924 }
26925
26926 static rtx
26927 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
26928                         rtx_code code, tree treeop0, tree treeop1)
26929 {
26930   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
26931   rtx op0, op1;
26932   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
26933   insn_code icode;
26934   struct expand_operand ops[4];
26935
26936   start_sequence ();
26937   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
26938
26939   op_mode = GET_MODE (op0);
26940   if (op_mode == VOIDmode)
26941     op_mode = GET_MODE (op1);
26942
26943   switch (op_mode)
26944     {
26945     case E_QImode:
26946     case E_HImode:
26947     case E_SImode:
26948       cmp_mode = SImode;
26949       icode = CODE_FOR_cmpsi;
26950       break;
26951
26952     case E_DImode:
26953       cmp_mode = DImode;
26954       icode = CODE_FOR_cmpdi;
26955       break;
26956
26957     case E_SFmode:
26958       cmp_mode = SFmode;
26959       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
26960       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
26961       break;
26962
26963     case E_DFmode:
26964       cmp_mode = DFmode;
26965       cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
26966       icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
26967       break;
26968
26969     default:
26970       end_sequence ();
26971       return NULL_RTX;
26972     }
26973
26974   op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
26975   op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
26976   if (!op0 || !op1)
26977     {
26978       end_sequence ();
26979       return NULL_RTX;
26980     }
26981   *prep_seq = get_insns ();
26982   end_sequence ();
26983
26984   create_fixed_operand (&ops[0], op0);
26985   create_fixed_operand (&ops[1], op1);
26986
26987   start_sequence ();
26988   if (!maybe_expand_insn (icode, 2, ops))
26989     {
26990       end_sequence ();
26991       return NULL_RTX;
26992     }
26993   *gen_seq = get_insns ();
26994   end_sequence ();
26995
26996   return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
26997                          gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
26998 }
26999
27000 static rtx
27001 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
27002                        rtx_code cmp_code, tree treeop0, tree treeop1,
27003                        rtx_code bit_code)
27004 {
27005   rtx op0, op1, target;
27006   machine_mode op_mode, cmp_mode, cc_mode = CCmode;
27007   int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
27008   insn_code icode;
27009   struct expand_operand ops[6];
27010   int aarch64_cond;
27011
27012   push_to_sequence (*prep_seq);
27013   expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
27014
27015   op_mode = GET_MODE (op0);
27016   if (op_mode == VOIDmode)
27017     op_mode = GET_MODE (op1);
27018
27019   switch (op_mode)
27020     {
27021     case E_QImode:
27022     case E_HImode:
27023     case E_SImode:
27024       cmp_mode = SImode;
27025       break;
27026
27027     case E_DImode:
27028       cmp_mode = DImode;
27029       break;
27030
27031     case E_SFmode:
27032       cmp_mode = SFmode;
27033       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
27034       break;
27035
27036     case E_DFmode:
27037       cmp_mode = DFmode;
27038       cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
27039       break;
27040
27041     default:
27042       end_sequence ();
27043       return NULL_RTX;
27044     }
27045
27046   icode = code_for_ccmp (cc_mode, cmp_mode);
27047
27048   op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
27049   op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
27050   if (!op0 || !op1)
27051     {
27052       end_sequence ();
27053       return NULL_RTX;
27054     }
27055   *prep_seq = get_insns ();
27056   end_sequence ();
27057
27058   target = gen_rtx_REG (cc_mode, CC_REGNUM);
27059   aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
27060
27061   if (bit_code != AND)
27062     {
27063       /* Treat the ccmp patterns as canonical and use them where possible,
27064          but fall back to ccmp_rev patterns if there's no other option.  */
27065       rtx_code prev_code = GET_CODE (prev);
27066       machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
27067       if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
27068           && !(prev_code == EQ
27069                || prev_code == NE
27070                || prev_code == ORDERED
27071                || prev_code == UNORDERED))
27072         icode = code_for_ccmp_rev (cc_mode, cmp_mode);
27073       else
27074         {
27075           rtx_code code = reverse_condition (prev_code);
27076           prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
27077         }
27078       aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
27079     }
27080
27081   create_fixed_operand (&ops[0], XEXP (prev, 0));
27082   create_fixed_operand (&ops[1], target);
27083   create_fixed_operand (&ops[2], op0);
27084   create_fixed_operand (&ops[3], op1);
27085   create_fixed_operand (&ops[4], prev);
27086   create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
27087
27088   push_to_sequence (*gen_seq);
27089   if (!maybe_expand_insn (icode, 6, ops))
27090     {
27091       end_sequence ();
27092       return NULL_RTX;
27093     }
27094
27095   *gen_seq = get_insns ();
27096   end_sequence ();
27097
27098   return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
27099 }
27100
27101 #undef TARGET_GEN_CCMP_FIRST
27102 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
27103
27104 #undef TARGET_GEN_CCMP_NEXT
27105 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
27106
27107 /* Implement TARGET_SCHED_MACRO_FUSION_P.  Return true if target supports
27108    instruction fusion of some sort.  */
27109
27110 static bool
27111 aarch64_macro_fusion_p (void)
27112 {
27113   return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
27114 }
27115
27116
27117 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P.  Return true if PREV and CURR
27118    should be kept together during scheduling.  */
27119
27120 static bool
27121 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
27122 {
27123   rtx set_dest;
27124   rtx prev_set = single_set (prev);
27125   rtx curr_set = single_set (curr);
27126   /* prev and curr are simple SET insns i.e. no flag setting or branching.  */
27127   bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
27128
27129   if (!aarch64_macro_fusion_p ())
27130     return false;
27131
27132   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
27133     {
27134       /* We are trying to match:
27135          prev (mov)  == (set (reg r0) (const_int imm16))
27136          curr (movk) == (set (zero_extract (reg r0)
27137                                            (const_int 16)
27138                                            (const_int 16))
27139                              (const_int imm16_1))  */
27140
27141       set_dest = SET_DEST (curr_set);
27142
27143       if (GET_CODE (set_dest) == ZERO_EXTRACT
27144           && CONST_INT_P (SET_SRC (curr_set))
27145           && CONST_INT_P (SET_SRC (prev_set))
27146           && CONST_INT_P (XEXP (set_dest, 2))
27147           && INTVAL (XEXP (set_dest, 2)) == 16
27148           && REG_P (XEXP (set_dest, 0))
27149           && REG_P (SET_DEST (prev_set))
27150           && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
27151         {
27152           return true;
27153         }
27154     }
27155
27156   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
27157     {
27158
27159       /*  We're trying to match:
27160           prev (adrp) == (set (reg r1)
27161                               (high (symbol_ref ("SYM"))))
27162           curr (add) == (set (reg r0)
27163                              (lo_sum (reg r1)
27164                                      (symbol_ref ("SYM"))))
27165           Note that r0 need not necessarily be the same as r1, especially
27166           during pre-regalloc scheduling.  */
27167
27168       if (satisfies_constraint_Ush (SET_SRC (prev_set))
27169           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
27170         {
27171           if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
27172               && REG_P (XEXP (SET_SRC (curr_set), 0))
27173               && REGNO (XEXP (SET_SRC (curr_set), 0))
27174                  == REGNO (SET_DEST (prev_set))
27175               && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
27176                               XEXP (SET_SRC (curr_set), 1)))
27177             return true;
27178         }
27179     }
27180
27181   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
27182     {
27183
27184       /* We're trying to match:
27185          prev (movk) == (set (zero_extract (reg r0)
27186                                            (const_int 16)
27187                                            (const_int 32))
27188                              (const_int imm16_1))
27189          curr (movk) == (set (zero_extract (reg r0)
27190                                            (const_int 16)
27191                                            (const_int 48))
27192                              (const_int imm16_2))  */
27193
27194       if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
27195           && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
27196           && REG_P (XEXP (SET_DEST (prev_set), 0))
27197           && REG_P (XEXP (SET_DEST (curr_set), 0))
27198           && REGNO (XEXP (SET_DEST (prev_set), 0))
27199              == REGNO (XEXP (SET_DEST (curr_set), 0))
27200           && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
27201           && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
27202           && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
27203           && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
27204           && CONST_INT_P (SET_SRC (prev_set))
27205           && CONST_INT_P (SET_SRC (curr_set)))
27206         return true;
27207
27208     }
27209   if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
27210     {
27211       /* We're trying to match:
27212           prev (adrp) == (set (reg r0)
27213                               (high (symbol_ref ("SYM"))))
27214           curr (ldr) == (set (reg r1)
27215                              (mem (lo_sum (reg r0)
27216                                              (symbol_ref ("SYM")))))
27217                  or
27218           curr (ldr) == (set (reg r1)
27219                              (zero_extend (mem
27220                                            (lo_sum (reg r0)
27221                                                    (symbol_ref ("SYM"))))))  */
27222       if (satisfies_constraint_Ush (SET_SRC (prev_set))
27223           && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
27224         {
27225           rtx curr_src = SET_SRC (curr_set);
27226
27227           if (GET_CODE (curr_src) == ZERO_EXTEND)
27228             curr_src = XEXP (curr_src, 0);
27229
27230           if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
27231               && REG_P (XEXP (XEXP (curr_src, 0), 0))
27232               && REGNO (XEXP (XEXP (curr_src, 0), 0))
27233                  == REGNO (SET_DEST (prev_set))
27234               && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
27235                               XEXP (SET_SRC (prev_set), 0)))
27236               return true;
27237         }
27238     }
27239
27240   /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch.  */
27241   if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
27242       && prev_set && curr_set && any_condjump_p (curr)
27243       && GET_CODE (SET_SRC (prev_set)) == COMPARE
27244       && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
27245       && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
27246     return true;
27247
27248   /* Fuse flag-setting ALU instructions and conditional branch.  */
27249   if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
27250       && any_condjump_p (curr))
27251     {
27252       unsigned int condreg1, condreg2;
27253       rtx cc_reg_1;
27254       aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
27255       cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
27256
27257       if (reg_referenced_p (cc_reg_1, PATTERN (curr))
27258           && prev
27259           && modified_in_p (cc_reg_1, prev))
27260         {
27261           enum attr_type prev_type = get_attr_type (prev);
27262
27263           /* FIXME: this misses some which is considered simple arthematic
27264              instructions for ThunderX.  Simple shifts are missed here.  */
27265           if (prev_type == TYPE_ALUS_SREG
27266               || prev_type == TYPE_ALUS_IMM
27267               || prev_type == TYPE_LOGICS_REG
27268               || prev_type == TYPE_LOGICS_IMM)
27269             return true;
27270         }
27271     }
27272
27273   /* Fuse ALU instructions and CBZ/CBNZ.  */
27274   if (prev_set
27275       && curr_set
27276       && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
27277       && any_condjump_p (curr))
27278     {
27279       /* We're trying to match:
27280           prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
27281           curr (cbz) ==  (set (pc) (if_then_else (eq/ne) (r0)
27282                                                          (const_int 0))
27283                                                  (label_ref ("SYM"))
27284                                                  (pc))  */
27285       if (SET_DEST (curr_set) == (pc_rtx)
27286           && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
27287           && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
27288           && REG_P (SET_DEST (prev_set))
27289           && REGNO (SET_DEST (prev_set))
27290              == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
27291         {
27292           /* Fuse ALU operations followed by conditional branch instruction.  */
27293           switch (get_attr_type (prev))
27294             {
27295             case TYPE_ALU_IMM:
27296             case TYPE_ALU_SREG:
27297             case TYPE_ADC_REG:
27298             case TYPE_ADC_IMM:
27299             case TYPE_ADCS_REG:
27300             case TYPE_ADCS_IMM:
27301             case TYPE_LOGIC_REG:
27302             case TYPE_LOGIC_IMM:
27303             case TYPE_CSEL:
27304             case TYPE_ADR:
27305             case TYPE_MOV_IMM:
27306             case TYPE_SHIFT_REG:
27307             case TYPE_SHIFT_IMM:
27308             case TYPE_BFM:
27309             case TYPE_RBIT:
27310             case TYPE_REV:
27311             case TYPE_EXTEND:
27312               return true;
27313
27314             default:;
27315             }
27316         }
27317     }
27318
27319   /* Fuse A+B+1 and A-B-1 */
27320   if (simple_sets_p
27321       && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
27322     {
27323       /* We're trying to match:
27324           prev == (set (r0) (plus (r0) (r1)))
27325           curr == (set (r0) (plus (r0) (const_int 1)))
27326         or:
27327           prev == (set (r0) (minus (r0) (r1)))
27328           curr == (set (r0) (plus (r0) (const_int -1))) */
27329
27330       rtx prev_src = SET_SRC (prev_set);
27331       rtx curr_src = SET_SRC (curr_set);
27332
27333       int polarity = 1;
27334       if (GET_CODE (prev_src) == MINUS)
27335         polarity = -1;
27336
27337       if (GET_CODE (curr_src) == PLUS
27338           && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
27339           && CONST_INT_P (XEXP (curr_src, 1))
27340           && INTVAL (XEXP (curr_src, 1)) == polarity
27341           && REG_P (XEXP (curr_src, 0))
27342           && REG_P (SET_DEST (prev_set))
27343           && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
27344         return true;
27345     }
27346
27347   return false;
27348 }
27349
27350 /* Return true iff the instruction fusion described by OP is enabled.  */
27351
27352 bool
27353 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
27354 {
27355   return (aarch64_tune_params.fusible_ops & op) != 0;
27356 }
27357
27358 /* If MEM is in the form of [base+offset], extract the two parts
27359    of address and set to BASE and OFFSET, otherwise return false
27360    after clearing BASE and OFFSET.  */
27361
27362 bool
27363 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
27364 {
27365   rtx addr;
27366
27367   gcc_assert (MEM_P (mem));
27368
27369   addr = XEXP (mem, 0);
27370
27371   if (REG_P (addr))
27372     {
27373       *base = addr;
27374       *offset = const0_rtx;
27375       return true;
27376     }
27377
27378   if (GET_CODE (addr) == PLUS
27379       && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
27380     {
27381       *base = XEXP (addr, 0);
27382       *offset = XEXP (addr, 1);
27383       return true;
27384     }
27385
27386   *base = NULL_RTX;
27387   *offset = NULL_RTX;
27388
27389   return false;
27390 }
27391
27392 /* Types for scheduling fusion.  */
27393 enum sched_fusion_type
27394 {
27395   SCHED_FUSION_NONE = 0,
27396   SCHED_FUSION_LD_SIGN_EXTEND,
27397   SCHED_FUSION_LD_ZERO_EXTEND,
27398   SCHED_FUSION_LD,
27399   SCHED_FUSION_ST,
27400   SCHED_FUSION_NUM
27401 };
27402
27403 /* If INSN is a load or store of address in the form of [base+offset],
27404    extract the two parts and set to BASE and OFFSET.  Return scheduling
27405    fusion type this INSN is.  */
27406
27407 static enum sched_fusion_type
27408 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
27409 {
27410   rtx x, dest, src;
27411   enum sched_fusion_type fusion = SCHED_FUSION_LD;
27412
27413   gcc_assert (INSN_P (insn));
27414   x = PATTERN (insn);
27415   if (GET_CODE (x) != SET)
27416     return SCHED_FUSION_NONE;
27417
27418   src = SET_SRC (x);
27419   dest = SET_DEST (x);
27420
27421   machine_mode dest_mode = GET_MODE (dest);
27422
27423   if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
27424     return SCHED_FUSION_NONE;
27425
27426   if (GET_CODE (src) == SIGN_EXTEND)
27427     {
27428       fusion = SCHED_FUSION_LD_SIGN_EXTEND;
27429       src = XEXP (src, 0);
27430       if (!MEM_P (src) || GET_MODE (src) != SImode)
27431         return SCHED_FUSION_NONE;
27432     }
27433   else if (GET_CODE (src) == ZERO_EXTEND)
27434     {
27435       fusion = SCHED_FUSION_LD_ZERO_EXTEND;
27436       src = XEXP (src, 0);
27437       if (!MEM_P (src) || GET_MODE (src) != SImode)
27438         return SCHED_FUSION_NONE;
27439     }
27440
27441   if (MEM_P (src) && REG_P (dest))
27442     extract_base_offset_in_addr (src, base, offset);
27443   else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
27444     {
27445       fusion = SCHED_FUSION_ST;
27446       extract_base_offset_in_addr (dest, base, offset);
27447     }
27448   else
27449     return SCHED_FUSION_NONE;
27450
27451   if (*base == NULL_RTX || *offset == NULL_RTX)
27452     fusion = SCHED_FUSION_NONE;
27453
27454   return fusion;
27455 }
27456
27457 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
27458
27459    Currently we only support to fuse ldr or str instructions, so FUSION_PRI
27460    and PRI are only calculated for these instructions.  For other instruction,
27461    FUSION_PRI and PRI are simply set to MAX_PRI - 1.  In the future, other
27462    type instruction fusion can be added by returning different priorities.
27463
27464    It's important that irrelevant instructions get the largest FUSION_PRI.  */
27465
27466 static void
27467 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
27468                                int *fusion_pri, int *pri)
27469 {
27470   int tmp, off_val;
27471   rtx base, offset;
27472   enum sched_fusion_type fusion;
27473
27474   gcc_assert (INSN_P (insn));
27475
27476   tmp = max_pri - 1;
27477   fusion = fusion_load_store (insn, &base, &offset);
27478   if (fusion == SCHED_FUSION_NONE)
27479     {
27480       *pri = tmp;
27481       *fusion_pri = tmp;
27482       return;
27483     }
27484
27485   /* Set FUSION_PRI according to fusion type and base register.  */
27486   *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
27487
27488   /* Calculate PRI.  */
27489   tmp /= 2;
27490
27491   /* INSN with smaller offset goes first.  */
27492   off_val = (int)(INTVAL (offset));
27493   if (off_val >= 0)
27494     tmp -= (off_val & 0xfffff);
27495   else
27496     tmp += ((- off_val) & 0xfffff);
27497
27498   *pri = tmp;
27499   return;
27500 }
27501
27502 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
27503    Adjust priority of sha1h instructions so they are scheduled before
27504    other SHA1 instructions.  */
27505
27506 static int
27507 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
27508 {
27509   rtx x = PATTERN (insn);
27510
27511   if (GET_CODE (x) == SET)
27512     {
27513       x = SET_SRC (x);
27514
27515       if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
27516         return priority + 10;
27517     }
27518
27519   return priority;
27520 }
27521
27522 /* If REVERSED is null, return true if memory reference *MEM2 comes
27523    immediately after memory reference *MEM1.  Do not change the references
27524    in this case.
27525
27526    Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
27527    if they are, try to make them use constant offsets from the same base
27528    register.  Return true on success.  When returning true, set *REVERSED
27529    to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2.  */
27530 static bool
27531 aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
27532 {
27533   if (reversed)
27534     *reversed = false;
27535
27536   if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
27537       || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
27538     return false;
27539
27540   if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
27541     return false;
27542
27543   auto size1 = MEM_SIZE (*mem1);
27544   auto size2 = MEM_SIZE (*mem2);
27545
27546   rtx base1, base2, offset1, offset2;
27547   extract_base_offset_in_addr (*mem1, &base1, &offset1);
27548   extract_base_offset_in_addr (*mem2, &base2, &offset2);
27549
27550   /* Make sure at least one memory is in base+offset form.  */
27551   if (!(base1 && offset1) && !(base2 && offset2))
27552     return false;
27553
27554   /* If both mems already use the same base register, just check the
27555      offsets.  */
27556   if (base1 && base2 && rtx_equal_p (base1, base2))
27557     {
27558       if (!offset1 || !offset2)
27559         return false;
27560
27561       if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
27562         return true;
27563
27564       if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)) && reversed)
27565         {
27566           *reversed = true;
27567           return true;
27568         }
27569
27570       return false;
27571     }
27572
27573   /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
27574      guarantee that the values are consecutive.  */
27575   if (MEM_EXPR (*mem1)
27576       && MEM_EXPR (*mem2)
27577       && MEM_OFFSET_KNOWN_P (*mem1)
27578       && MEM_OFFSET_KNOWN_P (*mem2))
27579     {
27580       poly_int64 expr_offset1;
27581       poly_int64 expr_offset2;
27582       tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
27583                                                        &expr_offset1);
27584       tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
27585                                                        &expr_offset2);
27586       if (!expr_base1
27587           || !expr_base2
27588           || !DECL_P (expr_base1)
27589           || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
27590         return false;
27591
27592       expr_offset1 += MEM_OFFSET (*mem1);
27593       expr_offset2 += MEM_OFFSET (*mem2);
27594
27595       if (known_eq (expr_offset1 + size1, expr_offset2))
27596         ;
27597       else if (known_eq (expr_offset2 + size2, expr_offset1) && reversed)
27598         *reversed = true;
27599       else
27600         return false;
27601
27602       if (reversed)
27603         {
27604           if (base2)
27605             {
27606               rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
27607                                          expr_offset1 - expr_offset2);
27608               *mem1 = replace_equiv_address_nv (*mem1, addr1);
27609             }
27610           else
27611             {
27612               rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
27613                                          expr_offset2 - expr_offset1);
27614               *mem2 = replace_equiv_address_nv (*mem2, addr2);
27615             }
27616         }
27617       return true;
27618     }
27619
27620   return false;
27621 }
27622
27623 /* Test if MODE is suitable for a single transfer register in an ldp or stp
27624    instruction.  */
27625
27626 bool
27627 aarch64_ldpstp_operand_mode_p (machine_mode mode)
27628 {
27629   if (!targetm.hard_regno_mode_ok (V0_REGNUM, mode)
27630       || hard_regno_nregs (V0_REGNUM, mode) > 1)
27631     return false;
27632
27633   const auto size = GET_MODE_SIZE (mode);
27634   return known_eq (size, 4) || known_eq (size, 8) || known_eq (size, 16);
27635 }
27636
27637 /* Return true if MEM1 and MEM2 can be combined into a single access
27638    of mode MODE, with the combined access having the same address as MEM1.  */
27639
27640 bool
27641 aarch64_mergeable_load_pair_p (machine_mode mode, rtx mem1, rtx mem2)
27642 {
27643   if (STRICT_ALIGNMENT && MEM_ALIGN (mem1) < GET_MODE_ALIGNMENT (mode))
27644     return false;
27645   return aarch64_check_consecutive_mems (&mem1, &mem2, nullptr);
27646 }
27647
27648 /* Return true if MEM agrees with the ldp-stp policy model.
27649    Otherwise, false.  */
27650
27651 bool
27652 aarch64_mem_ok_with_ldpstp_policy_model (rtx mem, bool load, machine_mode mode)
27653 {
27654   auto policy = (load
27655                  ? aarch64_tune_params.ldp_policy_model
27656                  : aarch64_tune_params.stp_policy_model);
27657
27658   /* If we have AARCH64_LDP_STP_POLICY_NEVER, reject the load pair.  */
27659   if (policy == AARCH64_LDP_STP_POLICY_NEVER)
27660     return false;
27661
27662   /* If we have AARCH64_LDP_STP_POLICY_ALIGNED,
27663      do not emit the load pair unless the alignment is checked to be
27664      at least double the alignment of the type.  */
27665   if (policy == AARCH64_LDP_STP_POLICY_ALIGNED
27666       && !optimize_function_for_size_p (cfun)
27667       && MEM_ALIGN (mem) < 2 * GET_MODE_ALIGNMENT (mode))
27668     return false;
27669
27670   return true;
27671 }
27672
27673 /* Given OPERANDS of consecutive load/store, check if we can merge
27674    them into ldp/stp.  LOAD is true if they are load instructions.
27675    MODE is the mode of memory operands.  */
27676
27677 bool
27678 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
27679                                 machine_mode mode)
27680 {
27681   enum reg_class rclass_1, rclass_2;
27682   rtx mem_1, mem_2, reg_1, reg_2;
27683
27684   if (load)
27685     {
27686       mem_1 = operands[1];
27687       mem_2 = operands[3];
27688       reg_1 = operands[0];
27689       reg_2 = operands[2];
27690       gcc_assert (REG_P (reg_1) && REG_P (reg_2));
27691       if (REGNO (reg_1) == REGNO (reg_2))
27692         return false;
27693       if (reg_overlap_mentioned_p (reg_1, mem_2))
27694         return false;
27695     }
27696   else
27697     {
27698       mem_1 = operands[0];
27699       mem_2 = operands[2];
27700       reg_1 = operands[1];
27701       reg_2 = operands[3];
27702     }
27703
27704   /* The mems cannot be volatile.  */
27705   if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
27706     return false;
27707
27708   /* Check if mem_1 is ok with the ldp-stp policy model.  */
27709   if (!aarch64_mem_ok_with_ldpstp_policy_model (mem_1, load, mode))
27710     return false;
27711
27712   /* Check if the addresses are in the form of [base+offset].  */
27713   bool reversed = false;
27714   if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
27715     return false;
27716
27717   /* The operands must be of the same size.  */
27718   gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
27719                         GET_MODE_SIZE (GET_MODE (mem_2))));
27720
27721   /* The lower memory access must be a mem-pair operand.  */
27722   rtx lower_mem = reversed ? mem_2 : mem_1;
27723   if (!aarch64_mem_pair_operand (lower_mem, GET_MODE (lower_mem)))
27724     return false;
27725
27726   if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
27727     rclass_1 = FP_REGS;
27728   else
27729     rclass_1 = GENERAL_REGS;
27730
27731   if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
27732     rclass_2 = FP_REGS;
27733   else
27734     rclass_2 = GENERAL_REGS;
27735
27736   /* Check if the registers are of same class.  */
27737   if (rclass_1 != rclass_2)
27738     return false;
27739
27740   return true;
27741 }
27742
27743 /* Given OPERANDS of consecutive load/store that can be merged,
27744    swap them if they are not in ascending order.  */
27745 void
27746 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
27747 {
27748   int mem_op = load ? 1 : 0;
27749   bool reversed = false;
27750   if (!aarch64_check_consecutive_mems (operands + mem_op,
27751                                        operands + mem_op + 2, &reversed))
27752     gcc_unreachable ();
27753
27754   if (reversed)
27755     {
27756       /* Irrespective of whether this is a load or a store,
27757          we do the same swap.  */
27758       std::swap (operands[0], operands[2]);
27759       std::swap (operands[1], operands[3]);
27760     }
27761 }
27762
27763 /* Helper function used for generation of load/store pair instructions, called
27764    from peepholes in aarch64-ldpstp.md.  OPERANDS is an array of
27765    operands as matched by the peepholes in that file.  LOAD_P is true if we're
27766    generating a load pair, otherwise we're generating a store pair.  CODE is
27767    either {ZERO,SIGN}_EXTEND for extending loads or UNKNOWN if we're generating a
27768    standard load/store pair.  */
27769
27770 void
27771 aarch64_finish_ldpstp_peephole (rtx *operands, bool load_p, enum rtx_code code)
27772 {
27773   aarch64_swap_ldrstr_operands (operands, load_p);
27774
27775   if (load_p)
27776     emit_insn (aarch64_gen_load_pair (operands[0], operands[2],
27777                                       operands[1], code));
27778   else
27779     {
27780       gcc_assert (code == UNKNOWN);
27781       emit_insn (aarch64_gen_store_pair (operands[0], operands[1],
27782                                          operands[3]));
27783     }
27784 }
27785
27786 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
27787    comparison between the two.  */
27788 int
27789 aarch64_host_wide_int_compare (const void *x, const void *y)
27790 {
27791   return wi::cmps (* ((const HOST_WIDE_INT *) x),
27792                    * ((const HOST_WIDE_INT *) y));
27793 }
27794
27795 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
27796    other pointing to a REG rtx containing an offset, compare the offsets
27797    of the two pairs.
27798
27799    Return:
27800
27801         1 iff offset (X) > offset (Y)
27802         0 iff offset (X) == offset (Y)
27803         -1 iff offset (X) < offset (Y)  */
27804 int
27805 aarch64_ldrstr_offset_compare (const void *x, const void *y)
27806 {
27807   const rtx * operands_1 = (const rtx *) x;
27808   const rtx * operands_2 = (const rtx *) y;
27809   rtx mem_1, mem_2, base, offset_1, offset_2;
27810
27811   if (MEM_P (operands_1[0]))
27812     mem_1 = operands_1[0];
27813   else
27814     mem_1 = operands_1[1];
27815
27816   if (MEM_P (operands_2[0]))
27817     mem_2 = operands_2[0];
27818   else
27819     mem_2 = operands_2[1];
27820
27821   /* Extract the offsets.  */
27822   extract_base_offset_in_addr (mem_1, &base, &offset_1);
27823   extract_base_offset_in_addr (mem_2, &base, &offset_2);
27824
27825   gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
27826
27827   return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
27828 }
27829
27830 /* Given OPERANDS of consecutive load/store, check if we can merge
27831    them into ldp/stp by adjusting the offset.  LOAD is true if they
27832    are load instructions.  MODE is the mode of memory operands.
27833
27834    Given below consecutive stores:
27835
27836      str  w1, [xb, 0x100]
27837      str  w1, [xb, 0x104]
27838      str  w1, [xb, 0x108]
27839      str  w1, [xb, 0x10c]
27840
27841    Though the offsets are out of the range supported by stp, we can
27842    still pair them after adjusting the offset, like:
27843
27844      add  scratch, xb, 0x100
27845      stp  w1, w1, [scratch]
27846      stp  w1, w1, [scratch, 0x8]
27847
27848    The peephole patterns detecting this opportunity should guarantee
27849    the scratch register is avaliable.  */
27850
27851 bool
27852 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
27853                                        machine_mode mode)
27854 {
27855   const int num_insns = 4;
27856   enum reg_class rclass;
27857   HOST_WIDE_INT offvals[num_insns], msize;
27858   rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
27859
27860   if (load)
27861     {
27862       for (int i = 0; i < num_insns; i++)
27863         {
27864           reg[i] = operands[2 * i];
27865           mem[i] = operands[2 * i + 1];
27866
27867           gcc_assert (REG_P (reg[i]));
27868         }
27869
27870       /* Do not attempt to merge the loads if the loads clobber each other.  */
27871       for (int i = 0; i < 8; i += 2)
27872         for (int j = i + 2; j < 8; j += 2)
27873           if (reg_overlap_mentioned_p (operands[i], operands[j]))
27874             return false;
27875     }
27876   else
27877     for (int i = 0; i < num_insns; i++)
27878       {
27879         mem[i] = operands[2 * i];
27880         reg[i] = operands[2 * i + 1];
27881       }
27882
27883   /* Skip if memory operand is by itself valid for ldp/stp.  */
27884   if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
27885     return false;
27886
27887   for (int i = 0; i < num_insns; i++)
27888     {
27889       /* The mems cannot be volatile.  */
27890       if (MEM_VOLATILE_P (mem[i]))
27891         return false;
27892
27893       /* Check if the addresses are in the form of [base+offset].  */
27894       extract_base_offset_in_addr (mem[i], base + i, offset + i);
27895       if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
27896         return false;
27897     }
27898
27899   /* Check if the registers are of same class.  */
27900   rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
27901     ? FP_REGS : GENERAL_REGS;
27902
27903   for (int i = 1; i < num_insns; i++)
27904     if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
27905       {
27906         if (rclass != FP_REGS)
27907           return false;
27908       }
27909     else
27910       {
27911         if (rclass != GENERAL_REGS)
27912           return false;
27913       }
27914
27915   /* Only the last register in the order in which they occur
27916      may be clobbered by the load.  */
27917   if (rclass == GENERAL_REGS && load)
27918     for (int i = 0; i < num_insns - 1; i++)
27919       if (reg_mentioned_p (reg[i], mem[i]))
27920         return false;
27921
27922   /* Check if the bases are same.  */
27923   for (int i = 0; i < num_insns - 1; i++)
27924     if (!rtx_equal_p (base[i], base[i + 1]))
27925       return false;
27926
27927   for (int i = 0; i < num_insns; i++)
27928     offvals[i] = INTVAL (offset[i]);
27929
27930   msize = GET_MODE_SIZE (mode).to_constant ();
27931
27932   /* Check if the offsets can be put in the right order to do a ldp/stp.  */
27933   qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
27934          aarch64_host_wide_int_compare);
27935
27936   if (!(offvals[1] == offvals[0] + msize
27937         && offvals[3] == offvals[2] + msize))
27938     return false;
27939
27940   /* Check that offsets are within range of each other.  The ldp/stp
27941      instructions have 7 bit immediate offsets, so use 0x80.  */
27942   if (offvals[2] - offvals[0] >= msize * 0x80)
27943     return false;
27944
27945   /* The offsets must be aligned with respect to each other.  */
27946   if (offvals[0] % msize != offvals[2] % msize)
27947     return false;
27948
27949    /* Check if mem[0] is ok with the ldp-stp policy model.  */
27950   if (!aarch64_mem_ok_with_ldpstp_policy_model (mem[0], load, mode))
27951     return false;
27952
27953   return true;
27954 }
27955
27956 /* Given OPERANDS of consecutive load/store, this function pairs them
27957    into LDP/STP after adjusting the offset.  It depends on the fact
27958    that the operands can be sorted so the offsets are correct for STP.
27959    MODE is the mode of memory operands.  CODE is the rtl operator
27960    which should be applied to all memory operands, it's SIGN_EXTEND,
27961    ZERO_EXTEND or UNKNOWN.  */
27962
27963 bool
27964 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
27965                              machine_mode mode, RTX_CODE code)
27966 {
27967   rtx base, offset_1, offset_2;
27968   rtx mem_1, mem_2;
27969   rtx temp_operands[8];
27970   HOST_WIDE_INT off_val_1, off_val_2, base_off, new_off_1, new_off_2,
27971                 stp_off_upper_limit, stp_off_lower_limit, msize;
27972
27973   /* We make changes on a copy as we may still bail out.  */
27974   for (int i = 0; i < 8; i ++)
27975     temp_operands[i] = operands[i];
27976
27977   /* Sort the operands.  Note for cases as below:
27978        [base + 0x310] = A
27979        [base + 0x320] = B
27980        [base + 0x330] = C
27981        [base + 0x320] = D
27982      We need stable sorting otherwise wrong data may be store to offset 0x320.
27983      Also note the dead store in above case should be optimized away, but no
27984      guarantees here.  */
27985   gcc_stablesort(temp_operands, 4, 2 * sizeof (rtx *),
27986                  aarch64_ldrstr_offset_compare);
27987
27988   /* Copy the memory operands so that if we have to bail for some
27989      reason the original addresses are unchanged.  */
27990   if (load)
27991     {
27992       mem_1 = copy_rtx (temp_operands[1]);
27993       mem_2 = copy_rtx (temp_operands[5]);
27994     }
27995   else
27996     {
27997       mem_1 = copy_rtx (temp_operands[0]);
27998       mem_2 = copy_rtx (temp_operands[4]);
27999       gcc_assert (code == UNKNOWN);
28000     }
28001
28002   extract_base_offset_in_addr (mem_1, &base, &offset_1);
28003   extract_base_offset_in_addr (mem_2, &base, &offset_2);
28004   gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
28005               && offset_2 != NULL_RTX);
28006
28007   /* Adjust offset so it can fit in LDP/STP instruction.  */
28008   msize = GET_MODE_SIZE (mode).to_constant();
28009   stp_off_upper_limit = msize * (0x40 - 1);
28010   stp_off_lower_limit = - msize * 0x40;
28011
28012   off_val_1 = INTVAL (offset_1);
28013   off_val_2 = INTVAL (offset_2);
28014
28015   /* The base offset is optimally half way between the two STP/LDP offsets.  */
28016   if (msize <= 4)
28017     base_off = (off_val_1 + off_val_2) / 2;
28018   else
28019     /* However, due to issues with negative LDP/STP offset generation for
28020        larger modes, for DF, DD, DI and vector modes. we must not use negative
28021        addresses smaller than 9 signed unadjusted bits can store.  This
28022        provides the most range in this case.  */
28023     base_off = off_val_1;
28024
28025   /* Adjust the base so that it is aligned with the addresses but still
28026      optimal.  */
28027   if (base_off % msize != off_val_1 % msize)
28028     /* Fix the offset, bearing in mind we want to make it bigger not
28029        smaller.  */
28030     base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28031   else if (msize <= 4)
28032     /* The negative range of LDP/STP is one larger than the positive range.  */
28033     base_off += msize;
28034
28035   /* Check if base offset is too big or too small.  We can attempt to resolve
28036      this issue by setting it to the maximum value and seeing if the offsets
28037      still fit.  */
28038   if (base_off >= 0x1000)
28039     {
28040       base_off = 0x1000 - 1;
28041       /* We must still make sure that the base offset is aligned with respect
28042          to the address.  But it may not be made any bigger.  */
28043       base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28044     }
28045
28046   /* Likewise for the case where the base is too small.  */
28047   if (base_off <= -0x1000)
28048     {
28049       base_off = -0x1000 + 1;
28050       base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
28051     }
28052
28053   /* Offset of the first STP/LDP.  */
28054   new_off_1 = off_val_1 - base_off;
28055
28056   /* Offset of the second STP/LDP.  */
28057   new_off_2 = off_val_2 - base_off;
28058
28059   /* The offsets must be within the range of the LDP/STP instructions.  */
28060   if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
28061       || new_off_2 > stp_off_upper_limit || new_off_2 < stp_off_lower_limit)
28062     return false;
28063
28064   replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
28065                                                   new_off_1), true);
28066   replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
28067                                                   new_off_2), true);
28068
28069   if (!aarch64_mem_pair_operand (mem_1, mode)
28070       || !aarch64_mem_pair_operand (mem_2, mode))
28071     return false;
28072
28073   if (load)
28074     {
28075       operands[0] = temp_operands[0];
28076       operands[1] = mem_1;
28077       operands[2] = temp_operands[2];
28078       operands[4] = temp_operands[4];
28079       operands[5] = mem_2;
28080       operands[6] = temp_operands[6];
28081     }
28082   else
28083     {
28084       operands[0] = mem_1;
28085       operands[1] = temp_operands[1];
28086       operands[3] = temp_operands[3];
28087       operands[4] = mem_2;
28088       operands[5] = temp_operands[5];
28089       operands[7] = temp_operands[7];
28090     }
28091
28092   /* Emit adjusting instruction.  */
28093   emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
28094   /* Emit ldp/stp instructions.  */
28095   if (load)
28096     {
28097       emit_insn (aarch64_gen_load_pair (operands[0], operands[2],
28098                                         operands[1], code));
28099       emit_insn (aarch64_gen_load_pair (operands[4], operands[6],
28100                                         operands[5], code));
28101     }
28102   else
28103     {
28104       emit_insn (aarch64_gen_store_pair (operands[0], operands[1],
28105                                          operands[3]));
28106       emit_insn (aarch64_gen_store_pair (operands[4], operands[5],
28107                                          operands[7]));
28108     }
28109   return true;
28110 }
28111
28112 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE.  Assume for now that
28113    it isn't worth branching around empty masked ops (including masked
28114    stores).  */
28115
28116 static bool
28117 aarch64_empty_mask_is_expensive (unsigned)
28118 {
28119   return false;
28120 }
28121
28122 /* Return 1 if pseudo register should be created and used to hold
28123    GOT address for PIC code.  */
28124
28125 bool
28126 aarch64_use_pseudo_pic_reg (void)
28127 {
28128   return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
28129 }
28130
28131 /* Implement TARGET_UNSPEC_MAY_TRAP_P.  */
28132
28133 static int
28134 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
28135 {
28136   switch (XINT (x, 1))
28137     {
28138     case UNSPEC_GOTSMALLPIC:
28139     case UNSPEC_GOTSMALLPIC28K:
28140     case UNSPEC_GOTTINYPIC:
28141       return 0;
28142     default:
28143       break;
28144     }
28145
28146   return default_unspec_may_trap_p (x, flags);
28147 }
28148
28149
28150 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
28151    return the log2 of that value.  Otherwise return -1.  */
28152
28153 int
28154 aarch64_fpconst_pow_of_2 (rtx x)
28155 {
28156   const REAL_VALUE_TYPE *r;
28157
28158   if (!CONST_DOUBLE_P (x))
28159     return -1;
28160
28161   r = CONST_DOUBLE_REAL_VALUE (x);
28162
28163   if (REAL_VALUE_NEGATIVE (*r)
28164       || REAL_VALUE_ISNAN (*r)
28165       || REAL_VALUE_ISINF (*r)
28166       || !real_isinteger (r, DFmode))
28167     return -1;
28168
28169   return exact_log2 (real_to_integer (r));
28170 }
28171
28172 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
28173    power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
28174    return n. Otherwise return -1.  */
28175
28176 int
28177 aarch64_fpconst_pow2_recip (rtx x)
28178 {
28179   REAL_VALUE_TYPE r0;
28180
28181   if (!CONST_DOUBLE_P (x))
28182     return -1;
28183
28184   r0 = *CONST_DOUBLE_REAL_VALUE (x);
28185   if (exact_real_inverse (DFmode, &r0)
28186       && !REAL_VALUE_NEGATIVE (r0))
28187     {
28188         int ret = exact_log2 (real_to_integer (&r0));
28189         if (ret >= 1 && ret <= 32)
28190             return ret;
28191     }
28192   return -1;
28193 }
28194
28195 /* If X is a vector of equal CONST_DOUBLE values and that value is
28196    Y, return the aarch64_fpconst_pow_of_2 of Y.  Otherwise return -1.  */
28197
28198 int
28199 aarch64_vec_fpconst_pow_of_2 (rtx x)
28200 {
28201   int nelts;
28202   if (!CONST_VECTOR_P (x)
28203       || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
28204     return -1;
28205
28206   if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
28207     return -1;
28208
28209   int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
28210   if (firstval <= 0)
28211     return -1;
28212
28213   for (int i = 1; i < nelts; i++)
28214     if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
28215       return -1;
28216
28217   return firstval;
28218 }
28219
28220 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
28221    to float.
28222
28223    __fp16 always promotes through this hook.
28224    _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
28225    through the generic excess precision logic rather than here.  */
28226
28227 static tree
28228 aarch64_promoted_type (const_tree t)
28229 {
28230   if (SCALAR_FLOAT_TYPE_P (t)
28231       && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
28232     return float_type_node;
28233
28234   return NULL_TREE;
28235 }
28236
28237 /* Implement the TARGET_OPTAB_SUPPORTED_P hook.  */
28238
28239 static bool
28240 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
28241                            optimization_type opt_type)
28242 {
28243   switch (op)
28244     {
28245     case rsqrt_optab:
28246       return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
28247
28248     default:
28249       return true;
28250     }
28251 }
28252
28253 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook.  */
28254
28255 static unsigned int
28256 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
28257                                         int *offset)
28258 {
28259   /* Polynomial invariant 1 == (VG / 2) - 1.  */
28260   gcc_assert (i == 1);
28261   *factor = 2;
28262   *offset = 1;
28263   return AARCH64_DWARF_VG;
28264 }
28265
28266 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
28267    if MODE is [BH]Fmode, and punt to the generic implementation otherwise.  */
28268
28269 static bool
28270 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
28271 {
28272   return ((mode == HFmode || mode == BFmode)
28273           ? true
28274           : default_libgcc_floating_mode_supported_p (mode));
28275 }
28276
28277 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
28278    if MODE is [BH]Fmode, and punt to the generic implementation otherwise.  */
28279
28280 static bool
28281 aarch64_scalar_mode_supported_p (scalar_mode mode)
28282 {
28283   if (DECIMAL_FLOAT_MODE_P (mode))
28284     return default_decimal_float_supported_p ();
28285
28286   return ((mode == HFmode || mode == BFmode)
28287           ? true
28288           : default_scalar_mode_supported_p (mode));
28289 }
28290
28291 /* Set the value of FLT_EVAL_METHOD.
28292    ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
28293
28294     0: evaluate all operations and constants, whose semantic type has at
28295        most the range and precision of type float, to the range and
28296        precision of float; evaluate all other operations and constants to
28297        the range and precision of the semantic type;
28298
28299     N, where _FloatN is a supported interchange floating type
28300        evaluate all operations and constants, whose semantic type has at
28301        most the range and precision of _FloatN type, to the range and
28302        precision of the _FloatN type; evaluate all other operations and
28303        constants to the range and precision of the semantic type;
28304
28305    If we have the ARMv8.2-A extensions then we support _Float16 in native
28306    precision, so we should set this to 16.  Otherwise, we support the type,
28307    but want to evaluate expressions in float precision, so set this to
28308    0.  */
28309
28310 static enum flt_eval_method
28311 aarch64_excess_precision (enum excess_precision_type type)
28312 {
28313   switch (type)
28314     {
28315       case EXCESS_PRECISION_TYPE_FAST:
28316       case EXCESS_PRECISION_TYPE_STANDARD:
28317         /* We can calculate either in 16-bit range and precision or
28318            32-bit range and precision.  Make that decision based on whether
28319            we have native support for the ARMv8.2-A 16-bit floating-point
28320            instructions or not.  */
28321         return (TARGET_FP_F16INST
28322                 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
28323                 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
28324       case EXCESS_PRECISION_TYPE_IMPLICIT:
28325       case EXCESS_PRECISION_TYPE_FLOAT16:
28326         return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
28327       default:
28328         gcc_unreachable ();
28329     }
28330   return FLT_EVAL_METHOD_UNPREDICTABLE;
28331 }
28332
28333 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN.  Return true if INSN can be
28334    scheduled for speculative execution.  Reject the long-running division
28335    and square-root instructions.  */
28336
28337 static bool
28338 aarch64_sched_can_speculate_insn (rtx_insn *insn)
28339 {
28340   switch (get_attr_type (insn))
28341     {
28342       case TYPE_SDIV:
28343       case TYPE_UDIV:
28344       case TYPE_FDIVS:
28345       case TYPE_FDIVD:
28346       case TYPE_FSQRTS:
28347       case TYPE_FSQRTD:
28348       case TYPE_NEON_FP_SQRT_S:
28349       case TYPE_NEON_FP_SQRT_D:
28350       case TYPE_NEON_FP_SQRT_S_Q:
28351       case TYPE_NEON_FP_SQRT_D_Q:
28352       case TYPE_NEON_FP_DIV_S:
28353       case TYPE_NEON_FP_DIV_D:
28354       case TYPE_NEON_FP_DIV_S_Q:
28355       case TYPE_NEON_FP_DIV_D_Q:
28356         return false;
28357       default:
28358         return true;
28359     }
28360 }
28361
28362 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES.  */
28363
28364 static int
28365 aarch64_compute_pressure_classes (reg_class *classes)
28366 {
28367   int i = 0;
28368   classes[i++] = GENERAL_REGS;
28369   classes[i++] = FP_REGS;
28370   /* PR_REGS isn't a useful pressure class because many predicate pseudo
28371      registers need to go in PR_LO_REGS at some point during their
28372      lifetime.  Splitting it into two halves has the effect of making
28373      all predicates count against PR_LO_REGS, so that we try whenever
28374      possible to restrict the number of live predicates to 8.  This
28375      greatly reduces the amount of spilling in certain loops.  */
28376   classes[i++] = PR_LO_REGS;
28377   classes[i++] = PR_HI_REGS;
28378   return i;
28379 }
28380
28381 /* Implement TARGET_CAN_CHANGE_MODE_CLASS.  */
28382
28383 static bool
28384 aarch64_can_change_mode_class (machine_mode from,
28385                                machine_mode to, reg_class_t)
28386 {
28387   return aarch64_modes_compatible_p (from, to);
28388 }
28389
28390 /* Implement TARGET_EARLY_REMAT_MODES.  */
28391
28392 static void
28393 aarch64_select_early_remat_modes (sbitmap modes)
28394 {
28395   /* SVE values are not normally live across a call, so it should be
28396      worth doing early rematerialization even in VL-specific mode.  */
28397   for (int i = 0; i < NUM_MACHINE_MODES; ++i)
28398     if (aarch64_sve_mode_p ((machine_mode) i))
28399       bitmap_set_bit (modes, i);
28400 }
28401
28402 /* Override the default target speculation_safe_value.  */
28403 static rtx
28404 aarch64_speculation_safe_value (machine_mode mode,
28405                                 rtx result, rtx val, rtx failval)
28406 {
28407   /* Maybe we should warn if falling back to hard barriers.  They are
28408      likely to be noticably more expensive than the alternative below.  */
28409   if (!aarch64_track_speculation)
28410     return default_speculation_safe_value (mode, result, val, failval);
28411
28412   if (!REG_P (val))
28413     val = copy_to_mode_reg (mode, val);
28414
28415   if (!aarch64_reg_or_zero (failval, mode))
28416     failval = copy_to_mode_reg (mode, failval);
28417
28418   emit_insn (gen_despeculate_copy (mode, result, val, failval));
28419   return result;
28420 }
28421
28422 /* Implement TARGET_ESTIMATED_POLY_VALUE.
28423    Look into the tuning structure for an estimate.
28424    KIND specifies the type of requested estimate: min, max or likely.
28425    For cores with a known SVE width all three estimates are the same.
28426    For generic SVE tuning we want to distinguish the maximum estimate from
28427    the minimum and likely ones.
28428    The likely estimate is the same as the minimum in that case to give a
28429    conservative behavior of auto-vectorizing with SVE when it is a win
28430    even for 128-bit SVE.
28431    When SVE width information is available VAL.coeffs[1] is multiplied by
28432    the number of VQ chunks over the initial Advanced SIMD 128 bits.  */
28433
28434 static HOST_WIDE_INT
28435 aarch64_estimated_poly_value (poly_int64 val,
28436                               poly_value_estimate_kind kind
28437                                 = POLY_VALUE_LIKELY)
28438 {
28439   unsigned int width_source = aarch64_tune_params.sve_width;
28440
28441   /* If there is no core-specific information then the minimum and likely
28442      values are based on 128-bit vectors and the maximum is based on
28443      the architectural maximum of 2048 bits.  */
28444   if (width_source == SVE_SCALABLE)
28445     switch (kind)
28446       {
28447       case POLY_VALUE_MIN:
28448       case POLY_VALUE_LIKELY:
28449         return val.coeffs[0];
28450       case POLY_VALUE_MAX:
28451           return val.coeffs[0] + val.coeffs[1] * 15;
28452       }
28453
28454   /* Allow sve_width to be a bitmask of different VL, treating the lowest
28455      as likely.  This could be made more general if future -mtune options
28456      need it to be.  */
28457   if (kind == POLY_VALUE_MAX)
28458     width_source = 1 << floor_log2 (width_source);
28459   else
28460     width_source = least_bit_hwi (width_source);
28461
28462   /* If the core provides width information, use that.  */
28463   HOST_WIDE_INT over_128 = width_source - 128;
28464   return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
28465 }
28466
28467
28468 /* Return true for types that could be supported as SIMD return or
28469    argument types.  */
28470
28471 static bool
28472 supported_simd_type (tree t)
28473 {
28474   if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
28475     {
28476       HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
28477       return s == 1 || s == 2 || s == 4 || s == 8;
28478     }
28479   return false;
28480 }
28481
28482 /* Determine the lane size for the clone argument/return type.  This follows
28483    the LS(P) rule in the VFABIA64.  */
28484
28485 static unsigned
28486 lane_size (cgraph_simd_clone_arg_type clone_arg_type, tree type)
28487 {
28488   gcc_assert (clone_arg_type != SIMD_CLONE_ARG_TYPE_MASK);
28489
28490   /* For non map-to-vector types that are pointers we use the element type it
28491      points to.  */
28492   if (POINTER_TYPE_P (type))
28493     switch (clone_arg_type)
28494       {
28495       default:
28496         break;
28497       case SIMD_CLONE_ARG_TYPE_UNIFORM:
28498       case SIMD_CLONE_ARG_TYPE_LINEAR_CONSTANT_STEP:
28499       case SIMD_CLONE_ARG_TYPE_LINEAR_VARIABLE_STEP:
28500         type = TREE_TYPE (type);
28501         break;
28502       }
28503
28504   /* For types (or pointers of non map-to-vector types point to) that are
28505      integers or floating point, we use their size if they are 1, 2, 4 or 8.
28506    */
28507   if (INTEGRAL_TYPE_P (type)
28508       || SCALAR_FLOAT_TYPE_P (type))
28509     switch (TYPE_PRECISION (type) / BITS_PER_UNIT)
28510       {
28511       default:
28512         break;
28513       case 1:
28514       case 2:
28515       case 4:
28516       case 8:
28517         return TYPE_PRECISION (type);
28518       }
28519   /* For any other we use the size of uintptr_t.  For map-to-vector types that
28520      are pointers, using the size of uintptr_t is the same as using the size of
28521      their type, seeing all pointers are the same size as uintptr_t.  */
28522   return POINTER_SIZE;
28523 }
28524
28525
28526 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN.  */
28527
28528 static int
28529 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
28530                                         struct cgraph_simd_clone *clonei,
28531                                         tree base_type ATTRIBUTE_UNUSED,
28532                                         int num, bool explicit_p)
28533 {
28534   tree t, ret_type;
28535   unsigned int nds_elt_bits;
28536   unsigned HOST_WIDE_INT const_simdlen;
28537
28538   if (!TARGET_SIMD)
28539     return 0;
28540
28541   /* For now, SVE simdclones won't produce illegal simdlen, So only check
28542      const simdlens here.  */
28543   if (maybe_ne (clonei->simdlen, 0U)
28544       && clonei->simdlen.is_constant (&const_simdlen)
28545       && (const_simdlen < 2
28546           || const_simdlen > 1024
28547           || (const_simdlen & (const_simdlen - 1)) != 0))
28548     {
28549       if (explicit_p)
28550         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28551                     "unsupported simdlen %wd", const_simdlen);
28552       return 0;
28553     }
28554
28555   ret_type = TREE_TYPE (TREE_TYPE (node->decl));
28556   /* According to AArch64's Vector ABI the type that determines the simdlen is
28557      the narrowest of types, so we ignore base_type for AArch64.  */
28558   if (TREE_CODE (ret_type) != VOID_TYPE
28559       && !supported_simd_type (ret_type))
28560     {
28561       if (!explicit_p)
28562         ;
28563       else if (COMPLEX_FLOAT_TYPE_P (ret_type))
28564         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28565                     "GCC does not currently support return type %qT "
28566                     "for simd", ret_type);
28567       else
28568         warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28569                     "unsupported return type %qT for simd",
28570                     ret_type);
28571       return 0;
28572     }
28573
28574   auto_vec<std::pair <tree, unsigned int>> vec_elts (clonei->nargs + 1);
28575
28576   /* We are looking for the NDS type here according to the VFABIA64.  */
28577   if (TREE_CODE (ret_type) != VOID_TYPE)
28578     {
28579       nds_elt_bits = lane_size (SIMD_CLONE_ARG_TYPE_VECTOR, ret_type);
28580       vec_elts.safe_push (std::make_pair (ret_type, nds_elt_bits));
28581     }
28582   else
28583     nds_elt_bits = POINTER_SIZE;
28584
28585   int i;
28586   tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
28587   bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
28588   for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
28589        t && t != void_list_node; t = TREE_CHAIN (t), i++)
28590     {
28591       tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
28592       if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
28593           && !supported_simd_type (arg_type))
28594         {
28595           if (!explicit_p)
28596             ;
28597           else if (COMPLEX_FLOAT_TYPE_P (ret_type))
28598             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28599                         "GCC does not currently support argument type %qT "
28600                         "for simd", arg_type);
28601           else
28602             warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28603                         "unsupported argument type %qT for simd",
28604                         arg_type);
28605           return 0;
28606         }
28607       unsigned lane_bits = lane_size (clonei->args[i].arg_type, arg_type);
28608       if (clonei->args[i].arg_type == SIMD_CLONE_ARG_TYPE_VECTOR)
28609         vec_elts.safe_push (std::make_pair (arg_type, lane_bits));
28610       if (nds_elt_bits > lane_bits)
28611         nds_elt_bits = lane_bits;
28612     }
28613
28614   clonei->vecsize_mangle = 'n';
28615   clonei->mask_mode = VOIDmode;
28616   poly_uint64 simdlen;
28617   auto_vec<poly_uint64> simdlens (2);
28618   /* Keep track of the possible simdlens the clones of this function can have,
28619      and check them later to see if we support them.  */
28620   if (known_eq (clonei->simdlen, 0U))
28621     {
28622       simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
28623       simdlens.safe_push (simdlen);
28624       simdlens.safe_push (simdlen * 2);
28625     }
28626   else
28627     simdlens.safe_push (clonei->simdlen);
28628
28629   clonei->vecsize_int = 0;
28630   clonei->vecsize_float = 0;
28631
28632   /* We currently do not support generating simdclones where vector arguments
28633      do not fit into a single vector register, i.e. vector types that are more
28634      than 128-bits large.  This is because of how we currently represent such
28635      types in ACLE, where we use a struct to allow us to pass them as arguments
28636      and return.
28637      Hence why we have to check whether the simdlens available for this
28638      simdclone would cause a vector type to be larger than 128-bits, and reject
28639      such a clone.  */
28640   unsigned j = 0;
28641   while (j < simdlens.length ())
28642     {
28643       bool remove_simdlen = false;
28644       for (auto elt : vec_elts)
28645         if (known_gt (simdlens[j] * elt.second, 128U))
28646           {
28647             /* Don't issue a warning for every simdclone when there is no
28648                specific simdlen clause.  */
28649             if (explicit_p && maybe_ne (clonei->simdlen, 0U))
28650               warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28651                           "GCC does not currently support simdlen %wd for "
28652                           "type %qT",
28653                           constant_lower_bound (simdlens[j]), elt.first);
28654             remove_simdlen = true;
28655             break;
28656           }
28657       if (remove_simdlen)
28658         simdlens.ordered_remove (j);
28659       else
28660         j++;
28661     }
28662
28663
28664   int count = simdlens.length ();
28665   if (count == 0)
28666     {
28667       if (explicit_p && known_eq (clonei->simdlen, 0U))
28668         {
28669           /* Warn the user if we can't generate any simdclone.  */
28670           simdlen = exact_div (poly_uint64 (64), nds_elt_bits);
28671           warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
28672                       "GCC does not currently support a simdclone with simdlens"
28673                       " %wd and %wd for these types.",
28674                       constant_lower_bound (simdlen),
28675                       constant_lower_bound (simdlen*2));
28676         }
28677       return 0;
28678     }
28679
28680   gcc_assert (num < count);
28681   clonei->simdlen = simdlens[num];
28682   return count;
28683 }
28684
28685 /* Implement TARGET_SIMD_CLONE_ADJUST.  */
28686
28687 static void
28688 aarch64_simd_clone_adjust (struct cgraph_node *node)
28689 {
28690   /* Add aarch64_vector_pcs target attribute to SIMD clones so they
28691      use the correct ABI.  */
28692
28693   tree t = TREE_TYPE (node->decl);
28694   TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
28695                                         TYPE_ATTRIBUTES (t));
28696 }
28697
28698 /* Implement TARGET_SIMD_CLONE_USABLE.  */
28699
28700 static int
28701 aarch64_simd_clone_usable (struct cgraph_node *node)
28702 {
28703   switch (node->simdclone->vecsize_mangle)
28704     {
28705     case 'n':
28706       if (!TARGET_SIMD)
28707         return -1;
28708       return 0;
28709     default:
28710       gcc_unreachable ();
28711     }
28712 }
28713
28714 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
28715
28716 static int
28717 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
28718 {
28719   auto check_attr = [&](const char *ns, const char *name) {
28720     tree attr1 = lookup_attribute (ns, name, TYPE_ATTRIBUTES (type1));
28721     tree attr2 = lookup_attribute (ns, name, TYPE_ATTRIBUTES (type2));
28722     if (!attr1 && !attr2)
28723       return true;
28724
28725     return attr1 && attr2 && attribute_value_equal (attr1, attr2);
28726   };
28727
28728   if (!check_attr ("gnu", "aarch64_vector_pcs"))
28729     return 0;
28730   if (!check_attr ("gnu", "Advanced SIMD type"))
28731     return 0;
28732   if (!check_attr ("gnu", "SVE type"))
28733     return 0;
28734   if (!check_attr ("gnu", "SVE sizeless type"))
28735     return 0;
28736   if (!check_attr ("arm", "streaming"))
28737     return 0;
28738   if (!check_attr ("arm", "streaming_compatible"))
28739     return 0;
28740   if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1), "za")
28741       != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2), "za"))
28742     return 0;
28743   if (aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type1), "zt0")
28744       != aarch64_lookup_shared_state_flags (TYPE_ATTRIBUTES (type2), "zt0"))
28745     return 0;
28746   return 1;
28747 }
28748
28749 /* Implement TARGET_MERGE_DECL_ATTRIBUTES.  */
28750
28751 static tree
28752 aarch64_merge_decl_attributes (tree olddecl, tree newdecl)
28753 {
28754   tree old_attrs = DECL_ATTRIBUTES (olddecl);
28755   tree old_new = lookup_attribute ("arm", "new", old_attrs);
28756
28757   tree new_attrs = DECL_ATTRIBUTES (newdecl);
28758   tree new_new = lookup_attribute ("arm", "new", new_attrs);
28759
28760   if (DECL_INITIAL (olddecl) && new_new)
28761     {
28762       error ("cannot apply attribute %qs to %q+D after the function"
28763              " has been defined", "new", newdecl);
28764       inform (DECL_SOURCE_LOCATION (olddecl), "%q+D defined here",
28765               newdecl);
28766     }
28767   else
28768     {
28769       if (old_new && new_new)
28770         {
28771           old_attrs = remove_attribute ("arm", "new", old_attrs);
28772           TREE_VALUE (new_new) = chainon (TREE_VALUE (new_new),
28773                                           TREE_VALUE (old_new));
28774         }
28775       if (new_new)
28776         aarch64_check_arm_new_against_type (TREE_VALUE (new_new), newdecl);
28777     }
28778
28779   return merge_attributes (old_attrs, new_attrs);
28780 }
28781
28782 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
28783
28784 static const char *
28785 aarch64_get_multilib_abi_name (void)
28786 {
28787   if (TARGET_BIG_END)
28788     return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
28789   return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
28790 }
28791
28792 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
28793    global variable based guard use the default else
28794    return a null tree.  */
28795 static tree
28796 aarch64_stack_protect_guard (void)
28797 {
28798   if (aarch64_stack_protector_guard == SSP_GLOBAL)
28799     return default_stack_protect_guard ();
28800
28801   return NULL_TREE;
28802 }
28803
28804 /* Return the diagnostic message string if the binary operation OP is
28805    not permitted on TYPE1 and TYPE2, NULL otherwise.  */
28806
28807 static const char *
28808 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
28809                            const_tree type2)
28810 {
28811   if (VECTOR_TYPE_P (type1)
28812       && VECTOR_TYPE_P (type2)
28813       && !TYPE_INDIVISIBLE_P (type1)
28814       && !TYPE_INDIVISIBLE_P (type2)
28815       && (aarch64_sve::builtin_type_p (type1)
28816           != aarch64_sve::builtin_type_p (type2)))
28817     return N_("cannot combine GNU and SVE vectors in a binary operation");
28818
28819   /* Operation allowed.  */
28820   return NULL;
28821 }
28822
28823 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES.  Here we tell the rest of the
28824    compiler that we automatically ignore the top byte of our pointers, which
28825    allows using -fsanitize=hwaddress.  */
28826 bool
28827 aarch64_can_tag_addresses ()
28828 {
28829   return !TARGET_ILP32;
28830 }
28831
28832 /* Implement TARGET_ASM_FILE_END for AArch64.  This adds the AArch64 GNU NOTE
28833    section at the end if needed.  */
28834 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND      0xc0000000
28835 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI      (1U << 0)
28836 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC      (1U << 1)
28837 void
28838 aarch64_file_end_indicate_exec_stack ()
28839 {
28840   file_end_indicate_exec_stack ();
28841
28842   unsigned feature_1_and = 0;
28843   if (aarch_bti_enabled ())
28844     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
28845
28846   if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE)
28847     feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
28848
28849   if (feature_1_and)
28850     {
28851       /* Generate .note.gnu.property section.  */
28852       switch_to_section (get_section (".note.gnu.property",
28853                                       SECTION_NOTYPE, NULL));
28854
28855       /* PT_NOTE header: namesz, descsz, type.
28856          namesz = 4 ("GNU\0")
28857          descsz = 16 (Size of the program property array)
28858                   [(12 + padding) * Number of array elements]
28859          type   = 5 (NT_GNU_PROPERTY_TYPE_0).  */
28860       assemble_align (POINTER_SIZE);
28861       assemble_integer (GEN_INT (4), 4, 32, 1);
28862       assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
28863       assemble_integer (GEN_INT (5), 4, 32, 1);
28864
28865       /* PT_NOTE name.  */
28866       assemble_string ("GNU", 4);
28867
28868       /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
28869          type   = GNU_PROPERTY_AARCH64_FEATURE_1_AND
28870          datasz = 4
28871          data   = feature_1_and.  */
28872       assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
28873       assemble_integer (GEN_INT (4), 4, 32, 1);
28874       assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
28875
28876       /* Pad the size of the note to the required alignment.  */
28877       assemble_align (POINTER_SIZE);
28878     }
28879 }
28880 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
28881 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
28882 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
28883
28884 /* Helper function for straight line speculation.
28885    Return what barrier should be emitted for straight line speculation
28886    mitigation.
28887    When not mitigating against straight line speculation this function returns
28888    an empty string.
28889    When mitigating against straight line speculation, use:
28890    * SB when the v8.5-A SB extension is enabled.
28891    * DSB+ISB otherwise.  */
28892 const char *
28893 aarch64_sls_barrier (int mitigation_required)
28894 {
28895   return mitigation_required
28896     ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
28897     : "";
28898 }
28899
28900 static GTY (()) tree aarch64_sls_shared_thunks[30];
28901 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
28902 const char *indirect_symbol_names[30] = {
28903     "__call_indirect_x0",
28904     "__call_indirect_x1",
28905     "__call_indirect_x2",
28906     "__call_indirect_x3",
28907     "__call_indirect_x4",
28908     "__call_indirect_x5",
28909     "__call_indirect_x6",
28910     "__call_indirect_x7",
28911     "__call_indirect_x8",
28912     "__call_indirect_x9",
28913     "__call_indirect_x10",
28914     "__call_indirect_x11",
28915     "__call_indirect_x12",
28916     "__call_indirect_x13",
28917     "__call_indirect_x14",
28918     "__call_indirect_x15",
28919     "", /* "__call_indirect_x16",  */
28920     "", /* "__call_indirect_x17",  */
28921     "__call_indirect_x18",
28922     "__call_indirect_x19",
28923     "__call_indirect_x20",
28924     "__call_indirect_x21",
28925     "__call_indirect_x22",
28926     "__call_indirect_x23",
28927     "__call_indirect_x24",
28928     "__call_indirect_x25",
28929     "__call_indirect_x26",
28930     "__call_indirect_x27",
28931     "__call_indirect_x28",
28932     "__call_indirect_x29",
28933 };
28934
28935 /* Function to create a BLR thunk.  This thunk is used to mitigate straight
28936    line speculation.  Instead of a simple BLR that can be speculated past,
28937    we emit a BL to this thunk, and this thunk contains a BR to the relevant
28938    register.  These thunks have the relevant speculation barries put after
28939    their indirect branch so that speculation is blocked.
28940
28941    We use such a thunk so the speculation barriers are kept off the
28942    architecturally executed path in order to reduce the performance overhead.
28943
28944    When optimizing for size we use stubs shared by the linked object.
28945    When optimizing for performance we emit stubs for each function in the hope
28946    that the branch predictor can better train on jumps specific for a given
28947    function.  */
28948 rtx
28949 aarch64_sls_create_blr_label (int regnum)
28950 {
28951   gcc_assert (STUB_REGNUM_P (regnum));
28952   if (optimize_function_for_size_p (cfun))
28953     {
28954       /* For the thunks shared between different functions in this compilation
28955          unit we use a named symbol -- this is just for users to more easily
28956          understand the generated assembly.  */
28957       aarch64_sls_shared_thunks_needed = true;
28958       const char *thunk_name = indirect_symbol_names[regnum];
28959       if (aarch64_sls_shared_thunks[regnum] == NULL)
28960         {
28961           /* Build a decl representing this function stub and record it for
28962              later.  We build a decl here so we can use the GCC machinery for
28963              handling sections automatically (through `get_named_section` and
28964              `make_decl_one_only`).  That saves us a lot of trouble handling
28965              the specifics of different output file formats.  */
28966           tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
28967                                   get_identifier (thunk_name),
28968                                   build_function_type_list (void_type_node,
28969                                                             NULL_TREE));
28970           DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
28971                                            NULL_TREE, void_type_node);
28972           TREE_PUBLIC (decl) = 1;
28973           TREE_STATIC (decl) = 1;
28974           DECL_IGNORED_P (decl) = 1;
28975           DECL_ARTIFICIAL (decl) = 1;
28976           make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
28977           resolve_unique_section (decl, 0, false);
28978           aarch64_sls_shared_thunks[regnum] = decl;
28979         }
28980
28981       return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
28982     }
28983
28984   if (cfun->machine->call_via[regnum] == NULL)
28985     cfun->machine->call_via[regnum]
28986       = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
28987   return cfun->machine->call_via[regnum];
28988 }
28989
28990 /* Helper function for aarch64_sls_emit_blr_function_thunks and
28991    aarch64_sls_emit_shared_blr_thunks below.  */
28992 static void
28993 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
28994 {
28995   /* Save in x16 and branch to that function so this transformation does
28996      not prevent jumping to `BTI c` instructions.  */
28997   asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
28998   asm_fprintf (out_file, "\tbr\tx16\n");
28999 }
29000
29001 /* Emit all BLR stubs for this particular function.
29002    Here we emit all the BLR stubs needed for the current function.  Since we
29003    emit these stubs in a consecutive block we know there will be no speculation
29004    gadgets between each stub, and hence we only emit a speculation barrier at
29005    the end of the stub sequences.
29006
29007    This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook.  */
29008 void
29009 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
29010 {
29011   if (! aarch64_harden_sls_blr_p ())
29012     return;
29013
29014   bool any_functions_emitted = false;
29015   /* We must save and restore the current function section since this assembly
29016      is emitted at the end of the function.  This means it can be emitted *just
29017      after* the cold section of a function.  That cold part would be emitted in
29018      a different section.  That switch would trigger a `.cfi_endproc` directive
29019      to be emitted in the original section and a `.cfi_startproc` directive to
29020      be emitted in the new section.  Switching to the original section without
29021      restoring would mean that the `.cfi_endproc` emitted as a function ends
29022      would happen in a different section -- leaving an unmatched
29023      `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
29024      in the standard text section.  */
29025   section *save_text_section = in_section;
29026   switch_to_section (function_section (current_function_decl));
29027   for (int regnum = 0; regnum < 30; ++regnum)
29028     {
29029       rtx specu_label = cfun->machine->call_via[regnum];
29030       if (specu_label == NULL)
29031         continue;
29032
29033       targetm.asm_out.print_operand (out_file, specu_label, 0);
29034       asm_fprintf (out_file, ":\n");
29035       aarch64_sls_emit_function_stub (out_file, regnum);
29036       any_functions_emitted = true;
29037     }
29038   if (any_functions_emitted)
29039     /* Can use the SB if needs be here, since this stub will only be used
29040       by the current function, and hence for the current target.  */
29041     asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
29042   switch_to_section (save_text_section);
29043 }
29044
29045 /* Emit shared BLR stubs for the current compilation unit.
29046    Over the course of compiling this unit we may have converted some BLR
29047    instructions to a BL to a shared stub function.  This is where we emit those
29048    stub functions.
29049    This function is for the stubs shared between different functions in this
29050    compilation unit.  We share when optimizing for size instead of speed.
29051
29052    This function is called through the TARGET_ASM_FILE_END hook.  */
29053 void
29054 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
29055 {
29056   if (! aarch64_sls_shared_thunks_needed)
29057     return;
29058
29059   for (int regnum = 0; regnum < 30; ++regnum)
29060     {
29061       tree decl = aarch64_sls_shared_thunks[regnum];
29062       if (!decl)
29063         continue;
29064
29065       const char *name = indirect_symbol_names[regnum];
29066       switch_to_section (get_named_section (decl, NULL, 0));
29067       ASM_OUTPUT_ALIGN (out_file, 2);
29068       targetm.asm_out.globalize_label (out_file, name);
29069       /* Only emits if the compiler is configured for an assembler that can
29070          handle visibility directives.  */
29071       targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
29072       ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
29073       ASM_OUTPUT_LABEL (out_file, name);
29074       aarch64_sls_emit_function_stub (out_file, regnum);
29075       /* Use the most conservative target to ensure it can always be used by any
29076          function in the translation unit.  */
29077       asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
29078       ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
29079     }
29080 }
29081
29082 /* Implement TARGET_ASM_FILE_END.  */
29083 void
29084 aarch64_asm_file_end ()
29085 {
29086   aarch64_sls_emit_shared_blr_thunks (asm_out_file);
29087   /* Since this function will be called for the ASM_FILE_END hook, we ensure
29088      that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
29089      for FreeBSD) still gets called.  */
29090 #ifdef TARGET_ASM_FILE_END
29091   TARGET_ASM_FILE_END ();
29092 #endif
29093 }
29094
29095 const char *
29096 aarch64_indirect_call_asm (rtx addr)
29097 {
29098   gcc_assert (REG_P (addr));
29099   if (aarch64_harden_sls_blr_p ())
29100     {
29101       rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
29102       output_asm_insn ("bl\t%0", &stub_label);
29103     }
29104   else
29105    output_asm_insn ("blr\t%0", &addr);
29106   return "";
29107 }
29108
29109 /* Emit the assembly instruction to load the thread pointer into DEST.
29110    Select between different tpidr_elN registers depending on -mtp= setting.  */
29111
29112 const char *
29113 aarch64_output_load_tp (rtx dest)
29114 {
29115   const char *tpidrs[] = {"tpidr_el0", "tpidr_el1", "tpidr_el2",
29116                           "tpidr_el3", "tpidrro_el0"};
29117   char buffer[64];
29118   snprintf (buffer, sizeof (buffer), "mrs\t%%0, %s",
29119             tpidrs[aarch64_tpidr_register]);
29120   output_asm_insn (buffer, &dest);
29121   return "";
29122 }
29123
29124 /* Set up the value of REG_ALLOC_ORDER from scratch.
29125
29126    It was previously good practice to put call-clobbered registers ahead
29127    of call-preserved registers, but that isn't necessary these days.
29128    IRA's model of register save/restore costs is much more sophisticated
29129    than the model that a simple ordering could provide.  We leave
29130    HONOR_REG_ALLOC_ORDER undefined so that we can get the full benefit
29131    of IRA's model.
29132
29133    However, it is still useful to list registers that are members of
29134    multiple classes after registers that are members of fewer classes.
29135    For example, we have:
29136
29137    - FP_LO8_REGS: v0-v7
29138    - FP_LO_REGS: v0-v15
29139    - FP_REGS: v0-v31
29140
29141    If, as a tie-breaker, we allocate FP_REGS in the order v0-v31,
29142    we run the risk of starving other (lower-priority) pseudos that
29143    require FP_LO8_REGS or FP_LO_REGS.  Allocating FP_LO_REGS in the
29144    order v0-v15 could similarly starve pseudos that require FP_LO8_REGS.
29145    Allocating downwards rather than upwards avoids this problem, at least
29146    in code that has reasonable register pressure.
29147
29148    The situation for predicate registers is similar.  */
29149
29150 void
29151 aarch64_adjust_reg_alloc_order ()
29152 {
29153   for (int i = 0; i < FIRST_PSEUDO_REGISTER; ++i)
29154     if (IN_RANGE (i, V0_REGNUM, V31_REGNUM))
29155       reg_alloc_order[i] = V31_REGNUM - (i - V0_REGNUM);
29156     else if (IN_RANGE (i, P0_REGNUM, P15_REGNUM))
29157       reg_alloc_order[i] = P15_REGNUM - (i - P0_REGNUM);
29158     else
29159       reg_alloc_order[i] = i;
29160 }
29161
29162 /* Return true if the PARALLEL PAR can be used in a VEC_SELECT expression
29163    of vector mode MODE to select half the elements of that vector.
29164    Allow any combination of indices except duplicates (or out of range of
29165    the mode units).  */
29166
29167 bool
29168 aarch64_parallel_select_half_p (machine_mode mode, rtx par)
29169 {
29170   int nunits = XVECLEN (par, 0);
29171   if (!known_eq (GET_MODE_NUNITS (mode), nunits * 2))
29172     return false;
29173   int mode_nunits = nunits * 2;
29174   /* Put all the elements of PAR into a hash_set and use its
29175      uniqueness guarantees to check that we don't try to insert the same
29176      element twice.  */
29177   hash_set<rtx> parset;
29178   for (int i = 0; i < nunits; ++i)
29179     {
29180       rtx elt = XVECEXP (par, 0, i);
29181       if (!CONST_INT_P (elt)
29182           || !IN_RANGE (INTVAL (elt), 0, mode_nunits - 1)
29183           || parset.add (elt))
29184         return false;
29185     }
29186   return true;
29187 }
29188
29189 /* Return true if PAR1 and PAR2, two PARALLEL rtxes of CONST_INT values,
29190    contain any common elements.  */
29191
29192 bool
29193 aarch64_pars_overlap_p (rtx par1, rtx par2)
29194 {
29195   int len1 = XVECLEN (par1, 0);
29196   int len2 = XVECLEN (par2, 0);
29197   hash_set<rtx> parset;
29198   for (int i = 0; i < len1; ++i)
29199     parset.add (XVECEXP (par1, 0, i));
29200   for (int i = 0; i < len2; ++i)
29201     if (parset.contains (XVECEXP (par2, 0, i)))
29202       return true;
29203   return false;
29204 }
29205
29206 /* Implement OPTIMIZE_MODE_SWITCHING.  */
29207
29208 bool
29209 aarch64_optimize_mode_switching (aarch64_mode_entity entity)
29210 {
29211   bool have_sme_state = (aarch64_cfun_incoming_pstate_za () != 0
29212                          || (aarch64_cfun_has_new_state ("za")
29213                              && df_regs_ever_live_p (ZA_REGNUM))
29214                          || (aarch64_cfun_has_new_state ("zt0")
29215                              && df_regs_ever_live_p (ZT0_REGNUM)));
29216
29217   if (have_sme_state && nonlocal_goto_handler_labels)
29218     {
29219       static bool reported;
29220       if (!reported)
29221         {
29222           sorry ("non-local gotos in functions with SME state");
29223           reported = true;
29224         }
29225     }
29226
29227   switch (entity)
29228     {
29229     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29230     case aarch64_mode_entity::LOCAL_SME_STATE:
29231       return have_sme_state && !nonlocal_goto_handler_labels;
29232     }
29233   gcc_unreachable ();
29234 }
29235
29236 /* Implement TARGET_MODE_EMIT for ZA_SAVE_BUFFER.  */
29237
29238 static void
29239 aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode mode,
29240                                   aarch64_tristate_mode prev_mode)
29241 {
29242   if (mode == aarch64_tristate_mode::YES)
29243     {
29244       gcc_assert (prev_mode == aarch64_tristate_mode::NO);
29245       aarch64_init_tpidr2_block ();
29246     }
29247   else
29248     gcc_unreachable ();
29249 }
29250
29251 /* Implement TARGET_MODE_EMIT for LOCAL_SME_STATE.  */
29252
29253 static void
29254 aarch64_mode_emit_local_sme_state (aarch64_local_sme_state mode,
29255                                    aarch64_local_sme_state prev_mode)
29256 {
29257   /* Back-propagation should ensure that we're always starting from
29258      a known mode.  */
29259   gcc_assert (prev_mode != aarch64_local_sme_state::ANY);
29260
29261   if (prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
29262     {
29263       /* Commit any uncommitted lazy save.  This leaves ZA either active
29264          and zero (lazy save case) or off (normal case).
29265
29266          The sequence is:
29267
29268              mrs <temp>, tpidr2_el0
29269              cbz <temp>, no_save
29270              bl __arm_tpidr2_save
29271              msr tpidr2_el0, xzr
29272              zero { za }       // Only if ZA is live
29273          no_save:  */
29274       bool is_active = (mode == aarch64_local_sme_state::ACTIVE_LIVE
29275                         || mode == aarch64_local_sme_state::ACTIVE_DEAD);
29276       auto tmp_reg = gen_reg_rtx (DImode);
29277       auto active_flag = gen_int_mode (is_active, DImode);
29278       emit_insn (gen_aarch64_read_tpidr2 (tmp_reg));
29279       emit_insn (gen_aarch64_commit_lazy_save (tmp_reg, active_flag));
29280     }
29281
29282   if (mode == aarch64_local_sme_state::ACTIVE_LIVE
29283       || mode == aarch64_local_sme_state::ACTIVE_DEAD)
29284     {
29285       if (prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL)
29286         {
29287           /* Make ZA active after being inactive.
29288
29289              First handle the case in which the lazy save we set up was
29290              committed by a callee.  If the function's source-level ZA state
29291              is live then we must conditionally restore it from the lazy
29292              save buffer.  Otherwise we can just force PSTATE.ZA to 1.  */
29293           if (mode == aarch64_local_sme_state::ACTIVE_LIVE)
29294             emit_insn (gen_aarch64_restore_za (aarch64_get_tpidr2_ptr ()));
29295           else
29296             emit_insn (gen_aarch64_smstart_za ());
29297
29298           /* Now handle the case in which the lazy save was not committed.
29299              In that case, ZA still contains the current function's ZA state,
29300              and we just need to cancel the lazy save.  */
29301           emit_insn (gen_aarch64_clear_tpidr2 ());
29302
29303           /* Restore the ZT0 state, if we have some.  */
29304           if (aarch64_cfun_has_state ("zt0"))
29305             aarch64_restore_zt0 (true);
29306
29307           return;
29308         }
29309
29310       if (prev_mode == aarch64_local_sme_state::SAVED_LOCAL)
29311         {
29312           /* Retrieve the current function's ZA state from the lazy save
29313              buffer.  */
29314           aarch64_restore_za (aarch64_get_tpidr2_ptr ());
29315
29316           /* Restore the ZT0 state, if we have some.  */
29317           if (aarch64_cfun_has_state ("zt0"))
29318             aarch64_restore_zt0 (true);
29319           return;
29320         }
29321
29322       if (prev_mode == aarch64_local_sme_state::INACTIVE_CALLER
29323           || prev_mode == aarch64_local_sme_state::OFF)
29324         {
29325           /* INACTIVE_CALLER means that we are enabling ZA for the first
29326              time in this function.  The code above means that ZA is either
29327              active and zero (if we committed a lazy save) or off.  Handle
29328              the latter case by forcing ZA on.
29329
29330              OFF means that PSTATE.ZA is guaranteed to be 0.  We just need
29331              to force it to 1.
29332
29333              Both cases leave ZA zeroed.  */
29334           emit_insn (gen_aarch64_smstart_za ());
29335
29336           /* Restore the ZT0 state, if we have some.  */
29337           if (prev_mode == aarch64_local_sme_state::OFF
29338               && aarch64_cfun_has_state ("zt0"))
29339             aarch64_restore_zt0 (true);
29340           return;
29341         }
29342
29343       if (prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
29344           || prev_mode == aarch64_local_sme_state::ACTIVE_LIVE)
29345         /* A simple change in liveness, such as in a CFG structure where
29346            ZA is only conditionally defined.  No code is needed.  */
29347         return;
29348
29349       gcc_unreachable ();
29350     }
29351
29352   if (mode == aarch64_local_sme_state::INACTIVE_LOCAL)
29353     {
29354       if (prev_mode == aarch64_local_sme_state::ACTIVE_LIVE
29355           || prev_mode == aarch64_local_sme_state::ACTIVE_DEAD
29356           || prev_mode == aarch64_local_sme_state::INACTIVE_CALLER)
29357         {
29358           /* Save the ZT0 state, if we have some.  */
29359           if (aarch64_cfun_has_state ("zt0"))
29360             aarch64_save_zt0 ();
29361
29362           /* A transition from ACTIVE_LIVE to INACTIVE_LOCAL is the usual
29363              case of setting up a lazy save buffer before a call.
29364              A transition from INACTIVE_CALLER is similar, except that
29365              the contents of ZA are known to be zero.
29366
29367              A transition from ACTIVE_DEAD means that ZA is live at the
29368              point of the transition, but is dead on at least one incoming
29369              edge.  (That is, ZA is only conditionally initialized.)
29370              For efficiency, we want to set up a lazy save even for
29371              dead contents, since forcing ZA off would make later code
29372              restore ZA from the lazy save buffer.  */
29373           emit_insn (gen_aarch64_write_tpidr2 (aarch64_get_tpidr2_ptr ()));
29374           return;
29375         }
29376
29377       if (prev_mode == aarch64_local_sme_state::SAVED_LOCAL
29378           || prev_mode == aarch64_local_sme_state::OFF)
29379         /* We're simply discarding the information about which inactive
29380            state applies.  */
29381         return;
29382
29383       gcc_unreachable ();
29384     }
29385
29386   if (mode == aarch64_local_sme_state::INACTIVE_CALLER
29387       || mode == aarch64_local_sme_state::OFF)
29388     {
29389       /* Save the ZT0 state, if we have some.  */
29390       if ((prev_mode == aarch64_local_sme_state::ACTIVE_LIVE
29391            || prev_mode == aarch64_local_sme_state::ACTIVE_DEAD)
29392           && mode == aarch64_local_sme_state::OFF
29393           && aarch64_cfun_has_state ("zt0"))
29394         aarch64_save_zt0 ();
29395
29396       /* The transition to INACTIVE_CALLER is used before returning from
29397          new("za") functions.  Any state in ZA belongs to the current
29398          function rather than a caller, but that state is no longer
29399          needed.  Clear any pending lazy save and turn ZA off.
29400
29401          The transition to OFF is used before calling a private-ZA function.
29402          We committed any incoming lazy save above, so at this point any
29403          contents in ZA belong to the current function.  */
29404       if (prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL)
29405         emit_insn (gen_aarch64_clear_tpidr2 ());
29406
29407       if (prev_mode != aarch64_local_sme_state::OFF
29408           && prev_mode != aarch64_local_sme_state::SAVED_LOCAL)
29409         emit_insn (gen_aarch64_smstop_za ());
29410
29411       return;
29412     }
29413
29414   if (mode == aarch64_local_sme_state::SAVED_LOCAL)
29415     {
29416       /* This is a transition to an exception handler.  */
29417       gcc_assert (prev_mode == aarch64_local_sme_state::OFF
29418                   || prev_mode == aarch64_local_sme_state::INACTIVE_LOCAL);
29419       return;
29420     }
29421
29422   gcc_unreachable ();
29423 }
29424
29425 /* Implement TARGET_MODE_EMIT.  */
29426
29427 static void
29428 aarch64_mode_emit (int entity, int mode, int prev_mode, HARD_REG_SET live)
29429 {
29430   if (mode == prev_mode)
29431     return;
29432
29433   start_sequence ();
29434   switch (aarch64_mode_entity (entity))
29435     {
29436     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29437       aarch64_mode_emit_za_save_buffer (aarch64_tristate_mode (mode),
29438                                         aarch64_tristate_mode (prev_mode));
29439       break;
29440
29441     case aarch64_mode_entity::LOCAL_SME_STATE:
29442       aarch64_mode_emit_local_sme_state (aarch64_local_sme_state (mode),
29443                                          aarch64_local_sme_state (prev_mode));
29444       break;
29445     }
29446   rtx_insn *seq = get_insns ();
29447   end_sequence ();
29448
29449   /* Get the set of clobbered registers that are currently live.  */
29450   HARD_REG_SET clobbers = {};
29451   for (rtx_insn *insn = seq; insn; insn = NEXT_INSN (insn))
29452     {
29453       vec_rtx_properties properties;
29454       properties.add_insn (insn, false);
29455       for (rtx_obj_reference ref : properties.refs ())
29456         if (ref.is_write () && HARD_REGISTER_NUM_P (ref.regno))
29457           SET_HARD_REG_BIT (clobbers, ref.regno);
29458     }
29459   clobbers &= live;
29460
29461   /* Emit instructions to save clobbered registers to pseudos.  Queue
29462      instructions to restore the registers afterwards.
29463
29464      This should only needed in rare situations.  */
29465   auto_vec<rtx, 33> after;
29466   for (unsigned int regno = R0_REGNUM; regno < R30_REGNUM; ++regno)
29467     if (TEST_HARD_REG_BIT (clobbers, regno))
29468       {
29469         rtx hard_reg = gen_rtx_REG (DImode, regno);
29470         rtx pseudo_reg = gen_reg_rtx (DImode);
29471         emit_move_insn (pseudo_reg, hard_reg);
29472         after.quick_push (gen_move_insn (hard_reg, pseudo_reg));
29473       }
29474   if (TEST_HARD_REG_BIT (clobbers, CC_REGNUM))
29475     {
29476       rtx pseudo_reg = gen_reg_rtx (DImode);
29477       emit_insn (gen_aarch64_save_nzcv (pseudo_reg));
29478       after.quick_push (gen_aarch64_restore_nzcv (pseudo_reg));
29479     }
29480
29481   /* Emit the transition instructions themselves.  */
29482   emit_insn (seq);
29483
29484   /* Restore the clobbered registers.  */
29485   for (auto *insn : after)
29486     emit_insn (insn);
29487 }
29488
29489 /* Return true if INSN references the SME state represented by hard register
29490    REGNO.  */
29491
29492 static bool
29493 aarch64_insn_references_sme_state_p (rtx_insn *insn, unsigned int regno)
29494 {
29495   df_ref ref;
29496   FOR_EACH_INSN_DEF (ref, insn)
29497     if (!DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
29498         && DF_REF_REGNO (ref) == regno)
29499       return true;
29500   FOR_EACH_INSN_USE (ref, insn)
29501     if (DF_REF_REGNO (ref) == regno)
29502       return true;
29503   return false;
29504 }
29505
29506 /* Implement TARGET_MODE_NEEDED for LOCAL_SME_STATE.  */
29507
29508 static aarch64_local_sme_state
29509 aarch64_mode_needed_local_sme_state (rtx_insn *insn, HARD_REG_SET live)
29510 {
29511   if (!CALL_P (insn)
29512       && find_reg_note (insn, REG_EH_REGION, NULL_RTX))
29513     {
29514       static bool reported;
29515       if (!reported)
29516         {
29517           sorry ("catching non-call exceptions in functions with SME state");
29518           reported = true;
29519         }
29520       /* Aim for graceful error recovery by picking the value that is
29521          least likely to generate an ICE.  */
29522       return aarch64_local_sme_state::INACTIVE_LOCAL;
29523     }
29524
29525   /* A non-local goto is equivalent to a return.  We disallow non-local
29526      receivers in functions with SME state, so we know that the target
29527      expects ZA to be dormant or off.  */
29528   if (JUMP_P (insn)
29529       && find_reg_note (insn, REG_NON_LOCAL_GOTO, NULL_RTX))
29530     return aarch64_local_sme_state::INACTIVE_CALLER;
29531
29532   /* start_private_za_call and end_private_za_call bracket a sequence
29533      that calls a private-ZA function.  Force ZA to be turned off if the
29534      function doesn't have any live ZA state, otherwise require ZA to be
29535      inactive.  */
29536   auto icode = recog_memoized (insn);
29537   if (icode == CODE_FOR_aarch64_start_private_za_call
29538       || icode == CODE_FOR_aarch64_end_private_za_call)
29539     return (TEST_HARD_REG_BIT (live, ZA_REGNUM)
29540             ? aarch64_local_sme_state::INACTIVE_LOCAL
29541             : aarch64_local_sme_state::OFF);
29542
29543   /* Force ZA to contain the current function's ZA state if INSN wants
29544      to access it.  Do the same for accesses to ZT0, since ZA and ZT0
29545      are both controlled by PSTATE.ZA.  */
29546   if (aarch64_insn_references_sme_state_p (insn, ZA_REGNUM)
29547       || aarch64_insn_references_sme_state_p (insn, ZT0_REGNUM))
29548     return (TEST_HARD_REG_BIT (live, ZA_REGNUM)
29549             ? aarch64_local_sme_state::ACTIVE_LIVE
29550             : aarch64_local_sme_state::ACTIVE_DEAD);
29551
29552   return aarch64_local_sme_state::ANY;
29553 }
29554
29555 /* Implement TARGET_MODE_NEEDED for ZA_SAVE_BUFFER.  */
29556
29557 static aarch64_tristate_mode
29558 aarch64_mode_needed_za_save_buffer (rtx_insn *insn, HARD_REG_SET live)
29559 {
29560   /* We need to set up a lazy save buffer no later than the first
29561      transition to INACTIVE_LOCAL (which involves setting up a lazy save).  */
29562   if (aarch64_mode_needed_local_sme_state (insn, live)
29563       == aarch64_local_sme_state::INACTIVE_LOCAL)
29564     return aarch64_tristate_mode::YES;
29565
29566   /* Also make sure that the lazy save buffer is set up before the first
29567      insn that throws internally.  The exception handler will sometimes
29568      load from it.  */
29569   if (find_reg_note (insn, REG_EH_REGION, NULL_RTX))
29570     return aarch64_tristate_mode::YES;
29571
29572   return aarch64_tristate_mode::MAYBE;
29573 }
29574
29575 /* Implement TARGET_MODE_NEEDED.  */
29576
29577 static int
29578 aarch64_mode_needed (int entity, rtx_insn *insn, HARD_REG_SET live)
29579 {
29580   switch (aarch64_mode_entity (entity))
29581     {
29582     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29583       return int (aarch64_mode_needed_za_save_buffer (insn, live));
29584
29585     case aarch64_mode_entity::LOCAL_SME_STATE:
29586       return int (aarch64_mode_needed_local_sme_state (insn, live));
29587     }
29588   gcc_unreachable ();
29589 }
29590
29591 /* Implement TARGET_MODE_AFTER for LOCAL_SME_STATE.  */
29592
29593 static aarch64_local_sme_state
29594 aarch64_mode_after_local_sme_state (aarch64_local_sme_state mode,
29595                                     HARD_REG_SET live)
29596 {
29597   /* Note places where ZA dies, so that we can try to avoid saving and
29598      restoring state that isn't needed.  */
29599   if (mode == aarch64_local_sme_state::ACTIVE_LIVE
29600       && !TEST_HARD_REG_BIT (live, ZA_REGNUM))
29601     return aarch64_local_sme_state::ACTIVE_DEAD;
29602
29603   /* Note where ZA is born, e.g. when moving past an __arm_out("za")
29604      function.  */
29605   if (mode == aarch64_local_sme_state::ACTIVE_DEAD
29606       && TEST_HARD_REG_BIT (live, ZA_REGNUM))
29607     return aarch64_local_sme_state::ACTIVE_LIVE;
29608
29609   return mode;
29610 }
29611
29612 /* Implement TARGET_MODE_AFTER.  */
29613
29614 static int
29615 aarch64_mode_after (int entity, int mode, rtx_insn *, HARD_REG_SET live)
29616 {
29617   switch (aarch64_mode_entity (entity))
29618     {
29619     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29620       return mode;
29621
29622     case aarch64_mode_entity::LOCAL_SME_STATE:
29623       return int (aarch64_mode_after_local_sme_state
29624                   (aarch64_local_sme_state (mode), live));
29625     }
29626   gcc_unreachable ();
29627 }
29628
29629 /* Implement TARGET_MODE_CONFLUENCE for LOCAL_SME_STATE.  */
29630
29631 static aarch64_local_sme_state
29632 aarch64_local_sme_confluence (aarch64_local_sme_state mode1,
29633                               aarch64_local_sme_state mode2)
29634 {
29635   /* Perform a symmetrical check for two values.  */
29636   auto is_pair = [&](aarch64_local_sme_state val1,
29637                      aarch64_local_sme_state val2)
29638     {
29639       return ((mode1 == val1 && mode2 == val2)
29640               || (mode1 == val2 && mode2 == val1));
29641     };
29642
29643   /* INACTIVE_CALLER means ZA is off or it has dormant contents belonging
29644      to a caller.  OFF is one of the options.  */
29645   if (is_pair (aarch64_local_sme_state::INACTIVE_CALLER,
29646                aarch64_local_sme_state::OFF))
29647     return aarch64_local_sme_state::INACTIVE_CALLER;
29648
29649   /* Similarly for dormant contents belonging to the current function.  */
29650   if (is_pair (aarch64_local_sme_state::INACTIVE_LOCAL,
29651                aarch64_local_sme_state::OFF))
29652     return aarch64_local_sme_state::INACTIVE_LOCAL;
29653
29654   /* Treat a conditionally-initialized value as a fully-initialized value.  */
29655   if (is_pair (aarch64_local_sme_state::ACTIVE_LIVE,
29656                aarch64_local_sme_state::ACTIVE_DEAD))
29657     return aarch64_local_sme_state::ACTIVE_LIVE;
29658
29659   return aarch64_local_sme_state::ANY;
29660 }
29661
29662 /* Implement TARGET_MODE_CONFLUENCE.  */
29663
29664 static int
29665 aarch64_mode_confluence (int entity, int mode1, int mode2)
29666 {
29667   gcc_assert (mode1 != mode2);
29668   switch (aarch64_mode_entity (entity))
29669     {
29670     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29671       return int (aarch64_tristate_mode::MAYBE);
29672
29673     case aarch64_mode_entity::LOCAL_SME_STATE:
29674       return int (aarch64_local_sme_confluence
29675                   (aarch64_local_sme_state (mode1),
29676                    aarch64_local_sme_state (mode2)));
29677     }
29678   gcc_unreachable ();
29679 }
29680
29681 /* Implement TARGET_MODE_BACKPROP for an entity that either stays
29682    NO throughput, or makes one transition from NO to YES.  */
29683
29684 static aarch64_tristate_mode
29685 aarch64_one_shot_backprop (aarch64_tristate_mode mode1,
29686                            aarch64_tristate_mode mode2)
29687 {
29688   /* Keep bringing the transition forward until it starts from NO.  */
29689   if (mode1 == aarch64_tristate_mode::MAYBE
29690       && mode2 == aarch64_tristate_mode::YES)
29691     return mode2;
29692
29693   return aarch64_tristate_mode::MAYBE;
29694 }
29695
29696 /* Implement TARGET_MODE_BACKPROP for LOCAL_SME_STATE.  */
29697
29698 static aarch64_local_sme_state
29699 aarch64_local_sme_backprop (aarch64_local_sme_state mode1,
29700                             aarch64_local_sme_state mode2)
29701 {
29702   /* We always need to know what the current state is when transitioning
29703      to a new state.  Force any location with indeterminate starting state
29704      to be active.  */
29705   if (mode1 == aarch64_local_sme_state::ANY)
29706     switch (mode2)
29707       {
29708       case aarch64_local_sme_state::INACTIVE_CALLER:
29709       case aarch64_local_sme_state::OFF:
29710       case aarch64_local_sme_state::ACTIVE_DEAD:
29711         /* The current function's ZA state is not live.  */
29712         return aarch64_local_sme_state::ACTIVE_DEAD;
29713
29714       case aarch64_local_sme_state::INACTIVE_LOCAL:
29715       case aarch64_local_sme_state::ACTIVE_LIVE:
29716         /* The current function's ZA state is live.  */
29717         return aarch64_local_sme_state::ACTIVE_LIVE;
29718
29719       case aarch64_local_sme_state::SAVED_LOCAL:
29720         /* This is a transition to an exception handler.  Since we don't
29721            support non-call exceptions for SME functions, the source of
29722            the transition must be known.  We'll assert later if that's
29723            not the case.  */
29724         return aarch64_local_sme_state::ANY;
29725
29726       case aarch64_local_sme_state::ANY:
29727         return aarch64_local_sme_state::ANY;
29728       }
29729
29730   return aarch64_local_sme_state::ANY;
29731 }
29732
29733 /* Implement TARGET_MODE_BACKPROP.  */
29734
29735 static int
29736 aarch64_mode_backprop (int entity, int mode1, int mode2)
29737 {
29738   switch (aarch64_mode_entity (entity))
29739     {
29740     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29741       return int (aarch64_one_shot_backprop (aarch64_tristate_mode (mode1),
29742                                              aarch64_tristate_mode (mode2)));
29743
29744     case aarch64_mode_entity::LOCAL_SME_STATE:
29745       return int (aarch64_local_sme_backprop
29746                   (aarch64_local_sme_state (mode1),
29747                    aarch64_local_sme_state (mode2)));
29748     }
29749   gcc_unreachable ();
29750 }
29751
29752 /* Implement TARGET_MODE_ENTRY.  */
29753
29754 static int
29755 aarch64_mode_entry (int entity)
29756 {
29757   switch (aarch64_mode_entity (entity))
29758     {
29759     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29760       return int (aarch64_tristate_mode::NO);
29761
29762     case aarch64_mode_entity::LOCAL_SME_STATE:
29763       return int (aarch64_cfun_shared_flags ("za") != 0
29764                   ? aarch64_local_sme_state::ACTIVE_LIVE
29765                   : aarch64_cfun_incoming_pstate_za () != 0
29766                   ? aarch64_local_sme_state::ACTIVE_DEAD
29767                   : aarch64_local_sme_state::INACTIVE_CALLER);
29768     }
29769   gcc_unreachable ();
29770 }
29771
29772 /* Implement TARGET_MODE_EXIT.  */
29773
29774 static int
29775 aarch64_mode_exit (int entity)
29776 {
29777   switch (aarch64_mode_entity (entity))
29778     {
29779     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29780       return int (aarch64_tristate_mode::MAYBE);
29781
29782     case aarch64_mode_entity::LOCAL_SME_STATE:
29783       return int (aarch64_cfun_shared_flags ("za") != 0
29784                   ? aarch64_local_sme_state::ACTIVE_LIVE
29785                   : aarch64_cfun_incoming_pstate_za () != 0
29786                   ? aarch64_local_sme_state::ACTIVE_DEAD
29787                   : aarch64_local_sme_state::INACTIVE_CALLER);
29788     }
29789   gcc_unreachable ();
29790 }
29791
29792 /* Implement TARGET_MODE_EH_HANDLER.  */
29793
29794 static int
29795 aarch64_mode_eh_handler (int entity)
29796 {
29797   switch (aarch64_mode_entity (entity))
29798     {
29799     case aarch64_mode_entity::HAVE_ZA_SAVE_BUFFER:
29800       /* Require a lazy save buffer to be allocated before the first
29801          insn that can throw.  */
29802       return int (aarch64_tristate_mode::YES);
29803
29804     case aarch64_mode_entity::LOCAL_SME_STATE:
29805       return int (aarch64_local_sme_state::SAVED_LOCAL);
29806     }
29807   gcc_unreachable ();
29808 }
29809
29810 /* Implement TARGET_MODE_PRIORITY.  */
29811
29812 static int
29813 aarch64_mode_priority (int, int n)
29814 {
29815   return n;
29816 }
29817
29818 /* Implement TARGET_MD_ASM_ADJUST.  */
29819
29820 static rtx_insn *
29821 aarch64_md_asm_adjust (vec<rtx> &outputs, vec<rtx> &inputs,
29822                        vec<machine_mode> &input_modes,
29823                        vec<const char *> &constraints,
29824                        vec<rtx> &uses, vec<rtx> &clobbers,
29825                        HARD_REG_SET &clobbered_regs, location_t loc)
29826 {
29827   rtx_insn *seq = arm_md_asm_adjust (outputs, inputs, input_modes, constraints,
29828                                      uses, clobbers, clobbered_regs, loc);
29829
29830   /* "za" in the clobber list of a function with ZA state is defined to
29831      mean that the asm can read from and write to ZA.  We can model the
29832      read using a USE, but unfortunately, it's not possible to model the
29833      write directly.   Use a separate insn to model the effect.
29834
29835      We must ensure that ZA is active on entry, which is enforced by using
29836      SME_STATE_REGNUM.  The asm must ensure that ZA is active on return.
29837
29838      The same thing applies to ZT0.  */
29839   if (TARGET_ZA)
29840     for (unsigned int i = clobbers.length (); i-- > 0; )
29841       {
29842         rtx x = clobbers[i];
29843         if (REG_P (x)
29844             && (REGNO (x) == ZA_REGNUM || REGNO (x) == ZT0_REGNUM))
29845           {
29846             auto id = cfun->machine->next_asm_update_za_id++;
29847
29848             start_sequence ();
29849             if (seq)
29850               emit_insn (seq);
29851             rtx id_rtx = gen_int_mode (id, SImode);
29852             emit_insn (REGNO (x) == ZA_REGNUM
29853                        ? gen_aarch64_asm_update_za (id_rtx)
29854                        : gen_aarch64_asm_update_zt0 (id_rtx));
29855             seq = get_insns ();
29856             end_sequence ();
29857
29858             auto mode = REGNO (x) == ZA_REGNUM ? VNx16QImode : V8DImode;
29859             uses.safe_push (gen_rtx_REG (mode, REGNO (x)));
29860             uses.safe_push (gen_rtx_REG (DImode, SME_STATE_REGNUM));
29861
29862             clobbers.ordered_remove (i);
29863             CLEAR_HARD_REG_BIT (clobbered_regs, REGNO (x));
29864           }
29865       }
29866   return seq;
29867 }
29868
29869 /* BB is the target of an exception or nonlocal goto edge, which means
29870    that PSTATE.SM is known to be 0 on entry.  Put it into the state that
29871    the current function requires.  */
29872
29873 static bool
29874 aarch64_switch_pstate_sm_for_landing_pad (basic_block bb)
29875 {
29876   if (TARGET_NON_STREAMING)
29877     return false;
29878
29879   start_sequence ();
29880   rtx_insn *guard_label = nullptr;
29881   if (TARGET_STREAMING_COMPATIBLE)
29882     guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
29883                                                   AARCH64_FL_SM_OFF);
29884   aarch64_sme_mode_switch_regs args_switch;
29885   args_switch.add_call_preserved_regs (df_get_live_in (bb));
29886   args_switch.emit_prologue ();
29887   aarch64_switch_pstate_sm (AARCH64_FL_SM_OFF, AARCH64_FL_SM_ON);
29888   args_switch.emit_epilogue ();
29889   if (guard_label)
29890     emit_label (guard_label);
29891   auto seq = get_insns ();
29892   end_sequence ();
29893
29894   emit_insn_after (seq, bb_note (bb));
29895   return true;
29896 }
29897
29898 /* JUMP is a nonlocal goto.  Its target requires PSTATE.SM to be 0 on entry,
29899    so arrange to make it so.  */
29900
29901 static bool
29902 aarch64_switch_pstate_sm_for_jump (rtx_insn *jump)
29903 {
29904   if (TARGET_NON_STREAMING)
29905     return false;
29906
29907   start_sequence ();
29908   rtx_insn *guard_label = nullptr;
29909   if (TARGET_STREAMING_COMPATIBLE)
29910     guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
29911                                                   AARCH64_FL_SM_OFF);
29912   aarch64_switch_pstate_sm (AARCH64_FL_SM_ON, AARCH64_FL_SM_OFF);
29913   if (guard_label)
29914     emit_label (guard_label);
29915   auto seq = get_insns ();
29916   end_sequence ();
29917
29918   emit_insn_before (seq, jump);
29919   return true;
29920 }
29921
29922 /* If CALL involves a change in PSTATE.SM, emit the instructions needed
29923    to switch to the new mode and the instructions needed to restore the
29924    original mode.  Return true if something changed.  */
29925 static bool
29926 aarch64_switch_pstate_sm_for_call (rtx_call_insn *call)
29927 {
29928   /* Mode switches for sibling calls are handled via the epilogue.  */
29929   if (SIBLING_CALL_P (call))
29930     return false;
29931
29932   auto callee_isa_mode = aarch64_insn_callee_isa_mode (call);
29933   if (!aarch64_call_switches_pstate_sm (callee_isa_mode))
29934     return false;
29935
29936   /* Switch mode before the call, preserving any argument registers
29937      across the switch.  */
29938   start_sequence ();
29939   rtx_insn *args_guard_label = nullptr;
29940   if (TARGET_STREAMING_COMPATIBLE)
29941     args_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
29942                                                        callee_isa_mode);
29943   aarch64_sme_mode_switch_regs args_switch;
29944   args_switch.add_call_args (call);
29945   args_switch.emit_prologue ();
29946   aarch64_switch_pstate_sm (AARCH64_ISA_MODE, callee_isa_mode);
29947   args_switch.emit_epilogue ();
29948   if (args_guard_label)
29949     emit_label (args_guard_label);
29950   auto args_seq = get_insns ();
29951   end_sequence ();
29952   emit_insn_before (args_seq, call);
29953
29954   if (find_reg_note (call, REG_NORETURN, NULL_RTX))
29955     return true;
29956
29957   /* Switch mode after the call, preserving any return registers across
29958      the switch.  */
29959   start_sequence ();
29960   rtx_insn *return_guard_label = nullptr;
29961   if (TARGET_STREAMING_COMPATIBLE)
29962     return_guard_label = aarch64_guard_switch_pstate_sm (IP0_REGNUM,
29963                                                          callee_isa_mode);
29964   aarch64_sme_mode_switch_regs return_switch;
29965   return_switch.add_call_result (call);
29966   return_switch.emit_prologue ();
29967   aarch64_switch_pstate_sm (callee_isa_mode, AARCH64_ISA_MODE);
29968   return_switch.emit_epilogue ();
29969   if (return_guard_label)
29970     emit_label (return_guard_label);
29971   auto result_seq = get_insns ();
29972   end_sequence ();
29973   emit_insn_after (result_seq, call);
29974   return true;
29975 }
29976
29977 namespace {
29978
29979 const pass_data pass_data_switch_pstate_sm =
29980 {
29981   RTL_PASS, // type
29982   "smstarts", // name
29983   OPTGROUP_NONE, // optinfo_flags
29984   TV_NONE, // tv_id
29985   0, // properties_required
29986   0, // properties_provided
29987   0, // properties_destroyed
29988   0, // todo_flags_start
29989   TODO_df_finish, // todo_flags_finish
29990 };
29991
29992 class pass_switch_pstate_sm : public rtl_opt_pass
29993 {
29994 public:
29995   pass_switch_pstate_sm (gcc::context *ctxt)
29996     : rtl_opt_pass (pass_data_switch_pstate_sm, ctxt)
29997   {}
29998
29999   // opt_pass methods:
30000   bool gate (function *) override final;
30001   unsigned int execute (function *) override final;
30002 };
30003
30004 bool
30005 pass_switch_pstate_sm::gate (function *fn)
30006 {
30007   return (aarch64_fndecl_pstate_sm (fn->decl) != AARCH64_FL_SM_OFF
30008           || cfun->machine->call_switches_pstate_sm);
30009 }
30010
30011 /* Emit any instructions needed to switch PSTATE.SM.  */
30012 unsigned int
30013 pass_switch_pstate_sm::execute (function *fn)
30014 {
30015   basic_block bb;
30016
30017   auto_sbitmap blocks (last_basic_block_for_fn (cfun));
30018   bitmap_clear (blocks);
30019   FOR_EACH_BB_FN (bb, fn)
30020     {
30021       if (has_abnormal_call_or_eh_pred_edge_p (bb)
30022           && aarch64_switch_pstate_sm_for_landing_pad (bb))
30023         bitmap_set_bit (blocks, bb->index);
30024
30025       if (cfun->machine->call_switches_pstate_sm)
30026         {
30027           rtx_insn *insn;
30028           FOR_BB_INSNS (bb, insn)
30029             if (auto *call = dyn_cast<rtx_call_insn *> (insn))
30030               if (aarch64_switch_pstate_sm_for_call (call))
30031                 bitmap_set_bit (blocks, bb->index);
30032         }
30033
30034       auto end = BB_END (bb);
30035       if (JUMP_P (end)
30036           && find_reg_note (end, REG_NON_LOCAL_GOTO, NULL_RTX)
30037           && aarch64_switch_pstate_sm_for_jump (end))
30038         bitmap_set_bit (blocks, bb->index);
30039     }
30040   find_many_sub_basic_blocks (blocks);
30041   clear_aux_for_blocks ();
30042   return 0;
30043 }
30044
30045 }
30046
30047 rtl_opt_pass *
30048 make_pass_switch_pstate_sm (gcc::context *ctxt)
30049 {
30050   return new pass_switch_pstate_sm (ctxt);
30051 }
30052
30053 /* Parse an implementation-defined system register name of
30054    the form S[0-3]_[0-7]_C[0-15]_C[0-15]_[0-7].
30055    Return true if name matched against above pattern, false
30056    otherwise.  */
30057 bool
30058 aarch64_is_implem_def_reg (const char *regname)
30059 {
30060   unsigned pos = 0;
30061   unsigned name_len = strlen (regname);
30062   if (name_len < 12 || name_len > 14)
30063     return false;
30064
30065   auto cterm_valid_p = [&]()
30066   {
30067     bool leading_zero_p = false;
30068     unsigned i = 0;
30069     char n[3] = {0};
30070
30071     if (regname[pos] != 'c')
30072       return false;
30073     pos++;
30074     while (regname[pos] != '_')
30075       {
30076         if (leading_zero_p)
30077           return false;
30078         if (i == 0 && regname[pos] == '0')
30079           leading_zero_p = true;
30080         if (i > 2)
30081           return false;
30082         if (!ISDIGIT (regname[pos]))
30083           return false;
30084         n[i++] = regname[pos++];
30085       }
30086     if (atoi (n) > 15)
30087       return false;
30088     return true;
30089   };
30090
30091   if (regname[pos] != 's')
30092     return false;
30093   pos++;
30094   if (regname[pos] < '0' || regname[pos] > '3')
30095     return false;
30096   pos++;
30097   if (regname[pos++] != '_')
30098     return false;
30099   if (regname[pos] < '0' || regname[pos] > '7')
30100     return false;
30101   pos++;
30102   if (regname[pos++] != '_')
30103     return false;
30104   if (!cterm_valid_p ())
30105     return false;
30106   if (regname[pos++] != '_')
30107     return false;
30108   if (!cterm_valid_p ())
30109     return false;
30110   if (regname[pos++] != '_')
30111     return false;
30112   if (regname[pos] < '0' || regname[pos] > '7')
30113     return false;
30114   return true;
30115 }
30116
30117 /* Return true if REGNAME matches either a known permitted system
30118    register name, or a generic sysreg specification.  For use in
30119    back-end predicate `aarch64_sysreg_string'.  */
30120 bool
30121 aarch64_valid_sysreg_name_p (const char *regname)
30122 {
30123   const sysreg_t *sysreg = aarch64_lookup_sysreg_map (regname);
30124   if (sysreg == NULL)
30125     return aarch64_is_implem_def_reg (regname);
30126   if (sysreg->arch_reqs)
30127     return (aarch64_isa_flags & sysreg->arch_reqs);
30128   return true;
30129 }
30130
30131 /* Return the generic sysreg specification for a valid system register
30132    name, otherwise NULL.  WRITE_P is true iff the register is being
30133    written to.  IS128OP indicates the requested system register should
30134    be checked for a 128-bit implementation.  */
30135 const char *
30136 aarch64_retrieve_sysreg (const char *regname, bool write_p, bool is128op)
30137 {
30138   const sysreg_t *sysreg = aarch64_lookup_sysreg_map (regname);
30139   if (sysreg == NULL)
30140     {
30141       if (aarch64_is_implem_def_reg (regname))
30142         return regname;
30143       else
30144         return NULL;
30145     }
30146   if (is128op && !(sysreg->properties & F_REG_128))
30147     return NULL;
30148   if ((write_p && (sysreg->properties & F_REG_READ))
30149       || (!write_p && (sysreg->properties & F_REG_WRITE)))
30150     return NULL;
30151   if ((~aarch64_isa_flags & sysreg->arch_reqs) != 0)
30152     return NULL;
30153   return sysreg->encoding;
30154 }
30155
30156 /* Target-specific selftests.  */
30157
30158 #if CHECKING_P
30159
30160 namespace selftest {
30161
30162 /* Selftest for the RTL loader.
30163    Verify that the RTL loader copes with a dump from
30164    print_rtx_function.  This is essentially just a test that class
30165    function_reader can handle a real dump, but it also verifies
30166    that lookup_reg_by_dump_name correctly handles hard regs.
30167    The presence of hard reg names in the dump means that the test is
30168    target-specific, hence it is in this file.  */
30169
30170 static void
30171 aarch64_test_loading_full_dump ()
30172 {
30173   rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
30174
30175   ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
30176
30177   rtx_insn *insn_1 = get_insn_by_uid (1);
30178   ASSERT_EQ (NOTE, GET_CODE (insn_1));
30179
30180   rtx_insn *insn_15 = get_insn_by_uid (15);
30181   ASSERT_EQ (INSN, GET_CODE (insn_15));
30182   ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
30183
30184   /* Verify crtl->return_rtx.  */
30185   ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
30186   ASSERT_EQ (0, REGNO (crtl->return_rtx));
30187   ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
30188 }
30189
30190 /* Test the fractional_cost class.  */
30191
30192 static void
30193 aarch64_test_fractional_cost ()
30194 {
30195   using cf = fractional_cost;
30196
30197   ASSERT_EQ (cf (0, 20), 0);
30198
30199   ASSERT_EQ (cf (4, 2), 2);
30200   ASSERT_EQ (3, cf (9, 3));
30201
30202   ASSERT_NE (cf (5, 2), 2);
30203   ASSERT_NE (3, cf (8, 3));
30204
30205   ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
30206   ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
30207   ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
30208
30209   ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
30210   ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
30211   ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
30212   ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
30213   ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
30214   ASSERT_EQ (3 - cf (10, 3), 0);
30215
30216   ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
30217   ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
30218
30219   ASSERT_TRUE (cf (4, 15) <= cf (5, 15));
30220   ASSERT_TRUE (cf (5, 15) <= cf (5, 15));
30221   ASSERT_FALSE (cf (6, 15) <= cf (5, 15));
30222   ASSERT_TRUE (cf (1, 3) <= cf (2, 5));
30223   ASSERT_TRUE (cf (1, 12) <= cf (1, 6));
30224   ASSERT_TRUE (cf (5, 3) <= cf (5, 3));
30225   ASSERT_TRUE (cf (239, 240) <= 1);
30226   ASSERT_TRUE (cf (240, 240) <= 1);
30227   ASSERT_FALSE (cf (241, 240) <= 1);
30228   ASSERT_FALSE (2 <= cf (207, 104));
30229   ASSERT_TRUE (2 <= cf (208, 104));
30230   ASSERT_TRUE (2 <= cf (209, 104));
30231
30232   ASSERT_TRUE (cf (4, 15) < cf (5, 15));
30233   ASSERT_FALSE (cf (5, 15) < cf (5, 15));
30234   ASSERT_FALSE (cf (6, 15) < cf (5, 15));
30235   ASSERT_TRUE (cf (1, 3) < cf (2, 5));
30236   ASSERT_TRUE (cf (1, 12) < cf (1, 6));
30237   ASSERT_FALSE (cf (5, 3) < cf (5, 3));
30238   ASSERT_TRUE (cf (239, 240) < 1);
30239   ASSERT_FALSE (cf (240, 240) < 1);
30240   ASSERT_FALSE (cf (241, 240) < 1);
30241   ASSERT_FALSE (2 < cf (207, 104));
30242   ASSERT_FALSE (2 < cf (208, 104));
30243   ASSERT_TRUE (2 < cf (209, 104));
30244
30245   ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
30246   ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
30247   ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
30248   ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
30249   ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
30250   ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
30251   ASSERT_FALSE (cf (239, 240) >= 1);
30252   ASSERT_TRUE (cf (240, 240) >= 1);
30253   ASSERT_TRUE (cf (241, 240) >= 1);
30254   ASSERT_TRUE (2 >= cf (207, 104));
30255   ASSERT_TRUE (2 >= cf (208, 104));
30256   ASSERT_FALSE (2 >= cf (209, 104));
30257
30258   ASSERT_FALSE (cf (4, 15) > cf (5, 15));
30259   ASSERT_FALSE (cf (5, 15) > cf (5, 15));
30260   ASSERT_TRUE (cf (6, 15) > cf (5, 15));
30261   ASSERT_FALSE (cf (1, 3) > cf (2, 5));
30262   ASSERT_FALSE (cf (1, 12) > cf (1, 6));
30263   ASSERT_FALSE (cf (5, 3) > cf (5, 3));
30264   ASSERT_FALSE (cf (239, 240) > 1);
30265   ASSERT_FALSE (cf (240, 240) > 1);
30266   ASSERT_TRUE (cf (241, 240) > 1);
30267   ASSERT_TRUE (2 > cf (207, 104));
30268   ASSERT_FALSE (2 > cf (208, 104));
30269   ASSERT_FALSE (2 > cf (209, 104));
30270
30271   ASSERT_EQ (cf (1, 2).ceil (), 1);
30272   ASSERT_EQ (cf (11, 7).ceil (), 2);
30273   ASSERT_EQ (cf (20, 1).ceil (), 20);
30274   ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
30275   ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
30276   ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
30277   ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
30278   ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
30279
30280   ASSERT_EQ (cf (1, 2).as_double (), 0.5);
30281 }
30282
30283 /* Calculate whether our system register data, as imported from
30284    `aarch64-sys-reg.def' has any duplicate entries.  */
30285 static void
30286 aarch64_test_sysreg_encoding_clashes (void)
30287 {
30288   using dup_instances_t = hash_map<nofree_string_hash,
30289                                    std::vector<const sysreg_t*>>;
30290
30291   dup_instances_t duplicate_instances;
30292
30293   /* Every time an encoding is established to come up more than once
30294      we add it to a "clash-analysis queue", which is then used to extract
30295      necessary information from our hash map when establishing whether
30296      repeated encodings are valid.  */
30297
30298   /* 1) Collect recurrence information.  */
30299   for (unsigned i = 0; i < ARRAY_SIZE (aarch64_sysregs); i++)
30300     {
30301       const sysreg_t *reg = aarch64_sysregs + i;
30302
30303       std::vector<const sysreg_t*> *tmp
30304         = &duplicate_instances.get_or_insert (reg->encoding);
30305
30306       tmp->push_back (reg);
30307     }
30308
30309   /* 2) Carry out analysis on collected data.  */
30310   for (auto instance : duplicate_instances)
30311     {
30312       unsigned nrep = instance.second.size ();
30313       if (nrep > 1)
30314         for (unsigned i = 0; i < nrep; i++)
30315           for (unsigned j = i + 1; j < nrep; j++)
30316             {
30317               const sysreg_t *a = instance.second[i];
30318               const sysreg_t *b = instance.second[j];
30319               ASSERT_TRUE ((a->properties != b->properties)
30320                            || (a->arch_reqs != b->arch_reqs));
30321             }
30322     }
30323 }
30324
30325 /* Run all target-specific selftests.  */
30326
30327 static void
30328 aarch64_run_selftests (void)
30329 {
30330   aarch64_test_loading_full_dump ();
30331   aarch64_test_fractional_cost ();
30332   aarch64_test_sysreg_encoding_clashes ();
30333 }
30334
30335 } // namespace selftest
30336
30337 #endif /* #if CHECKING_P */
30338
30339 #undef TARGET_STACK_PROTECT_GUARD
30340 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
30341
30342 #undef TARGET_ADDRESS_COST
30343 #define TARGET_ADDRESS_COST aarch64_address_cost
30344
30345 /* This hook will determines whether unnamed bitfields affect the alignment
30346    of the containing structure.  The hook returns true if the structure
30347    should inherit the alignment requirements of an unnamed bitfield's
30348    type.  */
30349 #undef TARGET_ALIGN_ANON_BITFIELD
30350 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
30351
30352 #undef TARGET_ASM_ALIGNED_DI_OP
30353 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
30354
30355 #undef TARGET_ASM_ALIGNED_HI_OP
30356 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
30357
30358 #undef TARGET_ASM_ALIGNED_SI_OP
30359 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
30360
30361 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
30362 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
30363   hook_bool_const_tree_hwi_hwi_const_tree_true
30364
30365 #undef TARGET_ASM_FILE_START
30366 #define TARGET_ASM_FILE_START aarch64_start_file
30367
30368 #undef TARGET_ASM_OUTPUT_MI_THUNK
30369 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
30370
30371 #undef TARGET_ASM_SELECT_RTX_SECTION
30372 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
30373
30374 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
30375 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
30376
30377 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
30378 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
30379
30380 #undef TARGET_BUILD_BUILTIN_VA_LIST
30381 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
30382
30383 #undef TARGET_CALLEE_COPIES
30384 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
30385
30386 #undef TARGET_FRAME_POINTER_REQUIRED
30387 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
30388
30389 #undef TARGET_CAN_ELIMINATE
30390 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
30391
30392 #undef TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P
30393 #define TARGET_FUNCTION_ATTRIBUTE_INLINABLE_P \
30394   aarch64_function_attribute_inlinable_p
30395
30396 #undef TARGET_NEED_IPA_FN_TARGET_INFO
30397 #define TARGET_NEED_IPA_FN_TARGET_INFO aarch64_need_ipa_fn_target_info
30398
30399 #undef TARGET_UPDATE_IPA_FN_TARGET_INFO
30400 #define TARGET_UPDATE_IPA_FN_TARGET_INFO aarch64_update_ipa_fn_target_info
30401
30402 #undef TARGET_CAN_INLINE_P
30403 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
30404
30405 #undef TARGET_CANNOT_FORCE_CONST_MEM
30406 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
30407
30408 #undef TARGET_CASE_VALUES_THRESHOLD
30409 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
30410
30411 #undef TARGET_CONDITIONAL_REGISTER_USAGE
30412 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
30413
30414 #undef TARGET_MEMBER_TYPE_FORCES_BLK
30415 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
30416
30417 /* Only the least significant bit is used for initialization guard
30418    variables.  */
30419 #undef TARGET_CXX_GUARD_MASK_BIT
30420 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
30421
30422 #undef TARGET_C_MODE_FOR_SUFFIX
30423 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
30424
30425 #ifdef TARGET_BIG_ENDIAN_DEFAULT
30426 #undef  TARGET_DEFAULT_TARGET_FLAGS
30427 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
30428 #endif
30429
30430 #undef TARGET_CLASS_MAX_NREGS
30431 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
30432
30433 #undef TARGET_BUILTIN_DECL
30434 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
30435
30436 #undef TARGET_BUILTIN_RECIPROCAL
30437 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
30438
30439 #undef TARGET_C_EXCESS_PRECISION
30440 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
30441
30442 #undef  TARGET_EXPAND_BUILTIN
30443 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
30444
30445 #undef TARGET_EXPAND_BUILTIN_VA_START
30446 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
30447
30448 #undef TARGET_FOLD_BUILTIN
30449 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
30450
30451 #undef TARGET_FUNCTION_ARG
30452 #define TARGET_FUNCTION_ARG aarch64_function_arg
30453
30454 #undef TARGET_FUNCTION_ARG_ADVANCE
30455 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
30456
30457 #undef TARGET_FUNCTION_ARG_BOUNDARY
30458 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
30459
30460 #undef TARGET_FUNCTION_ARG_PADDING
30461 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
30462
30463 #undef TARGET_GET_RAW_RESULT_MODE
30464 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
30465 #undef TARGET_GET_RAW_ARG_MODE
30466 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
30467
30468 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
30469 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
30470
30471 #undef TARGET_FUNCTION_VALUE
30472 #define TARGET_FUNCTION_VALUE aarch64_function_value
30473
30474 #undef TARGET_FUNCTION_VALUE_REGNO_P
30475 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
30476
30477 #undef TARGET_START_CALL_ARGS
30478 #define TARGET_START_CALL_ARGS aarch64_start_call_args
30479
30480 #undef TARGET_END_CALL_ARGS
30481 #define TARGET_END_CALL_ARGS aarch64_end_call_args
30482
30483 #undef TARGET_GIMPLE_FOLD_BUILTIN
30484 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
30485
30486 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
30487 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
30488
30489 #undef  TARGET_INIT_BUILTINS
30490 #define TARGET_INIT_BUILTINS  aarch64_init_builtins
30491
30492 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
30493 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
30494   aarch64_ira_change_pseudo_allocno_class
30495
30496 #undef TARGET_LEGITIMATE_ADDRESS_P
30497 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
30498
30499 #undef TARGET_LEGITIMATE_CONSTANT_P
30500 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
30501
30502 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
30503 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
30504   aarch64_legitimize_address_displacement
30505
30506 #undef TARGET_LIBGCC_CMP_RETURN_MODE
30507 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
30508
30509 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
30510 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
30511 aarch64_libgcc_floating_mode_supported_p
30512
30513 #undef TARGET_MANGLE_TYPE
30514 #define TARGET_MANGLE_TYPE aarch64_mangle_type
30515
30516 #undef TARGET_INVALID_BINARY_OP
30517 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
30518
30519 #undef TARGET_VERIFY_TYPE_CONTEXT
30520 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
30521
30522 #undef TARGET_MEMORY_MOVE_COST
30523 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
30524
30525 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
30526 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
30527
30528 #undef TARGET_MUST_PASS_IN_STACK
30529 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
30530
30531 /* This target hook should return true if accesses to volatile bitfields
30532    should use the narrowest mode possible.  It should return false if these
30533    accesses should use the bitfield container type.  */
30534 #undef TARGET_NARROW_VOLATILE_BITFIELD
30535 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
30536
30537 #undef  TARGET_OPTION_OVERRIDE
30538 #define TARGET_OPTION_OVERRIDE aarch64_override_options
30539
30540 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
30541 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
30542   aarch64_override_options_after_change
30543
30544 #undef TARGET_OFFLOAD_OPTIONS
30545 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
30546
30547 #undef TARGET_OPTION_RESTORE
30548 #define TARGET_OPTION_RESTORE aarch64_option_restore
30549
30550 #undef TARGET_OPTION_PRINT
30551 #define TARGET_OPTION_PRINT aarch64_option_print
30552
30553 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
30554 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
30555
30556 #undef TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P
30557 #define TARGET_OPTION_VALID_VERSION_ATTRIBUTE_P \
30558   aarch64_option_valid_version_attribute_p
30559
30560 #undef TARGET_SET_CURRENT_FUNCTION
30561 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
30562
30563 #undef TARGET_PASS_BY_REFERENCE
30564 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
30565
30566 #undef TARGET_PREFERRED_RELOAD_CLASS
30567 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
30568
30569 #undef TARGET_SCHED_REASSOCIATION_WIDTH
30570 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
30571
30572 #undef TARGET_DWARF_FRAME_REG_MODE
30573 #define TARGET_DWARF_FRAME_REG_MODE aarch64_dwarf_frame_reg_mode
30574
30575 #undef TARGET_PROMOTED_TYPE
30576 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
30577
30578 #undef TARGET_SECONDARY_RELOAD
30579 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
30580
30581 #undef TARGET_SECONDARY_MEMORY_NEEDED
30582 #define TARGET_SECONDARY_MEMORY_NEEDED aarch64_secondary_memory_needed
30583
30584 #undef TARGET_SHIFT_TRUNCATION_MASK
30585 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
30586
30587 #undef TARGET_SETUP_INCOMING_VARARGS
30588 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
30589
30590 #undef TARGET_STRUCT_VALUE_RTX
30591 #define TARGET_STRUCT_VALUE_RTX   aarch64_struct_value_rtx
30592
30593 #undef TARGET_REGISTER_MOVE_COST
30594 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
30595
30596 #undef TARGET_RETURN_IN_MEMORY
30597 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
30598
30599 #undef TARGET_RETURN_IN_MSB
30600 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
30601
30602 #undef TARGET_RTX_COSTS
30603 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
30604
30605 #undef TARGET_INSN_COST
30606 #define TARGET_INSN_COST aarch64_insn_cost
30607
30608 #undef TARGET_SCALAR_MODE_SUPPORTED_P
30609 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
30610
30611 #undef TARGET_SCHED_ISSUE_RATE
30612 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
30613
30614 #undef TARGET_SCHED_VARIABLE_ISSUE
30615 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
30616
30617 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
30618 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
30619   aarch64_sched_first_cycle_multipass_dfa_lookahead
30620
30621 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
30622 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
30623   aarch64_first_cycle_multipass_dfa_lookahead_guard
30624
30625 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
30626 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
30627   aarch64_get_separate_components
30628
30629 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
30630 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
30631   aarch64_components_for_bb
30632
30633 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
30634 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
30635   aarch64_disqualify_components
30636
30637 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
30638 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
30639   aarch64_emit_prologue_components
30640
30641 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
30642 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
30643   aarch64_emit_epilogue_components
30644
30645 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
30646 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
30647   aarch64_set_handled_components
30648
30649 #undef TARGET_TRAMPOLINE_INIT
30650 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
30651
30652 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
30653 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
30654
30655 #undef TARGET_VECTOR_MODE_SUPPORTED_P
30656 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
30657
30658 #undef TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P
30659 #define TARGET_VECTOR_MODE_SUPPORTED_ANY_TARGET_P aarch64_vector_mode_supported_any_target_p
30660
30661 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
30662 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
30663
30664 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
30665 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
30666   aarch64_builtin_support_vector_misalignment
30667
30668 #undef TARGET_ARRAY_MODE
30669 #define TARGET_ARRAY_MODE aarch64_array_mode
30670
30671 #undef TARGET_ARRAY_MODE_SUPPORTED_P
30672 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
30673
30674 #undef TARGET_VECTORIZE_CREATE_COSTS
30675 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
30676
30677 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
30678 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
30679   aarch64_builtin_vectorization_cost
30680
30681 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
30682 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
30683
30684 #undef TARGET_VECTORIZE_BUILTINS
30685 #define TARGET_VECTORIZE_BUILTINS
30686
30687 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
30688 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
30689   aarch64_autovectorize_vector_modes
30690
30691 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
30692 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
30693   aarch64_atomic_assign_expand_fenv
30694
30695 /* Section anchor support.  */
30696
30697 #undef TARGET_MIN_ANCHOR_OFFSET
30698 #define TARGET_MIN_ANCHOR_OFFSET -256
30699
30700 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
30701    byte offset; we can do much more for larger data types, but have no way
30702    to determine the size of the access.  We assume accesses are aligned.  */
30703 #undef TARGET_MAX_ANCHOR_OFFSET
30704 #define TARGET_MAX_ANCHOR_OFFSET 4095
30705
30706 #undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
30707 #define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \
30708   aarch64_vectorize_preferred_div_as_shifts_over_mult
30709
30710 #undef TARGET_VECTOR_ALIGNMENT
30711 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
30712
30713 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
30714 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
30715   aarch64_vectorize_preferred_vector_alignment
30716 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
30717 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
30718   aarch64_simd_vector_alignment_reachable
30719
30720 /* vec_perm support.  */
30721
30722 #undef TARGET_VECTORIZE_VEC_PERM_CONST
30723 #define TARGET_VECTORIZE_VEC_PERM_CONST \
30724   aarch64_vectorize_vec_perm_const
30725
30726 #undef TARGET_VECTORIZE_RELATED_MODE
30727 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
30728 #undef TARGET_VECTORIZE_GET_MASK_MODE
30729 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
30730 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
30731 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
30732   aarch64_empty_mask_is_expensive
30733 #undef TARGET_PREFERRED_ELSE_VALUE
30734 #define TARGET_PREFERRED_ELSE_VALUE \
30735   aarch64_preferred_else_value
30736
30737 #undef TARGET_INIT_LIBFUNCS
30738 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
30739
30740 #undef TARGET_FIXED_CONDITION_CODE_REGS
30741 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
30742
30743 #undef TARGET_FLAGS_REGNUM
30744 #define TARGET_FLAGS_REGNUM CC_REGNUM
30745
30746 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
30747 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
30748
30749 #undef TARGET_ASAN_SHADOW_OFFSET
30750 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
30751
30752 #undef TARGET_LEGITIMIZE_ADDRESS
30753 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
30754
30755 #undef TARGET_SCHED_CAN_SPECULATE_INSN
30756 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
30757
30758 #undef TARGET_CAN_USE_DOLOOP_P
30759 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
30760
30761 #undef TARGET_SCHED_ADJUST_PRIORITY
30762 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
30763
30764 #undef TARGET_SCHED_MACRO_FUSION_P
30765 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
30766
30767 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
30768 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
30769
30770 #undef TARGET_SCHED_FUSION_PRIORITY
30771 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
30772
30773 #undef TARGET_UNSPEC_MAY_TRAP_P
30774 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
30775
30776 #undef TARGET_USE_PSEUDO_PIC_REG
30777 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
30778
30779 #undef TARGET_PRINT_OPERAND
30780 #define TARGET_PRINT_OPERAND aarch64_print_operand
30781
30782 #undef TARGET_PRINT_OPERAND_ADDRESS
30783 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
30784
30785 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
30786 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
30787
30788 #undef TARGET_OPTAB_SUPPORTED_P
30789 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
30790
30791 #undef TARGET_OMIT_STRUCT_RETURN_REG
30792 #define TARGET_OMIT_STRUCT_RETURN_REG true
30793
30794 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
30795 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
30796   aarch64_dwarf_poly_indeterminate_value
30797
30798 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors.  */
30799 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
30800 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
30801
30802 #undef TARGET_HARD_REGNO_NREGS
30803 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
30804 #undef TARGET_HARD_REGNO_MODE_OK
30805 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
30806
30807 #undef TARGET_MODES_TIEABLE_P
30808 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
30809
30810 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
30811 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
30812   aarch64_hard_regno_call_part_clobbered
30813
30814 #undef TARGET_INSN_CALLEE_ABI
30815 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
30816
30817 #undef TARGET_CONSTANT_ALIGNMENT
30818 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
30819
30820 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
30821 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
30822   aarch64_stack_clash_protection_alloca_probe_range
30823
30824 #undef TARGET_COMPUTE_PRESSURE_CLASSES
30825 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
30826
30827 #undef TARGET_CAN_CHANGE_MODE_CLASS
30828 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
30829
30830 #undef TARGET_SELECT_EARLY_REMAT_MODES
30831 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
30832
30833 #undef TARGET_SPECULATION_SAFE_VALUE
30834 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
30835
30836 #undef TARGET_ESTIMATED_POLY_VALUE
30837 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
30838
30839 #undef TARGET_ATTRIBUTE_TABLE
30840 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
30841
30842 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
30843 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
30844   aarch64_simd_clone_compute_vecsize_and_simdlen
30845
30846 #undef TARGET_SIMD_CLONE_ADJUST
30847 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
30848
30849 #undef TARGET_SIMD_CLONE_USABLE
30850 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
30851
30852 #undef TARGET_COMP_TYPE_ATTRIBUTES
30853 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
30854
30855 #undef TARGET_MERGE_DECL_ATTRIBUTES
30856 #define TARGET_MERGE_DECL_ATTRIBUTES aarch64_merge_decl_attributes
30857
30858 #undef TARGET_GET_MULTILIB_ABI_NAME
30859 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
30860
30861 #undef TARGET_FNTYPE_ABI
30862 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
30863
30864 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
30865 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
30866
30867 #if CHECKING_P
30868 #undef TARGET_RUN_TARGET_SELFTESTS
30869 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
30870 #endif /* #if CHECKING_P */
30871
30872 #undef TARGET_ASM_POST_CFI_STARTPROC
30873 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
30874
30875 #undef TARGET_STRICT_ARGUMENT_NAMING
30876 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
30877
30878 #undef TARGET_MODE_EMIT
30879 #define TARGET_MODE_EMIT aarch64_mode_emit
30880
30881 #undef TARGET_MODE_NEEDED
30882 #define TARGET_MODE_NEEDED aarch64_mode_needed
30883
30884 #undef TARGET_MODE_AFTER
30885 #define TARGET_MODE_AFTER aarch64_mode_after
30886
30887 #undef TARGET_MODE_CONFLUENCE
30888 #define TARGET_MODE_CONFLUENCE aarch64_mode_confluence
30889
30890 #undef TARGET_MODE_BACKPROP
30891 #define TARGET_MODE_BACKPROP aarch64_mode_backprop
30892
30893 #undef TARGET_MODE_ENTRY
30894 #define TARGET_MODE_ENTRY aarch64_mode_entry
30895
30896 #undef TARGET_MODE_EXIT
30897 #define TARGET_MODE_EXIT aarch64_mode_exit
30898
30899 #undef TARGET_MODE_EH_HANDLER
30900 #define TARGET_MODE_EH_HANDLER aarch64_mode_eh_handler
30901
30902 #undef TARGET_MODE_PRIORITY
30903 #define TARGET_MODE_PRIORITY aarch64_mode_priority
30904
30905 #undef TARGET_MD_ASM_ADJUST
30906 #define TARGET_MD_ASM_ADJUST aarch64_md_asm_adjust
30907
30908 #undef TARGET_ASM_FILE_END
30909 #define TARGET_ASM_FILE_END aarch64_asm_file_end
30910
30911 #undef TARGET_ASM_FUNCTION_EPILOGUE
30912 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
30913
30914 #undef TARGET_HAVE_SHADOW_CALL_STACK
30915 #define TARGET_HAVE_SHADOW_CALL_STACK true
30916
30917 #undef TARGET_CONST_ANCHOR
30918 #define TARGET_CONST_ANCHOR 0x1000000
30919
30920 #undef TARGET_EXTRA_LIVE_ON_ENTRY
30921 #define TARGET_EXTRA_LIVE_ON_ENTRY aarch64_extra_live_on_entry
30922
30923 #undef TARGET_USE_LATE_PROLOGUE_EPILOGUE
30924 #define TARGET_USE_LATE_PROLOGUE_EPILOGUE aarch64_use_late_prologue_epilogue
30925
30926 #undef TARGET_EMIT_EPILOGUE_FOR_SIBCALL
30927 #define TARGET_EMIT_EPILOGUE_FOR_SIBCALL aarch64_expand_epilogue
30928
30929 #undef TARGET_OPTION_FUNCTION_VERSIONS
30930 #define TARGET_OPTION_FUNCTION_VERSIONS aarch64_common_function_versions
30931
30932 #undef TARGET_COMPARE_VERSION_PRIORITY
30933 #define TARGET_COMPARE_VERSION_PRIORITY aarch64_compare_version_priority
30934
30935 #undef TARGET_GENERATE_VERSION_DISPATCHER_BODY
30936 #define TARGET_GENERATE_VERSION_DISPATCHER_BODY \
30937   aarch64_generate_version_dispatcher_body
30938
30939 #undef TARGET_GET_FUNCTION_VERSIONS_DISPATCHER
30940 #define TARGET_GET_FUNCTION_VERSIONS_DISPATCHER \
30941   aarch64_get_function_versions_dispatcher
30942
30943 #undef TARGET_MANGLE_DECL_ASSEMBLER_NAME
30944 #define TARGET_MANGLE_DECL_ASSEMBLER_NAME aarch64_mangle_decl_assembler_name
30945
30946 struct gcc_target targetm = TARGET_INITIALIZER;
30947
30948 #include "gt-aarch64.h"